diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/llvm/lib/Target/AMDGPU/SIFrameLowering.h
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.h
@@ -73,6 +73,10 @@
 
 public:
   bool hasFP(const MachineFunction &MF) const override;
+
+  /// Create a CFI index for CFIInst and build a MachineInstr around it.
+  void BuildCFI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+                const DebugLoc &DL, const MCCFIInstruction &CFIInst) const;
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -17,7 +17,9 @@
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/MC/MCDwarf.h"
 
 using namespace llvm;
 
@@ -332,6 +334,7 @@
   const SIInstrInfo *TII = ST.getInstrInfo();
   MachineRegisterInfo &MRI = MF.getRegInfo();
   const Function &F = MF.getFunction();
+  const MCRegisterInfo *MCRI = MF.getMMI().getContext().getRegisterInfo();
 
   assert(MFI->isEntryFunction());
 
@@ -379,6 +382,22 @@
   DebugLoc DL;
   MachineBasicBlock::iterator I = MBB.begin();
 
+  // On entry the SP/FP are not set up, so we need to define the CFA in terms
+  // of a literal location expression.
+  static const char CFAEncodedInst[] = {
+      dwarf::DW_CFA_def_cfa_expression,
+      3, // length
+      static_cast<char>(unsigned(dwarf::DW_OP_lit0)),
+      static_cast<char>(unsigned(dwarf::DW_OP_lit6)), // DW_ASPACE_AMDGPU_private_wave FIXME: should be defined elsewhere
+      static_cast<char>(unsigned(dwarf::DW_OP_LLVM_form_aspace_address))};
+  BuildCFI(MBB, I, DL,
+           MCCFIInstruction::createEscape(
+               nullptr, StringRef(CFAEncodedInst, sizeof(CFAEncodedInst))));
+  // Unwinding halts when the return address (PC) is undefined.
+  BuildCFI(MBB, I, DL,
+           MCCFIInstruction::createUndefined(
+               nullptr, MCRI->getDwarfRegNum(AMDGPU::PC_REG, false)));
+
   if (MF.getFrameInfo().hasCalls()) {
     Register SPReg = MFI->getStackPtrOffsetReg();
     assert(SPReg != AMDGPU::SP_REG);
@@ -1049,3 +1068,16 @@
     MF.getSubtarget<GCNSubtarget>().getRegisterInfo()->needsStackRealignment(MF) ||
     MF.getTarget().Options.DisableFramePointerElim(MF);
 }
+
+void SIFrameLowering::BuildCFI(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator MBBI,
+                               const DebugLoc &DL,
+                               const MCCFIInstruction &CFIInst) const {
+  MachineFunction &MF = *MBB.getParent();
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  const SIInstrInfo *TII = ST.getInstrInfo();
+  unsigned CFIIndex = MF.addFrameInst(CFIInst);
+  BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+      .addCFIIndex(CFIIndex)
+      .setMIFlag(MachineInstr::FrameSetup);
+}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck -check-prefix=GFX8 %s
 
-define <2 x i16> @v_add_v2i16(<2 x i16> %a, <2 x i16> %b) {
+define <2 x i16> @v_add_v2i16(<2 x i16> %a, <2 x i16> %b) #0 {
 ; GFX9-LABEL: v_add_v2i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -20,7 +20,7 @@
   ret <2 x i16> %add
 }
 
-define <2 x i16> @v_add_v2i16_fneg_lhs(<2 x half> %a, <2 x i16> %b) {
+define <2 x i16> @v_add_v2i16_fneg_lhs(<2 x half> %a, <2 x i16> %b) #0 {
 ; GFX9-LABEL: v_add_v2i16_fneg_lhs:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -41,7 +41,7 @@
   ret <2 x i16> %add
 }
 
-define <2 x i16> @v_add_v2i16_fneg_rhs(<2 x i16> %a, <2 x half> %b) {
+define <2 x i16> @v_add_v2i16_fneg_rhs(<2 x i16> %a, <2 x half> %b) #0 {
 ; GFX9-LABEL: v_add_v2i16_fneg_rhs:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -62,7 +62,7 @@
   ret <2 x i16> %add
 }
 
-define <2 x i16> @v_add_v2i16_fneg_lhs_fneg_rhs(<2 x half> %a, <2 x half> %b) {
+define <2 x i16> @v_add_v2i16_fneg_lhs_fneg_rhs(<2 x half> %a, <2 x half> %b) #0 {
 ; GFX9-LABEL: v_add_v2i16_fneg_lhs_fneg_rhs:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -87,7 +87,7 @@
   ret <2 x i16> %add
 }
 
-define <2 x i16> @v_add_v2i16_neg_inline_imm_splat(<2 x i16> %a) {
+define <2 x i16> @v_add_v2i16_neg_inline_imm_splat(<2 x i16> %a) #0 {
 ; GFX9-LABEL: v_add_v2i16_neg_inline_imm_splat:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -109,7 +109,7 @@
   ret <2 x i16> %add
 }
 
-define <2 x i16> @v_add_v2i16_neg_inline_imm_lo(<2 x i16> %a) {
+define <2 x i16> @v_add_v2i16_neg_inline_imm_lo(<2 x i16> %a) #0 {
 ; GFX9-LABEL: v_add_v2i16_neg_inline_imm_lo:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -129,7 +129,7 @@
   ret <2 x i16> %add
 }
 
-define <2 x i16> @v_add_v2i16_neg_inline_imm_hi(<2 x i16> %a) {
+define <2 x i16> @v_add_v2i16_neg_inline_imm_hi(<2 x i16> %a) #0 {
 ; GFX9-LABEL: v_add_v2i16_neg_inline_imm_hi:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -149,7 +149,7 @@
   ret <2 x i16> %add
 }
 
-define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_splat(<2 x i16> inreg %a) {
+define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_splat(<2 x i16> inreg %a) #0 {
 ; GFX9-LABEL: s_add_v2i16_neg_inline_imm_splat:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_mov_b32 s1, 0xffffffc0
@@ -179,7 +179,7 @@
   ret i32 %cast
 }
 
-define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_lo(<2 x i16> inreg %a) {
+define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_lo(<2 x i16> inreg %a) #0 {
 ; GFX9-LABEL: s_add_v2i16_neg_inline_imm_lo:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s1, 0xffffffc0, 4
@@ -207,7 +207,7 @@
   ret i32 %cast
 }
 
-define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_hi(<2 x i16> inreg %a) {
+define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_hi(<2 x i16> inreg %a) #0 {
 ; GFX9-LABEL: s_add_v2i16_neg_inline_imm_hi:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s1, 4, 0xffffffc0
@@ -235,7 +235,7 @@
   ret i32 %cast
 }
 
-define amdgpu_ps i32 @s_add_v2i16(<2 x i16> inreg %a, <2 x i16> inreg %b) {
+define amdgpu_ps i32 @s_add_v2i16(<2 x i16> inreg %a, <2 x i16> inreg %b) #0 {
 ; GFX9-LABEL: s_add_v2i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_lshr_b32 s2, s0, 16
@@ -265,7 +265,7 @@
   ret i32 %cast
 }
 
-define amdgpu_ps i32 @s_add_v2i16_fneg_lhs(<2 x half> inreg %a, <2 x i16> inreg %b) {
+define amdgpu_ps i32 @s_add_v2i16_fneg_lhs(<2 x half> inreg %a, <2 x i16> inreg %b) #0 {
 ; GFX9-LABEL: s_add_v2i16_fneg_lhs:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_xor_b32 s0, s0, 0x80008000
@@ -299,7 +299,7 @@
   ret i32 %cast
 }
 
-define amdgpu_ps i32 @s_add_v2i16_fneg_rhs(<2 x i16> inreg %a, <2 x half> inreg %b) {
+define amdgpu_ps i32 @s_add_v2i16_fneg_rhs(<2 x i16> inreg %a, <2 x half> inreg %b) #0 {
 ; GFX9-LABEL: s_add_v2i16_fneg_rhs:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_xor_b32 s1, s1, 0x80008000
@@ -333,7 +333,7 @@
   ret i32 %cast
 }
 
-define amdgpu_ps i32 @s_add_v2i16_fneg_lhs_fneg_rhs(<2 x half> inreg %a, <2 x half> inreg %b) {
+define amdgpu_ps i32 @s_add_v2i16_fneg_lhs_fneg_rhs(<2 x half> inreg %a, <2 x half> inreg %b) #0 {
 ; GFX9-LABEL: s_add_v2i16_fneg_lhs_fneg_rhs:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_mov_b32 s2, 0x80008000
@@ -372,3 +372,5 @@
   %cast = bitcast <2 x i16> %add to i32
   ret i32 %cast
 }
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/bool-legalization.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/bool-legalization.ll
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/bool-legalization.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/bool-legalization.ll
@@ -3,7 +3,7 @@
 
 ; End to end tests for scalar vs. vector boolean legalization strategies.
 
-define amdgpu_ps float @select_vgpr_sgpr_trunc_cond(i32 inreg %a, i32 %b, i32 %c) {
+define amdgpu_ps float @select_vgpr_sgpr_trunc_cond(i32 inreg %a, i32 %b, i32 %c) #0 {
 ; GCN-LABEL: select_vgpr_sgpr_trunc_cond:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_and_b32 s0, 1, s0
@@ -16,7 +16,7 @@
   ret float %r.f
 }
 
-define amdgpu_ps float @select_vgpr_sgpr_trunc_and_cond(i32 inreg %a.0, i32 inreg %a.1, i32 %b, i32 %c) {
+define amdgpu_ps float @select_vgpr_sgpr_trunc_and_cond(i32 inreg %a.0, i32 inreg %a.1, i32 %b, i32 %c) #0 {
 ; GCN-LABEL: select_vgpr_sgpr_trunc_and_cond:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_and_b32 s0, s0, s1
@@ -32,7 +32,7 @@
   ret float %r.f
 }
 
-define amdgpu_ps i32 @select_sgpr_trunc_and_cond(i32 inreg %a.0, i32 inreg %a.1, i32 inreg %b, i32 inreg %c) {
+define amdgpu_ps i32 @select_sgpr_trunc_and_cond(i32 inreg %a.0, i32 inreg %a.1, i32 inreg %b, i32 inreg %c) #0 {
 ; GCN-LABEL: select_sgpr_trunc_and_cond:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_and_b32 s0, s0, s1
@@ -47,7 +47,7 @@
   ret i32 %r
 }
 
-define amdgpu_kernel void @sgpr_trunc_brcond(i32 %cond) {
+define amdgpu_kernel void @sgpr_trunc_brcond(i32 %cond) #0 {
 ; GCN-LABEL: sgpr_trunc_brcond:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_load_dword s0, s[0:1], 0x9
@@ -74,7 +74,7 @@
   unreachable
 }
 
-define amdgpu_kernel void @brcond_sgpr_trunc_and(i32 %cond0, i32 %cond1) {
+define amdgpu_kernel void @brcond_sgpr_trunc_and(i32 %cond0, i32 %cond1) #0 {
 ; GCN-LABEL: brcond_sgpr_trunc_and:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -103,3 +103,4 @@
   store volatile i32 1, i32 addrspace(1)* undef
   unreachable
 }
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll
@@ -3,7 +3,7 @@
 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji -o - %s | FileCheck -check-prefix=GFX8 %s
 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GFX9 %s
 
-define amdgpu_ps i32 @s_bswap_i32(i32 inreg %src) {
+define amdgpu_ps i32 @s_bswap_i32(i32 inreg %src) #0 {
 ; GFX7-LABEL: s_bswap_i32:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    v_alignbit_b32 v0, s0, s0, 8
@@ -32,7 +32,7 @@
   ret i32 %bswap
 }
 
-define i32 @v_bswap_i32(i32 %src) {
+define i32 @v_bswap_i32(i32 %src) #0 {
 ; GFX7-LABEL: v_bswap_i32:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -59,7 +59,7 @@
   ret i32 %bswap
 }
 
-define amdgpu_ps <2 x i32> @s_bswap_v2i32(<2 x i32> inreg %src) {
+define amdgpu_ps <2 x i32> @s_bswap_v2i32(<2 x i32> inreg %src) #0 {
 ; GFX7-LABEL: s_bswap_v2i32:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    v_alignbit_b32 v0, s0, s0, 8
@@ -98,7 +98,7 @@
   ret <2 x i32> %bswap
 }
 
-define <2 x i32> @v_bswap_v2i32(<2 x i32> %src) {
+define <2 x i32> @v_bswap_v2i32(<2 x i32> %src) #0 {
 ; GFX7-LABEL: v_bswap_v2i32:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -130,7 +130,7 @@
   ret <2 x i32> %bswap
 }
 
-define amdgpu_ps i64 @s_bswap_i64(i64 inreg %src) {
+define amdgpu_ps i64 @s_bswap_i64(i64 inreg %src) #0 {
 ; GFX7-LABEL: s_bswap_i64:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    v_alignbit_b32 v0, s1, s1, 8
@@ -169,7 +169,7 @@
   ret i64 %bswap
 }
 
-define i64 @v_bswap_i64(i64 %src) {
+define i64 @v_bswap_i64(i64 %src) #0 {
 ; GFX7-LABEL: v_bswap_i64:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -204,7 +204,7 @@
   ret i64 %bswap
 }
 
-define amdgpu_ps <2 x i64> @s_bswap_v2i64(<2 x i64> inreg %src) {
+define amdgpu_ps <2 x i64> @s_bswap_v2i64(<2 x i64> inreg %src) #0 {
 ; GFX7-LABEL: s_bswap_v2i64:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    v_alignbit_b32 v0, s1, s1, 8
@@ -263,7 +263,7 @@
   ret <2 x i64> %bswap
 }
 
-define <2 x i64> @v_bswap_v2i64(<2 x i64> %src) {
+define <2 x i64> @v_bswap_v2i64(<2 x i64> %src) #0 {
 ; GFX7-LABEL: v_bswap_v2i64:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -311,7 +311,7 @@
   ret <2 x i64> %bswap
 }
 
-define amdgpu_ps i16 @s_bswap_i16(i16 inreg %src) {
+define amdgpu_ps i16 @s_bswap_i16(i16 inreg %src) #0 {
 ; GFX7-LABEL: s_bswap_i16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_lshl_b32 s1, s0, 8
@@ -339,7 +339,7 @@
   ret i16 %bswap
 }
 
-define i16 @v_bswap_i16(i16 %src) {
+define i16 @v_bswap_i16(i16 %src) #0 {
 ; GFX7-LABEL: v_bswap_i16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -366,7 +366,7 @@
   ret i16 %bswap
 }
 
-define amdgpu_ps i32 @s_bswap_v2i16(<2 x i16> inreg %src) {
+define amdgpu_ps i32 @s_bswap_v2i16(<2 x i16> inreg %src) #0 {
 ; GFX7-LABEL: s_bswap_v2i16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_mov_b32 s3, 0xffff
@@ -404,7 +404,7 @@
   ret i32 %cast
 }
 
-define i32 @v_bswap_i16_zext_to_i32(i16 %src) {
+define i32 @v_bswap_i16_zext_to_i32(i16 %src) #0 {
 ; GFX7-LABEL: v_bswap_i16_zext_to_i32:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -433,7 +433,7 @@
   ret i32 %zext
 }
 
-define i32 @v_bswap_i16_sext_to_i32(i16 %src) {
+define i32 @v_bswap_i16_sext_to_i32(i16 %src) #0 {
 ; GFX7-LABEL: v_bswap_i16_sext_to_i32:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -464,7 +464,7 @@
   ret i32 %zext
 }
 
-define <2 x i16> @v_bswap_v2i16(<2 x i16> %src) {
+define <2 x i16> @v_bswap_v2i16(<2 x i16> %src) #0 {
 ; GFX7-LABEL: v_bswap_v2i16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -497,12 +497,12 @@
 }
 
 ; FIXME
-; define <3 x i16> @v_bswap_v3i16(<3 x i16> %src) {
+; define <3 x i16> @v_bswap_v3i16(<3 x i16> %src) #0 {
 ;   %bswap = call <3 x i16> @llvm.bswap.v3i16(<3 x i16> %ext.src)
 ;   ret <3 x i16> %bswap
 ; }
 
-define i64 @v_bswap_i48(i64 %src) {
+define i64 @v_bswap_i48(i64 %src) #0 {
 ; GFX7-LABEL: v_bswap_i48:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll
@@ -3,7 +3,7 @@
 
 ; Make sure the branch targets are correct after lowering llvm.amdgcn.if
 
-define i32 @divergent_if_swap_brtarget_order0(i32 %value) {
+define i32 @divergent_if_swap_brtarget_order0(i32 %value) #0 {
 ; CHECK-LABEL: divergent_if_swap_brtarget_order0:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -30,7 +30,7 @@
   ret i32 %v
 }
 
-define i32 @divergent_if_swap_brtarget_order1(i32 %value) {
+define i32 @divergent_if_swap_brtarget_order1(i32 %value) #0 {
 ; CHECK-LABEL: divergent_if_swap_brtarget_order1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -58,7 +58,7 @@
 }
 
 ; Make sure and 1 is inserted on llvm.amdgcn.if
-define i32 @divergent_if_nonboolean_condition0(i32 %value) {
+define i32 @divergent_if_nonboolean_condition0(i32 %value) #0 {
 ; CHECK-LABEL: divergent_if_nonboolean_condition0:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -87,7 +87,7 @@
 }
 
 ; Make sure and 1 is inserted on llvm.amdgcn.if
-define i32 @divergent_if_nonboolean_condition1(i32 addrspace(1)* %ptr) {
+define i32 @divergent_if_nonboolean_condition1(i32 addrspace(1)* %ptr) #0 {
 ; CHECK-LABEL: divergent_if_nonboolean_condition1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -123,7 +123,7 @@
 
 ; Make sure this case compiles. G_ICMP was mis-mapped due to having
 ; the result register class constrained by llvm.amdgcn.if lowering.
-define void @constrained_if_register_class() {
+define void @constrained_if_register_class() #0 {
 ; CHECK-LABEL: constrained_if_register_class:
 ; CHECK:       ; %bb.0: ; %bb
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -199,7 +199,7 @@
   ret void
 }
 
-define amdgpu_kernel void @break_loop(i32 %arg) {
+define amdgpu_kernel void @break_loop(i32 %arg) #0 {
 ; CHECK-LABEL: break_loop:
 ; CHECK:       ; %bb.0: ; %bb
 ; CHECK-NEXT:    s_load_dword s2, s[4:5], 0x0
@@ -249,3 +249,4 @@
 }
 
 declare i32 @llvm.amdgcn.workitem.id.x()
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GPRIDX %s
 ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=MOVREL %s
 
-define float @dyn_extract_v8f32_const_s_v(i32 %sel) {
+define float @dyn_extract_v8f32_const_s_v(i32 %sel) #0 {
 ; GPRIDX-LABEL: dyn_extract_v8f32_const_s_v:
 ; GPRIDX:       ; %bb.0: ; %entry
 ; GPRIDX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -59,7 +59,7 @@
   ret float %ext
 }
 
-define amdgpu_ps float @dyn_extract_v8f32_const_s_s(i32 inreg %sel) {
+define amdgpu_ps float @dyn_extract_v8f32_const_s_s(i32 inreg %sel) #0 {
 ; GPRIDX-LABEL: dyn_extract_v8f32_const_s_s:
 ; GPRIDX:       ; %bb.0: ; %entry
 ; GPRIDX-NEXT:    s_mov_b32 s4, 1.0
@@ -94,7 +94,7 @@
   ret float %ext
 }
 
-define amdgpu_ps float @dyn_extract_v8f32_s_v(<8 x float> inreg %vec, i32 %sel) {
+define amdgpu_ps float @dyn_extract_v8f32_s_v(<8 x float> inreg %vec, i32 %sel) #0 {
 ; GPRIDX-LABEL: dyn_extract_v8f32_s_v:
 ; GPRIDX:       ; %bb.0: ; %entry
 ; GPRIDX-NEXT:    s_mov_b32 s0, s2
@@ -149,7 +149,7 @@
   ret float %ext
 }
 
-define float @dyn_extract_v8f32_v_v(<8 x float> %vec, i32 %sel) {
+define float @dyn_extract_v8f32_v_v(<8 x float> %vec, i32 %sel) #0 {
 ; GPRIDX-LABEL: dyn_extract_v8f32_v_v:
 ; GPRIDX:       ; %bb.0: ; %entry
 ; GPRIDX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -189,7 +189,7 @@
   ret float %ext
 }
 
-define amdgpu_ps float @dyn_extract_v8f32_v_s(<8 x float> %vec, i32 inreg %sel) {
+define amdgpu_ps float @dyn_extract_v8f32_v_s(<8 x float> %vec, i32 inreg %sel) #0 {
 ; GPRIDX-LABEL: dyn_extract_v8f32_v_s:
 ; GPRIDX:       ; %bb.0: ; %entry
 ; GPRIDX-NEXT:    s_set_gpr_idx_on s2, gpr_idx(SRC0)
@@ -207,7 +207,7 @@
   ret float %ext
 }
 
-define amdgpu_ps float @dyn_extract_v8f32_s_s(<8 x float> inreg %vec, i32 inreg %sel) {
+define amdgpu_ps float @dyn_extract_v8f32_s_s(<8 x float> inreg %vec, i32 inreg %sel) #0 {
 ; GPRIDX-LABEL: dyn_extract_v8f32_s_s:
 ; GPRIDX:       ; %bb.0: ; %entry
 ; GPRIDX-NEXT:    s_mov_b32 s0, s2
@@ -242,7 +242,7 @@
   ret float %ext
 }
 
-define i64 @dyn_extract_v8i64_const_s_v(i32 %sel) {
+define i64 @dyn_extract_v8i64_const_s_v(i32 %sel) #0 {
 ; GPRIDX-LABEL: dyn_extract_v8i64_const_s_v:
 ; GPRIDX:       ; %bb.0: ; %entry
 ; GPRIDX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -301,7 +301,7 @@
   ret i64 %ext
 }
 
-define amdgpu_ps void @dyn_extract_v8i64_const_s_s(i32 inreg %sel) {
+define amdgpu_ps void @dyn_extract_v8i64_const_s_s(i32 inreg %sel) #0 {
 ; GPRIDX-LABEL: dyn_extract_v8i64_const_s_s:
 ; GPRIDX:       ; %bb.0: ; %entry
 ; GPRIDX-NEXT:    s_mov_b64 s[4:5], 1
@@ -341,7 +341,7 @@
   ret void
 }
 
-define amdgpu_ps void @dyn_extract_v8i64_s_v(<8 x i64> inreg %vec, i32 %sel) {
+define amdgpu_ps void @dyn_extract_v8i64_s_v(<8 x i64> inreg %vec, i32 %sel) #0 {
 ; GPRIDX-LABEL: dyn_extract_v8i64_s_v:
 ; GPRIDX:       ; %bb.0: ; %entry
 ; GPRIDX-NEXT:    s_mov_b32 s0, s2
@@ -417,7 +417,7 @@
   ret void
 }
 
-define i64 @dyn_extract_v8i64_v_v(<8 x i64> %vec, i32 %sel) {
+define i64 @dyn_extract_v8i64_v_v(<8 x i64> %vec, i32 %sel) #0 {
 ; GPRIDX-LABEL: dyn_extract_v8i64_v_v:
 ; GPRIDX:       ; %bb.0: ; %entry
 ; GPRIDX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -464,7 +464,7 @@
   ret i64 %ext
 }
 
-define amdgpu_ps void @dyn_extract_v8i64_v_s(<8 x i64> %vec, i32 inreg %sel) {
+define amdgpu_ps void @dyn_extract_v8i64_v_s(<8 x i64> %vec, i32 inreg %sel) #0 {
 ; GPRIDX-LABEL: dyn_extract_v8i64_v_s:
 ; GPRIDX:       ; %bb.0: ; %entry
 ; GPRIDX-NEXT:    s_lshl_b32 s0, s2, 1
@@ -490,7 +490,7 @@
   ret void
 }
 
-define amdgpu_ps void @dyn_extract_v8i64_s_s(<8 x i64> inreg %vec, i32 inreg %sel) {
+define amdgpu_ps void @dyn_extract_v8i64_s_s(<8 x i64> inreg %vec, i32 inreg %sel) #0 {
 ; GPRIDX-LABEL: dyn_extract_v8i64_s_s:
 ; GPRIDX:       ; %bb.0: ; %entry
 ; GPRIDX-NEXT:    s_mov_b32 s0, s2
@@ -546,7 +546,7 @@
   ret void
 }
 
-define amdgpu_ps float @dyn_extract_v8f32_s_s_offset3(<8 x float> inreg %vec, i32 inreg %sel) {
+define amdgpu_ps float @dyn_extract_v8f32_s_s_offset3(<8 x float> inreg %vec, i32 inreg %sel) #0 {
 ; GPRIDX-LABEL: dyn_extract_v8f32_s_s_offset3:
 ; GPRIDX:       ; %bb.0: ; %entry
 ; GPRIDX-NEXT:    s_mov_b32 s0, s2
@@ -582,7 +582,7 @@
   ret float %ext
 }
 
-define float @dyn_extract_v8f32_v_v_offset3(<8 x float> %vec, i32 %sel) {
+define float @dyn_extract_v8f32_v_v_offset3(<8 x float> %vec, i32 %sel) #0 {
 ; GPRIDX-LABEL: dyn_extract_v8f32_v_v_offset3:
 ; GPRIDX:       ; %bb.0: ; %entry
 ; GPRIDX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -623,7 +623,7 @@
   ret float %ext
 }
 
-define amdgpu_ps double @dyn_extract_v8f64_s_s_offset1(<8 x double> inreg %vec, i32 inreg %sel) {
+define amdgpu_ps double @dyn_extract_v8f64_s_s_offset1(<8 x double> inreg %vec, i32 inreg %sel) #0 {
 ; GPRIDX-LABEL: dyn_extract_v8f64_s_s_offset1:
 ; GPRIDX:       ; %bb.0: ; %entry
 ; GPRIDX-NEXT:    s_mov_b32 s0, s2
@@ -673,7 +673,7 @@
   ret double %ext
 }
 
-define amdgpu_ps double @dyn_extract_v8f64_s_s_offset2(<8 x double> inreg %vec, i32 inreg %sel) {
+define amdgpu_ps double @dyn_extract_v8f64_s_s_offset2(<8 x double> inreg %vec, i32 inreg %sel) #0 {
 ; GPRIDX-LABEL: dyn_extract_v8f64_s_s_offset2:
 ; GPRIDX:       ; %bb.0: ; %entry
 ; GPRIDX-NEXT:    s_mov_b32 s0, s2
@@ -723,7 +723,7 @@
   ret double %ext
 }
 
-define amdgpu_ps double @dyn_extract_v8f64_s_s_offset3(<8 x double> inreg %vec, i32 inreg %sel) {
+define amdgpu_ps double @dyn_extract_v8f64_s_s_offset3(<8 x double> inreg %vec, i32 inreg %sel) #0 {
 ; GPRIDX-LABEL: dyn_extract_v8f64_s_s_offset3:
 ; GPRIDX:       ; %bb.0: ; %entry
 ; GPRIDX-NEXT:    s_mov_b32 s0, s2
@@ -773,7 +773,7 @@
   ret double %ext
 }
 
-define amdgpu_ps double @dyn_extract_v8f64_s_s_offset4(<8 x double> inreg %vec, i32 inreg %sel) {
+define amdgpu_ps double @dyn_extract_v8f64_s_s_offset4(<8 x double> inreg %vec, i32 inreg %sel) #0 {
 ; GPRIDX-LABEL: dyn_extract_v8f64_s_s_offset4:
 ; GPRIDX:       ; %bb.0: ; %entry
 ; GPRIDX-NEXT:    s_mov_b32 s0, s2
@@ -823,7 +823,7 @@
   ret double %ext
 }
 
-define amdgpu_ps double @dyn_extract_v8f64_s_s_offset5(<8 x double> inreg %vec, i32 inreg %sel) {
+define amdgpu_ps double @dyn_extract_v8f64_s_s_offset5(<8 x double> inreg %vec, i32 inreg %sel) #0 {
 ; GPRIDX-LABEL: dyn_extract_v8f64_s_s_offset5:
 ; GPRIDX:       ; %bb.0: ; %entry
 ; GPRIDX-NEXT:    s_mov_b32 s0, s2
@@ -873,7 +873,7 @@
   ret double %ext
 }
 
-define amdgpu_ps double @dyn_extract_v8f64_s_s_offset6(<8 x double> inreg %vec, i32 inreg %sel) {
+define amdgpu_ps double @dyn_extract_v8f64_s_s_offset6(<8 x double> inreg %vec, i32 inreg %sel) #0 {
 ; GPRIDX-LABEL: dyn_extract_v8f64_s_s_offset6:
 ; GPRIDX:       ; %bb.0: ; %entry
 ; GPRIDX-NEXT:    s_mov_b32 s0, s2
@@ -923,7 +923,7 @@
   ret double %ext
 }
 
-define amdgpu_ps double @dyn_extract_v8f64_s_s_offset7(<8 x double> inreg %vec, i32 inreg %sel) {
+define amdgpu_ps double @dyn_extract_v8f64_s_s_offset7(<8 x double> inreg %vec, i32 inreg %sel) #0 {
 ; GPRIDX-LABEL: dyn_extract_v8f64_s_s_offset7:
 ; GPRIDX:       ; %bb.0: ; %entry
 ; GPRIDX-NEXT:    s_mov_b32 s0, s2
@@ -974,7 +974,7 @@
   ret double %ext
 }
 
-define amdgpu_ps double @dyn_extract_v8f64_s_s_offsetm1(<8 x double> inreg %vec, i32 inreg %sel) {
+define amdgpu_ps double @dyn_extract_v8f64_s_s_offsetm1(<8 x double> inreg %vec, i32 inreg %sel) #0 {
 ; GPRIDX-LABEL: dyn_extract_v8f64_s_s_offsetm1:
 ; GPRIDX:       ; %bb.0: ; %entry
 ; GPRIDX-NEXT:    s_mov_b32 s0, s2
@@ -1024,7 +1024,7 @@
   ret double %ext
 }
 
-define double @dyn_extract_v8f64_v_v_offset3(<8 x double> %vec, i32 %sel) {
+define double @dyn_extract_v8f64_v_v_offset3(<8 x double> %vec, i32 %sel) #0 {
 ; GPRIDX-LABEL: dyn_extract_v8f64_v_v_offset3:
 ; GPRIDX:       ; %bb.0: ; %entry
 ; GPRIDX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1074,7 +1074,7 @@
   ret double %ext
 }
 
-define i8 addrspace(3)* @dyn_extract_v8p3_v_v(<8 x i8 addrspace(3)*> %vec, i32 %idx) {
+define i8 addrspace(3)* @dyn_extract_v8p3_v_v(<8 x i8 addrspace(3)*> %vec, i32 %idx) #0 {
 ; GPRIDX-LABEL: dyn_extract_v8p3_v_v:
 ; GPRIDX:       ; %bb.0: ; %entry
 ; GPRIDX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1114,7 +1114,7 @@
   ret i8 addrspace(3)* %ext
 }
 
-define amdgpu_ps void @dyn_extract_v8p3_s_s(<8 x i8 addrspace(3)*> inreg %vec, i32 inreg %idx) {
+define amdgpu_ps void @dyn_extract_v8p3_s_s(<8 x i8 addrspace(3)*> inreg %vec, i32 inreg %idx) #0 {
 ; GPRIDX-LABEL: dyn_extract_v8p3_s_s:
 ; GPRIDX:       ; %bb.0: ; %entry
 ; GPRIDX-NEXT:    s_mov_b32 s0, s2
@@ -1153,7 +1153,7 @@
   ret void
 }
 
-define i8 addrspace(1)* @dyn_extract_v8p1_v_v(<8 x i8 addrspace(1)*> %vec, i32 %idx) {
+define i8 addrspace(1)* @dyn_extract_v8p1_v_v(<8 x i8 addrspace(1)*> %vec, i32 %idx) #0 {
 ; GPRIDX-LABEL: dyn_extract_v8p1_v_v:
 ; GPRIDX:       ; %bb.0: ; %entry
 ; GPRIDX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1200,7 +1200,7 @@
   ret i8 addrspace(1)* %ext
 }
 
-define amdgpu_ps void @dyn_extract_v8p1_s_s(<8 x i8 addrspace(1)*> inreg %vec, i32 inreg %idx) {
+define amdgpu_ps void @dyn_extract_v8p1_s_s(<8 x i8 addrspace(1)*> inreg %vec, i32 inreg %idx) #0 {
 ; GPRIDX-LABEL: dyn_extract_v8p1_s_s:
 ; GPRIDX:       ; %bb.0: ; %entry
 ; GPRIDX-NEXT:    s_mov_b32 s0, s2
@@ -1255,3 +1255,4 @@
   store i8 addrspace(1)* %ext, i8 addrspace(1)* addrspace(1)* undef
   ret void
 }
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/floor.f64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/floor.f64.ll
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/floor.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/floor.f64.ll
@@ -3,7 +3,7 @@
 ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=bonaire < %s | FileCheck -check-prefix=GFX78 %s
 ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji < %s | FileCheck -check-prefix=GFX78 %s
 
-define double @v_floor_f64_ieee(double %x) {
+define double @v_floor_f64_ieee(double %x) #0 {
 ; GFX6-LABEL: v_floor_f64_ieee:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -26,7 +26,7 @@
   ret double %result
 }
 
-define double @v_floor_f64_ieee_nnan(double %x) {
+define double @v_floor_f64_ieee_nnan(double %x) #0 {
 ; GFX6-LABEL: v_floor_f64_ieee_nnan:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -46,7 +46,7 @@
   ret double %result
 }
 
-define double @v_floor_f64_ieee_fneg(double %x) {
+define double @v_floor_f64_ieee_fneg(double %x) #0 {
 ; GFX6-LABEL: v_floor_f64_ieee_fneg:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -137,7 +137,7 @@
   ret double %result
 }
 
-define double @v_floor_f64_fabs(double %x) {
+define double @v_floor_f64_fabs(double %x) #0 {
 ; GFX6-LABEL: v_floor_f64_fabs:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -166,7 +166,7 @@
   ret double %result
 }
 
-define double @v_floor_f64_fneg_fabs(double %x) {
+define double @v_floor_f64_fneg_fabs(double %x) #0 {
 ; GFX6-LABEL: v_floor_f64_fneg_fabs:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -191,7 +191,7 @@
   ret double %result
 }
 
-define amdgpu_ps <2 x float> @s_floor_f64(double inreg %x) {
+define amdgpu_ps <2 x float> @s_floor_f64(double inreg %x) #0 {
 ; GFX6-LABEL: s_floor_f64:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    v_fract_f64_e32 v[0:1], s[2:3]
@@ -215,7 +215,7 @@
   ret <2 x float> %cast
 }
 
-define amdgpu_ps <2 x float> @s_floor_f64_fneg(double inreg %x) {
+define amdgpu_ps <2 x float> @s_floor_f64_fneg(double inreg %x) #0 {
 ; GFX6-LABEL: s_floor_f64_fneg:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    v_fract_f64_e64 v[0:1], -s[2:3]
@@ -240,7 +240,7 @@
   ret <2 x float> %cast
 }
 
-define amdgpu_ps <2 x float> @s_floor_f64_fabs(double inreg %x) {
+define amdgpu_ps <2 x float> @s_floor_f64_fabs(double inreg %x) #0 {
 ; GFX6-LABEL: s_floor_f64_fabs:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    v_fract_f64_e64 v[0:1], |s[2:3]|
@@ -265,7 +265,7 @@
   ret <2 x float> %cast
 }
 
-define amdgpu_ps <2 x float> @s_floor_f64_fneg_fabs(double inreg %x) {
+define amdgpu_ps <2 x float> @s_floor_f64_fneg_fabs(double inreg %x) #0 {
 ; GFX6-LABEL: s_floor_f64_fneg_fabs:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    v_fract_f64_e64 v[0:1], -|s[2:3]|
@@ -295,4 +295,4 @@
 declare double @llvm.fabs.f64(double) #0
 
 attributes #0 = { nounwind readnone speculatable willreturn }
-attributes #1 = { "amdgpu-ieee"="false" }
+attributes #1 = { nounwind "amdgpu-ieee"="false" }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll
@@ -3,7 +3,7 @@
 ; RUN: llc -global-isel -march=amdgcn -mcpu=fiji < %s | FileCheck -check-prefix=GFX8 %s
 ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
 
-define float @v_fma_f32(float %x, float %y, float %z) {
+define float @v_fma_f32(float %x, float %y, float %z) #0 {
 ; GFX6-LABEL: v_fma_f32:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -25,7 +25,7 @@
   ret float %fma
 }
 
-define <2 x float> @v_fma_v2f32(<2 x float> %x, <2 x float> %y, <2 x float> %z) {
+define <2 x float> @v_fma_v2f32(<2 x float> %x, <2 x float> %y, <2 x float> %z) #0 {
 ; GFX6-LABEL: v_fma_v2f32:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -50,7 +50,7 @@
   ret <2 x float> %fma
 }
 
-define half @v_fma_f16(half %x, half %y, half %z) {
+define half @v_fma_f16(half %x, half %y, half %z) #0 {
 ; GFX6-LABEL: v_fma_f16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -76,7 +76,7 @@
   ret half %fma
 }
 
-define <2 x half> @v_fma_v2f16(<2 x half> %x, <2 x half> %y, <2 x half> %z) {
+define <2 x half> @v_fma_v2f16(<2 x half> %x, <2 x half> %y, <2 x half> %z) #0 {
 ; GFX6-LABEL: v_fma_v2f16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -114,7 +114,7 @@
   ret <2 x half> %fma
 }
 
-define <2 x half> @v_fma_v2f16_fneg_lhs(<2 x half> %x, <2 x half> %y, <2 x half> %z) {
+define <2 x half> @v_fma_v2f16_fneg_lhs(<2 x half> %x, <2 x half> %y, <2 x half> %z) #0 {
 ; GFX6-LABEL: v_fma_v2f16_fneg_lhs:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -159,7 +159,7 @@
   ret <2 x half> %fma
 }
 
-define <2 x half> @v_fma_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y, <2 x half> %z) {
+define <2 x half> @v_fma_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y, <2 x half> %z) #0 {
 ; GFX6-LABEL: v_fma_v2f16_fneg_rhs:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -204,7 +204,7 @@
   ret <2 x half> %fma
 }
 
-define <2 x half> @v_fma_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y, <2 x half> %z) {
+define <2 x half> @v_fma_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y, <2 x half> %z) #0 {
 ; GFX6-LABEL: v_fma_v2f16_fneg_lhs_rhs:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -260,12 +260,12 @@
 }
 
 ; FIXME:
-; define <3 x half> @v_fma_v3f16(<3 x half> %x, <3 x half> %y, <3 x half> %z) {
+; define <3 x half> @v_fma_v3f16(<3 x half> %x, <3 x half> %y, <3 x half> %z) #0 {
 ;   %fma = call <3 x half> @llvm.fma.v3f16(<3 x half> %x, <3 x half> %y, <3 x half> %z)
 ;   ret <3 x half> %fma
 ; }
 
-define <4 x half> @v_fma_v4f16(<4 x half> %x, <4 x half> %y, <4 x half> %z) {
+define <4 x half> @v_fma_v4f16(<4 x half> %x, <4 x half> %y, <4 x half> %z) #0 {
 ; GFX6-LABEL: v_fma_v4f16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -321,7 +321,7 @@
   ret <4 x half> %fma
 }
 
-define double @v_fma_f64(double %x, double %y, double %z) {
+define double @v_fma_f64(double %x, double %y, double %z) #0 {
 ; GFX6-LABEL: v_fma_f64:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -343,7 +343,7 @@
   ret double %fma
 }
 
-define double @v_fma_f64_fneg_all(double %x, double %y, double %z) {
+define double @v_fma_f64_fneg_all(double %x, double %y, double %z) #0 {
 ; GFX6-LABEL: v_fma_f64_fneg_all:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -368,7 +368,7 @@
   ret double %fma
 }
 
-define <2 x double> @v_fma_v2f64(<2 x double> %x, <2 x double> %y, <2 x double> %z) {
+define <2 x double> @v_fma_v2f64(<2 x double> %x, <2 x double> %y, <2 x double> %z) #0 {
 ; GFX6-LABEL: v_fma_v2f64:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -393,7 +393,7 @@
   ret <2 x double> %fma
 }
 
-define float @v_fma_f32_fabs_lhs(float %x, float %y, float %z) {
+define float @v_fma_f32_fabs_lhs(float %x, float %y, float %z) #0 {
 ; GFX6-LABEL: v_fma_f32_fabs_lhs:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -416,7 +416,7 @@
   ret float %fma
 }
 
-define float @v_fma_f32_fabs_rhs(float %x, float %y, float %z) {
+define float @v_fma_f32_fabs_rhs(float %x, float %y, float %z) #0 {
 ; GFX6-LABEL: v_fma_f32_fabs_rhs:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -439,7 +439,7 @@
   ret float %fma
 }
 
-define float @v_fma_f32_fabs_lhs_rhs(float %x, float %y, float %z) {
+define float @v_fma_f32_fabs_lhs_rhs(float %x, float %y, float %z) #0 {
 ; GFX6-LABEL: v_fma_f32_fabs_lhs_rhs:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -463,7 +463,7 @@
   ret float %fma
 }
 
-define amdgpu_ps float @v_fma_f32_sgpr_vgpr_vgpr(float inreg %x, float %y, float %z) {
+define amdgpu_ps float @v_fma_f32_sgpr_vgpr_vgpr(float inreg %x, float %y, float %z) #0 {
 ; GFX6-LABEL: v_fma_f32_sgpr_vgpr_vgpr:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    v_fma_f32 v0, s0, v0, v1
@@ -482,7 +482,7 @@
   ret float %fma
 }
 
-define amdgpu_ps float @v_fma_f32_vgpr_sgpr_vgpr(float %x, float inreg %y, float %z) {
+define amdgpu_ps float @v_fma_f32_vgpr_sgpr_vgpr(float %x, float inreg %y, float %z) #0 {
 ; GFX6-LABEL: v_fma_f32_vgpr_sgpr_vgpr:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    v_fma_f32 v0, v0, s0, v1
@@ -501,7 +501,7 @@
   ret float %fma
 }
 
-define amdgpu_ps float @v_fma_f32_sgpr_sgpr_sgpr(float inreg %x, float inreg %y, float inreg %z) {
+define amdgpu_ps float @v_fma_f32_sgpr_sgpr_sgpr(float inreg %x, float inreg %y, float inreg %z) #0 {
 ; GFX6-LABEL: v_fma_f32_sgpr_sgpr_sgpr:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s1
@@ -526,7 +526,7 @@
   ret float %fma
 }
 
-define float @v_fma_f32_fneg_lhs(float %x, float %y, float %z) {
+define float @v_fma_f32_fneg_lhs(float %x, float %y, float %z) #0 {
 ; GFX6-LABEL: v_fma_f32_fneg_lhs:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -549,7 +549,7 @@
   ret float %fma
 }
 
-define float @v_fma_f32_fneg_rhs(float %x, float %y, float %z) {
+define float @v_fma_f32_fneg_rhs(float %x, float %y, float %z) #0 {
 ; GFX6-LABEL: v_fma_f32_fneg_rhs:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -572,7 +572,7 @@
   ret float %fma
 }
 
-define float @v_fma_f32_fneg_z(float %x, float %y, float %z) {
+define float @v_fma_f32_fneg_z(float %x, float %y, float %z) #0 {
 ; GFX6-LABEL: v_fma_f32_fneg_z:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll
@@ -3,7 +3,7 @@
 ; RUN: llc -global-isel -march=amdgcn -mcpu=fiji < %s | FileCheck -check-prefix=GFX8 %s
 ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
 
-define float @v_pow_f32(float %x, float %y) {
+define float @v_pow_f32(float %x, float %y) #0 {
 ; GFX6-LABEL: v_pow_f32:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -31,7 +31,7 @@
   ret float %pow
 }
 
-define <2 x float> @v_pow_v2f32(<2 x float> %x, <2 x float> %y) {
+define <2 x float> @v_pow_v2f32(<2 x float> %x, <2 x float> %y) #0 {
 ; GFX6-LABEL: v_pow_v2f32:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -68,7 +68,7 @@
   ret <2 x float> %pow
 }
 
-define half @v_pow_f16(half %x, half %y) {
+define half @v_pow_f16(half %x, half %y) #0 {
 ; GFX6-LABEL: v_pow_f16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -105,7 +105,7 @@
   ret half %pow
 }
 
-define <2 x half> @v_pow_v2f16(<2 x half> %x, <2 x half> %y) {
+define <2 x half> @v_pow_v2f16(<2 x half> %x, <2 x half> %y) #0 {
 ; GFX6-LABEL: v_pow_v2f16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -165,7 +165,7 @@
   ret <2 x half> %pow
 }
 
-define <2 x half> @v_pow_v2f16_fneg_lhs(<2 x half> %x, <2 x half> %y) {
+define <2 x half> @v_pow_v2f16_fneg_lhs(<2 x half> %x, <2 x half> %y) #0 {
 ; GFX6-LABEL: v_pow_v2f16_fneg_lhs:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -233,7 +233,7 @@
   ret <2 x half> %pow
 }
 
-define <2 x half> @v_pow_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y) {
+define <2 x half> @v_pow_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y) #0 {
 ; GFX6-LABEL: v_pow_v2f16_fneg_rhs:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -301,7 +301,7 @@
   ret <2 x half> %pow
 }
 
-define <2 x half> @v_pow_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y) {
+define <2 x half> @v_pow_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y) #0 {
 ; GFX6-LABEL: v_pow_v2f16_fneg_lhs_rhs:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -382,12 +382,12 @@
 }
 
 ; FIXME
-; define double @v_pow_f64(double %x, double %y) {
+; define double @v_pow_f64(double %x, double %y) #0 {
 ;   %pow = call double @llvm.pow.f64(double %x, double %y)
 ;   ret double %pow
 ; }
 
-define float @v_pow_f32_fabs_lhs(float %x, float %y) {
+define float @v_pow_f32_fabs_lhs(float %x, float %y) #0 {
 ; GFX6-LABEL: v_pow_f32_fabs_lhs:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -416,7 +416,7 @@
   ret float %pow
 }
 
-define float @v_pow_f32_fabs_rhs(float %x, float %y) {
+define float @v_pow_f32_fabs_rhs(float %x, float %y) #0 {
 ; GFX6-LABEL: v_pow_f32_fabs_rhs:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -445,7 +445,7 @@
   ret float %pow
 }
 
-define float @v_pow_f32_fabs_lhs_rhs(float %x, float %y) {
+define float @v_pow_f32_fabs_lhs_rhs(float %x, float %y) #0 {
 ; GFX6-LABEL: v_pow_f32_fabs_lhs_rhs:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -475,7 +475,7 @@
   ret float %pow
 }
 
-define amdgpu_ps float @v_pow_f32_sgpr_vgpr(float inreg %x, float %y) {
+define amdgpu_ps float @v_pow_f32_sgpr_vgpr(float inreg %x, float %y) #0 {
 ; GFX6-LABEL: v_pow_f32_sgpr_vgpr:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    v_log_f32_e32 v1, s0
@@ -500,7 +500,7 @@
   ret float %pow
 }
 
-define amdgpu_ps float @v_pow_f32_vgpr_sgpr(float %x, float inreg %y) {
+define amdgpu_ps float @v_pow_f32_vgpr_sgpr(float %x, float inreg %y) #0 {
 ; GFX6-LABEL: v_pow_f32_vgpr_sgpr:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    v_log_f32_e32 v0, v0
@@ -525,7 +525,7 @@
   ret float %pow
 }
 
-define amdgpu_ps float @v_pow_f32_sgpr_sgpr(float inreg %x, float inreg %y) {
+define amdgpu_ps float @v_pow_f32_sgpr_sgpr(float inreg %x, float inreg %y) #0 {
 ; GFX6-LABEL: v_pow_f32_sgpr_sgpr:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    v_log_f32_e32 v0, s0
@@ -550,7 +550,7 @@
   ret float %pow
 }
 
-define float @v_pow_f32_fneg_lhs(float %x, float %y) {
+define float @v_pow_f32_fneg_lhs(float %x, float %y) #0 {
 ; GFX6-LABEL: v_pow_f32_fneg_lhs:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -579,7 +579,7 @@
   ret float %pow
 }
 
-define float @v_pow_f32_fneg_rhs(float %x, float %y) {
+define float @v_pow_f32_fneg_rhs(float %x, float %y) #0 {
 ; GFX6-LABEL: v_pow_f32_fneg_rhs:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -617,3 +617,4 @@
 
 declare <2 x half> @llvm.pow.v2f16(<2 x half>, <2 x half>)
 declare <2 x float> @llvm.pow.v2f32(<2 x float>, <2 x float>)
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
@@ -6,7 +6,7 @@
 ; FIXME: Need constant bus fixup pre-gfx10 for movrel
 ; ERR: Bad machine code: VOP* instruction violates constant bus restriction
 
-define amdgpu_ps <8 x i32> @dyn_insertelement_v8i32_s_s_s(<8 x i32> inreg %vec, i32 inreg %val, i32 inreg %idx) {
+define amdgpu_ps <8 x i32> @dyn_insertelement_v8i32_s_s_s(<8 x i32> inreg %vec, i32 inreg %val, i32 inreg %idx) #0 {
 ; GPRIDX-LABEL: dyn_insertelement_v8i32_s_s_s:
 ; GPRIDX:       ; %bb.0: ; %entry
 ; GPRIDX-NEXT:    s_mov_b32 s0, s2
@@ -41,7 +41,7 @@
   ret <8 x i32> %insert
 }
 
-define amdgpu_ps <8 x i8 addrspace(3)*> @dyn_insertelement_v8p3i8_s_s_s(<8 x i8 addrspace(3)*> inreg %vec, i8 addrspace(3)* inreg %val, i32 inreg %idx) {
+define amdgpu_ps <8 x i8 addrspace(3)*> @dyn_insertelement_v8p3i8_s_s_s(<8 x i8 addrspace(3)*> inreg %vec, i8 addrspace(3)* inreg %val, i32 inreg %idx) #0 {
 ; GPRIDX-LABEL: dyn_insertelement_v8p3i8_s_s_s:
 ; GPRIDX:       ; %bb.0: ; %entry
 ; GPRIDX-NEXT:    s_mov_b32 s0, s2
@@ -76,7 +76,7 @@
   ret <8 x i8 addrspace(3)*> %insert
 }
 
-define <8 x float> @dyn_insertelement_v8f32_const_s_v_v(float %val, i32 %idx) {
+define <8 x float> @dyn_insertelement_v8f32_const_s_v_v(float %val, i32 %idx) #0 {
 ; GPRIDX-LABEL: dyn_insertelement_v8f32_const_s_v_v:
 ; GPRIDX:       ; %bb.0: ; %entry
 ; GPRIDX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -180,7 +180,7 @@
   ret <8 x float> %insert
 }
 
-define amdgpu_ps <8 x float> @dyn_insertelement_v8f32_s_s_v(<8 x float> inreg %vec, float inreg %val, i32 %idx) {
+define amdgpu_ps <8 x float> @dyn_insertelement_v8f32_s_s_v(<8 x float> inreg %vec, float inreg %val, i32 %idx) #0 {
 ; GPRIDX-LABEL: dyn_insertelement_v8f32_s_s_v:
 ; GPRIDX:       ; %bb.0: ; %entry
 ; GPRIDX-NEXT:    s_mov_b32 s1, s3
@@ -274,7 +274,7 @@
   ret <8 x float> %insert
 }
 
-define amdgpu_ps <8 x float> @dyn_insertelement_v8f32_s_v_s(<8 x float> inreg %vec, float %val, i32 inreg %idx) {
+define amdgpu_ps <8 x float> @dyn_insertelement_v8f32_s_v_s(<8 x float> inreg %vec, float %val, i32 inreg %idx) #0 {
 ; GPRIDX-LABEL: dyn_insertelement_v8f32_s_v_s:
 ; GPRIDX:       ; %bb.0: ; %entry
 ; GPRIDX-NEXT:    s_mov_b32 s0, s2
@@ -327,7 +327,7 @@
   ret <8 x float> %insert
 }
 
-define amdgpu_ps <8 x float> @dyn_insertelement_v8f32_v_s_s(<8 x float> %vec, float inreg %val, i32 inreg %idx) {
+define amdgpu_ps <8 x float> @dyn_insertelement_v8f32_v_s_s(<8 x float> %vec, float inreg %val, i32 inreg %idx) #0 {
 ; GPRIDX-LABEL: dyn_insertelement_v8f32_v_s_s:
 ; GPRIDX:       ; %bb.0: ; %entry
 ; GPRIDX-NEXT:    s_set_gpr_idx_on s3, gpr_idx(DST)
@@ -346,7 +346,7 @@
   ret <8 x float> %insert
 }
 
-define amdgpu_ps <8 x float> @dyn_insertelement_v8f32_s_v_v(<8 x float> inreg %vec, float %val, i32 %idx) {
+define amdgpu_ps <8 x float> @dyn_insertelement_v8f32_s_v_v(<8 x float> inreg %vec, float %val, i32 %idx) #0 {
 ; GPRIDX-LABEL: dyn_insertelement_v8f32_s_v_v:
 ; GPRIDX:       ; %bb.0: ; %entry
 ; GPRIDX-NEXT:    s_mov_b32 s1, s3
@@ -441,7 +441,7 @@
   ret <8 x float> %insert
 }
 
-define amdgpu_ps <8 x float> @dyn_insertelement_v8f32_v_s_v(<8 x float> %vec, float inreg %val, i32 %idx) {
+define amdgpu_ps <8 x float> @dyn_insertelement_v8f32_v_s_v(<8 x float> %vec, float inreg %val, i32 %idx) #0 {
 ; GPRIDX-LABEL: dyn_insertelement_v8f32_v_s_v:
 ; GPRIDX:       ; %bb.0: ; %entry
 ; GPRIDX-NEXT:    s_mov_b64 s[0:1], exec
@@ -510,7 +510,7 @@
   ret <8 x float> %insert
 }
 
-define amdgpu_ps <8 x float> @dyn_insertelement_v8f32_v_v_s(<8 x float> %vec, float %val, i32 inreg %idx) {
+define amdgpu_ps <8 x float> @dyn_insertelement_v8f32_v_v_s(<8 x float> %vec, float %val, i32 inreg %idx) #0 {
 ; GPRIDX-LABEL: dyn_insertelement_v8f32_v_v_s:
 ; GPRIDX:       ; %bb.0: ; %entry
 ; GPRIDX-NEXT:    s_set_gpr_idx_on s2, gpr_idx(DST)
@@ -529,7 +529,7 @@
   ret <8 x float> %insert
 }
 
-define amdgpu_ps <8 x float> @dyn_insertelement_v8p3i8_v_v_s(<8 x i8 addrspace(3)*> %vec, i8 addrspace(3)* %val, i32 inreg %idx) {
+define amdgpu_ps <8 x float> @dyn_insertelement_v8p3i8_v_v_s(<8 x i8 addrspace(3)*> %vec, i8 addrspace(3)* %val, i32 inreg %idx) #0 {
 ; GPRIDX-LABEL: dyn_insertelement_v8p3i8_v_v_s:
 ; GPRIDX:       ; %bb.0: ; %entry
 ; GPRIDX-NEXT:    s_set_gpr_idx_on s2, gpr_idx(DST)
@@ -550,7 +550,7 @@
   ret <8 x float> %cast.1
 }
 
-define amdgpu_ps <8 x float> @dyn_insertelement_v8f32_v_v_v(<8 x float> %vec, float %val, i32 %idx) {
+define amdgpu_ps <8 x float> @dyn_insertelement_v8f32_v_v_v(<8 x float> %vec, float %val, i32 %idx) #0 {
 ; GPRIDX-LABEL: dyn_insertelement_v8f32_v_v_v:
 ; GPRIDX:       ; %bb.0: ; %entry
 ; GPRIDX-NEXT:    s_mov_b64 s[0:1], exec
@@ -619,7 +619,7 @@
   ret <8 x float> %insert
 }
 
-define amdgpu_ps <8 x i64> @dyn_insertelement_v8i64_s_s_s(<8 x i64> inreg %vec, i64 inreg %val, i32 inreg %idx) {
+define amdgpu_ps <8 x i64> @dyn_insertelement_v8i64_s_s_s(<8 x i64> inreg %vec, i64 inreg %val, i32 inreg %idx) #0 {
 ; GPRIDX-LABEL: dyn_insertelement_v8i64_s_s_s:
 ; GPRIDX:       ; %bb.0: ; %entry
 ; GPRIDX-NEXT:    s_mov_b32 s0, s2
@@ -670,7 +670,7 @@
   ret <8 x i64> %insert
 }
 
-define amdgpu_ps <8 x i8 addrspace(1)*> @dyn_insertelement_v8p1i8_s_s_s(<8 x i8 addrspace(1)*> inreg %vec, i8 addrspace(1)* inreg %val, i32 inreg %idx) {
+define amdgpu_ps <8 x i8 addrspace(1)*> @dyn_insertelement_v8p1i8_s_s_s(<8 x i8 addrspace(1)*> inreg %vec, i8 addrspace(1)* inreg %val, i32 inreg %idx) #0 {
 ; GPRIDX-LABEL: dyn_insertelement_v8p1i8_s_s_s:
 ; GPRIDX:       ; %bb.0: ; %entry
 ; GPRIDX-NEXT:    s_mov_b32 s0, s2
@@ -721,7 +721,7 @@
   ret <8 x i8 addrspace(1)*> %insert
 }
 
-define void @dyn_insertelement_v8f64_const_s_v_v(double %val, i32 %idx) {
+define void @dyn_insertelement_v8f64_const_s_v_v(double %val, i32 %idx) #0 {
 ; GPRIDX-LABEL: dyn_insertelement_v8f64_const_s_v_v:
 ; GPRIDX:       ; %bb.0: ; %entry
 ; GPRIDX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -887,7 +887,7 @@
   ret void
 }
 
-define amdgpu_ps void @dyn_insertelement_v8f64_s_s_v(<8 x double> inreg %vec, double inreg %val, i32 %idx) {
+define amdgpu_ps void @dyn_insertelement_v8f64_s_s_v(<8 x double> inreg %vec, double inreg %val, i32 %idx) #0 {
 ; GPRIDX-LABEL: dyn_insertelement_v8f64_s_s_v:
 ; GPRIDX:       ; %bb.0: ; %entry
 ; GPRIDX-NEXT:    s_mov_b32 s1, s3
@@ -1041,7 +1041,7 @@
   ret void
 }
 
-define amdgpu_ps void @dyn_insertelement_v8f64_s_v_s(<8 x double> inreg %vec, double %val, i32 inreg %idx) {
+define amdgpu_ps void @dyn_insertelement_v8f64_s_v_s(<8 x double> inreg %vec, double %val, i32 inreg %idx) #0 {
 ; GPRIDX-LABEL: dyn_insertelement_v8f64_s_v_s:
 ; GPRIDX:       ; %bb.0: ; %entry
 ; GPRIDX-NEXT:    s_mov_b32 s1, s3
@@ -1145,7 +1145,7 @@
   ret void
 }
 
-define amdgpu_ps void @dyn_insertelement_v8f64_v_s_s(<8 x double> %vec, double inreg %val, i32 inreg %idx) {
+define amdgpu_ps void @dyn_insertelement_v8f64_v_s_s(<8 x double> %vec, double inreg %val, i32 inreg %idx) #0 {
 ; GPRIDX-LABEL: dyn_insertelement_v8f64_v_s_s:
 ; GPRIDX:       ; %bb.0: ; %entry
 ; GPRIDX-NEXT:    s_lshl_b32 s0, s4, 1
@@ -1185,7 +1185,7 @@
   ret void
 }
 
-define amdgpu_ps void @dyn_insertelement_v8f64_s_v_v(<8 x double> inreg %vec, double %val, i32 %idx) {
+define amdgpu_ps void @dyn_insertelement_v8f64_s_v_v(<8 x double> inreg %vec, double %val, i32 %idx) #0 {
 ; GPRIDX-LABEL: dyn_insertelement_v8f64_s_v_v:
 ; GPRIDX:       ; %bb.0: ; %entry
 ; GPRIDX-NEXT:    s_mov_b32 s1, s3
@@ -1339,7 +1339,7 @@
   ret void
 }
 
-define amdgpu_ps void @dyn_insertelement_v8f64_v_s_v(<8 x double> %vec, double inreg %val, i32 %idx) {
+define amdgpu_ps void @dyn_insertelement_v8f64_v_s_v(<8 x double> %vec, double inreg %val, i32 %idx) #0 {
 ; GPRIDX-LABEL: dyn_insertelement_v8f64_v_s_v:
 ; GPRIDX:       ; %bb.0: ; %entry
 ; GPRIDX-NEXT:    s_mov_b64 s[0:1], exec
@@ -1429,7 +1429,7 @@
   ret void
 }
 
-define amdgpu_ps void @dyn_insertelement_v8f64_v_v_s(<8 x double> %vec, double %val, i32 inreg %idx) {
+define amdgpu_ps void @dyn_insertelement_v8f64_v_v_s(<8 x double> %vec, double %val, i32 inreg %idx) #0 {
 ; GPRIDX-LABEL: dyn_insertelement_v8f64_v_v_s:
 ; GPRIDX:       ; %bb.0: ; %entry
 ; GPRIDX-NEXT:    s_lshl_b32 s0, s2, 1
@@ -1469,7 +1469,7 @@
   ret void
 }
 
-define amdgpu_ps void @dyn_insertelement_v8f64_v_v_v(<8 x double> %vec, double %val, i32 %idx) {
+define amdgpu_ps void @dyn_insertelement_v8f64_v_v_v(<8 x double> %vec, double %val, i32 %idx) #0 {
 ; GPRIDX-LABEL: dyn_insertelement_v8f64_v_v_v:
 ; GPRIDX:       ; %bb.0: ; %entry
 ; GPRIDX-NEXT:    s_mov_b64 s[0:1], exec
@@ -1559,7 +1559,7 @@
   ret void
 }
 
-define amdgpu_ps <3 x i32> @dyn_insertelement_v3i32_s_s_s(<3 x i32> inreg %vec, i32 inreg %val, i32 inreg %idx) {
+define amdgpu_ps <3 x i32> @dyn_insertelement_v3i32_s_s_s(<3 x i32> inreg %vec, i32 inreg %val, i32 inreg %idx) #0 {
 ; GPRIDX-LABEL: dyn_insertelement_v3i32_s_s_s:
 ; GPRIDX:       ; %bb.0: ; %entry
 ; GPRIDX-NEXT:    s_mov_b32 s0, s2
@@ -1584,7 +1584,7 @@
   ret <3 x i32> %insert
 }
 
-define amdgpu_ps <3 x float> @dyn_insertelement_v3i32_v_v_s(<3 x float> %vec, float %val, i32 inreg %idx) {
+define amdgpu_ps <3 x float> @dyn_insertelement_v3i32_v_v_s(<3 x float> %vec, float %val, i32 inreg %idx) #0 {
 ; GPRIDX-LABEL: dyn_insertelement_v3i32_v_v_s:
 ; GPRIDX:       ; %bb.0: ; %entry
 ; GPRIDX-NEXT:    s_set_gpr_idx_on s2, gpr_idx(DST)
@@ -1603,7 +1603,7 @@
   ret <3 x float> %insert
 }
 
-define amdgpu_ps <5 x i32> @dyn_insertelement_v5i32_s_s_s(<5 x i32> inreg %vec, i32 inreg %val, i32 inreg %idx) {
+define amdgpu_ps <5 x i32> @dyn_insertelement_v5i32_s_s_s(<5 x i32> inreg %vec, i32 inreg %val, i32 inreg %idx) #0 {
 ; GPRIDX-LABEL: dyn_insertelement_v5i32_s_s_s:
 ; GPRIDX:       ; %bb.0: ; %entry
 ; GPRIDX-NEXT:    s_mov_b32 s0, s2
@@ -1632,7 +1632,7 @@
   ret <5 x i32> %insert
 }
 
-define amdgpu_ps <5 x float> @dyn_insertelement_v5i32_v_v_s(<5 x float> %vec, float %val, i32 inreg %idx) {
+define amdgpu_ps <5 x float> @dyn_insertelement_v5i32_v_v_s(<5 x float> %vec, float %val, i32 inreg %idx) #0 {
 ; GPRIDX-LABEL: dyn_insertelement_v5i32_v_v_s:
 ; GPRIDX:       ; %bb.0: ; %entry
 ; GPRIDX-NEXT:    s_set_gpr_idx_on s2, gpr_idx(DST)
@@ -1651,7 +1651,7 @@
   ret <5 x float> %insert
 }
 
-define amdgpu_ps <32 x i32> @dyn_insertelement_v32i32_s_s_s(<32 x i32> inreg %vec, i32 inreg %val, i32 inreg %idx) {
+define amdgpu_ps <32 x i32> @dyn_insertelement_v32i32_s_s_s(<32 x i32> inreg %vec, i32 inreg %val, i32 inreg %idx) #0 {
 ; GPRIDX-LABEL: dyn_insertelement_v32i32_s_s_s:
 ; GPRIDX:       ; %bb.0: ; %entry
 ; GPRIDX-NEXT:    s_mov_b32 s0, s2
@@ -1734,7 +1734,7 @@
   ret <32 x i32> %insert
 }
 
-define amdgpu_ps <32 x float> @dyn_insertelement_v32i32_v_v_s(<32 x float> %vec, float %val, i32 inreg %idx) {
+define amdgpu_ps <32 x float> @dyn_insertelement_v32i32_v_v_s(<32 x float> %vec, float %val, i32 inreg %idx) #0 {
 ; GPRIDX-LABEL: dyn_insertelement_v32i32_v_v_s:
 ; GPRIDX:       ; %bb.0: ; %entry
 ; GPRIDX-NEXT:    s_set_gpr_idx_on s2, gpr_idx(DST)
@@ -1753,7 +1753,7 @@
   ret <32 x float> %insert
 }
 
-define amdgpu_ps <8 x float> @dyn_insertelement_v8f32_s_s_s_add_1(<8 x float> inreg %vec, float inreg %val, i32 inreg %idx) {
+define amdgpu_ps <8 x float> @dyn_insertelement_v8f32_s_s_s_add_1(<8 x float> inreg %vec, float inreg %val, i32 inreg %idx) #0 {
 ; GPRIDX-LABEL: dyn_insertelement_v8f32_s_s_s_add_1:
 ; GPRIDX:       ; %bb.0: ; %entry
 ; GPRIDX-NEXT:    s_mov_b32 s0, s2
@@ -1805,7 +1805,7 @@
   ret <8 x float> %insert
 }
 
-define amdgpu_ps <8 x float> @dyn_insertelement_v8f32_s_s_s_add_7(<8 x float> inreg %vec, float inreg %val, i32 inreg %idx) {
+define amdgpu_ps <8 x float> @dyn_insertelement_v8f32_s_s_s_add_7(<8 x float> inreg %vec, float inreg %val, i32 inreg %idx) #0 {
 ; GPRIDX-LABEL: dyn_insertelement_v8f32_s_s_s_add_7:
 ; GPRIDX:       ; %bb.0: ; %entry
 ; GPRIDX-NEXT:    s_mov_b32 s0, s2
@@ -1857,7 +1857,7 @@
   ret <8 x float> %insert
 }
 
-define amdgpu_ps <8 x float> @dyn_insertelement_v8f32_v_v_v_add_1(<8 x float> %vec, float %val, i32 %idx) {
+define amdgpu_ps <8 x float> @dyn_insertelement_v8f32_v_v_v_add_1(<8 x float> %vec, float %val, i32 %idx) #0 {
 ; GPRIDX-LABEL: dyn_insertelement_v8f32_v_v_v_add_1:
 ; GPRIDX:       ; %bb.0: ; %entry
 ; GPRIDX-NEXT:    s_mov_b64 s[0:1], exec
@@ -1927,7 +1927,7 @@
   ret <8 x float> %insert
 }
 
-define amdgpu_ps <8 x float> @dyn_insertelement_v8f32_v_v_v_add_7(<8 x float> %vec, float %val, i32 %idx) {
+define amdgpu_ps <8 x float> @dyn_insertelement_v8f32_v_v_v_add_7(<8 x float> %vec, float %val, i32 %idx) #0 {
 ; GPRIDX-LABEL: dyn_insertelement_v8f32_v_v_v_add_7:
 ; GPRIDX:       ; %bb.0: ; %entry
 ; GPRIDX-NEXT:    s_mov_b64 s[0:1], exec
@@ -1997,7 +1997,7 @@
   ret <8 x float> %insert
 }
 
-define amdgpu_ps void @dyn_insertelement_v8f64_s_s_s_add_1(<8 x double> inreg %vec, double inreg %val, i32 inreg %idx) {
+define amdgpu_ps void @dyn_insertelement_v8f64_s_s_s_add_1(<8 x double> inreg %vec, double inreg %val, i32 inreg %idx) #0 {
 ; GPRIDX-LABEL: dyn_insertelement_v8f64_s_s_s_add_1:
 ; GPRIDX:       ; %bb.0: ; %entry
 ; GPRIDX-NEXT:    s_mov_b32 s0, s2
@@ -2100,7 +2100,7 @@
   ret void
 }
 
-define amdgpu_ps void @dyn_insertelement_v8f64_v_v_v_add_1(<8 x double> %vec, double %val, i32 %idx) {
+define amdgpu_ps void @dyn_insertelement_v8f64_v_v_v_add_1(<8 x double> %vec, double %val, i32 %idx) #0 {
 ; GPRIDX-LABEL: dyn_insertelement_v8f64_v_v_v_add_1:
 ; GPRIDX:       ; %bb.0: ; %entry
 ; GPRIDX-NEXT:    s_mov_b64 s[0:1], exec
@@ -2192,3 +2192,4 @@
   store volatile <2 x double> %vec.3, <2 x double> addrspace(1)* undef
   ret void
 }
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -global-isel -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
-define amdgpu_kernel void @test_wave32(i32 %arg0, [8 x i32], i32 %saved) {
+define amdgpu_kernel void @test_wave32(i32 %arg0, [8 x i32], i32 %saved) #0 {
 ; GCN-LABEL: test_wave32:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_load_dword s1, s[4:5], 0x0
@@ -37,3 +37,4 @@
 }
 
 declare void @llvm.amdgcn.end.cf.i32(i32 %val)
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i64.ll
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i64.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -global-isel -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
-define amdgpu_kernel void @test_wave64(i32 %arg0, i64 %saved) {
+define amdgpu_kernel void @test_wave64(i32 %arg0, i64 %saved) #0 {
 ; GCN-LABEL: test_wave64:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_load_dword s2, s[4:5], 0x0
@@ -35,3 +35,4 @@
 }
 
 declare void @llvm.amdgcn.end.cf.i64(i64 %val)
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fmul.legacy.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fmul.legacy.ll
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fmul.legacy.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fmul.legacy.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -global-isel -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 ; RUN: llc -global-isel -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
-define float @v_mul_legacy_f32(float %a, float %b) {
+define float @v_mul_legacy_f32(float %a, float %b) #0 {
 ; GCN-LABEL: v_mul_legacy_f32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -12,7 +12,7 @@
   ret float %result
 }
 
-define float @v_mul_legacy_undef0_f32(float %a) {
+define float @v_mul_legacy_undef0_f32(float %a) #0 {
 ; GCN-LABEL: v_mul_legacy_undef0_f32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -22,7 +22,7 @@
   ret float %result
 }
 
-define float @v_mul_legacy_undef1_f32(float %a) {
+define float @v_mul_legacy_undef1_f32(float %a) #0 {
 ; GCN-LABEL: v_mul_legacy_undef1_f32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -32,7 +32,7 @@
   ret float %result
 }
 
-define float @v_mul_legacy_undef_f32() {
+define float @v_mul_legacy_undef_f32() #0 {
 ; GCN-LABEL: v_mul_legacy_undef_f32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -42,7 +42,7 @@
   ret float %result
 }
 
-define float @v_mul_legacy_fabs_f32(float %a, float %b) {
+define float @v_mul_legacy_fabs_f32(float %a, float %b) #0 {
 ; GCN-LABEL: v_mul_legacy_fabs_f32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -54,7 +54,7 @@
   ret float %result
 }
 
-define float @v_mul_legacy_fneg_f32(float %a, float %b) {
+define float @v_mul_legacy_fneg_f32(float %a, float %b) #0 {
 ; GCN-LABEL: v_mul_legacy_fneg_f32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -67,7 +67,7 @@
 }
 
 ; TODO: Should match mac_legacy/mad_legacy
-define float @v_mad_legacy_f32(float %a, float %b, float %c) {
+define float @v_mad_legacy_f32(float %a, float %b, float %c) #0 {
 ; GCN-LABEL: v_mad_legacy_f32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -79,7 +79,7 @@
   ret float %add
 }
 
-define amdgpu_ps float @s_mul_legacy_f32(float inreg %a, float inreg %b) {
+define amdgpu_ps float @s_mul_legacy_f32(float inreg %a, float inreg %b) #0 {
 ; GCN-LABEL: s_mul_legacy_f32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    v_mov_b32_e32 v0, s1
@@ -89,7 +89,7 @@
   ret float %result
 }
 
-define float @v_mul_legacy_f32_1.0(float %a) {
+define float @v_mul_legacy_f32_1.0(float %a) #0 {
 ; GCN-LABEL: v_mul_legacy_f32_1.0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -99,7 +99,7 @@
   ret float %result
 }
 
-define float @v_mul_legacy_f32_1.0_swap(float %b) {
+define float @v_mul_legacy_f32_1.0_swap(float %b) #0 {
 ; GCN-LABEL: v_mul_legacy_f32_1.0_swap:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -109,7 +109,7 @@
   ret float %result
 }
 
-define float @v_mul_legacy_f32_2.0(float %a) {
+define float @v_mul_legacy_f32_2.0(float %a) #0 {
 ; GCN-LABEL: v_mul_legacy_f32_2.0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -119,7 +119,7 @@
   ret float %result
 }
 
-define float @v_mul_legacy_f32_2.0_swap(float %b) {
+define float @v_mul_legacy_f32_2.0_swap(float %b) #0 {
 ; GCN-LABEL: v_mul_legacy_f32_2.0_swap:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i32.ll
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i32.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -global-isel -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
-define amdgpu_kernel void @test_wave32(i32 %arg0, [8 x i32], i32 %saved) {
+define amdgpu_kernel void @test_wave32(i32 %arg0, [8 x i32], i32 %saved) #0 {
 ; GCN-LABEL: test_wave32:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_load_dword s0, s[4:5], 0x0
@@ -24,3 +24,4 @@
 }
 
 declare i32 @llvm.amdgcn.if.break.i32(i1, i32)
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i64.ll
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i64.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -global-isel -mtriple=amdgcn--amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
-define amdgpu_kernel void @test_wave64(i32 %arg0, [8 x i32], i64 %saved) {
+define amdgpu_kernel void @test_wave64(i32 %arg0, [8 x i32], i64 %saved) #0 {
 ; GCN-LABEL: test_wave64:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_load_dword s2, s[4:5], 0x0
@@ -24,3 +24,4 @@
 }
 
 declare i64 @llvm.amdgcn.if.break.i64(i1, i64)
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll
@@ -4,7 +4,7 @@
 
 ; TODO: Merge with DAG test
 
-define amdgpu_kernel void @is_private_vgpr(i8* addrspace(1)* %ptr.ptr) {
+define amdgpu_kernel void @is_private_vgpr(i8* addrspace(1)* %ptr.ptr) #0 {
 ; CI-LABEL: is_private_vgpr:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
@@ -50,7 +50,7 @@
   ret void
 }
 
-define amdgpu_kernel void @is_private_sgpr(i8* %ptr) {
+define amdgpu_kernel void @is_private_sgpr(i8* %ptr) #0 {
 ; CI-LABEL: is_private_sgpr:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll
@@ -4,7 +4,7 @@
 
 ; TODO: Merge with DAG test
 
-define amdgpu_kernel void @is_local_vgpr(i8* addrspace(1)* %ptr.ptr) {
+define amdgpu_kernel void @is_local_vgpr(i8* addrspace(1)* %ptr.ptr) #0 {
 ; CI-LABEL: is_local_vgpr:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
@@ -50,7 +50,7 @@
   ret void
 }
 
-define amdgpu_kernel void @is_local_sgpr(i8* %ptr) {
+define amdgpu_kernel void @is_local_sgpr(i8* %ptr) #0 {
 ; CI-LABEL: is_local_sgpr:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll
@@ -4,7 +4,7 @@
 
 ; FIXME: Merge with DAG test
 
-define amdgpu_kernel void @dpp_test(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @dpp_test(i32 addrspace(1)* %out, i32 %in) #0 {
 ; GFX8-LABEL: dpp_test:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
@@ -34,7 +34,7 @@
   store i32 %tmp0, i32 addrspace(1)* %out
   ret void
 }
-define amdgpu_kernel void @mov_dpp64_test(i64 addrspace(1)* %out, i64 %in1) {
+define amdgpu_kernel void @mov_dpp64_test(i64 addrspace(1)* %out, i64 %in1) #0 {
 ; GFX8-LABEL: mov_dpp64_test:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll
@@ -4,7 +4,7 @@
 ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s
 ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s
 
-define i32 @v_sdot2(<2 x i16> %a, <2 x i16> %b, i32 %c) {
+define i32 @v_sdot2(<2 x i16> %a, <2 x i16> %b, i32 %c) #0 {
 ; GFX906-LABEL: v_sdot2:
 ; GFX906:       ; %bb.0:
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -28,7 +28,7 @@
   ret i32 %r
 }
 
-define i32 @v_sdot2_clamp(<2 x i16> %a, <2 x i16> %b, i32 %c) {
+define i32 @v_sdot2_clamp(<2 x i16> %a, <2 x i16> %b, i32 %c) #0 {
 ; GFX906-LABEL: v_sdot2_clamp:
 ; GFX906:       ; %bb.0:
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -52,7 +52,7 @@
   ret i32 %r
 }
 
-define amdgpu_ps float @v_sdot2_sgpr_sgpr_sgpr(<2 x i16> inreg %a, <2 x i16> inreg %b, i32 inreg %c) {
+define amdgpu_ps float @v_sdot2_sgpr_sgpr_sgpr(<2 x i16> inreg %a, <2 x i16> inreg %b, i32 inreg %c) #0 {
 ; GFX906-LABEL: v_sdot2_sgpr_sgpr_sgpr:
 ; GFX906:       ; %bb.0:
 ; GFX906-NEXT:    v_mov_b32_e32 v0, s1
@@ -78,7 +78,7 @@
   ret float %cast
 }
 
-define i32 @v_sdot2_inline_literal_a(<2 x i16> %b, i32 %c) {
+define i32 @v_sdot2_inline_literal_a(<2 x i16> %b, i32 %c) #0 {
 ; GFX906-LABEL: v_sdot2_inline_literal_a:
 ; GFX906:       ; %bb.0:
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -105,7 +105,7 @@
   ret i32 %r
 }
 
-define i32 @v_sdot2_inline_literal_b(<2 x i16> %a, i32 %c) {
+define i32 @v_sdot2_inline_literal_b(<2 x i16> %a, i32 %c) #0 {
 ; GFX906-LABEL: v_sdot2_inline_literal_b:
 ; GFX906:       ; %bb.0:
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -132,7 +132,7 @@
   ret i32 %r
 }
 
-define i32 @v_sdot2_inline_literal_a_b(<2 x i16> %a, i32 %c) {
+define i32 @v_sdot2_inline_literal_a_b(<2 x i16> %a, i32 %c) #0 {
 ; GFX906-LABEL: v_sdot2_inline_literal_a_b:
 ; GFX906:       ; %bb.0:
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -164,7 +164,7 @@
   ret i32 %r
 }
 
-define i32 @v_sdot2_inline_literal_a_b_c() {
+define i32 @v_sdot2_inline_literal_a_b_c() #0 {
 ; GFX906-LABEL: v_sdot2_inline_literal_a_b_c:
 ; GFX906:       ; %bb.0:
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -196,7 +196,7 @@
   ret i32 %r
 }
 
-define i32 @v_sdot2_inline_literal_c(<2 x i16> %a, <2 x i16> %b) {
+define i32 @v_sdot2_inline_literal_c(<2 x i16> %a, <2 x i16> %b) #0 {
 ; GFX906-LABEL: v_sdot2_inline_literal_c:
 ; GFX906:       ; %bb.0:
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -220,7 +220,7 @@
   ret i32 %r
 }
 
-define i32 @v_sdot2_fneg_a(<2 x half> %a, <2 x i16> %b, i32 %c) {
+define i32 @v_sdot2_fneg_a(<2 x half> %a, <2 x i16> %b, i32 %c) #0 {
 ; GFX906-LABEL: v_sdot2_fneg_a:
 ; GFX906:       ; %bb.0:
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -246,7 +246,7 @@
   ret i32 %r
 }
 
-define i32 @v_sdot2_fneg_b(<2 x i16> %a, <2 x half> %b, i32 %c) {
+define i32 @v_sdot2_fneg_b(<2 x i16> %a, <2 x half> %b, i32 %c) #0 {
 ; GFX906-LABEL: v_sdot2_fneg_b:
 ; GFX906:       ; %bb.0:
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -272,7 +272,7 @@
   ret i32 %r
 }
 
-define i32 @v_sdot2_fnegf32_c(<2 x i16> %a, <2 x i16> %b, float %c) {
+define i32 @v_sdot2_fnegf32_c(<2 x i16> %a, <2 x i16> %b, float %c) #0 {
 ; GFX906-LABEL: v_sdot2_fnegf32_c:
 ; GFX906:       ; %bb.0:
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -301,7 +301,7 @@
   ret i32 %r
 }
 
-define i32 @v_sdot2_fnegv2f16_c(<2 x i16> %a, <2 x i16> %b, <2 x half> %c) {
+define i32 @v_sdot2_fnegv2f16_c(<2 x i16> %a, <2 x i16> %b, <2 x half> %c) #0 {
 ; GFX906-LABEL: v_sdot2_fnegv2f16_c:
 ; GFX906:       ; %bb.0:
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -330,7 +330,7 @@
   ret i32 %r
 }
 
-define i32 @v_sdot2_shuffle10_a(<2 x i16> %a, <2 x i16> %b, i32 %c) {
+define i32 @v_sdot2_shuffle10_a(<2 x i16> %a, <2 x i16> %b, i32 %c) #0 {
 ; GFX906-LABEL: v_sdot2_shuffle10_a:
 ; GFX906:       ; %bb.0:
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -358,7 +358,7 @@
   ret i32 %r
 }
 
-define i32 @v_sdot2_shuffle10_b(<2 x i16> %a, <2 x i16> %b, i32 %c) {
+define i32 @v_sdot2_shuffle10_b(<2 x i16> %a, <2 x i16> %b, i32 %c) #0 {
 ; GFX906-LABEL: v_sdot2_shuffle10_b:
 ; GFX906:       ; %bb.0:
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll
@@ -4,7 +4,7 @@
 ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s
 ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s
 
-define i32 @v_udot2(<2 x i16> %a, <2 x i16> %b, i32 %c) {
+define i32 @v_udot2(<2 x i16> %a, <2 x i16> %b, i32 %c) #0 {
 ; GFX906-LABEL: v_udot2:
 ; GFX906:       ; %bb.0:
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -28,7 +28,7 @@
   ret i32 %r
 }
 
-define i32 @v_udot2_clamp(<2 x i16> %a, <2 x i16> %b, i32 %c) {
+define i32 @v_udot2_clamp(<2 x i16> %a, <2 x i16> %b, i32 %c) #0 {
 ; GFX906-LABEL: v_udot2_clamp:
 ; GFX906:       ; %bb.0:
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -52,7 +52,7 @@
   ret i32 %r
 }
 
-define amdgpu_ps float @v_udot2_sgpr_sgpr_sgpr(<2 x i16> inreg %a, <2 x i16> inreg %b, i32 inreg %c) {
+define amdgpu_ps float @v_udot2_sgpr_sgpr_sgpr(<2 x i16> inreg %a, <2 x i16> inreg %b, i32 inreg %c) #0 {
 ; GFX906-LABEL: v_udot2_sgpr_sgpr_sgpr:
 ; GFX906:       ; %bb.0:
 ; GFX906-NEXT:    v_mov_b32_e32 v0, s1
@@ -78,7 +78,7 @@
   ret float %cast
 }
 
-define i32 @v_udot2_inline_literal_a(<2 x i16> %b, i32 %c) {
+define i32 @v_udot2_inline_literal_a(<2 x i16> %b, i32 %c) #0 {
 ; GFX906-LABEL: v_udot2_inline_literal_a:
 ; GFX906:       ; %bb.0:
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -105,7 +105,7 @@
   ret i32 %r
 }
 
-define i32 @v_udot2_inline_literal_b(<2 x i16> %a, i32 %c) {
+define i32 @v_udot2_inline_literal_b(<2 x i16> %a, i32 %c) #0 {
 ; GFX906-LABEL: v_udot2_inline_literal_b:
 ; GFX906:       ; %bb.0:
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -132,7 +132,7 @@
   ret i32 %r
 }
 
-define i32 @v_udot2_inline_literal_a_b(<2 x i16> %a, i32 %c) {
+define i32 @v_udot2_inline_literal_a_b(<2 x i16> %a, i32 %c) #0 {
 ; GFX906-LABEL: v_udot2_inline_literal_a_b:
 ; GFX906:       ; %bb.0:
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -164,7 +164,7 @@
   ret i32 %r
 }
 
-define i32 @v_udot2_inline_literal_a_b_c() {
+define i32 @v_udot2_inline_literal_a_b_c() #0 {
 ; GFX906-LABEL: v_udot2_inline_literal_a_b_c:
 ; GFX906:       ; %bb.0:
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -196,7 +196,7 @@
   ret i32 %r
 }
 
-define i32 @v_udot2_inline_literal_c(<2 x i16> %a, <2 x i16> %b) {
+define i32 @v_udot2_inline_literal_c(<2 x i16> %a, <2 x i16> %b) #0 {
 ; GFX906-LABEL: v_udot2_inline_literal_c:
 ; GFX906:       ; %bb.0:
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -220,7 +220,7 @@
   ret i32 %r
 }
 
-define i32 @v_udot2_fneg_a(<2 x half> %a, <2 x i16> %b, i32 %c) {
+define i32 @v_udot2_fneg_a(<2 x half> %a, <2 x i16> %b, i32 %c) #0 {
 ; GFX906-LABEL: v_udot2_fneg_a:
 ; GFX906:       ; %bb.0:
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -246,7 +246,7 @@
   ret i32 %r
 }
 
-define i32 @v_udot2_fneg_b(<2 x i16> %a, <2 x half> %b, i32 %c) {
+define i32 @v_udot2_fneg_b(<2 x i16> %a, <2 x half> %b, i32 %c) #0 {
 ; GFX906-LABEL: v_udot2_fneg_b:
 ; GFX906:       ; %bb.0:
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -272,7 +272,7 @@
   ret i32 %r
 }
 
-define i32 @v_udot2_fnegf32_c(<2 x i16> %a, <2 x i16> %b, float %c) {
+define i32 @v_udot2_fnegf32_c(<2 x i16> %a, <2 x i16> %b, float %c) #0 {
 ; GFX906-LABEL: v_udot2_fnegf32_c:
 ; GFX906:       ; %bb.0:
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -301,7 +301,7 @@
   ret i32 %r
 }
 
-define i32 @v_udot2_fnegv2f16_c(<2 x i16> %a, <2 x i16> %b, <2 x half> %c) {
+define i32 @v_udot2_fnegv2f16_c(<2 x i16> %a, <2 x i16> %b, <2 x half> %c) #0 {
 ; GFX906-LABEL: v_udot2_fnegv2f16_c:
 ; GFX906:       ; %bb.0:
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -330,7 +330,7 @@
   ret i32 %r
 }
 
-define i32 @v_udot2_shuffle10_a(<2 x i16> %a, <2 x i16> %b, i32 %c) {
+define i32 @v_udot2_shuffle10_a(<2 x i16> %a, <2 x i16> %b, i32 %c) #0 {
 ; GFX906-LABEL: v_udot2_shuffle10_a:
 ; GFX906:       ; %bb.0:
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -358,7 +358,7 @@
   ret i32 %r
 }
 
-define i32 @v_udot2_shuffle10_b(<2 x i16> %a, <2 x i16> %b, i32 %c) {
+define i32 @v_udot2_shuffle10_b(<2 x i16> %a, <2 x i16> %b, i32 %c) #0 {
 ; GFX906-LABEL: v_udot2_shuffle10_b:
 ; GFX906:       ; %bb.0:
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -global-isel -march=amdgcn -mcpu=tonga -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8 %s
 ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
 
-define amdgpu_kernel void @dpp_test(i32 addrspace(1)* %out, i32 %in1, i32 %in2) {
+define amdgpu_kernel void @dpp_test(i32 addrspace(1)* %out, i32 %in1, i32 %in2) #0 {
 ; GFX8-LABEL: dpp_test:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
@@ -34,7 +34,7 @@
   store i32 %tmp0, i32 addrspace(1)* %out
   ret void
 }
-define amdgpu_kernel void @update_dpp64_test(i64 addrspace(1)* %arg, i64 %in1, i64 %in2) {
+define amdgpu_kernel void @update_dpp64_test(i64 addrspace(1)* %arg, i64 %in1, i64 %in2) #0 {
 ; GFX8-LABEL: update_dpp64_test:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
@@ -4,7 +4,7 @@
 ; Test the localizer did something and we don't materialize all
 ; constants in SGPRs in the entry block.
 
-define amdgpu_kernel void @localize_constants(i1 %cond) {
+define amdgpu_kernel void @localize_constants(i1 %cond) #0 {
 ; GFX9-LABEL: localize_constants:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x0
@@ -72,7 +72,7 @@
 @gv2 = addrspace(1) global i32 undef, align 4
 @gv3 = addrspace(1) global i32 undef, align 4
 
-define amdgpu_kernel void @localize_globals(i1 %cond) {
+define amdgpu_kernel void @localize_globals(i1 %cond) #0 {
 ; GFX9-LABEL: localize_globals:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x0
@@ -131,7 +131,7 @@
 @static.gv2 = internal addrspace(1) global i32 undef, align 4
 @static.gv3 = internal addrspace(1) global i32 undef, align 4
 
-define void @localize_internal_globals(i1 %cond) {
+define void @localize_internal_globals(i1 %cond) #0 {
 ; GFX9-LABEL: localize_internal_globals:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -196,3 +196,4 @@
 bb2:
   ret void
 }
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll
@@ -5,7 +5,7 @@
 ; Test end to end matching of addressing modes when MUBUF is used for
 ; global memory.
 
-define amdgpu_ps void @mubuf_store_sgpr_ptr(i32 addrspace(1)* inreg %ptr) {
+define amdgpu_ps void @mubuf_store_sgpr_ptr(i32 addrspace(1)* inreg %ptr) #0 {
 ; GFX6-LABEL: mubuf_store_sgpr_ptr:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_mov_b32 s0, s2
@@ -29,7 +29,7 @@
   ret void
 }
 
-define amdgpu_ps void @mubuf_store_sgpr_ptr_offset4095(i32 addrspace(1)* inreg %ptr) {
+define amdgpu_ps void @mubuf_store_sgpr_ptr_offset4095(i32 addrspace(1)* inreg %ptr) #0 {
 ; GFX6-LABEL: mubuf_store_sgpr_ptr_offset4095:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_mov_b32 s0, s2
@@ -56,7 +56,7 @@
   ret void
 }
 
-define amdgpu_ps void @mubuf_store_sgpr_ptr_offset4294967296(i32 addrspace(1)* inreg %ptr) {
+define amdgpu_ps void @mubuf_store_sgpr_ptr_offset4294967296(i32 addrspace(1)* inreg %ptr) #0 {
 ; GFX6-LABEL: mubuf_store_sgpr_ptr_offset4294967296:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_mov_b32 s4, 0
@@ -89,7 +89,7 @@
   ret void
 }
 
-define amdgpu_ps void @mubuf_store_sgpr_ptr_offset4294967297(i32 addrspace(1)* inreg %ptr) {
+define amdgpu_ps void @mubuf_store_sgpr_ptr_offset4294967297(i32 addrspace(1)* inreg %ptr) #0 {
 ; GFX6-LABEL: mubuf_store_sgpr_ptr_offset4294967297:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_mov_b32 s4, 4
@@ -122,7 +122,7 @@
   ret void
 }
 
-define amdgpu_ps void @mubuf_store_sgpr_ptr_offset4096(i32 addrspace(1)* inreg %ptr) {
+define amdgpu_ps void @mubuf_store_sgpr_ptr_offset4096(i32 addrspace(1)* inreg %ptr) #0 {
 ; GFX6-LABEL: mubuf_store_sgpr_ptr_offset4096:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_mov_b32 s0, s2
@@ -149,7 +149,7 @@
   ret void
 }
 
-define amdgpu_ps void @mubuf_store_vgpr_ptr_offset4095(i32 addrspace(1)* %ptr) {
+define amdgpu_ps void @mubuf_store_vgpr_ptr_offset4095(i32 addrspace(1)* %ptr) #0 {
 ; GFX6-LABEL: mubuf_store_vgpr_ptr_offset4095:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_mov_b32 s2, 0
@@ -174,7 +174,7 @@
   ret void
 }
 
-define amdgpu_ps void @mubuf_store_vgpr_ptr_offset4294967296(i32 addrspace(1)* %ptr) {
+define amdgpu_ps void @mubuf_store_vgpr_ptr_offset4294967296(i32 addrspace(1)* %ptr) #0 {
 ; GFX6-LABEL: mubuf_store_vgpr_ptr_offset4294967296:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_mov_b32 s0, 0
@@ -199,7 +199,7 @@
   ret void
 }
 
-define amdgpu_ps void @mubuf_store_vgpr_ptr_offset4294967297(i32 addrspace(1)* %ptr) {
+define amdgpu_ps void @mubuf_store_vgpr_ptr_offset4294967297(i32 addrspace(1)* %ptr) #0 {
 ; GFX6-LABEL: mubuf_store_vgpr_ptr_offset4294967297:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_mov_b32 s0, 4
@@ -224,7 +224,7 @@
   ret void
 }
 
-define amdgpu_ps void @mubuf_store_vgpr_ptr_offset4096(i32 addrspace(1)* %ptr) {
+define amdgpu_ps void @mubuf_store_vgpr_ptr_offset4096(i32 addrspace(1)* %ptr) #0 {
 ; GFX6-LABEL: mubuf_store_vgpr_ptr_offset4096:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_mov_b32 s2, 0
@@ -249,7 +249,7 @@
   ret void
 }
 
-define amdgpu_ps void @mubuf_store_sgpr_ptr_sgpr_offset(i32 addrspace(1)* inreg %ptr, i32 inreg %soffset) {
+define amdgpu_ps void @mubuf_store_sgpr_ptr_sgpr_offset(i32 addrspace(1)* inreg %ptr, i32 inreg %soffset) #0 {
 ; GFX6-LABEL: mubuf_store_sgpr_ptr_sgpr_offset:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_mov_b32 s0, s2
@@ -282,7 +282,7 @@
   ret void
 }
 
-define amdgpu_ps void @mubuf_store_vgpr_ptr_sgpr_offset(i32 addrspace(1)* %ptr, i32 inreg %soffset) {
+define amdgpu_ps void @mubuf_store_vgpr_ptr_sgpr_offset(i32 addrspace(1)* %ptr, i32 inreg %soffset) #0 {
 ; GFX6-LABEL: mubuf_store_vgpr_ptr_sgpr_offset:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_bfe_i64 s[0:1], s[2:3], 0x200000
@@ -307,7 +307,7 @@
   ret void
 }
 
-define amdgpu_ps void @mubuf_store_vgpr_ptr_sgpr_offset_offset256(i32 addrspace(1)* %ptr, i32 inreg %soffset) {
+define amdgpu_ps void @mubuf_store_vgpr_ptr_sgpr_offset_offset256(i32 addrspace(1)* %ptr, i32 inreg %soffset) #0 {
 ; GFX6-LABEL: mubuf_store_vgpr_ptr_sgpr_offset_offset256:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_bfe_i64 s[0:1], s[2:3], 0x200000
@@ -333,7 +333,7 @@
   ret void
 }
 
-define amdgpu_ps void @mubuf_store_vgpr_ptr_sgpr_offset256_offset(i32 addrspace(1)* %ptr, i32 inreg %soffset) {
+define amdgpu_ps void @mubuf_store_vgpr_ptr_sgpr_offset256_offset(i32 addrspace(1)* %ptr, i32 inreg %soffset) #0 {
 ; GFX6-LABEL: mubuf_store_vgpr_ptr_sgpr_offset256_offset:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_movk_i32 s4, 0x400
@@ -371,7 +371,7 @@
   ret void
 }
 
-define amdgpu_ps void @mubuf_store_sgpr_ptr_vgpr_offset(i32 addrspace(1)* inreg %ptr, i32 %voffset) {
+define amdgpu_ps void @mubuf_store_sgpr_ptr_vgpr_offset(i32 addrspace(1)* inreg %ptr, i32 %voffset) #0 {
 ; GFX6-LABEL: mubuf_store_sgpr_ptr_vgpr_offset:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
@@ -400,7 +400,7 @@
   ret void
 }
 
-define amdgpu_ps void @mubuf_store_sgpr_ptr_vgpr_offset_offset4095(i32 addrspace(1)* inreg %ptr, i32 %voffset) {
+define amdgpu_ps void @mubuf_store_sgpr_ptr_vgpr_offset_offset4095(i32 addrspace(1)* inreg %ptr, i32 %voffset) #0 {
 ; GFX6-LABEL: mubuf_store_sgpr_ptr_vgpr_offset_offset4095:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
@@ -431,7 +431,7 @@
   store i32 0, i32 addrspace(1)* %gep1
   ret void
 }
-define amdgpu_ps void @mubuf_store_sgpr_ptr_offset4095_vgpr_offset(i32 addrspace(1)* inreg %ptr, i32 %voffset) {
+define amdgpu_ps void @mubuf_store_sgpr_ptr_offset4095_vgpr_offset(i32 addrspace(1)* inreg %ptr, i32 %voffset) #0 {
 ; GFX6-LABEL: mubuf_store_sgpr_ptr_offset4095_vgpr_offset:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
@@ -461,7 +461,7 @@
   ret void
 }
 
-define amdgpu_ps float @mubuf_load_sgpr_ptr(float addrspace(1)* inreg %ptr) {
+define amdgpu_ps float @mubuf_load_sgpr_ptr(float addrspace(1)* inreg %ptr) #0 {
 ; GFX6-LABEL: mubuf_load_sgpr_ptr:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_mov_b32 s0, s2
@@ -485,7 +485,7 @@
   ret float %val
 }
 
-define amdgpu_ps float @mubuf_load_sgpr_ptr_offset4095(float addrspace(1)* inreg %ptr) {
+define amdgpu_ps float @mubuf_load_sgpr_ptr_offset4095(float addrspace(1)* inreg %ptr) #0 {
 ; GFX6-LABEL: mubuf_load_sgpr_ptr_offset4095:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_mov_b32 s0, s2
@@ -512,7 +512,7 @@
   ret float %val
 }
 
-define amdgpu_ps float @mubuf_load_sgpr_ptr_offset4294967296(float addrspace(1)* inreg %ptr) {
+define amdgpu_ps float @mubuf_load_sgpr_ptr_offset4294967296(float addrspace(1)* inreg %ptr) #0 {
 ; GFX6-LABEL: mubuf_load_sgpr_ptr_offset4294967296:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_mov_b32 s4, 0
@@ -545,7 +545,7 @@
   ret float %val
 }
 
-define amdgpu_ps float @mubuf_load_sgpr_ptr_offset4294967297(float addrspace(1)* inreg %ptr) {
+define amdgpu_ps float @mubuf_load_sgpr_ptr_offset4294967297(float addrspace(1)* inreg %ptr) #0 {
 ; GFX6-LABEL: mubuf_load_sgpr_ptr_offset4294967297:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_mov_b32 s4, 4
@@ -578,7 +578,7 @@
   ret float %val
 }
 
-define amdgpu_ps float @mubuf_load_sgpr_ptr_offset4096(float addrspace(1)* inreg %ptr) {
+define amdgpu_ps float @mubuf_load_sgpr_ptr_offset4096(float addrspace(1)* inreg %ptr) #0 {
 ; GFX6-LABEL: mubuf_load_sgpr_ptr_offset4096:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_mov_b32 s0, s2
@@ -605,7 +605,7 @@
   ret float %val
 }
 
-define amdgpu_ps float @mubuf_load_vgpr_ptr_offset4095(float addrspace(1)* %ptr) {
+define amdgpu_ps float @mubuf_load_vgpr_ptr_offset4095(float addrspace(1)* %ptr) #0 {
 ; GFX6-LABEL: mubuf_load_vgpr_ptr_offset4095:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_mov_b32 s2, 0
@@ -630,7 +630,7 @@
   ret float %val
 }
 
-define amdgpu_ps float @mubuf_load_vgpr_ptr_offset4294967296(float addrspace(1)* %ptr) {
+define amdgpu_ps float @mubuf_load_vgpr_ptr_offset4294967296(float addrspace(1)* %ptr) #0 {
 ; GFX6-LABEL: mubuf_load_vgpr_ptr_offset4294967296:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_mov_b32 s0, 0
@@ -655,7 +655,7 @@
   ret float %val
 }
 
-define amdgpu_ps float @mubuf_load_vgpr_ptr_offset4294967297(float addrspace(1)* %ptr) {
+define amdgpu_ps float @mubuf_load_vgpr_ptr_offset4294967297(float addrspace(1)* %ptr) #0 {
 ; GFX6-LABEL: mubuf_load_vgpr_ptr_offset4294967297:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_mov_b32 s0, 4
@@ -680,7 +680,7 @@
   ret float %val
 }
 
-define amdgpu_ps float @mubuf_load_vgpr_ptr_offset4096(float addrspace(1)* %ptr) {
+define amdgpu_ps float @mubuf_load_vgpr_ptr_offset4096(float addrspace(1)* %ptr) #0 {
 ; GFX6-LABEL: mubuf_load_vgpr_ptr_offset4096:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_mov_b32 s2, 0
@@ -705,7 +705,7 @@
   ret float %val
 }
 
-define amdgpu_ps float @mubuf_load_sgpr_ptr_sgpr_offset(float addrspace(1)* inreg %ptr, i32 inreg %soffset) {
+define amdgpu_ps float @mubuf_load_sgpr_ptr_sgpr_offset(float addrspace(1)* inreg %ptr, i32 inreg %soffset) #0 {
 ; GFX6-LABEL: mubuf_load_sgpr_ptr_sgpr_offset:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_mov_b32 s0, s2
@@ -738,7 +738,7 @@
   ret float %val
 }
 
-define amdgpu_ps float @mubuf_load_vgpr_ptr_sgpr_offset(float addrspace(1)* %ptr, i32 inreg %soffset) {
+define amdgpu_ps float @mubuf_load_vgpr_ptr_sgpr_offset(float addrspace(1)* %ptr, i32 inreg %soffset) #0 {
 ; GFX6-LABEL: mubuf_load_vgpr_ptr_sgpr_offset:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_bfe_i64 s[0:1], s[2:3], 0x200000
@@ -763,7 +763,7 @@
   ret float %val
 }
 
-define amdgpu_ps float @mubuf_load_vgpr_ptr_sgpr_offset_offset256(float addrspace(1)* %ptr, i32 inreg %soffset) {
+define amdgpu_ps float @mubuf_load_vgpr_ptr_sgpr_offset_offset256(float addrspace(1)* %ptr, i32 inreg %soffset) #0 {
 ; GFX6-LABEL: mubuf_load_vgpr_ptr_sgpr_offset_offset256:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_bfe_i64 s[0:1], s[2:3], 0x200000
@@ -789,7 +789,7 @@
   ret float %val
 }
 
-define amdgpu_ps float @mubuf_load_vgpr_ptr_sgpr_offset256_offset(float addrspace(1)* %ptr, i32 inreg %soffset) {
+define amdgpu_ps float @mubuf_load_vgpr_ptr_sgpr_offset256_offset(float addrspace(1)* %ptr, i32 inreg %soffset) #0 {
 ; GFX6-LABEL: mubuf_load_vgpr_ptr_sgpr_offset256_offset:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_movk_i32 s4, 0x400
@@ -827,7 +827,7 @@
   ret float %val
 }
 
-define amdgpu_ps float @mubuf_load_sgpr_ptr_vgpr_offset(float addrspace(1)* inreg %ptr, i32 %voffset) {
+define amdgpu_ps float @mubuf_load_sgpr_ptr_vgpr_offset(float addrspace(1)* inreg %ptr, i32 %voffset) #0 {
 ; GFX6-LABEL: mubuf_load_sgpr_ptr_vgpr_offset:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
@@ -856,7 +856,7 @@
   ret float %val
 }
 
-define amdgpu_ps float @mubuf_load_sgpr_ptr_vgpr_offset_offset4095(float addrspace(1)* inreg %ptr, i32 %voffset) {
+define amdgpu_ps float @mubuf_load_sgpr_ptr_vgpr_offset_offset4095(float addrspace(1)* inreg %ptr, i32 %voffset) #0 {
 ; GFX6-LABEL: mubuf_load_sgpr_ptr_vgpr_offset_offset4095:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
@@ -887,7 +887,7 @@
   %val = load float, float addrspace(1)* %gep1
   ret float %val
 }
-define amdgpu_ps float @mubuf_load_sgpr_ptr_offset4095_vgpr_offset(float addrspace(1)* inreg %ptr, i32 %voffset) {
+define amdgpu_ps float @mubuf_load_sgpr_ptr_offset4095_vgpr_offset(float addrspace(1)* inreg %ptr, i32 %voffset) #0 {
 ; GFX6-LABEL: mubuf_load_sgpr_ptr_offset4095_vgpr_offset:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
@@ -917,7 +917,7 @@
   ret float %val
 }
 
-define amdgpu_ps float @mubuf_atomicrmw_sgpr_ptr_offset4095(i32 addrspace(1)* inreg %ptr) {
+define amdgpu_ps float @mubuf_atomicrmw_sgpr_ptr_offset4095(i32 addrspace(1)* inreg %ptr) #0 {
 ; GFX6-LABEL: mubuf_atomicrmw_sgpr_ptr_offset4095:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_mov_b32 s0, s2
@@ -952,7 +952,7 @@
   ret float %cast
 }
 
-define amdgpu_ps float @mubuf_atomicrmw_sgpr_ptr_offset4294967296(i32 addrspace(1)* inreg %ptr) {
+define amdgpu_ps float @mubuf_atomicrmw_sgpr_ptr_offset4294967296(i32 addrspace(1)* inreg %ptr) #0 {
 ; GFX6-LABEL: mubuf_atomicrmw_sgpr_ptr_offset4294967296:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_mov_b32 s4, 0
@@ -993,7 +993,7 @@
   ret float %cast
 }
 
-define amdgpu_ps float @mubuf_atomicrmw_vgpr_ptr_offset4095(i32 addrspace(1)* %ptr) {
+define amdgpu_ps float @mubuf_atomicrmw_vgpr_ptr_offset4095(i32 addrspace(1)* %ptr) #0 {
 ; GFX6-LABEL: mubuf_atomicrmw_vgpr_ptr_offset4095:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 2
@@ -1028,7 +1028,7 @@
   ret float %cast
 }
 
-define amdgpu_ps float @mubuf_atomicrmw_vgpr_ptr_offset4294967296(i32 addrspace(1)* %ptr) {
+define amdgpu_ps float @mubuf_atomicrmw_vgpr_ptr_offset4294967296(i32 addrspace(1)* %ptr) #0 {
 ; GFX6-LABEL: mubuf_atomicrmw_vgpr_ptr_offset4294967296:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_mov_b32 s0, 0
@@ -1063,7 +1063,7 @@
   ret float %cast
 }
 
-define amdgpu_ps float @mubuf_atomicrmw_sgpr_ptr_vgpr_offset(i32 addrspace(1)* inreg %ptr, i32 %voffset) {
+define amdgpu_ps float @mubuf_atomicrmw_sgpr_ptr_vgpr_offset(i32 addrspace(1)* inreg %ptr, i32 %voffset) #0 {
 ; GFX6-LABEL: mubuf_atomicrmw_sgpr_ptr_vgpr_offset:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
@@ -1102,7 +1102,7 @@
   ret float %cast
 }
 
-define amdgpu_ps float @mubuf_cmpxchg_sgpr_ptr_offset4095(i32 addrspace(1)* inreg %ptr, i32 %old, i32 %in) {
+define amdgpu_ps float @mubuf_cmpxchg_sgpr_ptr_offset4095(i32 addrspace(1)* inreg %ptr, i32 %old, i32 %in) #0 {
 ; GFX6-LABEL: mubuf_cmpxchg_sgpr_ptr_offset4095:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_mov_b32 s0, s2
@@ -1140,7 +1140,7 @@
   ret float %cast
 }
 
-define amdgpu_ps float @mubuf_cmpxchg_sgpr_ptr_offset4294967296(i32 addrspace(1)* inreg %ptr, i32 %old, i32 %in) {
+define amdgpu_ps float @mubuf_cmpxchg_sgpr_ptr_offset4294967296(i32 addrspace(1)* inreg %ptr, i32 %old, i32 %in) #0 {
 ; GFX6-LABEL: mubuf_cmpxchg_sgpr_ptr_offset4294967296:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_mov_b32 s4, 0
@@ -1184,7 +1184,7 @@
   ret float %cast
 }
 
-define amdgpu_ps float @mubuf_cmpxchg_vgpr_ptr_offset4095(i32 addrspace(1)* %ptr, i32 %old, i32 %in) {
+define amdgpu_ps float @mubuf_cmpxchg_vgpr_ptr_offset4095(i32 addrspace(1)* %ptr, i32 %old, i32 %in) #0 {
 ; GFX6-LABEL: mubuf_cmpxchg_vgpr_ptr_offset4095:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    v_mov_b32_e32 v4, v2
@@ -1220,7 +1220,7 @@
   ret float %cast
 }
 
-define amdgpu_ps float @mubuf_cmpxchg_vgpr_ptr_offset4294967296(i32 addrspace(1)* %ptr, i32 %old, i32 %in) {
+define amdgpu_ps float @mubuf_cmpxchg_vgpr_ptr_offset4294967296(i32 addrspace(1)* %ptr, i32 %old, i32 %in) #0 {
 ; GFX6-LABEL: mubuf_cmpxchg_vgpr_ptr_offset4294967296:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_mov_b32 s0, 0
@@ -1256,7 +1256,7 @@
   ret float %cast
 }
 
-define amdgpu_ps float @mubuf_cmpxchg_sgpr_ptr_vgpr_offset(i32 addrspace(1)* inreg %ptr, i32 %voffset, i32 %old, i32 %in) {
+define amdgpu_ps float @mubuf_cmpxchg_sgpr_ptr_vgpr_offset(i32 addrspace(1)* inreg %ptr, i32 %voffset, i32 %old, i32 %in) #0 {
 ; GFX6-LABEL: mubuf_cmpxchg_sgpr_ptr_vgpr_offset:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v1
@@ -1295,3 +1295,4 @@
   %cast = bitcast i32 %result to float
   ret float %cast
 }
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
@@ -3,7 +3,7 @@
 ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx801 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s
 ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
 
-define amdgpu_ps i16 @s_mul_i16(i16 inreg %num, i16 inreg %den) {
+define amdgpu_ps i16 @s_mul_i16(i16 inreg %num, i16 inreg %den) #0 {
 ; GFX7-LABEL: s_mul_i16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_mul_i32 s0, s0, s1
@@ -28,7 +28,7 @@
   ret i16 %result
 }
 
-define i16 @v_mul_i16(i16 %num, i16 %den) {
+define i16 @v_mul_i16(i16 %num, i16 %den) #0 {
 ; GFX7-LABEL: v_mul_i16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -53,7 +53,7 @@
   ret i16 %result
 }
 
-define amdgpu_ps zeroext i16 @s_mul_i16_zeroext(i16 inreg zeroext %num, i16 inreg zeroext %den) {
+define amdgpu_ps zeroext i16 @s_mul_i16_zeroext(i16 inreg zeroext %num, i16 inreg zeroext %den) #0 {
 ; GFX7-LABEL: s_mul_i16_zeroext:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_mul_i32 s0, s0, s1
@@ -81,7 +81,7 @@
   ret i16 %result
 }
 
-define zeroext i16 @v_mul_i16_zeroext(i16 zeroext %num, i16 zeroext %den) {
+define zeroext i16 @v_mul_i16_zeroext(i16 zeroext %num, i16 zeroext %den) #0 {
 ; GFX7-LABEL: v_mul_i16_zeroext:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -107,7 +107,7 @@
   ret i16 %result
 }
 
-define amdgpu_ps signext i16 @s_mul_i16_signext(i16 inreg signext %num, i16 inreg signext %den) {
+define amdgpu_ps signext i16 @s_mul_i16_signext(i16 inreg signext %num, i16 inreg signext %den) #0 {
 ; GFX7-LABEL: s_mul_i16_signext:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_mul_i32 s0, s0, s1
@@ -135,7 +135,7 @@
   ret i16 %result
 }
 
-define signext i16 @v_mul_i16_signext(i16 signext %num, i16 signext %den) {
+define signext i16 @v_mul_i16_signext(i16 signext %num, i16 signext %den) #0 {
 ; GFX7-LABEL: v_mul_i16_signext:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -163,7 +163,7 @@
   ret i16 %result
 }
 
-define amdgpu_ps i32 @s_mul_i32(i32 inreg %num, i32 inreg %den) {
+define amdgpu_ps i32 @s_mul_i32(i32 inreg %num, i32 inreg %den) #0 {
 ; GCN-LABEL: s_mul_i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_mul_i32 s0, s0, s1
@@ -172,7 +172,7 @@
   ret i32 %result
 }
 
-define i32 @v_mul_i32(i32 %num, i32 %den) {
+define i32 @v_mul_i32(i32 %num, i32 %den) #0 {
 ; GCN-LABEL: v_mul_i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -182,7 +182,7 @@
   ret i32 %result
 }
 
-define amdgpu_ps <2 x i32> @s_mul_v2i32(<2 x i32> inreg %num, <2 x i32> inreg %den) {
+define amdgpu_ps <2 x i32> @s_mul_v2i32(<2 x i32> inreg %num, <2 x i32> inreg %den) #0 {
 ; GCN-LABEL: s_mul_v2i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_mul_i32 s0, s0, s2
@@ -192,7 +192,7 @@
   ret <2 x i32> %result
 }
 
-define <2 x i32> @v_mul_v2i32(<2 x i32> %num, <2 x i32> %den) {
+define <2 x i32> @v_mul_v2i32(<2 x i32> %num, <2 x i32> %den) #0 {
 ; GCN-LABEL: v_mul_v2i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -203,7 +203,7 @@
   ret <2 x i32> %result
 }
 
-define amdgpu_ps i64 @s_mul_i64(i64 inreg %num, i64 inreg %den) {
+define amdgpu_ps i64 @s_mul_i64(i64 inreg %num, i64 inreg %den) #0 {
 ; GFX7-LABEL: s_mul_i64:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s2
@@ -244,7 +244,7 @@
   ret i64 %result
 }
 
-define i64 @v_mul_i64(i64 %num, i64 %den) {
+define i64 @v_mul_i64(i64 %num, i64 %den) #0 {
 ; GFX7-LABEL: v_mul_i64:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -282,7 +282,7 @@
   ret i64 %result
 }
 
-define amdgpu_ps <3 x i32> @s_mul_i96(i96 inreg %num, i96 inreg %den) {
+define amdgpu_ps <3 x i32> @s_mul_i96(i96 inreg %num, i96 inreg %den) #0 {
 ; GFX7-LABEL: s_mul_i96:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s3
@@ -374,7 +374,7 @@
   ret <3 x i32> %cast
 }
 
-define i96 @v_mul_i96(i96 %num, i96 %den) {
+define i96 @v_mul_i96(i96 %num, i96 %den) #0 {
 ; GFX7-LABEL: v_mul_i96:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -454,7 +454,7 @@
   ret i96 %result
 }
 
-define amdgpu_ps <4 x i32> @s_mul_i128(i128 inreg %num, i128 inreg %den) {
+define amdgpu_ps <4 x i32> @s_mul_i128(i128 inreg %num, i128 inreg %den) #0 {
 ; GFX7-LABEL: s_mul_i128:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
@@ -631,7 +631,7 @@
   ret <4 x i32> %cast
 }
 
-define i128 @v_mul_i128(i128 %num, i128 %den) {
+define i128 @v_mul_i128(i128 %num, i128 %den) #0 {
 ; GFX7-LABEL: v_mul_i128:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -780,7 +780,7 @@
   ret i128 %result
 }
 
-define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
+define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) #0 {
 ; GFX7-LABEL: s_mul_i256:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s8
@@ -1577,7 +1577,7 @@
   ret <8 x i32> %cast
 }
 
-define i256 @v_mul_i256(i256 %num, i256 %den) {
+define i256 @v_mul_i256(i256 %num, i256 %den) #0 {
 ; GFX7-LABEL: v_mul_i256:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2231,3 +2231,4 @@
   %result = mul i256 %num, %den
   ret i256 %result
 }
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll
@@ -4,7 +4,7 @@
 
 ; The same 32-bit expansion is implemented in the legalizer and in AMDGPUCodeGenPrepare.
 
-define i32 @v_sdiv_i32(i32 %num, i32 %den) {
+define i32 @v_sdiv_i32(i32 %num, i32 %den) #0 {
 ; GISEL-LABEL: v_sdiv_i32:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -95,7 +95,7 @@
 ; FIXME: This is a workaround for not handling uniform VGPR case.
 declare i32 @llvm.amdgcn.readfirstlane(i32)
 
-define amdgpu_ps i32 @s_sdiv_i32(i32 inreg %num, i32 inreg %den) {
+define amdgpu_ps i32 @s_sdiv_i32(i32 inreg %num, i32 inreg %den) #0 {
 ; GISEL-LABEL: s_sdiv_i32:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_ashr_i32 s2, s0, 31
@@ -187,7 +187,7 @@
   ret i32 %readlane
 }
 
-define <2 x i32> @v_sdiv_v2i32(<2 x i32> %num, <2 x i32> %den) {
+define <2 x i32> @v_sdiv_v2i32(<2 x i32> %num, <2 x i32> %den) #0 {
 ; GISEL-LABEL: v_sdiv_v2i32:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -349,7 +349,7 @@
   ret <2 x i32> %result
 }
 
-define i32 @v_sdiv_i32_pow2k_denom(i32 %num) {
+define i32 @v_sdiv_i32_pow2k_denom(i32 %num) #0 {
 ; CHECK-LABEL: v_sdiv_i32_pow2k_denom:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -389,7 +389,7 @@
   ret i32 %result
 }
 
-define <2 x i32> @v_sdiv_v2i32_pow2k_denom(<2 x i32> %num) {
+define <2 x i32> @v_sdiv_v2i32_pow2k_denom(<2 x i32> %num) #0 {
 ; CHECK-LABEL: v_sdiv_v2i32_pow2k_denom:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -462,7 +462,7 @@
   ret <2 x i32> %result
 }
 
-define i32 @v_sdiv_i32_oddk_denom(i32 %num) {
+define i32 @v_sdiv_i32_oddk_denom(i32 %num) #0 {
 ; CHECK-LABEL: v_sdiv_i32_oddk_denom:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -502,7 +502,7 @@
   ret i32 %result
 }
 
-define <2 x i32> @v_sdiv_v2i32_oddk_denom(<2 x i32> %num) {
+define <2 x i32> @v_sdiv_v2i32_oddk_denom(<2 x i32> %num) #0 {
 ; CHECK-LABEL: v_sdiv_v2i32_oddk_denom:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -575,7 +575,7 @@
   ret <2 x i32> %result
 }
 
-define i32 @v_sdiv_i32_pow2_shl_denom(i32 %x, i32 %y) {
+define i32 @v_sdiv_i32_pow2_shl_denom(i32 %x, i32 %y) #0 {
 ; CHECK-LABEL: v_sdiv_i32_pow2_shl_denom:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -617,7 +617,7 @@
   ret i32 %r
 }
 
-define <2 x i32> @v_sdiv_v2i32_pow2_shl_denom(<2 x i32> %x, <2 x i32> %y) {
+define <2 x i32> @v_sdiv_v2i32_pow2_shl_denom(<2 x i32> %x, <2 x i32> %y) #0 {
 ; GISEL-LABEL: v_sdiv_v2i32_pow2_shl_denom:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -786,7 +786,7 @@
   ret <2 x i32> %r
 }
 
-define i32 @v_sdiv_i32_24bit(i32 %num, i32 %den) {
+define i32 @v_sdiv_i32_24bit(i32 %num, i32 %den) #0 {
 ; GISEL-LABEL: v_sdiv_i32_24bit:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -873,7 +873,7 @@
   ret i32 %result
 }
 
-define <2 x i32> @v_sdiv_v2i32_24bit(<2 x i32> %num, <2 x i32> %den) {
+define <2 x i32> @v_sdiv_v2i32_24bit(<2 x i32> %num, <2 x i32> %den) #0 {
 ; GISEL-LABEL: v_sdiv_v2i32_24bit:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1028,3 +1028,4 @@
   %result = sdiv <2 x i32> %num.mask, %den.mask
   ret <2 x i32> %result
 }
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shlN_add.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shlN_add.ll
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shlN_add.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shlN_add.ll
@@ -4,7 +4,7 @@
 
 ; Test gfx9+ s_shl[1-4]_add_u32 pattern matching
 
-define amdgpu_ps i32 @s_shl1_add_u32(i32 inreg %src0, i32 inreg %src1) {
+define amdgpu_ps i32 @s_shl1_add_u32(i32 inreg %src0, i32 inreg %src1) #0 {
 ; GFX9-LABEL: s_shl1_add_u32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_lshl1_add_u32 s0, s0, s1
@@ -20,7 +20,7 @@
   ret i32 %add
 }
 
-define amdgpu_ps i32 @s_shl2_add_u32(i32 inreg %src0, i32 inreg %src1) {
+define amdgpu_ps i32 @s_shl2_add_u32(i32 inreg %src0, i32 inreg %src1) #0 {
 ; GFX9-LABEL: s_shl2_add_u32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_lshl2_add_u32 s0, s0, s1
@@ -36,7 +36,7 @@
   ret i32 %add
 }
 
-define amdgpu_ps i32 @s_shl3_add_u32(i32 inreg %src0, i32 inreg %src1) {
+define amdgpu_ps i32 @s_shl3_add_u32(i32 inreg %src0, i32 inreg %src1) #0 {
 ; GFX9-LABEL: s_shl3_add_u32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_lshl3_add_u32 s0, s0, s1
@@ -52,7 +52,7 @@
   ret i32 %add
 }
 
-define amdgpu_ps i32 @s_shl4_add_u32(i32 inreg %src0, i32 inreg %src1) {
+define amdgpu_ps i32 @s_shl4_add_u32(i32 inreg %src0, i32 inreg %src1) #0 {
 ; GFX9-LABEL: s_shl4_add_u32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_lshl4_add_u32 s0, s0, s1
@@ -68,7 +68,7 @@
   ret i32 %add
 }
 
-define amdgpu_ps i32 @s_shl5_add_u32(i32 inreg %src0, i32 inreg %src1) {
+define amdgpu_ps i32 @s_shl5_add_u32(i32 inreg %src0, i32 inreg %src1) #0 {
 ; GCN-LABEL: s_shl5_add_u32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_lshl_b32 s0, s0, 5
@@ -79,7 +79,7 @@
   ret i32 %add
 }
 
-define i32 @v_shl1_add_u32(i32 %src0, i32 %src1) {
+define i32 @v_shl1_add_u32(i32 %src0, i32 %src1) #0 {
 ; GFX9-LABEL: v_shl1_add_u32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -97,7 +97,7 @@
   ret i32 %add
 }
 
-define i32 @v_shl2_add_u32(i32 %src0, i32 %src1) {
+define i32 @v_shl2_add_u32(i32 %src0, i32 %src1) #0 {
 ; GFX9-LABEL: v_shl2_add_u32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -115,7 +115,7 @@
   ret i32 %add
 }
 
-define i32 @v_shl3_add_u32(i32 %src0, i32 %src1) {
+define i32 @v_shl3_add_u32(i32 %src0, i32 %src1) #0 {
 ; GFX9-LABEL: v_shl3_add_u32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -133,7 +133,7 @@
   ret i32 %add
 }
 
-define i32 @v_shl4_add_u32(i32 %src0, i32 %src1) {
+define i32 @v_shl4_add_u32(i32 %src0, i32 %src1) #0 {
 ; GFX9-LABEL: v_shl4_add_u32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -151,7 +151,7 @@
   ret i32 %add
 }
 
-define i32 @v_shl5_add_u32(i32 %src0, i32 %src1) {
+define i32 @v_shl5_add_u32(i32 %src0, i32 %src1) #0 {
 ; GFX9-LABEL: v_shl5_add_u32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -171,7 +171,7 @@
 
 ; FIXME: Use v_lshl_add_u32
 ; shift is scalar, but add is vector.
-define amdgpu_ps float @shl1_add_u32_vgpr1(i32 inreg %src0, i32 %src1) {
+define amdgpu_ps float @shl1_add_u32_vgpr1(i32 inreg %src0, i32 %src1) #0 {
 ; GFX9-LABEL: shl1_add_u32_vgpr1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, 1
@@ -189,7 +189,7 @@
   ret float %cast
 }
 
-define amdgpu_ps float @shl2_add_u32_vgpr1(i32 inreg %src0, i32 %src1) {
+define amdgpu_ps float @shl2_add_u32_vgpr1(i32 inreg %src0, i32 %src1) #0 {
 ; GFX9-LABEL: shl2_add_u32_vgpr1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
@@ -207,7 +207,7 @@
   ret float %cast
 }
 
-define amdgpu_ps float @shl3_add_u32_vgpr1(i32 inreg %src0, i32 %src1) {
+define amdgpu_ps float @shl3_add_u32_vgpr1(i32 inreg %src0, i32 %src1) #0 {
 ; GFX9-LABEL: shl3_add_u32_vgpr1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, 3
@@ -225,7 +225,7 @@
   ret float %cast
 }
 
-define amdgpu_ps float @shl4_add_u32_vgpr1(i32 inreg %src0, i32 %src1) {
+define amdgpu_ps float @shl4_add_u32_vgpr1(i32 inreg %src0, i32 %src1) #0 {
 ; GFX9-LABEL: shl4_add_u32_vgpr1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, 4
@@ -243,7 +243,7 @@
   ret float %cast
 }
 
-define amdgpu_ps float @shl5_add_u32_vgpr1(i32 inreg %src0, i32 %src1) {
+define amdgpu_ps float @shl5_add_u32_vgpr1(i32 inreg %src0, i32 %src1) #0 {
 ; GFX9-LABEL: shl5_add_u32_vgpr1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, 5
@@ -261,7 +261,7 @@
   ret float %cast
 }
 
-define amdgpu_ps <2 x i32> @s_shl1_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> inreg %src1) {
+define amdgpu_ps <2 x i32> @s_shl1_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> inreg %src1) #0 {
 ; GFX9-LABEL: s_shl1_add_u32_v2:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_lshl1_add_u32 s0, s0, s2
@@ -280,7 +280,7 @@
   ret <2 x i32> %add
 }
 
-define amdgpu_ps <2 x i32> @s_shl2_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> inreg %src1) {
+define amdgpu_ps <2 x i32> @s_shl2_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> inreg %src1) #0 {
 ; GFX9-LABEL: s_shl2_add_u32_v2:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_lshl2_add_u32 s0, s0, s2
@@ -299,7 +299,7 @@
   ret <2 x i32> %add
 }
 
-define amdgpu_ps <2 x i32> @s_shl3_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> inreg %src1) {
+define amdgpu_ps <2 x i32> @s_shl3_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> inreg %src1) #0 {
 ; GFX9-LABEL: s_shl3_add_u32_v2:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_lshl3_add_u32 s0, s0, s2
@@ -318,7 +318,7 @@
   ret <2 x i32> %add
 }
 
-define amdgpu_ps <2 x i32> @s_shl4_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> inreg %src1) {
+define amdgpu_ps <2 x i32> @s_shl4_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> inreg %src1) #0 {
 ; GFX9-LABEL: s_shl4_add_u32_v2:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_lshl4_add_u32 s0, s0, s2
@@ -337,7 +337,7 @@
   ret <2 x i32> %add
 }
 
-define amdgpu_ps <2 x i32> @s_shl_2_4_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> inreg %src1) {
+define amdgpu_ps <2 x i32> @s_shl_2_4_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> inreg %src1) #0 {
 ; GFX9-LABEL: s_shl_2_4_add_u32_v2:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_lshl2_add_u32 s0, s0, s2
@@ -356,7 +356,7 @@
   ret <2 x i32> %add
 }
 
-define amdgpu_ps { i32, i32 } @s_shl4_add_u32_multi_use(i32 inreg %src0, i32 inreg %src1) {
+define amdgpu_ps { i32, i32 } @s_shl4_add_u32_multi_use(i32 inreg %src0, i32 inreg %src1) #0 {
 ; GCN-LABEL: s_shl4_add_u32_multi_use:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_lshl_b32 s0, s0, 4
@@ -369,7 +369,7 @@
   ret { i32, i32 } %insert1
 }
 
-define amdgpu_ps { i32, i32 } @s_shl3_add_u32_multi_use(i32 inreg %src0, i32 inreg %src1) {
+define amdgpu_ps { i32, i32 } @s_shl3_add_u32_multi_use(i32 inreg %src0, i32 inreg %src1) #0 {
 ; GCN-LABEL: s_shl3_add_u32_multi_use:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_lshl_b32 s0, s0, 3
@@ -382,7 +382,7 @@
   ret { i32, i32 } %insert1
 }
 
-define amdgpu_ps { i32, i32 } @s_shl2_add_u32_multi_use(i32 inreg %src0, i32 inreg %src1) {
+define amdgpu_ps { i32, i32 } @s_shl2_add_u32_multi_use(i32 inreg %src0, i32 inreg %src1) #0 {
 ; GCN-LABEL: s_shl2_add_u32_multi_use:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_lshl_b32 s0, s0, 2
@@ -396,7 +396,7 @@
 }
 
 
-define amdgpu_ps { i32, i32 } @s_shl1_add_u32_multi_use(i32 inreg %src0, i32 inreg %src1) {
+define amdgpu_ps { i32, i32 } @s_shl1_add_u32_multi_use(i32 inreg %src0, i32 inreg %src1) #0 {
 ; GCN-LABEL: s_shl1_add_u32_multi_use:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_lshl_b32 s0, s0, 1
@@ -408,3 +408,4 @@
   %insert1 = insertvalue { i32, i32 } %insert0, i32 %add, 1
   ret { i32, i32 } %insert1
 }
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll
@@ -4,7 +4,7 @@
 
 ; The same 32-bit expansion is implemented in the legalizer and in AMDGPUCodeGenPrepare.
 
-define i32 @v_srem_i32(i32 %num, i32 %den) {
+define i32 @v_srem_i32(i32 %num, i32 %den) #0 {
 ; GISEL-LABEL: v_srem_i32:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -93,7 +93,7 @@
 ; FIXME: This is a workaround for not handling uniform VGPR case.
 declare i32 @llvm.amdgcn.readfirstlane(i32)
 
-define amdgpu_ps i32 @s_srem_i32(i32 inreg %num, i32 inreg %den) {
+define amdgpu_ps i32 @s_srem_i32(i32 inreg %num, i32 inreg %den) #0 {
 ; GISEL-LABEL: s_srem_i32:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_ashr_i32 s4, s0, 31
@@ -183,7 +183,7 @@
   ret i32 %readlane
 }
 
-define <2 x i32> @v_srem_v2i32(<2 x i32> %num, <2 x i32> %den) {
+define <2 x i32> @v_srem_v2i32(<2 x i32> %num, <2 x i32> %den) #0 {
 ; GISEL-LABEL: v_srem_v2i32:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -341,7 +341,7 @@
   ret <2 x i32> %result
 }
 
-define i32 @v_srem_i32_pow2k_denom(i32 %num) {
+define i32 @v_srem_i32_pow2k_denom(i32 %num) #0 {
 ; CHECK-LABEL: v_srem_i32_pow2k_denom:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -380,7 +380,7 @@
   ret i32 %result
 }
 
-define <2 x i32> @v_srem_v2i32_pow2k_denom(<2 x i32> %num) {
+define <2 x i32> @v_srem_v2i32_pow2k_denom(<2 x i32> %num) #0 {
 ; CHECK-LABEL: v_srem_v2i32_pow2k_denom:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -451,7 +451,7 @@
   ret <2 x i32> %result
 }
 
-define i32 @v_srem_i32_oddk_denom(i32 %num) {
+define i32 @v_srem_i32_oddk_denom(i32 %num) #0 {
 ; CHECK-LABEL: v_srem_i32_oddk_denom:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -490,7 +490,7 @@
   ret i32 %result
 }
 
-define <2 x i32> @v_srem_v2i32_oddk_denom(<2 x i32> %num) {
+define <2 x i32> @v_srem_v2i32_oddk_denom(<2 x i32> %num) #0 {
 ; CHECK-LABEL: v_srem_v2i32_oddk_denom:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -561,7 +561,7 @@
   ret <2 x i32> %result
 }
 
-define i32 @v_srem_i32_pow2_shl_denom(i32 %x, i32 %y) {
+define i32 @v_srem_i32_pow2_shl_denom(i32 %x, i32 %y) #0 {
 ; CHECK-LABEL: v_srem_i32_pow2_shl_denom:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -602,7 +602,7 @@
   ret i32 %r
 }
 
-define <2 x i32> @v_srem_v2i32_pow2_shl_denom(<2 x i32> %x, <2 x i32> %y) {
+define <2 x i32> @v_srem_v2i32_pow2_shl_denom(<2 x i32> %x, <2 x i32> %y) #0 {
 ; GISEL-LABEL: v_srem_v2i32_pow2_shl_denom:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -767,7 +767,7 @@
   ret <2 x i32> %r
 }
 
-define i32 @v_srem_i32_24bit(i32 %num, i32 %den) {
+define i32 @v_srem_i32_24bit(i32 %num, i32 %den) #0 {
 ; GISEL-LABEL: v_srem_i32_24bit:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -853,7 +853,7 @@
   ret i32 %result
 }
 
-define <2 x i32> @v_srem_v2i32_24bit(<2 x i32> %num, <2 x i32> %den) {
+define <2 x i32> @v_srem_v2i32_24bit(<2 x i32> %num, <2 x i32> %den) #0 {
 ; GISEL-LABEL: v_srem_v2i32_24bit:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1006,3 +1006,4 @@
   %result = srem <2 x i32> %num.mask, %den.mask
   ret <2 x i32> %result
 }
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/trunc.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/trunc.ll
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/trunc.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/trunc.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GFX7 %s
 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8 %s
 
-define i16 @v_trunc_i32_to_i16(i32 %src) {
+define i16 @v_trunc_i32_to_i16(i32 %src) #0 {
 ; GFX7-LABEL: v_trunc_i32_to_i16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16,7 +16,7 @@
   ret i16 %trunc
 }
 
-define amdgpu_ps i16 @s_trunc_i32_to_i16(i32 inreg %src) {
+define amdgpu_ps i16 @s_trunc_i32_to_i16(i32 inreg %src) #0 {
 ; GFX7-LABEL: s_trunc_i32_to_i16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    ; return to shader part epilog
@@ -28,7 +28,7 @@
   ret i16 %trunc
 }
 
-define i16 @v_trunc_i64_to_i16(i64 %src) {
+define i16 @v_trunc_i64_to_i16(i64 %src) #0 {
 ; GFX7-LABEL: v_trunc_i64_to_i16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -42,7 +42,7 @@
   ret i16 %trunc
 }
 
-define amdgpu_ps i16 @s_trunc_i64_to_i16(i64 inreg %src) {
+define amdgpu_ps i16 @s_trunc_i64_to_i16(i64 inreg %src) #0 {
 ; GFX7-LABEL: s_trunc_i64_to_i16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    ; return to shader part epilog
@@ -54,7 +54,7 @@
   ret i16 %trunc
 }
 
-define amdgpu_ps i16 @s_trunc_i128_to_i16(i128 inreg %src) {
+define amdgpu_ps i16 @s_trunc_i128_to_i16(i128 inreg %src) #0 {
 ; GFX7-LABEL: s_trunc_i128_to_i16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    ; return to shader part epilog
@@ -66,7 +66,7 @@
   ret i16 %trunc
 }
 
-define i16 @v_trunc_i128_to_i16(i128 %src) {
+define i16 @v_trunc_i128_to_i16(i128 %src) #0 {
 ; GFX7-LABEL: v_trunc_i128_to_i16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -80,7 +80,7 @@
   ret i16 %trunc
 }
 
-define i32 @v_trunc_v2i32_to_v2i16(<2 x i32> %src) {
+define i32 @v_trunc_v2i32_to_v2i16(<2 x i32> %src) #0 {
 ; GFX7-LABEL: v_trunc_v2i32_to_v2i16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -99,7 +99,7 @@
   ret i32 %cast
 }
 
-define amdgpu_ps i32 @s_trunc_v2i32_to_v2i16(<2 x i32> inreg %src) {
+define amdgpu_ps i32 @s_trunc_v2i32_to_v2i16(<2 x i32> inreg %src) #0 {
 ; GFX7-LABEL: s_trunc_v2i32_to_v2i16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_lshl_b32 s1, s1, 16
@@ -119,7 +119,7 @@
 }
 
 ; ; FIXME: G_INSERT mishandled
-; define <2 x i32> @v_trunc_v3i32_to_v3i16(<3 x i32> %src) {
+; define <2 x i32> @v_trunc_v3i32_to_v3i16(<3 x i32> %src) #0 {
 ;   %trunc = trunc <3 x i32> %src to <3 x i16>
 ;   %ext = shufflevector <3 x i16> %trunc, <3 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ;   %cast = bitcast <4 x i16> %ext to <2 x i32>
@@ -127,14 +127,14 @@
 ; }
 
 ; ; FIXME: G_INSERT mishandled
-; define amdgpu_ps <2 x i32> @s_trunc_v3i32_to_v3i16(<3 x i32> inreg %src) {
+; define amdgpu_ps <2 x i32> @s_trunc_v3i32_to_v3i16(<3 x i32> inreg %src) #0 {
 ;   %trunc = trunc <3 x i32> %src to <3 x i16>
 ;   %ext = shufflevector <3 x i16> %trunc, <3 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ;   %cast = bitcast <4 x i16> %ext to <2 x i32>
 ;   ret <2 x i32> %cast
 ; }
 
-define <2 x i32> @v_trunc_v4i32_to_v4i16(<4 x i32> %src) {
+define <2 x i32> @v_trunc_v4i32_to_v4i16(<4 x i32> %src) #0 {
 ; GFX7-LABEL: v_trunc_v4i32_to_v4i16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -159,7 +159,7 @@
   ret <2 x i32> %cast
 }
 
-define amdgpu_ps <2 x i32> @s_trunc_v4i32_to_v4i16(<4 x i32> inreg %src) {
+define amdgpu_ps <2 x i32> @s_trunc_v4i32_to_v4i16(<4 x i32> inreg %src) #0 {
 ; GFX7-LABEL: s_trunc_v4i32_to_v4i16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_mov_b32 s4, 0xffff
@@ -185,3 +185,4 @@
   %cast = bitcast <4 x i16> %trunc to <2 x i32>
   ret <2 x i32> %cast
 }
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll
@@ -4,7 +4,7 @@
 
 ; The same 32-bit expansion is implemented in the legalizer and in AMDGPUCodeGenPrepare.
 
-define i32 @v_udiv_i32(i32 %num, i32 %den) {
+define i32 @v_udiv_i32(i32 %num, i32 %den) #0 {
 ; GISEL-LABEL: v_udiv_i32:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -78,7 +78,7 @@
 ; FIXME: This is a workaround for not handling uniform VGPR case.
 declare i32 @llvm.amdgcn.readfirstlane(i32)
 
-define amdgpu_ps i32 @s_udiv_i32(i32 inreg %num, i32 inreg %den) {
+define amdgpu_ps i32 @s_udiv_i32(i32 inreg %num, i32 inreg %den) #0 {
 ; GISEL-LABEL: s_udiv_i32:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v0, s1
@@ -154,7 +154,7 @@
   ret i32 %readlane
 }
 
-define <2 x i32> @v_udiv_v2i32(<2 x i32> %num, <2 x i32> %den) {
+define <2 x i32> @v_udiv_v2i32(<2 x i32> %num, <2 x i32> %den) #0 {
 ; GISEL-LABEL: v_udiv_v2i32:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -282,7 +282,7 @@
   ret <2 x i32> %result
 }
 
-define i32 @v_udiv_i32_pow2k_denom(i32 %num) {
+define i32 @v_udiv_i32_pow2k_denom(i32 %num) #0 {
 ; CHECK-LABEL: v_udiv_i32_pow2k_denom:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -316,7 +316,7 @@
   ret i32 %result
 }
 
-define <2 x i32> @v_udiv_v2i32_pow2k_denom(<2 x i32> %num) {
+define <2 x i32> @v_udiv_v2i32_pow2k_denom(<2 x i32> %num) #0 {
 ; CHECK-LABEL: v_udiv_v2i32_pow2k_denom:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -373,7 +373,7 @@
   ret <2 x i32> %result
 }
 
-define i32 @v_udiv_i32_oddk_denom(i32 %num) {
+define i32 @v_udiv_i32_oddk_denom(i32 %num) #0 {
 ; CHECK-LABEL: v_udiv_i32_oddk_denom:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -406,7 +406,7 @@
   ret i32 %result
 }
 
-define <2 x i32> @v_udiv_v2i32_oddk_denom(<2 x i32> %num) {
+define <2 x i32> @v_udiv_v2i32_oddk_denom(<2 x i32> %num) #0 {
 ; CHECK-LABEL: v_udiv_v2i32_oddk_denom:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -463,7 +463,7 @@
   ret <2 x i32> %result
 }
 
-define i32 @v_udiv_i32_pow2_shl_denom(i32 %x, i32 %y) {
+define i32 @v_udiv_i32_pow2_shl_denom(i32 %x, i32 %y) #0 {
 ; CHECK-LABEL: v_udiv_i32_pow2_shl_denom:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -497,7 +497,7 @@
   ret i32 %r
 }
 
-define <2 x i32> @v_udiv_v2i32_pow2_shl_denom(<2 x i32> %x, <2 x i32> %y) {
+define <2 x i32> @v_udiv_v2i32_pow2_shl_denom(<2 x i32> %x, <2 x i32> %y) #0 {
 ; GISEL-LABEL: v_udiv_v2i32_pow2_shl_denom:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -632,7 +632,7 @@
   ret <2 x i32> %r
 }
 
-define i32 @v_udiv_i32_24bit(i32 %num, i32 %den) {
+define i32 @v_udiv_i32_24bit(i32 %num, i32 %den) #0 {
 ; GISEL-LABEL: v_udiv_i32_24bit:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -711,7 +711,7 @@
   ret i32 %result
 }
 
-define <2 x i32> @v_udiv_v2i32_24bit(<2 x i32> %num, <2 x i32> %den) {
+define <2 x i32> @v_udiv_v2i32_24bit(<2 x i32> %num, <2 x i32> %den) #0 {
 ; GISEL-LABEL: v_udiv_v2i32_24bit:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -850,3 +850,4 @@
   %result = udiv <2 x i32> %num.mask, %den.mask
   ret <2 x i32> %result
 }
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll
@@ -4,7 +4,7 @@
 
 ; The same 32-bit expansion is implemented in the legalizer and in AMDGPUCodeGenPrepare.
 
-define i32 @v_urem_i32(i32 %num, i32 %den) {
+define i32 @v_urem_i32(i32 %num, i32 %den) #0 {
 ; GISEL-LABEL: v_urem_i32:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -78,7 +78,7 @@
 ; FIXME: This is a workaround for not handling uniform VGPR case.
 declare i32 @llvm.amdgcn.readfirstlane(i32)
 
-define amdgpu_ps i32 @s_urem_i32(i32 inreg %num, i32 inreg %den) {
+define amdgpu_ps i32 @s_urem_i32(i32 inreg %num, i32 inreg %den) #0 {
 ; GISEL-LABEL: s_urem_i32:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v0, s1
@@ -154,7 +154,7 @@
   ret i32 %readlane
 }
 
-define <2 x i32> @v_urem_v2i32(<2 x i32> %num, <2 x i32> %den) {
+define <2 x i32> @v_urem_v2i32(<2 x i32> %num, <2 x i32> %den) #0 {
 ; GISEL-LABEL: v_urem_v2i32:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -282,7 +282,7 @@
   ret <2 x i32> %result
 }
 
-define i32 @v_urem_i32_pow2k_denom(i32 %num) {
+define i32 @v_urem_i32_pow2k_denom(i32 %num) #0 {
 ; CHECK-LABEL: v_urem_i32_pow2k_denom:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -316,7 +316,7 @@
   ret i32 %result
 }
 
-define <2 x i32> @v_urem_v2i32_pow2k_denom(<2 x i32> %num) {
+define <2 x i32> @v_urem_v2i32_pow2k_denom(<2 x i32> %num) #0 {
 ; CHECK-LABEL: v_urem_v2i32_pow2k_denom:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -373,7 +373,7 @@
   ret <2 x i32> %result
 }
 
-define i32 @v_urem_i32_oddk_denom(i32 %num) {
+define i32 @v_urem_i32_oddk_denom(i32 %num) #0 {
 ; CHECK-LABEL: v_urem_i32_oddk_denom:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -407,7 +407,7 @@
   ret i32 %result
 }
 
-define <2 x i32> @v_urem_v2i32_oddk_denom(<2 x i32> %num) {
+define <2 x i32> @v_urem_v2i32_oddk_denom(<2 x i32> %num) #0 {
 ; CHECK-LABEL: v_urem_v2i32_oddk_denom:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -464,7 +464,7 @@
   ret <2 x i32> %result
 }
 
-define i32 @v_urem_i32_pow2_shl_denom(i32 %x, i32 %y) {
+define i32 @v_urem_i32_pow2_shl_denom(i32 %x, i32 %y) #0 {
 ; CHECK-LABEL: v_urem_i32_pow2_shl_denom:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -498,7 +498,7 @@
   ret i32 %r
 }
 
-define <2 x i32> @v_urem_v2i32_pow2_shl_denom(<2 x i32> %x, <2 x i32> %y) {
+define <2 x i32> @v_urem_v2i32_pow2_shl_denom(<2 x i32> %x, <2 x i32> %y) #0 {
 ; GISEL-LABEL: v_urem_v2i32_pow2_shl_denom:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -633,7 +633,7 @@
   ret <2 x i32> %r
 }
 
-define i32 @v_urem_i32_24bit(i32 %num, i32 %den) {
+define i32 @v_urem_i32_24bit(i32 %num, i32 %den) #0 {
 ; GISEL-LABEL: v_urem_i32_24bit:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -712,7 +712,7 @@
   ret i32 %result
 }
 
-define <2 x i32> @v_urem_v2i32_24bit(<2 x i32> %num, <2 x i32> %den) {
+define <2 x i32> @v_urem_v2i32_24bit(<2 x i32> %num, <2 x i32> %den) #0 {
 ; GISEL-LABEL: v_urem_v2i32_24bit:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -851,3 +851,4 @@
   %result = urem <2 x i32> %num.mask, %den.mask
   ret <2 x i32> %result
 }
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll
@@ -4,7 +4,7 @@
 ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX900 %s
 ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX906 %s
 
-define amdgpu_ps i32 @scalar_xnor_i32_one_use(i32 inreg %a, i32 inreg %b) {
+define amdgpu_ps i32 @scalar_xnor_i32_one_use(i32 inreg %a, i32 inreg %b) #0 {
 ; GCN-LABEL: scalar_xnor_i32_one_use:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_xnor_b32 s0, s0, s1
@@ -16,7 +16,7 @@
 }
 
 ; FIXME:
-; define amdgpu_ps i32 @scalar_xnor_v2i16_one_use(<2 x i16> inreg %a, <2 x i16> inreg %b) {
+; define amdgpu_ps i32 @scalar_xnor_v2i16_one_use(<2 x i16> inreg %a, <2 x i16> inreg %b) #0 {
 ; entry:
 ;   %xor = xor <2 x i16> %a, %b
 ;   %r0.val = xor <2 x i16> %xor, <i16 -1, i16 -1>
@@ -24,7 +24,7 @@
 ;   ret i32 %cast
 ; }
 
-define amdgpu_ps <2 x i32> @scalar_xnor_i32_mul_use(i32 inreg %a, i32 inreg %b) {
+define amdgpu_ps <2 x i32> @scalar_xnor_i32_mul_use(i32 inreg %a, i32 inreg %b) #0 {
 ; GCN-LABEL: scalar_xnor_i32_mul_use:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_xor_b32 s1, s0, s1
@@ -41,7 +41,7 @@
   ret <2 x i32> %ins1
 }
 
-define amdgpu_ps i64 @scalar_xnor_i64_one_use(i64 inreg %a, i64 inreg %b) {
+define amdgpu_ps i64 @scalar_xnor_i64_one_use(i64 inreg %a, i64 inreg %b) #0 {
 ; GCN-LABEL: scalar_xnor_i64_one_use:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_xnor_b64 s[0:1], s[0:1], s[2:3]
@@ -52,14 +52,14 @@
 }
 
 ; FIXME:
-; define amdgpu_ps i64 @scalar_xnor_v4i16_one_use(<4 x i16> inreg %a, <4 x i16> inreg %b) {
+; define amdgpu_ps i64 @scalar_xnor_v4i16_one_use(<4 x i16> inreg %a, <4 x i16> inreg %b) #0 {
 ;   %xor = xor <4 x i16> %a, %b
 ;   %ret = xor <4 x i16> %xor, <i16 -1, i16 -1, i16 -1, i16 -1>
 ;   %cast = bitcast <4 x i16> %ret to i64
 ;   ret i64 %cast
 ; }
 
-define amdgpu_ps <2 x i64> @scalar_xnor_i64_mul_use(i64 inreg %a, i64 inreg %b) {
+define amdgpu_ps <2 x i64> @scalar_xnor_i64_mul_use(i64 inreg %a, i64 inreg %b) #0 {
 ; GCN-LABEL: scalar_xnor_i64_mul_use:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_mov_b32 s4, s0
@@ -80,7 +80,7 @@
   ret <2 x i64> %ins1
 }
 
-define i32 @vector_xnor_i32_one_use(i32 %a, i32 %b) {
+define i32 @vector_xnor_i32_one_use(i32 %a, i32 %b) #0 {
 ; GCN-LABEL: vector_xnor_i32_one_use:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -93,7 +93,7 @@
   ret i32 %r
 }
 
-define i64 @vector_xnor_i64_one_use(i64 %a, i64 %b) {
+define i64 @vector_xnor_i64_one_use(i64 %a, i64 %b) #0 {
 ; GCN-LABEL: vector_xnor_i64_one_use:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -108,7 +108,7 @@
   ret i64 %r
 }
 
-define amdgpu_ps float @xnor_s_v_i32_one_use(i32 inreg %s, i32 %v) {
+define amdgpu_ps float @xnor_s_v_i32_one_use(i32 inreg %s, i32 %v) #0 {
 ; GCN-LABEL: xnor_s_v_i32_one_use:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    v_xor_b32_e32 v0, s0, v0
@@ -120,7 +120,7 @@
   ret float %cast
 }
 
-define amdgpu_ps float @xnor_v_s_i32_one_use(i32 inreg %s, i32 %v) {
+define amdgpu_ps float @xnor_v_s_i32_one_use(i32 inreg %s, i32 %v) #0 {
 ; GCN-LABEL: xnor_v_s_i32_one_use:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    v_xor_b32_e32 v0, s0, v0
@@ -132,7 +132,7 @@
   ret float %cast
 }
 
-define amdgpu_ps <2 x float> @xnor_i64_s_v_one_use(i64 inreg %a, i64 %b64) {
+define amdgpu_ps <2 x float> @xnor_i64_s_v_one_use(i64 inreg %a, i64 %b64) #0 {
 ; GFX7-LABEL: xnor_i64_s_v_one_use:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    v_lshl_b64 v[0:1], v[0:1], 29
@@ -176,7 +176,7 @@
   ret <2 x float> %cast
 }
 
-define amdgpu_ps <2 x float> @xnor_i64_v_s_one_use(i64 inreg %a, i64 %b64) {
+define amdgpu_ps <2 x float> @xnor_i64_v_s_one_use(i64 inreg %a, i64 %b64) #0 {
 ; GFX7-LABEL: xnor_i64_v_s_one_use:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    v_lshl_b64 v[0:1], v[0:1], 29
@@ -219,7 +219,7 @@
   ret <2 x float> %cast
 }
 
-define i32 @vector_xor_na_b_i32_one_use(i32 %a, i32 %b) {
+define i32 @vector_xor_na_b_i32_one_use(i32 %a, i32 %b) #0 {
 ; GCN-LABEL: vector_xor_na_b_i32_one_use:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -232,7 +232,7 @@
   ret i32 %r
 }
 
-define i32 @vector_xor_a_nb_i32_one_use(i32 %a, i32 %b) {
+define i32 @vector_xor_a_nb_i32_one_use(i32 %a, i32 %b) #0 {
 ; GCN-LABEL: vector_xor_a_nb_i32_one_use:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -245,7 +245,7 @@
   ret i32 %r
 }
 
-define amdgpu_ps <2 x i32> @scalar_xor_a_nb_i64_one_use(i64 inreg %a, i64 inreg %b) {
+define amdgpu_ps <2 x i32> @scalar_xor_a_nb_i64_one_use(i64 inreg %a, i64 inreg %b) #0 {
 ; GCN-LABEL: scalar_xor_a_nb_i64_one_use:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_not_b64 s[2:3], s[2:3]
@@ -258,7 +258,7 @@
   ret <2 x i32> %cast
 }
 
-define amdgpu_ps <2 x i32> @scalar_xor_na_b_i64_one_use(i64 inreg %a, i64 inreg %b) {
+define amdgpu_ps <2 x i32> @scalar_xor_na_b_i64_one_use(i64 inreg %a, i64 inreg %b) #0 {
 ; GCN-LABEL: scalar_xor_na_b_i64_one_use:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_not_b64 s[0:1], s[0:1]
@@ -270,3 +270,4 @@
   %cast = bitcast i64 %r0.val to <2 x i32>
   ret <2 x i32> %cast
 }
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/add3.ll b/llvm/test/CodeGen/AMDGPU/add3.ll
--- a/llvm/test/CodeGen/AMDGPU/add3.ll
+++ b/llvm/test/CodeGen/AMDGPU/add3.ll
@@ -7,7 +7,7 @@
 ; V_ADD3_U32
 ; ===================================================================================
 
-define amdgpu_ps float @add3(i32 %a, i32 %b, i32 %c) {
+define amdgpu_ps float @add3(i32 %a, i32 %b, i32 %c) #0 {
 ; VI-LABEL: add3:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
@@ -31,7 +31,7 @@
 }
 
 ; V_MAD_U32_U24 is given higher priority.
-define amdgpu_ps float @mad_no_add3(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) {
+define amdgpu_ps float @mad_no_add3(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) #0 {
 ; VI-LABEL: mad_no_add3:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    v_mad_u32_u24 v0, v0, v1, v4
@@ -71,7 +71,7 @@
 
 ; ThreeOp instruction variant not used due to Constant Bus Limitations
 ; TODO: with reassociation it is possible to replace a v_add_u32_e32 with a s_add_i32
-define amdgpu_ps float @add3_vgpr_b(i32 inreg %a, i32 %b, i32 inreg %c) {
+define amdgpu_ps float @add3_vgpr_b(i32 inreg %a, i32 %b, i32 inreg %c) #0 {
 ; VI-LABEL: add3_vgpr_b:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_add_i32 s3, s3, s2
@@ -95,7 +95,7 @@
   ret float %bc
 }
 
-define amdgpu_ps float @add3_vgpr_all2(i32 %a, i32 %b, i32 %c) {
+define amdgpu_ps float @add3_vgpr_all2(i32 %a, i32 %b, i32 %c) #0 {
 ; VI-LABEL: add3_vgpr_all2:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
@@ -118,7 +118,7 @@
   ret float %bc
 }
 
-define amdgpu_ps float @add3_vgpr_bc(i32 inreg %a, i32 %b, i32 %c) {
+define amdgpu_ps float @add3_vgpr_bc(i32 inreg %a, i32 %b, i32 %c) #0 {
 ; VI-LABEL: add3_vgpr_bc:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
@@ -141,7 +141,7 @@
   ret float %bc
 }
 
-define amdgpu_ps float @add3_vgpr_const(i32 %a, i32 %b) {
+define amdgpu_ps float @add3_vgpr_const(i32 %a, i32 %b) #0 {
 ; VI-LABEL: add3_vgpr_const:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
@@ -164,7 +164,7 @@
   ret float %bc
 }
 
-define amdgpu_ps <2 x float> @add3_multiuse_outer(i32 %a, i32 %b, i32 %c, i32 %x) {
+define amdgpu_ps <2 x float> @add3_multiuse_outer(i32 %a, i32 %b, i32 %c, i32 %x) #0 {
 ; VI-LABEL: add3_multiuse_outer:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
@@ -193,7 +193,7 @@
   ret <2 x float> %bc
 }
 
-define amdgpu_ps <2 x float> @add3_multiuse_inner(i32 %a, i32 %b, i32 %c) {
+define amdgpu_ps <2 x float> @add3_multiuse_inner(i32 %a, i32 %b, i32 %c) #0 {
 ; VI-LABEL: add3_multiuse_inner:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
@@ -222,7 +222,7 @@
 
 ; A case where uniform values end up in VGPRs -- we could use v_add3_u32 here,
 ; but we don't.
-define amdgpu_ps float @add3_uniform_vgpr(float inreg %a, float inreg %b, float inreg %c) {
+define amdgpu_ps float @add3_uniform_vgpr(float inreg %a, float inreg %b, float inreg %c) #0 {
 ; VI-LABEL: add3_uniform_vgpr:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    v_mov_b32_e32 v2, 0x40400000
@@ -263,3 +263,4 @@
   %bc = bitcast i32 %result to float
   ret float %bc
 }
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/add_shl.ll b/llvm/test/CodeGen/AMDGPU/add_shl.ll
--- a/llvm/test/CodeGen/AMDGPU/add_shl.ll
+++ b/llvm/test/CodeGen/AMDGPU/add_shl.ll
@@ -7,7 +7,7 @@
 ; V_ADD_LSHL_U32
 ; ===================================================================================
 
-define amdgpu_ps float @add_shl(i32 %a, i32 %b, i32 %c) {
+define amdgpu_ps float @add_shl(i32 %a, i32 %b, i32 %c) #0 {
 ; VI-LABEL: add_shl:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
@@ -30,7 +30,7 @@
   ret float %bc
 }
 
-define amdgpu_ps float @add_shl_vgpr_c(i32 inreg %a, i32 inreg %b, i32 %c) {
+define amdgpu_ps float @add_shl_vgpr_c(i32 inreg %a, i32 inreg %b, i32 %c) #0 {
 ; VI-LABEL: add_shl_vgpr_c:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_add_i32 s2, s2, s3
@@ -54,7 +54,7 @@
   ret float %bc
 }
 
-define amdgpu_ps float @add_shl_vgpr_ac(i32 %a, i32 inreg %b, i32 %c) {
+define amdgpu_ps float @add_shl_vgpr_ac(i32 %a, i32 inreg %b, i32 %c) #0 {
 ; VI-LABEL: add_shl_vgpr_ac:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
@@ -77,7 +77,7 @@
   ret float %bc
 }
 
-define amdgpu_ps float @add_shl_vgpr_const(i32 %a, i32 %b) {
+define amdgpu_ps float @add_shl_vgpr_const(i32 %a, i32 %b) #0 {
 ; VI-LABEL: add_shl_vgpr_const:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
@@ -100,7 +100,7 @@
   ret float %bc
 }
 
-define amdgpu_ps float @add_shl_vgpr_const_inline_const(i32 %a) {
+define amdgpu_ps float @add_shl_vgpr_const_inline_const(i32 %a) #0 {
 ; VI-LABEL: add_shl_vgpr_const_inline_const:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 9, v0
@@ -127,7 +127,7 @@
 ; TODO: Non-optimal code generation because SelectionDAG combines
 ;       (shl (add x, CONST), y) ---> (add (shl x, y), CONST').
 ;
-define amdgpu_ps float @add_shl_vgpr_inline_const_x2(i32 %a) {
+define amdgpu_ps float @add_shl_vgpr_inline_const_x2(i32 %a) #0 {
 ; VI-LABEL: add_shl_vgpr_inline_const_x2:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 9, v0
@@ -150,3 +150,4 @@
   %bc = bitcast i32 %result to float
   ret float %bc
 }
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
@@ -2,7 +2,7 @@
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -amdgpu-codegenprepare %s | FileCheck -check-prefix=IR %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefix=GCN %s
 
-define i32 @select_sdiv_lhs_const_i32(i1 %cond) {
+define i32 @select_sdiv_lhs_const_i32(i1 %cond) #0 {
 ; IR-LABEL: @select_sdiv_lhs_const_i32(
 ; IR-NEXT:    [[OP:%.*]] = select i1 [[COND:%.*]], i32 200000, i32 125000
 ; IR-NEXT:    ret i32 [[OP]]
@@ -21,7 +21,7 @@
   ret i32 %op
 }
 
-define i32 @select_sdiv_rhs_const_i32(i1 %cond) {
+define i32 @select_sdiv_rhs_const_i32(i1 %cond) #0 {
 ; IR-LABEL: @select_sdiv_rhs_const_i32(
 ; IR-NEXT:    [[OP:%.*]] = select i1 [[COND:%.*]], i32 1000, i32 10000
 ; IR-NEXT:    ret i32 [[OP]]
@@ -40,7 +40,7 @@
   ret i32 %op
 }
 
-define <2 x i32> @select_sdiv_lhs_const_v2i32(i1 %cond) {
+define <2 x i32> @select_sdiv_lhs_const_v2i32(i1 %cond) #0 {
 ; IR-LABEL: @select_sdiv_lhs_const_v2i32(
 ; IR-NEXT:    [[OP:%.*]] = select i1 [[COND:%.*]], <2 x i32> <i32 666, i32 undef>, <2 x i32> <i32 555, i32 1428>
 ; IR-NEXT:    ret <2 x i32> [[OP]]
@@ -60,7 +60,7 @@
   ret <2 x i32> %op
 }
 
-define <2 x i32> @select_sdiv_rhs_const_v2i32(i1 %cond) {
+define <2 x i32> @select_sdiv_rhs_const_v2i32(i1 %cond) #0 {
 ; IR-LABEL: @select_sdiv_rhs_const_v2i32(
 ; IR-NEXT:    [[OP:%.*]] = select i1 [[COND:%.*]], <2 x i32> <i32 198621, i32 20855308>, <2 x i32> <i32 222748, i32 2338858>
 ; IR-NEXT:    ret <2 x i32> [[OP]]
@@ -84,7 +84,7 @@
 
 @gv = external addrspace(1) global i32
 
-define i32 @select_sdiv_lhs_opaque_const0_i32(i1 %cond) {
+define i32 @select_sdiv_lhs_opaque_const0_i32(i1 %cond) #0 {
 ; IR-LABEL: @select_sdiv_lhs_opaque_const0_i32(
 ; IR-NEXT:    [[SELECT:%.*]] = select i1 [[COND:%.*]], i32 ptrtoint (i32 addrspace(1)* @gv to i32), i32 5
 ; IR-NEXT:    [[TMP1:%.*]] = ashr i32 [[SELECT]], 31
@@ -179,7 +179,7 @@
   ret i32 %op
 }
 
-define i32 @select_sdiv_lhs_opaque_const1_i32(i1 %cond) {
+define i32 @select_sdiv_lhs_opaque_const1_i32(i1 %cond) #0 {
 ; IR-LABEL: @select_sdiv_lhs_opaque_const1_i32(
 ; IR-NEXT:    [[SELECT:%.*]] = select i1 [[COND:%.*]], i32 5, i32 ptrtoint (i32 addrspace(1)* @gv to i32)
 ; IR-NEXT:    [[TMP1:%.*]] = ashr i32 [[SELECT]], 31
@@ -274,7 +274,7 @@
   ret i32 %op
 }
 
-define i32 @select_sdiv_rhs_opaque_const0_i32(i1 %cond) {
+define i32 @select_sdiv_rhs_opaque_const0_i32(i1 %cond) #0 {
 ; IR-LABEL: @select_sdiv_rhs_opaque_const0_i32(
 ; IR-NEXT:    [[SELECT:%.*]] = select i1 [[COND:%.*]], i32 ptrtoint (i32 addrspace(1)* @gv to i32), i32 234234
 ; IR-NEXT:    [[OP:%.*]] = sdiv i32 [[SELECT]], 42
@@ -304,7 +304,7 @@
   ret i32 %op
 }
 
-define i32 @select_sdiv_rhs_opaque_const1_i32(i1 %cond) {
+define i32 @select_sdiv_rhs_opaque_const1_i32(i1 %cond) #0 {
 ; IR-LABEL: @select_sdiv_rhs_opaque_const1_i32(
 ; IR-NEXT:    [[SELECT:%.*]] = select i1 [[COND:%.*]], i32 42000, i32 ptrtoint (i32 addrspace(1)* @gv to i32)
 ; IR-NEXT:    [[OP:%.*]] = sdiv i32 [[SELECT]], 42
@@ -334,7 +334,7 @@
   ret i32 %op
 }
 
-define i32 @select_add_lhs_const_i32(i1 %cond) {
+define i32 @select_add_lhs_const_i32(i1 %cond) #0 {
 ; IR-LABEL: @select_add_lhs_const_i32(
 ; IR-NEXT:    [[OP:%.*]] = select i1 [[COND:%.*]], i32 1000005, i32 1000008
 ; IR-NEXT:    ret i32 [[OP]]
@@ -353,7 +353,7 @@
   ret i32 %op
 }
 
-define float @select_fadd_lhs_const_i32_fmf(i1 %cond) {
+define float @select_fadd_lhs_const_i32_fmf(i1 %cond) #0 {
 ; IR-LABEL: @select_fadd_lhs_const_i32_fmf(
 ; IR-NEXT:    [[OP:%.*]] = select nnan nsz i1 [[COND:%.*]], float 3.000000e+00, float 5.000000e+00
 ; IR-NEXT:    ret float [[OP]]
@@ -372,7 +372,7 @@
 }
 
 ; Make sure we don't try to use mul24 instead
-define i32 @select_mul_lhs_const_i32(i1 %cond) {
+define i32 @select_mul_lhs_const_i32(i1 %cond) #0 {
 ; GCN-LABEL: select_mul_lhs_const_i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -391,7 +391,7 @@
 }
 
 ; Make sure we don't try to use mul24 instead
-define i32 @select_mul_rhs_const_i32(i1 %cond) {
+define i32 @select_mul_rhs_const_i32(i1 %cond) #0 {
 ; GCN-LABEL: select_mul_rhs_const_i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -409,7 +409,7 @@
   ret i32 %op
 }
 
-define amdgpu_kernel void @select_add_lhs_const_i16(i1 %cond) {
+define amdgpu_kernel void @select_add_lhs_const_i16(i1 %cond) #0 {
 ; IR-LABEL: @select_add_lhs_const_i16(
 ; IR-NEXT:    [[OP:%.*]] = select i1 [[COND:%.*]], i16 128, i16 131
 ; IR-NEXT:    store i16 [[OP]], i16 addrspace(1)* undef
@@ -431,7 +431,7 @@
   ret void
 }
 
-define i16 @select_add_trunc_select(i1 %cond) {
+define i16 @select_add_trunc_select(i1 %cond) #0 {
 ; GCN-LABEL: select_add_trunc_select:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -448,7 +448,7 @@
   ret i16 %op
 }
 
-define i32 @select_add_sext_select(i1 %cond) {
+define i32 @select_add_sext_select(i1 %cond) #0 {
 ; IR-LABEL: @select_add_sext_select(
 ; IR-NEXT:    [[OP:%.*]] = select i1 [[COND:%.*]], i32 29, i32 50
 ; IR-NEXT:    ret i32 [[OP]]
@@ -465,7 +465,7 @@
   ret i32 %op
 }
 
-define i32 @select_add_zext_select(i1 %cond) {
+define i32 @select_add_zext_select(i1 %cond) #0 {
 ; IR-LABEL: @select_add_zext_select(
 ; IR-NEXT:    [[OP:%.*]] = select i1 [[COND:%.*]], i32 47, i32 50
 ; IR-NEXT:    ret i32 [[OP]]
@@ -482,7 +482,7 @@
   ret i32 %op
 }
 
-define i32 @select_add_bitcast_select(i1 %cond) {
+define i32 @select_add_bitcast_select(i1 %cond) #0 {
 ; IR-LABEL: @select_add_bitcast_select(
 ; IR-NEXT:    [[OP:%.*]] = select i1 [[COND:%.*]], i32 1065353258, i32 1073741866
 ; IR-NEXT:    ret i32 [[OP]]
@@ -504,7 +504,7 @@
 
 ; If we fold through a cast, we need to ensure it doesn't have
 ; multiple uses.
-define <2 x half> @multi_use_cast_regression(i1 %cond) {
+define <2 x half> @multi_use_cast_regression(i1 %cond) #0 {
 ; IR-LABEL: @multi_use_cast_regression(
 ; IR-NEXT:    [[SELECT:%.*]] = select i1 [[COND:%.*]], half 0xH3C00, half 0xH0000
 ; IR-NEXT:    [[FPEXT:%.*]] = fpext half [[SELECT]] to float
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
@@ -3,7 +3,7 @@
 ; RUN: opt -S -mtriple=amdgcn-- -mcpu=tahiti -amdgpu-codegenprepare -amdgpu-bypass-slow-div=0 %s | FileCheck %s
 ; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GCN %s
 
-define amdgpu_kernel void @udiv_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) {
+define amdgpu_kernel void @udiv_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {
 ; CHECK-LABEL: @udiv_i32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = uitofp i32 [[Y:%.*]] to float
 ; CHECK-NEXT:    [[TMP2:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP1]])
@@ -82,7 +82,7 @@
   ret void
 }
 
-define amdgpu_kernel void @urem_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) {
+define amdgpu_kernel void @urem_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {
 ; CHECK-LABEL: @urem_i32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = uitofp i32 [[Y:%.*]] to float
 ; CHECK-NEXT:    [[TMP2:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP1]])
@@ -161,7 +161,7 @@
   ret void
 }
 
-define amdgpu_kernel void @sdiv_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) {
+define amdgpu_kernel void @sdiv_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {
 ; CHECK-LABEL: @sdiv_i32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = ashr i32 [[X:%.*]], 31
 ; CHECK-NEXT:    [[TMP2:%.*]] = ashr i32 [[Y:%.*]], 31
@@ -258,7 +258,7 @@
   ret void
 }
 
-define amdgpu_kernel void @srem_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) {
+define amdgpu_kernel void @srem_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {
 ; CHECK-LABEL: @srem_i32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = ashr i32 [[X:%.*]], 31
 ; CHECK-NEXT:    [[TMP2:%.*]] = ashr i32 [[Y:%.*]], 31
@@ -354,7 +354,7 @@
   ret void
 }
 
-define amdgpu_kernel void @udiv_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) {
+define amdgpu_kernel void @udiv_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) #0 {
 ; CHECK-LABEL: @udiv_i16(
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[X:%.*]] to i32
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[Y:%.*]] to i32
@@ -401,7 +401,7 @@
   ret void
 }
 
-define amdgpu_kernel void @urem_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) {
+define amdgpu_kernel void @urem_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) #0 {
 ; CHECK-LABEL: @urem_i16(
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[X:%.*]] to i32
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[Y:%.*]] to i32
@@ -452,7 +452,7 @@
   ret void
 }
 
-define amdgpu_kernel void @sdiv_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) {
+define amdgpu_kernel void @sdiv_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) #0 {
 ; CHECK-LABEL: @sdiv_i16(
 ; CHECK-NEXT:    [[TMP1:%.*]] = sext i16 [[X:%.*]] to i32
 ; CHECK-NEXT:    [[TMP2:%.*]] = sext i16 [[Y:%.*]] to i32
@@ -508,7 +508,7 @@
   ret void
 }
 
-define amdgpu_kernel void @srem_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) {
+define amdgpu_kernel void @srem_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) #0 {
 ; CHECK-LABEL: @srem_i16(
 ; CHECK-NEXT:    [[TMP1:%.*]] = sext i16 [[X:%.*]] to i32
 ; CHECK-NEXT:    [[TMP2:%.*]] = sext i16 [[Y:%.*]] to i32
@@ -568,7 +568,7 @@
   ret void
 }
 
-define amdgpu_kernel void @udiv_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) {
+define amdgpu_kernel void @udiv_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) #0 {
 ; CHECK-LABEL: @udiv_i8(
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[X:%.*]] to i32
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[Y:%.*]] to i32
@@ -613,7 +613,7 @@
   ret void
 }
 
-define amdgpu_kernel void @urem_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) {
+define amdgpu_kernel void @urem_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) #0 {
 ; CHECK-LABEL: @urem_i8(
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[X:%.*]] to i32
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[Y:%.*]] to i32
@@ -663,7 +663,7 @@
   ret void
 }
 
-define amdgpu_kernel void @sdiv_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) {
+define amdgpu_kernel void @sdiv_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) #0 {
 ; CHECK-LABEL: @sdiv_i8(
 ; CHECK-NEXT:    [[TMP1:%.*]] = sext i8 [[X:%.*]] to i32
 ; CHECK-NEXT:    [[TMP2:%.*]] = sext i8 [[Y:%.*]] to i32
@@ -719,7 +719,7 @@
   ret void
 }
 
-define amdgpu_kernel void @srem_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) {
+define amdgpu_kernel void @srem_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) #0 {
 ; CHECK-LABEL: @srem_i8(
 ; CHECK-NEXT:    [[TMP1:%.*]] = sext i8 [[X:%.*]] to i32
 ; CHECK-NEXT:    [[TMP2:%.*]] = sext i8 [[Y:%.*]] to i32
@@ -780,7 +780,7 @@
   ret void
 }
 
-define amdgpu_kernel void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) {
+define amdgpu_kernel void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) #0 {
 ; CHECK-LABEL: @udiv_v4i32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0
@@ -1052,7 +1052,7 @@
   ret void
 }
 
-define amdgpu_kernel void @urem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) {
+define amdgpu_kernel void @urem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) #0 {
 ; CHECK-LABEL: @urem_v4i32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0
@@ -1324,7 +1324,7 @@
   ret void
 }
 
-define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) {
+define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) #0 {
 ; CHECK-LABEL: @sdiv_v4i32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0
@@ -1668,7 +1668,7 @@
   ret void
 }
 
-define amdgpu_kernel void @srem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) {
+define amdgpu_kernel void @srem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) #0 {
 ; CHECK-LABEL: @srem_v4i32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0
@@ -2004,7 +2004,7 @@
   ret void
 }
 
-define amdgpu_kernel void @udiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) {
+define amdgpu_kernel void @udiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) #0 {
 ; CHECK-LABEL: @udiv_v4i16(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0
@@ -2154,7 +2154,7 @@
   ret void
 }
 
-define amdgpu_kernel void @urem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) {
+define amdgpu_kernel void @urem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) #0 {
 ; CHECK-LABEL: @urem_v4i16(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0
@@ -2320,7 +2320,7 @@
   ret void
 }
 
-define amdgpu_kernel void @sdiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) {
+define amdgpu_kernel void @sdiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) #0 {
 ; CHECK-LABEL: @sdiv_v4i16(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0
@@ -2506,7 +2506,7 @@
   ret void
 }
 
-define amdgpu_kernel void @srem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) {
+define amdgpu_kernel void @srem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) #0 {
 ; CHECK-LABEL: @srem_v4i16(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0
@@ -2708,7 +2708,7 @@
   ret void
 }
 
-define amdgpu_kernel void @udiv_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) {
+define amdgpu_kernel void @udiv_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) #0 {
 ; CHECK-LABEL: @udiv_i3(
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext i3 [[X:%.*]] to i32
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext i3 [[Y:%.*]] to i32
@@ -2756,7 +2756,7 @@
   ret void
 }
 
-define amdgpu_kernel void @urem_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) {
+define amdgpu_kernel void @urem_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) #0 {
 ; CHECK-LABEL: @urem_i3(
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext i3 [[X:%.*]] to i32
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext i3 [[Y:%.*]] to i32
@@ -2809,7 +2809,7 @@
   ret void
 }
 
-define amdgpu_kernel void @sdiv_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) {
+define amdgpu_kernel void @sdiv_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) #0 {
 ; CHECK-LABEL: @sdiv_i3(
 ; CHECK-NEXT:    [[TMP1:%.*]] = sext i3 [[X:%.*]] to i32
 ; CHECK-NEXT:    [[TMP2:%.*]] = sext i3 [[Y:%.*]] to i32
@@ -2866,7 +2866,7 @@
   ret void
 }
 
-define amdgpu_kernel void @srem_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) {
+define amdgpu_kernel void @srem_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) #0 {
 ; CHECK-LABEL: @srem_i3(
 ; CHECK-NEXT:    [[TMP1:%.*]] = sext i3 [[X:%.*]] to i32
 ; CHECK-NEXT:    [[TMP2:%.*]] = sext i3 [[Y:%.*]] to i32
@@ -2928,7 +2928,7 @@
   ret void
 }
 
-define amdgpu_kernel void @udiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) {
+define amdgpu_kernel void @udiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) #0 {
 ; CHECK-LABEL: @udiv_v3i16(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0
@@ -3046,7 +3046,7 @@
   ret void
 }
 
-define amdgpu_kernel void @urem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) {
+define amdgpu_kernel void @urem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) #0 {
 ; CHECK-LABEL: @urem_v3i16(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0
@@ -3180,7 +3180,7 @@
   ret void
 }
 
-define amdgpu_kernel void @sdiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) {
+define amdgpu_kernel void @sdiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) #0 {
 ; CHECK-LABEL: @sdiv_v3i16(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0
@@ -3324,7 +3324,7 @@
   ret void
 }
 
-define amdgpu_kernel void @srem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) {
+define amdgpu_kernel void @srem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) #0 {
 ; CHECK-LABEL: @srem_v3i16(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0
@@ -3483,7 +3483,7 @@
   ret void
 }
 
-define amdgpu_kernel void @udiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) {
+define amdgpu_kernel void @udiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) #0 {
 ; CHECK-LABEL: @udiv_v3i15(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0
@@ -3609,7 +3609,7 @@
   ret void
 }
 
-define amdgpu_kernel void @urem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) {
+define amdgpu_kernel void @urem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) #0 {
 ; CHECK-LABEL: @urem_v3i15(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0
@@ -3749,7 +3749,7 @@
   ret void
 }
 
-define amdgpu_kernel void @sdiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) {
+define amdgpu_kernel void @sdiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) #0 {
 ; CHECK-LABEL: @sdiv_v3i15(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0
@@ -3901,7 +3901,7 @@
   ret void
 }
 
-define amdgpu_kernel void @srem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) {
+define amdgpu_kernel void @srem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) #0 {
 ; CHECK-LABEL: @srem_v3i15(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0
@@ -4073,7 +4073,7 @@
   ret void
 }
 
-define amdgpu_kernel void @udiv_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) {
+define amdgpu_kernel void @udiv_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) #0 {
 ; CHECK-LABEL: @udiv_i32_oddk_denom(
 ; CHECK-NEXT:    [[R:%.*]] = udiv i32 [[X:%.*]], 1235195
 ; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]]
@@ -4099,7 +4099,7 @@
   ret void
 }
 
-define amdgpu_kernel void @udiv_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) {
+define amdgpu_kernel void @udiv_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) #0 {
 ; CHECK-LABEL: @udiv_i32_pow2k_denom(
 ; CHECK-NEXT:    [[R:%.*]] = udiv i32 [[X:%.*]], 4096
 ; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]]
@@ -4121,7 +4121,7 @@
   ret void
 }
 
-define amdgpu_kernel void @udiv_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) {
+define amdgpu_kernel void @udiv_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {
 ; CHECK-LABEL: @udiv_i32_pow2_shl_denom(
 ; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]]
 ; CHECK-NEXT:    [[R:%.*]] = udiv i32 [[X:%.*]], [[SHL_Y]]
@@ -4146,7 +4146,7 @@
   ret void
 }
 
-define amdgpu_kernel void @udiv_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) {
+define amdgpu_kernel void @udiv_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) #0 {
 ; CHECK-LABEL: @udiv_v2i32_pow2k_denom(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = udiv i32 [[TMP1]], 4096
@@ -4175,7 +4175,7 @@
   ret void
 }
 
-define amdgpu_kernel void @udiv_v2i32_mixed_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) {
+define amdgpu_kernel void @udiv_v2i32_mixed_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) #0 {
 ; CHECK-LABEL: @udiv_v2i32_mixed_pow2k_denom(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = udiv i32 [[TMP1]], 4096
@@ -4208,7 +4208,7 @@
   ret void
 }
 
-define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) {
+define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) #0 {
 ; CHECK-LABEL: @udiv_v2i32_pow2_shl_denom(
 ; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i32> <i32 4096, i32 4096>, [[Y:%.*]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
@@ -4361,7 +4361,7 @@
   ret void
 }
 
-define amdgpu_kernel void @urem_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) {
+define amdgpu_kernel void @urem_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) #0 {
 ; CHECK-LABEL: @urem_i32_oddk_denom(
 ; CHECK-NEXT:    [[R:%.*]] = urem i32 [[X:%.*]], 1235195
 ; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]]
@@ -4389,7 +4389,7 @@
   ret void
 }
 
-define amdgpu_kernel void @urem_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) {
+define amdgpu_kernel void @urem_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) #0 {
 ; CHECK-LABEL: @urem_i32_pow2k_denom(
 ; CHECK-NEXT:    [[R:%.*]] = urem i32 [[X:%.*]], 4096
 ; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]]
@@ -4411,7 +4411,7 @@
   ret void
 }
 
-define amdgpu_kernel void @urem_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) {
+define amdgpu_kernel void @urem_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {
 ; CHECK-LABEL: @urem_i32_pow2_shl_denom(
 ; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]]
 ; CHECK-NEXT:    [[R:%.*]] = urem i32 [[X:%.*]], [[SHL_Y]]
@@ -4437,7 +4437,7 @@
   ret void
 }
 
-define amdgpu_kernel void @urem_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) {
+define amdgpu_kernel void @urem_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) #0 {
 ; CHECK-LABEL: @urem_v2i32_pow2k_denom(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = urem i32 [[TMP1]], 4096
@@ -4467,7 +4467,7 @@
   ret void
 }
 
-define amdgpu_kernel void @urem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) {
+define amdgpu_kernel void @urem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) #0 {
 ; CHECK-LABEL: @urem_v2i32_pow2_shl_denom(
 ; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i32> <i32 4096, i32 4096>, [[Y:%.*]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
@@ -4620,7 +4620,7 @@
   ret void
 }
 
-define amdgpu_kernel void @sdiv_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) {
+define amdgpu_kernel void @sdiv_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) #0 {
 ; CHECK-LABEL: @sdiv_i32_oddk_denom(
 ; CHECK-NEXT:    [[R:%.*]] = sdiv i32 [[X:%.*]], 1235195
 ; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]]
@@ -4646,7 +4646,7 @@
   ret void
 }
 
-define amdgpu_kernel void @sdiv_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) {
+define amdgpu_kernel void @sdiv_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) #0 {
 ; CHECK-LABEL: @sdiv_i32_pow2k_denom(
 ; CHECK-NEXT:    [[R:%.*]] = sdiv i32 [[X:%.*]], 4096
 ; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]]
@@ -4671,7 +4671,7 @@
   ret void
 }
 
-define amdgpu_kernel void @sdiv_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) {
+define amdgpu_kernel void @sdiv_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {
 ; CHECK-LABEL: @sdiv_i32_pow2_shl_denom(
 ; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]]
 ; CHECK-NEXT:    [[R:%.*]] = sdiv i32 [[X:%.*]], [[SHL_Y]]
@@ -4726,7 +4726,7 @@
   ret void
 }
 
-define amdgpu_kernel void @sdiv_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) {
+define amdgpu_kernel void @sdiv_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) #0 {
 ; CHECK-LABEL: @sdiv_v2i32_pow2k_denom(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = sdiv i32 [[TMP1]], 4096
@@ -4761,7 +4761,7 @@
   ret void
 }
 
-define amdgpu_kernel void @ssdiv_v2i32_mixed_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) {
+define amdgpu_kernel void @ssdiv_v2i32_mixed_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) #0 {
 ; CHECK-LABEL: @ssdiv_v2i32_mixed_pow2k_denom(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = sdiv i32 [[TMP1]], 4096
@@ -4797,7 +4797,7 @@
   ret void
 }
 
-define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) {
+define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) #0 {
 ; CHECK-LABEL: @sdiv_v2i32_pow2_shl_denom(
 ; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i32> <i32 4096, i32 4096>, [[Y:%.*]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
@@ -4985,7 +4985,7 @@
   ret void
 }
 
-define amdgpu_kernel void @srem_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) {
+define amdgpu_kernel void @srem_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) #0 {
 ; CHECK-LABEL: @srem_i32_oddk_denom(
 ; CHECK-NEXT:    [[R:%.*]] = srem i32 [[X:%.*]], 1235195
 ; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]]
@@ -5013,7 +5013,7 @@
   ret void
 }
 
-define amdgpu_kernel void @srem_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) {
+define amdgpu_kernel void @srem_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) #0 {
 ; CHECK-LABEL: @srem_i32_pow2k_denom(
 ; CHECK-NEXT:    [[R:%.*]] = srem i32 [[X:%.*]], 4096
 ; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]]
@@ -5039,7 +5039,7 @@
   ret void
 }
 
-define amdgpu_kernel void @srem_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) {
+define amdgpu_kernel void @srem_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {
 ; CHECK-LABEL: @srem_i32_pow2_shl_denom(
 ; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]]
 ; CHECK-NEXT:    [[R:%.*]] = srem i32 [[X:%.*]], [[SHL_Y]]
@@ -5094,7 +5094,7 @@
   ret void
 }
 
-define amdgpu_kernel void @srem_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) {
+define amdgpu_kernel void @srem_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) #0 {
 ; CHECK-LABEL: @srem_v2i32_pow2k_denom(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = srem i32 [[TMP1]], 4096
@@ -5132,7 +5132,7 @@
   ret void
 }
 
-define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) {
+define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) #0 {
 ; CHECK-LABEL: @srem_v2i32_pow2_shl_denom(
 ; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i32> <i32 4096, i32 4096>, [[Y:%.*]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
@@ -5316,7 +5316,7 @@
   ret void
 }
 
-define amdgpu_kernel void @udiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
+define amdgpu_kernel void @udiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) #0 {
 ; CHECK-LABEL: @udiv_i64_oddk_denom(
 ; CHECK-NEXT:    [[R:%.*]] = udiv i64 [[X:%.*]], 1235195949943
 ; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]]
@@ -5451,7 +5451,7 @@
   ret void
 }
 
-define amdgpu_kernel void @udiv_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) {
+define amdgpu_kernel void @udiv_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) #0 {
 ; CHECK-LABEL: @udiv_i64_pow2k_denom(
 ; CHECK-NEXT:    [[R:%.*]] = udiv i64 [[X:%.*]], 4096
 ; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]]
@@ -5475,7 +5475,7 @@
   ret void
 }
 
-define amdgpu_kernel void @udiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @udiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) #0 {
 ; CHECK-LABEL: @udiv_i64_pow2_shl_denom(
 ; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]]
 ; CHECK-NEXT:    [[R:%.*]] = udiv i64 [[X:%.*]], [[SHL_Y]]
@@ -5503,7 +5503,7 @@
   ret void
 }
 
-define amdgpu_kernel void @udiv_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) {
+define amdgpu_kernel void @udiv_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) #0 {
 ; CHECK-LABEL: @udiv_v2i64_pow2k_denom(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = udiv i64 [[TMP1]], 4096
@@ -5534,7 +5534,7 @@
   ret void
 }
 
-define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) {
+define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) #0 {
 ; CHECK-LABEL: @udiv_v2i64_mixed_pow2k_denom(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = udiv i64 [[TMP1]], 4096
@@ -5661,7 +5661,7 @@
   ret void
 }
 
-define amdgpu_kernel void @udiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) {
+define amdgpu_kernel void @udiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) #0 {
 ; CHECK-LABEL: @udiv_v2i64_pow2_shl_denom(
 ; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
@@ -5699,7 +5699,7 @@
   ret void
 }
 
-define amdgpu_kernel void @urem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
+define amdgpu_kernel void @urem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) #0 {
 ; CHECK-LABEL: @urem_i64_oddk_denom(
 ; CHECK-NEXT:    [[R:%.*]] = urem i64 [[X:%.*]], 1235195393993
 ; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]]
@@ -5833,7 +5833,7 @@
   ret void
 }
 
-define amdgpu_kernel void @urem_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) {
+define amdgpu_kernel void @urem_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) #0 {
 ; CHECK-LABEL: @urem_i64_pow2k_denom(
 ; CHECK-NEXT:    [[R:%.*]] = urem i64 [[X:%.*]], 4096
 ; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]]
@@ -5857,7 +5857,7 @@
   ret void
 }
 
-define amdgpu_kernel void @urem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @urem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) #0 {
 ; CHECK-LABEL: @urem_i64_pow2_shl_denom(
 ; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]]
 ; CHECK-NEXT:    [[R:%.*]] = urem i64 [[X:%.*]], [[SHL_Y]]
@@ -5889,7 +5889,7 @@
   ret void
 }
 
-define amdgpu_kernel void @urem_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) {
+define amdgpu_kernel void @urem_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) #0 {
 ; CHECK-LABEL: @urem_v2i64_pow2k_denom(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = urem i64 [[TMP1]], 4096
@@ -5921,7 +5921,7 @@
   ret void
 }
 
-define amdgpu_kernel void @urem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) {
+define amdgpu_kernel void @urem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) #0 {
 ; CHECK-LABEL: @urem_v2i64_pow2_shl_denom(
 ; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
@@ -5965,7 +5965,7 @@
   ret void
 }
 
-define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
+define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) #0 {
 ; CHECK-LABEL: @sdiv_i64_oddk_denom(
 ; CHECK-NEXT:    [[R:%.*]] = sdiv i64 [[X:%.*]], 1235195
 ; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]]
@@ -6095,7 +6095,7 @@
   ret void
 }
 
-define amdgpu_kernel void @sdiv_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) {
+define amdgpu_kernel void @sdiv_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) #0 {
 ; CHECK-LABEL: @sdiv_i64_pow2k_denom(
 ; CHECK-NEXT:    [[R:%.*]] = sdiv i64 [[X:%.*]], 4096
 ; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]]
@@ -6123,7 +6123,7 @@
   ret void
 }
 
-define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) #0 {
 ; CHECK-LABEL: @sdiv_i64_pow2_shl_denom(
 ; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]]
 ; CHECK-NEXT:    [[R:%.*]] = sdiv i64 [[X:%.*]], [[SHL_Y]]
@@ -6276,7 +6276,7 @@
   ret void
 }
 
-define amdgpu_kernel void @sdiv_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) {
+define amdgpu_kernel void @sdiv_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) #0 {
 ; CHECK-LABEL: @sdiv_v2i64_pow2k_denom(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = sdiv i64 [[TMP1]], 4096
@@ -6315,7 +6315,7 @@
   ret void
 }
 
-define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) {
+define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) #0 {
 ; CHECK-LABEL: @ssdiv_v2i64_mixed_pow2k_denom(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = sdiv i64 [[TMP1]], 4096
@@ -6457,7 +6457,7 @@
   ret void
 }
 
-define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) {
+define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) #0 {
 ; CHECK-LABEL: @sdiv_v2i64_pow2_shl_denom(
 ; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
@@ -6746,7 +6746,7 @@
   ret void
 }
 
-define amdgpu_kernel void @srem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
+define amdgpu_kernel void @srem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) #0 {
 ; CHECK-LABEL: @srem_i64_oddk_denom(
 ; CHECK-NEXT:    [[R:%.*]] = srem i64 [[X:%.*]], 1235195
 ; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]]
@@ -6874,7 +6874,7 @@
   ret void
 }
 
-define amdgpu_kernel void @srem_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) {
+define amdgpu_kernel void @srem_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) #0 {
 ; CHECK-LABEL: @srem_i64_pow2k_denom(
 ; CHECK-NEXT:    [[R:%.*]] = srem i64 [[X:%.*]], 4096
 ; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]]
@@ -6904,7 +6904,7 @@
   ret void
 }
 
-define amdgpu_kernel void @srem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @srem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) #0 {
 ; CHECK-LABEL: @srem_i64_pow2_shl_denom(
 ; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]]
 ; CHECK-NEXT:    [[R:%.*]] = srem i64 [[X:%.*]], [[SHL_Y]]
@@ -7055,7 +7055,7 @@
   ret void
 }
 
-define amdgpu_kernel void @srem_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) {
+define amdgpu_kernel void @srem_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) #0 {
 ; CHECK-LABEL: @srem_v2i64_pow2k_denom(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = srem i64 [[TMP1]], 4096
@@ -7099,7 +7099,7 @@
   ret void
 }
 
-define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) {
+define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) #0 {
 ; CHECK-LABEL: @srem_v2i64_pow2_shl_denom(
 ; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
@@ -7383,3 +7383,5 @@
   store <2 x i64> %r, <2 x i64> addrspace(1)* %out
   ret void
 }
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-mul24-knownbits.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-mul24-knownbits.ll
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-mul24-knownbits.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-mul24-knownbits.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck --check-prefix=GCN %s
 
-define weak_odr amdgpu_kernel void @test_mul24_knownbits_kernel(float addrspace(1)* %p) #4 {
+define weak_odr amdgpu_kernel void @test_mul24_knownbits_kernel(float addrspace(1)* %p) #0 {
 ; GCN-LABEL: test_mul24_knownbits_kernel:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    v_and_b32_e32 v0, 3, v0
@@ -32,3 +32,4 @@
 declare i32 @llvm.amdgcn.workitem.id.x() #20
 
 !4 = !{i32 0, i32 1024}
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/and_or.ll b/llvm/test/CodeGen/AMDGPU/and_or.ll
--- a/llvm/test/CodeGen/AMDGPU/and_or.ll
+++ b/llvm/test/CodeGen/AMDGPU/and_or.ll
@@ -7,7 +7,7 @@
 ; V_AND_OR_B32
 ; ===================================================================================
 
-define amdgpu_ps float @and_or(i32 %a, i32 %b, i32 %c) {
+define amdgpu_ps float @and_or(i32 %a, i32 %b, i32 %c) #0 {
 ; VI-LABEL: and_or:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    v_and_b32_e32 v0, v0, v1
@@ -31,7 +31,7 @@
 }
 
 ; ThreeOp instruction variant not used due to Constant Bus Limitations
-define amdgpu_ps float @and_or_vgpr_b(i32 inreg %a, i32 %b, i32 inreg %c) {
+define amdgpu_ps float @and_or_vgpr_b(i32 inreg %a, i32 %b, i32 inreg %c) #0 {
 ; VI-LABEL: and_or_vgpr_b:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    v_and_b32_e32 v0, s2, v0
@@ -55,7 +55,7 @@
   ret float %bc
 }
 
-define amdgpu_ps float @and_or_vgpr_ab(i32 %a, i32 %b, i32 inreg %c) {
+define amdgpu_ps float @and_or_vgpr_ab(i32 %a, i32 %b, i32 inreg %c) #0 {
 ; VI-LABEL: and_or_vgpr_ab:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    v_and_b32_e32 v0, v0, v1
@@ -78,7 +78,7 @@
   ret float %bc
 }
 
-define amdgpu_ps float @and_or_vgpr_const(i32 %a, i32 %b) {
+define amdgpu_ps float @and_or_vgpr_const(i32 %a, i32 %b) #0 {
 ; VI-LABEL: and_or_vgpr_const:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    v_and_b32_e32 v0, 4, v0
@@ -101,7 +101,7 @@
   ret float %bc
 }
 
-define amdgpu_ps float @and_or_vgpr_const_inline_const(i32 %a) {
+define amdgpu_ps float @and_or_vgpr_const_inline_const(i32 %a) #0 {
 ; VI-LABEL: and_or_vgpr_const_inline_const:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    v_and_b32_e32 v0, 20, v0
@@ -125,7 +125,7 @@
   ret float %bc
 }
 
-define amdgpu_ps float @and_or_vgpr_inline_const_x2(i32 %a) {
+define amdgpu_ps float @and_or_vgpr_inline_const_x2(i32 %a) #0 {
 ; VI-LABEL: and_or_vgpr_inline_const_x2:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    v_and_b32_e32 v0, 4, v0
@@ -147,3 +147,4 @@
   %bc = bitcast i32 %result to float
   ret float %bc
 }
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -12,7 +12,7 @@
 
 ; Show that what the atomic optimization pass will do for local pointers.
 
-define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) #0 {
 ;
 ;
 ; GFX7LESS-LABEL: add_i32_constant:
@@ -172,7 +172,7 @@
   ret void
 }
 
-define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive) {
+define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive) #0 {
 ;
 ;
 ; GFX7LESS-LABEL: add_i32_uniform:
@@ -356,7 +356,7 @@
 ; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31
 ; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
 ; GFX8MORE: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
-define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) #0 {
 ;
 ;
 ; GFX7LESS-LABEL: add_i32_varying:
@@ -595,7 +595,7 @@
   ret void
 }
 
-define amdgpu_kernel void @add_i32_varying_gfx1032(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @add_i32_varying_gfx1032(i32 addrspace(1)* %out) #0 {
 ;
 ;
 ; GFX7LESS-LABEL: add_i32_varying_gfx1032:
@@ -834,7 +834,7 @@
   ret void
 }
 
-define amdgpu_kernel void @add_i32_varying_gfx1064(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @add_i32_varying_gfx1064(i32 addrspace(1)* %out) #0 {
 ;
 ;
 ; GFX7LESS-LABEL: add_i32_varying_gfx1064:
@@ -1073,7 +1073,7 @@
   ret void
 }
 
-define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) #0 {
 ;
 ;
 ; GFX7LESS-LABEL: add_i64_constant:
@@ -1251,7 +1251,7 @@
   ret void
 }
 
-define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive) {
+define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive) #0 {
 ;
 ;
 ; GFX7LESS-LABEL: add_i64_uniform:
@@ -1479,7 +1479,7 @@
 ; GCN-NOT: v_mbcnt_lo_u32_b32
 ; GCN-NOT: v_mbcnt_hi_u32_b32
 ; GCN-NOT: s_bcnt1_i32_b64
-define amdgpu_kernel void @add_i64_varying(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @add_i64_varying(i64 addrspace(1)* %out) #0 {
 ;
 ;
 ; GFX7LESS-LABEL: add_i64_varying:
@@ -1566,7 +1566,7 @@
   ret void
 }
 
-define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) #0 {
 ;
 ;
 ; GFX7LESS-LABEL: sub_i32_constant:
@@ -1731,7 +1731,7 @@
   ret void
 }
 
-define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive) {
+define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive) #0 {
 ;
 ;
 ; GFX7LESS-LABEL: sub_i32_uniform:
@@ -1915,7 +1915,7 @@
 ; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31
 ; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
 ; GFX8MORE: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
-define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) #0 {
 ;
 ;
 ; GFX7LESS-LABEL: sub_i32_varying:
@@ -2154,7 +2154,7 @@
   ret void
 }
 
-define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) #0 {
 ;
 ;
 ; GFX7LESS-LABEL: sub_i64_constant:
@@ -2338,7 +2338,7 @@
   ret void
 }
 
-define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive) {
+define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive) #0 {
 ;
 ;
 ; GFX7LESS-LABEL: sub_i64_uniform:
@@ -2566,7 +2566,7 @@
 ; GCN-NOT: v_mbcnt_lo_u32_b32
 ; GCN-NOT: v_mbcnt_hi_u32_b32
 ; GCN-NOT: s_bcnt1_i32_b64
-define amdgpu_kernel void @sub_i64_varying(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @sub_i64_varying(i64 addrspace(1)* %out) #0 {
 ;
 ;
 ; GFX7LESS-LABEL: sub_i64_varying:
@@ -2656,7 +2656,7 @@
 ; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31
 ; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
 ; GFX8MORE: ds_and_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
-define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) #0 {
 ;
 ;
 ; GFX7LESS-LABEL: and_i32_varying:
@@ -2898,7 +2898,7 @@
 ; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31
 ; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
 ; GFX8MORE: ds_or_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
-define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) #0 {
 ;
 ;
 ; GFX7LESS-LABEL: or_i32_varying:
@@ -3140,7 +3140,7 @@
 ; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31
 ; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
 ; GFX8MORE: ds_xor_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
-define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) #0 {
 ;
 ;
 ; GFX7LESS-LABEL: xor_i32_varying:
@@ -3382,7 +3382,7 @@
 ; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31
 ; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
 ; GFX8MORE: ds_max_rtn_i32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
-define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) #0 {
 ;
 ;
 ; GFX7LESS-LABEL: max_i32_varying:
@@ -3621,7 +3621,7 @@
   ret void
 }
 
-define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) #0 {
 ;
 ;
 ; GFX7LESS-LABEL: max_i64_constant:
@@ -3814,7 +3814,7 @@
 ; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31
 ; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
 ; GFX8MORE: ds_min_rtn_i32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
-define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) #0 {
 ;
 ;
 ; GFX7LESS-LABEL: min_i32_varying:
@@ -4053,7 +4053,7 @@
   ret void
 }
 
-define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) #0 {
 ;
 ;
 ; GFX7LESS-LABEL: min_i64_constant:
@@ -4246,7 +4246,7 @@
 ; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31
 ; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
 ; GFX8MORE: ds_max_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
-define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) #0 {
 ;
 ;
 ; GFX7LESS-LABEL: umax_i32_varying:
@@ -4485,7 +4485,7 @@
   ret void
 }
 
-define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) #0 {
 ;
 ;
 ; GFX7LESS-LABEL: umax_i64_constant:
@@ -4675,7 +4675,7 @@
 ; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31
 ; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
 ; GFX8MORE: ds_min_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
-define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) #0 {
 ;
 ;
 ; GFX7LESS-LABEL: umin_i32_varying:
@@ -4914,7 +4914,7 @@
   ret void
 }
 
-define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) #0 {
 ;
 ;
 ; GFX7LESS-LABEL: umin_i64_constant:
@@ -5100,3 +5100,4 @@
   store i64 %old, i64 addrspace(1)* %out
   ret void
 }
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
@@ -11,7 +11,7 @@
 
 ; Show what the atomic optimization pass will do for raw buffers.
 
-define amdgpu_ps void @add_i32_constant(<4 x i32> inreg %out, <4 x i32> inreg %inout) {
+define amdgpu_ps void @add_i32_constant(<4 x i32> inreg %out, <4 x i32> inreg %inout) #0 {
 ; GFX7-LABEL: add_i32_constant:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_mov_b64 s[10:11], exec
@@ -192,7 +192,7 @@
   ret void
 }
 
-define amdgpu_ps void @add_i32_varying(<4 x i32> inreg %out, <4 x i32> inreg %inout, i32 %val) {
+define amdgpu_ps void @add_i32_varying(<4 x i32> inreg %out, <4 x i32> inreg %inout, i32 %val) #0 {
 ; GFX7-LABEL: add_i32_varying:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_wqm_b64 s[8:9], -1
@@ -446,3 +446,4 @@
 else:
   ret void
 }
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
--- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
+++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX900 %s
 
-define <2 x half> @chain_hi_to_lo_private() {
+define <2 x half> @chain_hi_to_lo_private() #0 {
 ; GCN-LABEL: chain_hi_to_lo_private:
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -22,7 +22,7 @@
   ret <2 x half> %result
 }
 
-define <2 x half> @chain_hi_to_lo_private_different_bases(half addrspace(5)* %base_lo, half addrspace(5)* %base_hi) {
+define <2 x half> @chain_hi_to_lo_private_different_bases(half addrspace(5)* %base_lo, half addrspace(5)* %base_hi) #0 {
 ; GCN-LABEL: chain_hi_to_lo_private_different_bases:
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -41,7 +41,7 @@
   ret <2 x half> %result
 }
 
-define <2 x half> @chain_hi_to_lo_arithmatic(half addrspace(5)* %base, half %in) {
+define <2 x half> @chain_hi_to_lo_arithmatic(half addrspace(5)* %base, half %in) #0 {
 ; GCN-LABEL: chain_hi_to_lo_arithmatic:
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -60,7 +60,7 @@
   ret <2 x half> %result
 }
 
-define <2 x half> @chain_hi_to_lo_group() {
+define <2 x half> @chain_hi_to_lo_group() #0 {
 ; GCN-LABEL: chain_hi_to_lo_group:
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -82,7 +82,7 @@
   ret <2 x half> %result
 }
 
-define <2 x half> @chain_hi_to_lo_group_different_bases(half addrspace(3)* %base_lo, half addrspace(3)* %base_hi) {
+define <2 x half> @chain_hi_to_lo_group_different_bases(half addrspace(3)* %base_lo, half addrspace(3)* %base_hi) #0 {
 ; GCN-LABEL: chain_hi_to_lo_group_different_bases:
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -101,7 +101,7 @@
   ret <2 x half> %result
 }
 
-define <2 x half> @chain_hi_to_lo_global() {
+define <2 x half> @chain_hi_to_lo_global() #0 {
 ; GCN-LABEL: chain_hi_to_lo_global:
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -126,7 +126,7 @@
   ret <2 x half> %result
 }
 
-define <2 x half> @chain_hi_to_lo_global_different_bases(half addrspace(1)* %base_lo, half addrspace(1)* %base_hi) {
+define <2 x half> @chain_hi_to_lo_global_different_bases(half addrspace(1)* %base_lo, half addrspace(1)* %base_hi) #0 {
 ; GCN-LABEL: chain_hi_to_lo_global_different_bases:
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -145,7 +145,7 @@
   ret <2 x half> %result
 }
 
-define <2 x half> @chain_hi_to_lo_flat() {
+define <2 x half> @chain_hi_to_lo_flat() #0 {
 ; GCN-LABEL: chain_hi_to_lo_flat:
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -170,7 +170,7 @@
   ret <2 x half> %result
 }
 
-define <2 x half> @chain_hi_to_lo_flat_different_bases(half* %base_lo, half* %base_hi) {
+define <2 x half> @chain_hi_to_lo_flat_different_bases(half* %base_lo, half* %base_hi) #0 {
 ; GCN-LABEL: chain_hi_to_lo_flat_different_bases:
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -251,7 +251,7 @@
 
 ; There is another instruction between the misordered instruction and
 ; the value dependent load, so a simple operand check is insufficient.
-define <2 x i16> @chain_hi_to_lo_group_other_dep(i16 addrspace(3)* %ptr) {
+define <2 x i16> @chain_hi_to_lo_group_other_dep(i16 addrspace(3)* %ptr) #0 {
 ; GCN-LABEL: chain_hi_to_lo_group_other_dep:
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -274,7 +274,7 @@
 }
 
 ; The volatile operations aren't put on the same chain
-define <2 x i16> @chain_hi_to_lo_group_other_dep_multi_chain(i16 addrspace(3)* %ptr) {
+define <2 x i16> @chain_hi_to_lo_group_other_dep_multi_chain(i16 addrspace(3)* %ptr) #0 {
 ; GCN-LABEL: chain_hi_to_lo_group_other_dep_multi_chain:
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -296,7 +296,7 @@
   ret <2 x i16> %result
 }
 
-define <2 x i16> @chain_hi_to_lo_private_other_dep(i16 addrspace(5)* %ptr) {
+define <2 x i16> @chain_hi_to_lo_private_other_dep(i16 addrspace(5)* %ptr) #0 {
 ; GCN-LABEL: chain_hi_to_lo_private_other_dep:
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -318,7 +318,7 @@
   ret <2 x i16> %result
 }
 
-define <2 x i16> @chain_hi_to_lo_global_other_dep(i16 addrspace(1)* %ptr) {
+define <2 x i16> @chain_hi_to_lo_global_other_dep(i16 addrspace(1)* %ptr) #0 {
 ; GCN-LABEL: chain_hi_to_lo_global_other_dep:
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -340,7 +340,7 @@
   ret <2 x i16> %result
 }
 
-define <2 x i16> @chain_hi_to_lo_flat_other_dep(i16 addrspace(0)* %ptr) {
+define <2 x i16> @chain_hi_to_lo_flat_other_dep(i16 addrspace(0)* %ptr) #0 {
 ; GCN-LABEL: chain_hi_to_lo_flat_other_dep:
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -362,7 +362,7 @@
   ret <2 x i16> %result
 }
 
-define <2 x i16> @chain_hi_to_lo_group_may_alias_store(i16 addrspace(3)* %ptr, i16 addrspace(3)* %may.alias) {
+define <2 x i16> @chain_hi_to_lo_group_may_alias_store(i16 addrspace(3)* %ptr, i16 addrspace(3)* %may.alias) #0 {
 ; GCN-LABEL: chain_hi_to_lo_group_may_alias_store:
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -385,3 +385,4 @@
   %result = insertelement <2 x i16> %to.hi, i16 %load_lo, i32 0
   ret <2 x i16> %result
 }
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
--- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
@@ -875,7 +875,7 @@
   ret void
 }
 
-define amdgpu_kernel void @cvt_ubyte0_or_multiuse(i32 addrspace(1)* %in, float addrspace(1)* %out) {
+define amdgpu_kernel void @cvt_ubyte0_or_multiuse(i32 addrspace(1)* %in, float addrspace(1)* %out) #0 {
 ; SI-LABEL: cvt_ubyte0_or_multiuse:
 ; SI:       ; %bb.0: ; %bb
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -927,3 +927,4 @@
   store float %add, float addrspace(1)* %out
   ret void
 }
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-setcc-select.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-setcc-select.ll
--- a/llvm/test/CodeGen/AMDGPU/dagcombine-setcc-select.ll
+++ b/llvm/test/CodeGen/AMDGPU/dagcombine-setcc-select.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
 
-define amdgpu_kernel void @eq_t(float %x) {
+define amdgpu_kernel void @eq_t(float %x) #0 {
 ; GCN-LABEL: eq_t:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dword s0, s[0:1], 0x24
@@ -18,7 +18,7 @@
   ret void
 }
 
-define amdgpu_kernel void @ne_t(float %x) {
+define amdgpu_kernel void @ne_t(float %x) #0 {
 ; GCN-LABEL: ne_t:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dword s0, s[0:1], 0x24
@@ -35,7 +35,7 @@
   ret void
 }
 
-define amdgpu_kernel void @eq_f(float %x) {
+define amdgpu_kernel void @eq_f(float %x) #0 {
 ; GCN-LABEL: eq_f:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dword s0, s[0:1], 0x24
@@ -52,7 +52,7 @@
   ret void
 }
 
-define amdgpu_kernel void @ne_f(float %x) {
+define amdgpu_kernel void @ne_f(float %x) #0 {
 ; GCN-LABEL: ne_f:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dword s0, s[0:1], 0x24
@@ -69,7 +69,7 @@
   ret void
 }
 
-define amdgpu_kernel void @different_constants(float %x) {
+define amdgpu_kernel void @different_constants(float %x) #0 {
 ; GCN-LABEL: different_constants:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    v_mov_b32_e32 v0, 2.0
@@ -82,3 +82,4 @@
   store float %s2, float* undef, align 4
   ret void
 }
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/debug-frame.ll b/llvm/test/CodeGen/AMDGPU/debug-frame.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/debug-frame.ll
@@ -0,0 +1,34 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=asm -o - %s | FileCheck %s
+
+; CHECK-LABEL: kern1:
+; CHECK: .cfi_startproc
+
+; CHECK-NOT: .cfi_{{.*}}
+
+; CHECK: %bb.0:
+; DW_CFA_def_cfa_expression [0x0f]
+;   BLOCK_LENGTH ULEB128(3)=[0x03]
+;     DW_OP_lit0 [0x30]
+;     DW_OP_lit6 [0x36]
+;     DW_OP_LLVM_form_aspace_address [0xe1]
+; CHECK-NEXT: .cfi_escape 0x0f, 0x03, 0x30, 0x36, 0xe1
+; PC_64 = 16
+; CHECK-NEXT: .cfi_undefined 16
+
+; CHECK-NOT: .cfi_{{.*}}
+
+; CHECK: .cfi_endproc
+define protected amdgpu_kernel void @kern1() #0 {
+entry:
+  ret void
+}
+
+attributes #0 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, emissionKind: FullDebug)
+!1 = !DIFile(filename: "filename", directory: "directory")
+!2 = !{i32 7, !"Dwarf Version", i32 4}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
diff --git a/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll b/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll
--- a/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll
+++ b/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll
@@ -15,7 +15,7 @@
 
 target triple = "amdgcn-mesa-mesa3d"
 
-define amdgpu_ps void @main(i32 %0, float %1) {
+define amdgpu_ps void @main(i32 %0, float %1) #0 {
 ; ISA-LABEL: main:
 ; ISA:       ; %bb.0: ; %start
 ; ISA-NEXT:    v_readfirstlane_b32 s0, v0
diff --git a/llvm/test/CodeGen/AMDGPU/early-if-convert.ll b/llvm/test/CodeGen/AMDGPU/early-if-convert.ll
--- a/llvm/test/CodeGen/AMDGPU/early-if-convert.ll
+++ b/llvm/test/CodeGen/AMDGPU/early-if-convert.ll
@@ -364,7 +364,7 @@
 ; GCN-LABEL: {{^}}uniform_if_swap_br_targets_scc_constant_select:
 ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 0
 ; GCN: s_cselect_b32 s{{[0-9]+}}, 1, 0{{$}}
-define amdgpu_kernel void @uniform_if_swap_br_targets_scc_constant_select(i32 %cond, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @uniform_if_swap_br_targets_scc_constant_select(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %cmp0 = icmp eq i32 %cond, 0
   br i1 %cmp0, label %else, label %if
@@ -385,7 +385,7 @@
 ; GCN: {{^}}; %bb.0:
 ; GCN-NEXT: s_load_dwordx2
 ; GCN-NEXT: s_cselect_b32 s{{[0-9]+}}, 1, 0
-define amdgpu_kernel void @ifcvt_undef_scc(i32 %cond, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @ifcvt_undef_scc(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   br i1 undef, label %else, label %if
 
diff --git a/llvm/test/CodeGen/AMDGPU/extractelt-to-trunc.ll b/llvm/test/CodeGen/AMDGPU/extractelt-to-trunc.ll
--- a/llvm/test/CodeGen/AMDGPU/extractelt-to-trunc.ll
+++ b/llvm/test/CodeGen/AMDGPU/extractelt-to-trunc.ll
@@ -5,7 +5,7 @@
 
 ; Make sure the add and load are reduced to 32-bits even with the
 ; bitcast to vector.
-define amdgpu_kernel void @bitcast_int_to_vector_extract_0(i32 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %b) {
+define amdgpu_kernel void @bitcast_int_to_vector_extract_0(i32 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %b) #0 {
 ; GCN-LABEL: bitcast_int_to_vector_extract_0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -35,7 +35,7 @@
    ret void
 }
 
-define amdgpu_kernel void @bitcast_fp_to_vector_extract_0(i32 addrspace(1)* %out, double addrspace(1)* %in, double %b) {
+define amdgpu_kernel void @bitcast_fp_to_vector_extract_0(i32 addrspace(1)* %out, double addrspace(1)* %in, double %b) #0 {
 ; GCN-LABEL: bitcast_fp_to_vector_extract_0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -65,7 +65,7 @@
    ret void
 }
 
-define amdgpu_kernel void @bitcast_int_to_fpvector_extract_0(float addrspace(1)* %out, i64 addrspace(1)* %in, i64 %b) {
+define amdgpu_kernel void @bitcast_int_to_fpvector_extract_0(float addrspace(1)* %out, i64 addrspace(1)* %in, i64 %b) #0 {
 ; GCN-LABEL: bitcast_int_to_fpvector_extract_0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -95,7 +95,7 @@
    ret void
 }
 
-define amdgpu_kernel void @no_extract_volatile_load_extract0(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @no_extract_volatile_load_extract0(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
 ; GCN-LABEL: no_extract_volatile_load_extract0:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -119,7 +119,7 @@
   ret void
 }
 
-define amdgpu_kernel void @no_extract_volatile_load_extract2(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @no_extract_volatile_load_extract2(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
 ; GCN-LABEL: no_extract_volatile_load_extract2:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -143,7 +143,7 @@
   ret void
 }
 
-define amdgpu_kernel void @no_extract_volatile_load_dynextract(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %idx) {
+define amdgpu_kernel void @no_extract_volatile_load_dynextract(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %idx) #0 {
 ; GCN-LABEL: no_extract_volatile_load_dynextract:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -173,3 +173,4 @@
   store i32 %eltN, i32 addrspace(1)* %out
   ret void
 }
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll
--- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll
+++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll
@@ -326,3 +326,4 @@
 
 
 
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/fpow.ll b/llvm/test/CodeGen/AMDGPU/fpow.ll
--- a/llvm/test/CodeGen/AMDGPU/fpow.ll
+++ b/llvm/test/CodeGen/AMDGPU/fpow.ll
@@ -3,7 +3,7 @@
 ; RUN: llc -march=amdgcn -mcpu=fiji < %s | FileCheck -check-prefix=GFX8 %s
 ; RUN: llc -march=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
 
-define float @v_pow_f32(float %x, float %y) {
+define float @v_pow_f32(float %x, float %y) #0 {
 ; GFX6-LABEL: v_pow_f32:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -31,7 +31,7 @@
   ret float %pow
 }
 
-define <2 x float> @v_pow_v2f32(<2 x float> %x, <2 x float> %y) {
+define <2 x float> @v_pow_v2f32(<2 x float> %x, <2 x float> %y) #0 {
 ; GFX6-LABEL: v_pow_v2f32:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -68,7 +68,7 @@
   ret <2 x float> %pow
 }
 
-define half @v_pow_f16(half %x, half %y) {
+define half @v_pow_f16(half %x, half %y) #0 {
 ; GFX6-LABEL: v_pow_f16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -106,7 +106,7 @@
   ret half %pow
 }
 
-define <2 x half> @v_pow_v2f16(<2 x half> %x, <2 x half> %y) {
+define <2 x half> @v_pow_v2f16(<2 x half> %x, <2 x half> %y) #0 {
 ; GFX6-LABEL: v_pow_v2f16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -166,7 +166,7 @@
   ret <2 x half> %pow
 }
 
-define <2 x half> @v_pow_v2f16_fneg_lhs(<2 x half> %x, <2 x half> %y) {
+define <2 x half> @v_pow_v2f16_fneg_lhs(<2 x half> %x, <2 x half> %y) #0 {
 ; GFX6-LABEL: v_pow_v2f16_fneg_lhs:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -231,7 +231,7 @@
   ret <2 x half> %pow
 }
 
-define <2 x half> @v_pow_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y) {
+define <2 x half> @v_pow_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y) #0 {
 ; GFX6-LABEL: v_pow_v2f16_fneg_rhs:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -296,7 +296,7 @@
   ret <2 x half> %pow
 }
 
-define <2 x half> @v_pow_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y) {
+define <2 x half> @v_pow_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y) #0 {
 ; GFX6-LABEL: v_pow_v2f16_fneg_lhs_rhs:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -368,12 +368,12 @@
 }
 
 ; FIXME
-; define double @v_pow_f64(double %x, double %y) {
+; define double @v_pow_f64(double %x, double %y) #0 {
 ;   %pow = call double @llvm.pow.f64(double %x, double %y)
 ;   ret double %pow
 ; }
 
-define float @v_pow_f32_fabs_lhs(float %x, float %y) {
+define float @v_pow_f32_fabs_lhs(float %x, float %y) #0 {
 ; GFX6-LABEL: v_pow_f32_fabs_lhs:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -405,7 +405,7 @@
   ret float %pow
 }
 
-define float @v_pow_f32_fabs_rhs(float %x, float %y) {
+define float @v_pow_f32_fabs_rhs(float %x, float %y) #0 {
 ; GFX6-LABEL: v_pow_f32_fabs_rhs:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -437,7 +437,7 @@
   ret float %pow
 }
 
-define float @v_pow_f32_fabs_lhs_rhs(float %x, float %y) {
+define float @v_pow_f32_fabs_lhs_rhs(float %x, float %y) #0 {
 ; GFX6-LABEL: v_pow_f32_fabs_lhs_rhs:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -476,7 +476,7 @@
   ret float %pow
 }
 
-define amdgpu_ps float @v_pow_f32_sgpr_vgpr(float inreg %x, float %y) {
+define amdgpu_ps float @v_pow_f32_sgpr_vgpr(float inreg %x, float %y) #0 {
 ; GFX6-LABEL: v_pow_f32_sgpr_vgpr:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    v_log_f32_e32 v1, s0
@@ -501,7 +501,7 @@
   ret float %pow
 }
 
-define amdgpu_ps float @v_pow_f32_vgpr_sgpr(float %x, float inreg %y) {
+define amdgpu_ps float @v_pow_f32_vgpr_sgpr(float %x, float inreg %y) #0 {
 ; GFX6-LABEL: v_pow_f32_vgpr_sgpr:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    v_log_f32_e32 v0, v0
@@ -526,7 +526,7 @@
   ret float %pow
 }
 
-define amdgpu_ps float @v_pow_f32_sgpr_sgpr(float inreg %x, float inreg %y) {
+define amdgpu_ps float @v_pow_f32_sgpr_sgpr(float inreg %x, float inreg %y) #0 {
 ; GFX6-LABEL: v_pow_f32_sgpr_sgpr:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    v_log_f32_e32 v0, s0
@@ -560,3 +560,4 @@
 
 declare <2 x half> @llvm.pow.v2f16(<2 x half>, <2 x half>)
 declare <2 x float> @llvm.pow.v2f32(<2 x float>, <2 x float>)
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/fshl.ll b/llvm/test/CodeGen/AMDGPU/fshl.ll
--- a/llvm/test/CodeGen/AMDGPU/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/fshl.ll
@@ -8,7 +8,7 @@
 declare <2 x i32> @llvm.fshl.v2i32(<2 x i32>, <2 x i32>, <2 x i32>) nounwind readnone
 declare <4 x i32> @llvm.fshl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) nounwind readnone
 
-define amdgpu_kernel void @fshl_i32(i32 addrspace(1)* %in, i32 %x, i32 %y, i32 %z) {
+define amdgpu_kernel void @fshl_i32(i32 addrspace(1)* %in, i32 %x, i32 %y, i32 %z) #0 {
 ; SI-LABEL: fshl_i32:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
@@ -84,7 +84,7 @@
   ret void
 }
 
-define amdgpu_kernel void @fshl_i32_imm(i32 addrspace(1)* %in, i32 %x, i32 %y) {
+define amdgpu_kernel void @fshl_i32_imm(i32 addrspace(1)* %in, i32 %x, i32 %y) #0 {
 ; SI-LABEL: fshl_i32_imm:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
@@ -138,7 +138,7 @@
   ret void
 }
 
-define amdgpu_kernel void @fshl_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y, <2 x i32> %z) {
+define amdgpu_kernel void @fshl_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y, <2 x i32> %z) #0 {
 ; SI-LABEL: fshl_v2i32:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
@@ -250,7 +250,7 @@
   ret void
 }
 
-define amdgpu_kernel void @fshl_v2i32_imm(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y) {
+define amdgpu_kernel void @fshl_v2i32_imm(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y) #0 {
 ; SI-LABEL: fshl_v2i32_imm:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
@@ -315,7 +315,7 @@
   ret void
 }
 
-define amdgpu_kernel void @fshl_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
+define amdgpu_kernel void @fshl_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y, <4 x i32> %z) #0 {
 ; SI-LABEL: fshl_v4i32:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
@@ -487,7 +487,7 @@
   ret void
 }
 
-define amdgpu_kernel void @fshl_v4i32_imm(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y) {
+define amdgpu_kernel void @fshl_v4i32_imm(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y) #0 {
 ; SI-LABEL: fshl_v4i32_imm:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
@@ -567,3 +567,4 @@
   store <4 x i32> %0, <4 x i32> addrspace(1)* %in
   ret void
 }
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll
--- a/llvm/test/CodeGen/AMDGPU/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/fshr.ll
@@ -17,7 +17,7 @@
 declare i24 @llvm.fshr.i24(i24, i24, i24)
 declare <2 x i24> @llvm.fshr.v2i24(<2 x i24>, <2 x i24>, <2 x i24>)
 
-define amdgpu_kernel void @fshr_i32(i32 addrspace(1)* %in, i32 %x, i32 %y, i32 %z) {
+define amdgpu_kernel void @fshr_i32(i32 addrspace(1)* %in, i32 %x, i32 %y, i32 %z) #0 {
 ; SI-LABEL: fshr_i32:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
@@ -73,7 +73,7 @@
   ret void
 }
 
-define amdgpu_kernel void @fshr_i32_imm(i32 addrspace(1)* %in, i32 %x, i32 %y) {
+define amdgpu_kernel void @fshr_i32_imm(i32 addrspace(1)* %in, i32 %x, i32 %y) #0 {
 ; SI-LABEL: fshr_i32_imm:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
@@ -127,7 +127,7 @@
   ret void
 }
 
-define amdgpu_kernel void @fshr_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y, <2 x i32> %z) {
+define amdgpu_kernel void @fshr_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y, <2 x i32> %z) #0 {
 ; SI-LABEL: fshr_v2i32:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
@@ -225,7 +225,7 @@
   ret void
 }
 
-define amdgpu_kernel void @fshr_v2i32_imm(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y) {
+define amdgpu_kernel void @fshr_v2i32_imm(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y) #0 {
 ; SI-LABEL: fshr_v2i32_imm:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
@@ -290,7 +290,7 @@
   ret void
 }
 
-define amdgpu_kernel void @fshr_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
+define amdgpu_kernel void @fshr_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y, <4 x i32> %z) #0 {
 ; SI-LABEL: fshr_v4i32:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
@@ -433,7 +433,7 @@
   ret void
 }
 
-define amdgpu_kernel void @fshr_v4i32_imm(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y) {
+define amdgpu_kernel void @fshr_v4i32_imm(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y) #0 {
 ; SI-LABEL: fshr_v4i32_imm:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
@@ -512,7 +512,7 @@
   ret void
 }
 
-define i32 @v_fshr_i32(i32 %src0, i32 %src1, i32 %src2) {
+define i32 @v_fshr_i32(i32 %src0, i32 %src1, i32 %src2) #0 {
 ; GFX89-LABEL: v_fshr_i32:
 ; GFX89:       ; %bb.0:
 ; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -527,7 +527,7 @@
   ret i32 %ret
 }
 
-define <2 x i32> @v_fshr_v2i32(<2 x i32> %src0, <2 x i32> %src1, <2 x i32> %src2) {
+define <2 x i32> @v_fshr_v2i32(<2 x i32> %src0, <2 x i32> %src1, <2 x i32> %src2) #0 {
 ; GFX89-LABEL: v_fshr_v2i32:
 ; GFX89:       ; %bb.0:
 ; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -549,7 +549,7 @@
   ret <2 x i32> %ret
 }
 
-define <3 x i32> @v_fshr_v3i32(<3 x i32> %src0, <3 x i32> %src1, <3 x i32> %src2) {
+define <3 x i32> @v_fshr_v3i32(<3 x i32> %src0, <3 x i32> %src1, <3 x i32> %src2) #0 {
 ; GFX89-LABEL: v_fshr_v3i32:
 ; GFX89:       ; %bb.0:
 ; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -575,7 +575,7 @@
   ret <3 x i32> %ret
 }
 
-define <4 x i32> @v_fshr_v4i32(<4 x i32> %src0, <4 x i32> %src1, <4 x i32> %src2) {
+define <4 x i32> @v_fshr_v4i32(<4 x i32> %src0, <4 x i32> %src1, <4 x i32> %src2) #0 {
 ; GFX89-LABEL: v_fshr_v4i32:
 ; GFX89:       ; %bb.0:
 ; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -605,7 +605,7 @@
   ret <4 x i32> %ret
 }
 
-define i16 @v_fshr_i16(i16 %src0, i16 %src1, i16 %src2) {
+define i16 @v_fshr_i16(i16 %src0, i16 %src1, i16 %src2) #0 {
 ; SI-LABEL: v_fshr_i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -651,7 +651,7 @@
   ret i16 %ret
 }
 
-define <2 x i16> @v_fshr_v2i16(<2 x i16> %src0, <2 x i16> %src1, <2 x i16> %src2) {
+define <2 x i16> @v_fshr_v2i16(<2 x i16> %src0, <2 x i16> %src1, <2 x i16> %src2) #0 {
 ; SI-LABEL: v_fshr_v2i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -729,7 +729,7 @@
   ret <2 x i16> %ret
 }
 
-define <3 x i16> @v_fshr_v3i16(<3 x i16> %src0, <3 x i16> %src1, <3 x i16> %src2) {
+define <3 x i16> @v_fshr_v3i16(<3 x i16> %src0, <3 x i16> %src1, <3 x i16> %src2) #0 {
 ; SI-LABEL: v_fshr_v3i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -843,7 +843,7 @@
   ret <3 x i16> %ret
 }
 
-define <4 x i16> @v_fshr_v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2) {
+define <4 x i16> @v_fshr_v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2) #0 {
 ; SI-LABEL: v_fshr_v4i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -987,7 +987,7 @@
   ret <4 x i16> %ret
 }
 
-define i64 @v_fshr_i64(i64 %src0, i64 %src1, i64 %src2) {
+define i64 @v_fshr_i64(i64 %src0, i64 %src1, i64 %src2) #0 {
 ; SI-LABEL: v_fshr_i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1041,7 +1041,7 @@
   ret i64 %ret
 }
 
-define <2 x i64> @v_fshr_v2i64(<2 x i64> %src0, <2 x i64> %src1, <2 x i64> %src2) {
+define <2 x i64> @v_fshr_v2i64(<2 x i64> %src0, <2 x i64> %src1, <2 x i64> %src2) #0 {
 ; SI-LABEL: v_fshr_v2i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1122,7 +1122,7 @@
   ret <2 x i64> %ret
 }
 
-define i24 @v_fshr_i24(i24 %src0, i24 %src1, i24 %src2) {
+define i24 @v_fshr_i24(i24 %src0, i24 %src1, i24 %src2) #0 {
 ; SI-LABEL: v_fshr_i24:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1190,7 +1190,7 @@
   ret i24 %ret
 }
 
-define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2) {
+define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2) #0 {
 ; SI-LABEL: v_fshr_v2i24:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1366,3 +1366,4 @@
   %ret = call <2 x i24> @llvm.fshr.v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2)
   ret <2 x i24> %ret
 }
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll b/llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll
--- a/llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
-define amdgpu_ps void @i1_copy_from_loop(<4 x i32> inreg %rsrc, i32 %tid) {
+define amdgpu_ps void @i1_copy_from_loop(<4 x i32> inreg %rsrc, i32 %tid) #0 {
 ; SI-LABEL: i1_copy_from_loop:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_mov_b32 s6, 0
diff --git a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
--- a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
+++ b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX9 %s
 
-define amdgpu_kernel void @udiv32_invariant_denom(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
+define amdgpu_kernel void @udiv32_invariant_denom(i32 addrspace(1)* nocapture %arg, i32 %arg1) #0 {
 ; GFX9-LABEL: udiv32_invariant_denom:
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
@@ -65,7 +65,7 @@
   br i1 %tmp8, label %bb2, label %bb3
 }
 
-define amdgpu_kernel void @urem32_invariant_denom(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
+define amdgpu_kernel void @urem32_invariant_denom(i32 addrspace(1)* nocapture %arg, i32 %arg1) #0 {
 ; GFX9-LABEL: urem32_invariant_denom:
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
@@ -133,7 +133,7 @@
   br i1 %tmp8, label %bb2, label %bb3
 }
 
-define amdgpu_kernel void @sdiv32_invariant_denom(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
+define amdgpu_kernel void @sdiv32_invariant_denom(i32 addrspace(1)* nocapture %arg, i32 %arg1) #0 {
 ; GFX9-LABEL: sdiv32_invariant_denom:
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_load_dword s3, s[0:1], 0x2c
@@ -197,7 +197,7 @@
   br i1 %tmp8, label %bb2, label %bb3
 }
 
-define amdgpu_kernel void @srem32_invariant_denom(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
+define amdgpu_kernel void @srem32_invariant_denom(i32 addrspace(1)* nocapture %arg, i32 %arg1) #0 {
 ; GFX9-LABEL: srem32_invariant_denom:
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
@@ -259,7 +259,7 @@
   br i1 %tmp8, label %bb2, label %bb3
 }
 
-define amdgpu_kernel void @udiv16_invariant_denom(i16 addrspace(1)* nocapture %arg, i16 %arg1) {
+define amdgpu_kernel void @udiv16_invariant_denom(i16 addrspace(1)* nocapture %arg, i16 %arg1) #0 {
 ; GFX9-LABEL: udiv16_invariant_denom:
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_load_dword s3, s[0:1], 0x2c
@@ -310,7 +310,7 @@
   br i1 %tmp8, label %bb2, label %bb3
 }
 
-define amdgpu_kernel void @urem16_invariant_denom(i16 addrspace(1)* nocapture %arg, i16 %arg1) {
+define amdgpu_kernel void @urem16_invariant_denom(i16 addrspace(1)* nocapture %arg, i16 %arg1) #0 {
 ; GFX9-LABEL: urem16_invariant_denom:
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_load_dword s3, s[0:1], 0x2c
@@ -363,7 +363,7 @@
   br i1 %tmp8, label %bb2, label %bb3
 }
 
-define amdgpu_kernel void @sdiv16_invariant_denom(i16 addrspace(1)* nocapture %arg, i16 %arg1) {
+define amdgpu_kernel void @sdiv16_invariant_denom(i16 addrspace(1)* nocapture %arg, i16 %arg1) #0 {
 ; GFX9-LABEL: sdiv16_invariant_denom:
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
@@ -418,7 +418,7 @@
   br i1 %tmp8, label %bb2, label %bb3
 }
 
-define amdgpu_kernel void @srem16_invariant_denom(i16 addrspace(1)* nocapture %arg, i16 %arg1) {
+define amdgpu_kernel void @srem16_invariant_denom(i16 addrspace(1)* nocapture %arg, i16 %arg1) #0 {
 ; GFX9-LABEL: srem16_invariant_denom:
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
@@ -474,3 +474,4 @@
   %tmp8 = icmp eq i16 %tmp7, 1024
   br i1 %tmp8, label %bb2, label %bb3
 }
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/idot2.ll b/llvm/test/CodeGen/AMDGPU/idot2.ll
--- a/llvm/test/CodeGen/AMDGPU/idot2.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot2.ll
@@ -117,7 +117,7 @@
 ; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                  <2 x i16> addrspace(1)* %src2,
-                                 i32 addrspace(1)* nocapture %dst) {
+                                 i32 addrspace(1)* nocapture %dst) #0 {
 entry:
   %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1
   %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2
@@ -265,7 +265,7 @@
 ; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                         <2 x i16> addrspace(1)* %src2,
-                                        i32 addrspace(1)* nocapture %dst) {
+                                        i32 addrspace(1)* nocapture %dst) #0 {
 entry:
   %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1
   %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2
@@ -392,7 +392,7 @@
 ; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                  <2 x i16> addrspace(1)* %src2,
-                                 i32 addrspace(1)* nocapture %dst) {
+                                 i32 addrspace(1)* nocapture %dst) #0 {
 entry:
   %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1
   %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2
@@ -531,7 +531,7 @@
 ; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                                <2 x i16> addrspace(1)* %src2,
-                                               i32 addrspace(1)* nocapture %dst) {
+                                               i32 addrspace(1)* nocapture %dst) #0 {
 entry:
   %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1
   %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2
@@ -663,7 +663,7 @@
 ; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                                  <2 x i16> addrspace(1)* %src2,
-                                                 i32 addrspace(1)* nocapture %dst) {
+                                                 i32 addrspace(1)* nocapture %dst) #0 {
 entry:
   %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1
   %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2
@@ -802,7 +802,7 @@
 ; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                           <2 x i16> addrspace(1)* %src2,
-                                          i32 addrspace(1)* nocapture %dst) {
+                                          i32 addrspace(1)* nocapture %dst) #0 {
 entry:
   %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1
   %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2
@@ -922,7 +922,7 @@
 ; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                             <2 x i16> addrspace(1)* %src2,
-                                            i32 addrspace(1)* nocapture %dst) {
+                                            i32 addrspace(1)* nocapture %dst) #0 {
 entry:
   %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1
   %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2
@@ -1054,7 +1054,7 @@
 ; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                        <4 x i16> addrspace(1)* %src2,
-                                       i32 addrspace(1)* nocapture %dst) {
+                                       i32 addrspace(1)* nocapture %dst) #0 {
 entry:
   %vec1 = load <4 x i16>, <4 x i16> addrspace(1)* %src1
   %vec2 = load <4 x i16>, <4 x i16> addrspace(1)* %src2
@@ -1186,7 +1186,7 @@
 ; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                           <4 x i16> addrspace(1)* %src2,
-                                          i32 addrspace(1)* nocapture %dst) {
+                                          i32 addrspace(1)* nocapture %dst) #0 {
 entry:
   %vec1 = load <4 x i16>, <4 x i16> addrspace(1)* %src1
   %vec2 = load <4 x i16>, <4 x i16> addrspace(1)* %src2
@@ -1331,7 +1331,7 @@
 ; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                                <4 x i16> addrspace(1)* %src2,
-                                               i32 addrspace(1)* nocapture %dst) {
+                                               i32 addrspace(1)* nocapture %dst) #0 {
 entry:
   %vec1 = load <4 x i16>, <4 x i16> addrspace(1)* %src1
   %vec2 = load <4 x i16>, <4 x i16> addrspace(1)* %src2
@@ -1476,7 +1476,7 @@
 ; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                                  <4 x i16> addrspace(1)* %src2,
-                                                 i32 addrspace(1)* nocapture %dst) {
+                                                 i32 addrspace(1)* nocapture %dst) #0 {
 entry:
   %vec1 = load <4 x i16>, <4 x i16> addrspace(1)* %src1
   %vec2 = load <4 x i16>, <4 x i16> addrspace(1)* %src2
@@ -1621,7 +1621,7 @@
 ; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                               <2 x i16> addrspace(1)* %src2,
-                                              i32 addrspace(1)* nocapture %dst) {
+                                              i32 addrspace(1)* nocapture %dst) #0 {
 entry:
   %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1
   %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2
@@ -1771,7 +1771,7 @@
 ; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                                    <2 x i16> addrspace(1)* %src2,
-                                                   i32 addrspace(1)* nocapture %dst) {
+                                                   i32 addrspace(1)* nocapture %dst) #0 {
 entry:
   %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1
   %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2
@@ -1917,7 +1917,7 @@
 ; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                                    <2 x i16> addrspace(1)* %src2,
-                                                   i32 addrspace(1)* nocapture %dst) {
+                                                   i32 addrspace(1)* nocapture %dst) #0 {
 entry:
   %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1
   %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2
@@ -2069,7 +2069,7 @@
 ; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                                    <2 x i16> addrspace(1)* %src2,
-                                                   i32 addrspace(1)* nocapture %dst) {
+                                                   i32 addrspace(1)* nocapture %dst) #0 {
 entry:
   %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1
   %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2
@@ -2216,7 +2216,7 @@
 ; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                                    <2 x i16> addrspace(1)* %src2,
-                                                   i32 addrspace(1)* nocapture %dst) {
+                                                   i32 addrspace(1)* nocapture %dst) #0 {
 entry:
   %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1
   %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2
@@ -2369,7 +2369,7 @@
 ; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                                    <2 x i16> addrspace(1)* %src2,
-                                                   i32 addrspace(1)* nocapture %dst) {
+                                                   i32 addrspace(1)* nocapture %dst) #0 {
 entry:
   %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1
   %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2
@@ -2516,7 +2516,7 @@
 ; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                                    <2 x i16> addrspace(1)* %src2,
-                                                   i32 addrspace(1)* nocapture %dst) {
+                                                   i32 addrspace(1)* nocapture %dst) #0 {
 entry:
   %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1
   %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2
@@ -2650,7 +2650,7 @@
 ; GFX10-DL-NEXT:    global_store_short v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                        <2 x i16> addrspace(1)* %src2,
-                                       i16 addrspace(1)* nocapture %dst) {
+                                       i16 addrspace(1)* nocapture %dst) #0 {
 entry:
   %v1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1
   %v2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2
@@ -2811,7 +2811,7 @@
 ; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                           <2 x i8> addrspace(1)* %src2,
-                                          i32 addrspace(1)* nocapture %dst) {
+                                          i32 addrspace(1)* nocapture %dst) #0 {
 entry:
   %vec1 = load <2 x i8>, <2 x i8> addrspace(1)* %src1
   %vec2 = load <2 x i8>, <2 x i8> addrspace(1)* %src2
@@ -2834,3 +2834,4 @@
   store i32 %add6, i32 addrspace(1)* %dst, align 4
   ret void
 }
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/idot4s.ll b/llvm/test/CodeGen/AMDGPU/idot4s.ll
--- a/llvm/test/CodeGen/AMDGPU/idot4s.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4s.ll
@@ -134,7 +134,7 @@
 ; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                        <4 x i8> addrspace(1)* %src2,
-                                       i32 addrspace(1)* nocapture %dst) {
+                                       i32 addrspace(1)* nocapture %dst) #0 {
 entry:
   %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %src1
   %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %src2
@@ -311,7 +311,7 @@
 ; GFX10-DL-NEXT:    global_store_short v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                        <4 x i8> addrspace(1)* %src2,
-                                       i16 addrspace(1)* nocapture %dst) {
+                                       i16 addrspace(1)* nocapture %dst) #0 {
 entry:
   %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %src1
   %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %src2
@@ -480,7 +480,7 @@
 ; GFX10-DL-NEXT:    global_store_byte v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                       <4 x i8> addrspace(1)* %src2,
-                                      i8 addrspace(1)* nocapture %dst) {
+                                      i8 addrspace(1)* nocapture %dst) #0 {
 entry:
   %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %src1
   %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %src2
@@ -668,7 +668,7 @@
 ; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                                <4 x i8> addrspace(1)* %src2,
-                                               i32 addrspace(1)* nocapture %dst) {
+                                               i32 addrspace(1)* nocapture %dst) #0 {
 entry:
   %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %src1
   %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %src2
@@ -867,7 +867,7 @@
 ; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                               <4 x i8> addrspace(1)* %src2,
-                                              i32 addrspace(1)* nocapture %dst) {
+                                              i32 addrspace(1)* nocapture %dst) #0 {
 entry:
   %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %src1
   %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %src2
@@ -1077,7 +1077,7 @@
 ; GFX10-DL-NEXT:    global_store_short v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                               <4 x i8> addrspace(1)* %src2,
-                                              i16 addrspace(1)* nocapture %dst) {
+                                              i16 addrspace(1)* nocapture %dst) #0 {
 entry:
   %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %src1
   %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %src2
@@ -1100,3 +1100,4 @@
   store i16 %add4, i16 addrspace(1)* %dst, align 4
   ret void
 }
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll
--- a/llvm/test/CodeGen/AMDGPU/idot4u.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll
@@ -137,7 +137,7 @@
 ; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                        <4 x i8> addrspace(1)* %src2,
-                                       i32 addrspace(1)* nocapture %dst) {
+                                       i32 addrspace(1)* nocapture %dst) #0 {
 entry:
   %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %src1
   %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %src2
@@ -307,7 +307,7 @@
 ; GFX10-DL-NEXT:    global_store_short v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                        <4 x i8> addrspace(1)* %src2,
-                                       i16 addrspace(1)* nocapture %dst) {
+                                       i16 addrspace(1)* nocapture %dst) #0 {
 entry:
   %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %src1
   %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %src2
@@ -477,7 +477,7 @@
 ; GFX10-DL-NEXT:    global_store_byte v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                       <4 x i8> addrspace(1)* %src2,
-                                      i8 addrspace(1)* nocapture %dst) {
+                                      i8 addrspace(1)* nocapture %dst) #0 {
 entry:
   %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %src1
   %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %src2
@@ -630,7 +630,7 @@
 ; GFX10-DL-NEXT:    global_store_byte v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                    <4 x i8> addrspace(1)* %src2,
-                                   i8 addrspace(1)* nocapture %dst) {
+                                   i8 addrspace(1)* nocapture %dst) #0 {
 entry:
   %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %src1
   %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %src2
@@ -781,7 +781,7 @@
 ; GFX10-DL-NEXT:    global_store_byte v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                                       <4 x i8> addrspace(1)* %src2,
-                                                      i8 addrspace(1)* nocapture %dst) {
+                                                      i8 addrspace(1)* nocapture %dst) #0 {
 entry:
   %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %src1
   %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %src2
@@ -972,7 +972,7 @@
 ; GFX10-DL-NEXT:    global_store_byte v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                                         <4 x i8> addrspace(1)* %src2,
-                                                        i8 addrspace(1)* nocapture %dst) {
+                                                        i8 addrspace(1)* nocapture %dst) #0 {
 entry:
   %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %src1
   %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %src2
@@ -1166,7 +1166,7 @@
 ; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                                <4 x i8> addrspace(1)* %src2,
-                                               i32 addrspace(1)* nocapture %dst) {
+                                               i32 addrspace(1)* nocapture %dst) #0 {
 entry:
   %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %src1
   %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %src2
@@ -1374,7 +1374,7 @@
 ; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                                <4 x i8> addrspace(1)* %src2,
-                                               i32 addrspace(1)* nocapture %dst) {
+                                               i32 addrspace(1)* nocapture %dst) #0 {
 entry:
   %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %src1
   %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %src2
@@ -1571,7 +1571,7 @@
 ; GFX10-DL-NEXT:    global_store_short v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                               <4 x i8> addrspace(1)* %src2,
-                                              i16 addrspace(1)* nocapture %dst) {
+                                              i16 addrspace(1)* nocapture %dst) #0 {
 entry:
   %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %src1
   %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %src2
@@ -1773,7 +1773,7 @@
 ; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                               <4 x i8> addrspace(1)* %src2,
-                                              i32 addrspace(1)* nocapture %dst) {
+                                              i32 addrspace(1)* nocapture %dst) #0 {
 entry:
   %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %src1
   %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %src2
@@ -1972,7 +1972,7 @@
 ; GFX10-DL-NEXT:    global_store_short v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                               <4 x i8> addrspace(1)* %src2,
-                                              i16 addrspace(1)* nocapture %dst) {
+                                              i16 addrspace(1)* nocapture %dst) #0 {
 entry:
   %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %src1
   %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %src2
@@ -2193,7 +2193,7 @@
 ; GFX10-DL-NEXT:    global_store_byte v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                              <4 x i8> addrspace(1)* %src2,
-                                             i8 addrspace(1)* nocapture %dst) {
+                                             i8 addrspace(1)* nocapture %dst) #0 {
 entry:
   %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %src1
   %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %src2
@@ -2213,3 +2213,4 @@
   store i8 %add4, i8 addrspace(1)* %dst, align 4
   ret void
 }
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll
--- a/llvm/test/CodeGen/AMDGPU/idot8s.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll
@@ -182,7 +182,7 @@
 ; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                        <8 x i4> addrspace(1)* %src2,
-                                       i32 addrspace(1)* nocapture %dst) {
+                                       i32 addrspace(1)* nocapture %dst) #0 {
 entry:
   %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %src1
   %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %src2
@@ -518,7 +518,7 @@
 ; GFX10-DL-NEXT:    global_store_short v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                        <8 x i4> addrspace(1)* %src2,
-                                       i16 addrspace(1)* nocapture %dst) {
+                                       i16 addrspace(1)* nocapture %dst) #0 {
 entry:
   %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %src1
   %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %src2
@@ -862,7 +862,7 @@
 ; GFX10-DL-NEXT:    global_store_byte v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                        <8 x i4> addrspace(1)* %src2,
-                                       i8 addrspace(1)* nocapture %dst) {
+                                       i8 addrspace(1)* nocapture %dst) #0 {
 entry:
   %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %src1
   %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %src2
@@ -1170,7 +1170,7 @@
 ; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                                 <8 x i4> addrspace(1)* %src2,
-                                                i32 addrspace(1)* nocapture %dst) {
+                                                i32 addrspace(1)* nocapture %dst) #0 {
 entry:
   %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %src1
   %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %src2
@@ -1543,7 +1543,7 @@
 ; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                               <8 x i4> addrspace(1)* %src2,
-                                              i32 addrspace(1)* nocapture %dst) {
+                                              i32 addrspace(1)* nocapture %dst) #0 {
 entry:
   %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %src1
   %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %src2
@@ -1902,7 +1902,7 @@
 ; GFX10-DL-NEXT:    global_store_short v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                               <8 x i4> addrspace(1)* %src2,
-                                              i16 addrspace(1)* nocapture %dst) {
+                                              i16 addrspace(1)* nocapture %dst) #0 {
 entry:
   %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %src1
   %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %src2
@@ -2371,7 +2371,7 @@
 ; GFX10-DL-NEXT:    global_store_byte v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                              <8 x i4> addrspace(1)* %src2,
-                                             i8 addrspace(1)* nocapture %dst) {
+                                             i8 addrspace(1)* nocapture %dst) #0 {
 entry:
   %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %src1
   %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %src2
@@ -2402,3 +2402,4 @@
   store i8 %add8, i8 addrspace(1)* %dst, align 4
   ret void
 }
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/idot8u.ll b/llvm/test/CodeGen/AMDGPU/idot8u.ll
--- a/llvm/test/CodeGen/AMDGPU/idot8u.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot8u.ll
@@ -182,7 +182,7 @@
 ; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                        <8 x i4> addrspace(1)* %src2,
-                                       i32 addrspace(1)* nocapture %dst) {
+                                       i32 addrspace(1)* nocapture %dst) #0 {
 entry:
   %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %src1
   %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %src2
@@ -485,7 +485,7 @@
 ; GFX10-DL-NEXT:    global_store_short v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                        <8 x i4> addrspace(1)* %src2,
-                                       i16 addrspace(1)* nocapture %dst) {
+                                       i16 addrspace(1)* nocapture %dst) #0 {
 entry:
   %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %src1
   %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %src2
@@ -788,7 +788,7 @@
 ; GFX10-DL-NEXT:    global_store_byte v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                       <8 x i4> addrspace(1)* %src2,
-                                      i8 addrspace(1)* nocapture %dst) {
+                                      i8 addrspace(1)* nocapture %dst) #0 {
 entry:
   %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %src1
   %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %src2
@@ -1104,7 +1104,7 @@
 ; GFX10-DL-NEXT:    global_store_byte v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                       <8 x i4> addrspace(1)* %src2,
-                                      i4 addrspace(1)* nocapture %dst) {
+                                      i4 addrspace(1)* nocapture %dst) #0 {
 entry:
   %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %src1
   %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %src2
@@ -1404,7 +1404,7 @@
 ; GFX10-DL-NEXT:    global_store_byte v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                                       <8 x i4> addrspace(1)* %src2,
-                                                      i4 addrspace(1)* nocapture %dst) {
+                                                      i4 addrspace(1)* nocapture %dst) #0 {
 entry:
   %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %src1
   %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %src2
@@ -1694,7 +1694,7 @@
 ; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                                 <8 x i4> addrspace(1)* %src2,
-                                                i32 addrspace(1)* nocapture %dst) {
+                                                i32 addrspace(1)* nocapture %dst) #0 {
 entry:
   %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %src1
   %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %src2
@@ -1939,7 +1939,7 @@
 ; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                               <8 x i4> addrspace(1)* %src2,
-                                              i32 addrspace(1)* nocapture %dst) {
+                                              i32 addrspace(1)* nocapture %dst) #0 {
 entry:
   %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %src1
   %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %src2
@@ -2242,7 +2242,7 @@
 ; GFX10-DL-NEXT:    global_store_short v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                               <8 x i4> addrspace(1)* %src2,
-                                              i16 addrspace(1)* nocapture %dst) {
+                                              i16 addrspace(1)* nocapture %dst) #0 {
 entry:
   %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %src1
   %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %src2
@@ -2615,7 +2615,7 @@
 ; GFX10-DL-NEXT:    global_store_byte v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                              <8 x i4> addrspace(1)* %src2,
-                                             i8 addrspace(1)* nocapture %dst) {
+                                             i8 addrspace(1)* nocapture %dst) #0 {
 entry:
   %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %src1
   %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %src2
@@ -2895,7 +2895,7 @@
 ; GFX10-DL-NEXT:    global_store_byte v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                              <8 x i4> addrspace(1)* %src2,
-                                             i4 addrspace(1)* nocapture %dst) {
+                                             i4 addrspace(1)* nocapture %dst) #0 {
 entry:
   %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %src1
   %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %src2
@@ -3100,7 +3100,7 @@
 ; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                           i32 addrspace(1)* %v2addr,
-                                          i32 addrspace(1)* %dst) {
+                                          i32 addrspace(1)* %dst) #0 {
 entry:
   %v1 = load i32, i32 addrspace(1)* %v1addr, align 4
   %v2 = load i32, i32 addrspace(1)* %v2addr, align 4
@@ -3160,3 +3160,4 @@
   store i32 %add8, i32 addrspace(1)* %dst, align 4
   ret void
 }
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/image-load-d16-tfe.ll b/llvm/test/CodeGen/AMDGPU/image-load-d16-tfe.ll
--- a/llvm/test/CodeGen/AMDGPU/image-load-d16-tfe.ll
+++ b/llvm/test/CodeGen/AMDGPU/image-load-d16-tfe.ll
@@ -3,7 +3,7 @@
 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8-UNPACKED %s
 
-define amdgpu_ps void @load_1d_f16_tfe_dmask0(<8 x i32> inreg %rsrc, i32 %s) {
+define amdgpu_ps void @load_1d_f16_tfe_dmask0(<8 x i32> inreg %rsrc, i32 %s) #0 {
 ; GFX9-LABEL: load_1d_f16_tfe_dmask0:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
@@ -66,7 +66,7 @@
   ret void
 }
 
-define amdgpu_ps void @load_1d_f16_tfe_dmask1(<8 x i32> inreg %rsrc, i32 %s) {
+define amdgpu_ps void @load_1d_f16_tfe_dmask1(<8 x i32> inreg %rsrc, i32 %s) #0 {
 ; GFX9-LABEL: load_1d_f16_tfe_dmask1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
@@ -129,7 +129,7 @@
   ret void
 }
 
-define amdgpu_ps void @load_1d_v2f16_tfe_dmask0(<8 x i32> inreg %rsrc, i32 %s) {
+define amdgpu_ps void @load_1d_v2f16_tfe_dmask0(<8 x i32> inreg %rsrc, i32 %s) #0 {
 ; GFX9-LABEL: load_1d_v2f16_tfe_dmask0:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
@@ -192,7 +192,7 @@
   ret void
 }
 
-define amdgpu_ps void @load_1d_v2f16_tfe_dmask1(<8 x i32> inreg %rsrc, i32 %s) {
+define amdgpu_ps void @load_1d_v2f16_tfe_dmask1(<8 x i32> inreg %rsrc, i32 %s) #0 {
 ; GFX9-LABEL: load_1d_v2f16_tfe_dmask1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
@@ -255,7 +255,7 @@
   ret void
 }
 
-define amdgpu_ps void @load_1d_v2f16_tfe_dmask3(<8 x i32> inreg %rsrc, i32 %s) {
+define amdgpu_ps void @load_1d_v2f16_tfe_dmask3(<8 x i32> inreg %rsrc, i32 %s) #0 {
 ; GFX9-LABEL: load_1d_v2f16_tfe_dmask3:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
@@ -321,7 +321,7 @@
   ret void
 }
 
-; define amdgpu_ps void @load_1d_v3f16_tfe_dmask7(<8 x i32> inreg %rsrc, i32 %s) {
+; define amdgpu_ps void @load_1d_v3f16_tfe_dmask7(<8 x i32> inreg %rsrc, i32 %s) #0 {
 ;   %v = call { <3 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v3f16i32s.i32(i32 7, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
 ;   %v.data = extractvalue { <3 x half>, i32 } %v, 0
 ;   %v.err = extractvalue { <3 x half>, i32 } %v, 1
@@ -330,7 +330,7 @@
 ;   ret void
 ; }
 
-define amdgpu_ps void @load_1d_v4f16_tfe_dmask15(<8 x i32> inreg %rsrc, i32 %s) {
+define amdgpu_ps void @load_1d_v4f16_tfe_dmask15(<8 x i32> inreg %rsrc, i32 %s) #0 {
 ; GFX9-LABEL: load_1d_v4f16_tfe_dmask15:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
diff --git a/llvm/test/CodeGen/AMDGPU/imm.ll b/llvm/test/CodeGen/AMDGPU/imm.ll
--- a/llvm/test/CodeGen/AMDGPU/imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/imm.ll
@@ -3,7 +3,7 @@
 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
 
 ; Use a 64-bit value with lo bits that can be represented as an inline constant
-define amdgpu_kernel void @i64_imm_inline_lo(i64 addrspace(1) *%out) {
+define amdgpu_kernel void @i64_imm_inline_lo(i64 addrspace(1) *%out) #0 {
 ; SI-LABEL: i64_imm_inline_lo:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -31,7 +31,7 @@
 }
 
 ; Use a 64-bit value with hi bits that can be represented as an inline constant
-define amdgpu_kernel void @i64_imm_inline_hi(i64 addrspace(1) *%out) {
+define amdgpu_kernel void @i64_imm_inline_hi(i64 addrspace(1) *%out) #0 {
 ; SI-LABEL: i64_imm_inline_hi:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -58,7 +58,7 @@
   ret void
 }
 
-define amdgpu_kernel void @store_imm_neg_0.0_i64(i64 addrspace(1) *%out) {
+define amdgpu_kernel void @store_imm_neg_0.0_i64(i64 addrspace(1) *%out) #0 {
 ; SI-LABEL: store_imm_neg_0.0_i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -84,7 +84,7 @@
   ret void
 }
 
-define amdgpu_kernel void @store_inline_imm_neg_0.0_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_neg_0.0_i32(i32 addrspace(1)* %out) #0 {
 ; SI-LABEL: store_inline_imm_neg_0.0_i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -108,7 +108,7 @@
   ret void
 }
 
-define amdgpu_kernel void @store_inline_imm_0.0_f32(float addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_0.0_f32(float addrspace(1)* %out) #0 {
 ; SI-LABEL: store_inline_imm_0.0_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -132,7 +132,7 @@
   ret void
 }
 
-define amdgpu_kernel void @store_imm_neg_0.0_f32(float addrspace(1)* %out) {
+define amdgpu_kernel void @store_imm_neg_0.0_f32(float addrspace(1)* %out) #0 {
 ; SI-LABEL: store_imm_neg_0.0_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -156,7 +156,7 @@
   ret void
 }
 
-define amdgpu_kernel void @store_inline_imm_0.5_f32(float addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_0.5_f32(float addrspace(1)* %out) #0 {
 ; SI-LABEL: store_inline_imm_0.5_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -180,7 +180,7 @@
   ret void
 }
 
-define amdgpu_kernel void @store_inline_imm_m_0.5_f32(float addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_m_0.5_f32(float addrspace(1)* %out) #0 {
 ; SI-LABEL: store_inline_imm_m_0.5_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -204,7 +204,7 @@
   ret void
 }
 
-define amdgpu_kernel void @store_inline_imm_1.0_f32(float addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_1.0_f32(float addrspace(1)* %out) #0 {
 ; SI-LABEL: store_inline_imm_1.0_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -228,7 +228,7 @@
   ret void
 }
 
-define amdgpu_kernel void @store_inline_imm_m_1.0_f32(float addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_m_1.0_f32(float addrspace(1)* %out) #0 {
 ; SI-LABEL: store_inline_imm_m_1.0_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -252,7 +252,7 @@
   ret void
 }
 
-define amdgpu_kernel void @store_inline_imm_2.0_f32(float addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_2.0_f32(float addrspace(1)* %out) #0 {
 ; SI-LABEL: store_inline_imm_2.0_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -276,7 +276,7 @@
   ret void
 }
 
-define amdgpu_kernel void @store_inline_imm_m_2.0_f32(float addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_m_2.0_f32(float addrspace(1)* %out) #0 {
 ; SI-LABEL: store_inline_imm_m_2.0_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -300,7 +300,7 @@
   ret void
 }
 
-define amdgpu_kernel void @store_inline_imm_4.0_f32(float addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_4.0_f32(float addrspace(1)* %out) #0 {
 ; SI-LABEL: store_inline_imm_4.0_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -324,7 +324,7 @@
   ret void
 }
 
-define amdgpu_kernel void @store_inline_imm_m_4.0_f32(float addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_m_4.0_f32(float addrspace(1)* %out) #0 {
 ; SI-LABEL: store_inline_imm_m_4.0_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -348,7 +348,7 @@
   ret void
 }
 
-define amdgpu_kernel void @store_inline_imm_inv_2pi_f32(float addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_inv_2pi_f32(float addrspace(1)* %out) #0 {
 ; SI-LABEL: store_inline_imm_inv_2pi_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -372,7 +372,7 @@
   ret void
 }
 
-define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f32(float addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f32(float addrspace(1)* %out) #0 {
 ; SI-LABEL: store_inline_imm_m_inv_2pi_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -396,7 +396,7 @@
   ret void
 }
 
-define amdgpu_kernel void @store_literal_imm_f32(float addrspace(1)* %out) {
+define amdgpu_kernel void @store_literal_imm_f32(float addrspace(1)* %out) #0 {
 ; SI-LABEL: store_literal_imm_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -420,7 +420,7 @@
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_0.0_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_0.0_f32(float addrspace(1)* %out, float %x) #0 {
 ; SI-LABEL: add_inline_imm_0.0_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
@@ -447,7 +447,7 @@
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_0.5_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_0.5_f32(float addrspace(1)* %out, float %x) #0 {
 ; SI-LABEL: add_inline_imm_0.5_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
@@ -474,7 +474,7 @@
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_neg_0.5_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_neg_0.5_f32(float addrspace(1)* %out, float %x) #0 {
 ; SI-LABEL: add_inline_imm_neg_0.5_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
@@ -501,7 +501,7 @@
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_1.0_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_1.0_f32(float addrspace(1)* %out, float %x) #0 {
 ; SI-LABEL: add_inline_imm_1.0_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
@@ -528,7 +528,7 @@
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_neg_1.0_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_neg_1.0_f32(float addrspace(1)* %out, float %x) #0 {
 ; SI-LABEL: add_inline_imm_neg_1.0_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
@@ -555,7 +555,7 @@
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_2.0_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_2.0_f32(float addrspace(1)* %out, float %x) #0 {
 ; SI-LABEL: add_inline_imm_2.0_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
@@ -582,7 +582,7 @@
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_neg_2.0_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_neg_2.0_f32(float addrspace(1)* %out, float %x) #0 {
 ; SI-LABEL: add_inline_imm_neg_2.0_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
@@ -609,7 +609,7 @@
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_4.0_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_4.0_f32(float addrspace(1)* %out, float %x) #0 {
 ; SI-LABEL: add_inline_imm_4.0_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
@@ -636,7 +636,7 @@
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_neg_4.0_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_neg_4.0_f32(float addrspace(1)* %out, float %x) #0 {
 ; SI-LABEL: add_inline_imm_neg_4.0_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
@@ -663,7 +663,7 @@
   ret void
 }
 
-define amdgpu_kernel void @commute_add_inline_imm_0.5_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
+define amdgpu_kernel void @commute_add_inline_imm_0.5_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
 ; SI-LABEL: commute_add_inline_imm_0.5_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -705,7 +705,7 @@
   ret void
 }
 
-define amdgpu_kernel void @commute_add_literal_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
+define amdgpu_kernel void @commute_add_literal_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
 ; SI-LABEL: commute_add_literal_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -747,7 +747,7 @@
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_1_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_1_f32(float addrspace(1)* %out, float %x) #0 {
 ; SI-LABEL: add_inline_imm_1_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
@@ -774,7 +774,7 @@
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_2_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_2_f32(float addrspace(1)* %out, float %x) #0 {
 ; SI-LABEL: add_inline_imm_2_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
@@ -801,7 +801,7 @@
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_16_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_16_f32(float addrspace(1)* %out, float %x) #0 {
 ; SI-LABEL: add_inline_imm_16_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
@@ -828,7 +828,7 @@
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_neg_1_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_neg_1_f32(float addrspace(1)* %out, float %x) #0 {
 ; SI-LABEL: add_inline_imm_neg_1_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
@@ -859,7 +859,7 @@
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_neg_2_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_neg_2_f32(float addrspace(1)* %out, float %x) #0 {
 ; SI-LABEL: add_inline_imm_neg_2_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
@@ -890,7 +890,7 @@
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_neg_16_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_neg_16_f32(float addrspace(1)* %out, float %x) #0 {
 ; SI-LABEL: add_inline_imm_neg_16_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
@@ -921,7 +921,7 @@
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_63_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_63_f32(float addrspace(1)* %out, float %x) #0 {
 ; SI-LABEL: add_inline_imm_63_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
@@ -948,7 +948,7 @@
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_64_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_64_f32(float addrspace(1)* %out, float %x) #0 {
 ; SI-LABEL: add_inline_imm_64_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
@@ -975,7 +975,7 @@
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_0.0_f64(double addrspace(1)* %out, [8 x i32], double %x) {
+define amdgpu_kernel void @add_inline_imm_0.0_f64(double addrspace(1)* %out, [8 x i32], double %x) #0 {
 ; SI-LABEL: add_inline_imm_0.0_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x13
@@ -1002,7 +1002,7 @@
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_0.5_f64(double addrspace(1)* %out, [8 x i32], double %x) {
+define amdgpu_kernel void @add_inline_imm_0.5_f64(double addrspace(1)* %out, [8 x i32], double %x) #0 {
 ; SI-LABEL: add_inline_imm_0.5_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x13
@@ -1029,7 +1029,7 @@
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_neg_0.5_f64(double addrspace(1)* %out, [8 x i32], double %x) {
+define amdgpu_kernel void @add_inline_imm_neg_0.5_f64(double addrspace(1)* %out, [8 x i32], double %x) #0 {
 ; SI-LABEL: add_inline_imm_neg_0.5_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x13
@@ -1056,7 +1056,7 @@
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_1.0_f64(double addrspace(1)* %out, [8 x i32], double %x) {
+define amdgpu_kernel void @add_inline_imm_1.0_f64(double addrspace(1)* %out, [8 x i32], double %x) #0 {
 ; SI-LABEL: add_inline_imm_1.0_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x13
@@ -1083,7 +1083,7 @@
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_neg_1.0_f64(double addrspace(1)* %out, [8 x i32], double %x) {
+define amdgpu_kernel void @add_inline_imm_neg_1.0_f64(double addrspace(1)* %out, [8 x i32], double %x) #0 {
 ; SI-LABEL: add_inline_imm_neg_1.0_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x13
@@ -1110,7 +1110,7 @@
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_2.0_f64(double addrspace(1)* %out, [8 x i32], double %x) {
+define amdgpu_kernel void @add_inline_imm_2.0_f64(double addrspace(1)* %out, [8 x i32], double %x) #0 {
 ; SI-LABEL: add_inline_imm_2.0_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x13
@@ -1137,7 +1137,7 @@
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_neg_2.0_f64(double addrspace(1)* %out, [8 x i32], double %x) {
+define amdgpu_kernel void @add_inline_imm_neg_2.0_f64(double addrspace(1)* %out, [8 x i32], double %x) #0 {
 ; SI-LABEL: add_inline_imm_neg_2.0_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x13
@@ -1164,7 +1164,7 @@
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_4.0_f64(double addrspace(1)* %out, [8 x i32], double %x) {
+define amdgpu_kernel void @add_inline_imm_4.0_f64(double addrspace(1)* %out, [8 x i32], double %x) #0 {
 ; SI-LABEL: add_inline_imm_4.0_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x13
@@ -1191,7 +1191,7 @@
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_neg_4.0_f64(double addrspace(1)* %out, [8 x i32], double %x) {
+define amdgpu_kernel void @add_inline_imm_neg_4.0_f64(double addrspace(1)* %out, [8 x i32], double %x) #0 {
 ; SI-LABEL: add_inline_imm_neg_4.0_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x13
@@ -1218,7 +1218,7 @@
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_inv_2pi_f64(double addrspace(1)* %out, [8 x i32], double %x) {
+define amdgpu_kernel void @add_inline_imm_inv_2pi_f64(double addrspace(1)* %out, [8 x i32], double %x) #0 {
 ; SI-LABEL: add_inline_imm_inv_2pi_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x13
@@ -1247,7 +1247,7 @@
   ret void
 }
 
-define amdgpu_kernel void @add_m_inv_2pi_f64(double addrspace(1)* %out, [8 x i32], double %x) {
+define amdgpu_kernel void @add_m_inv_2pi_f64(double addrspace(1)* %out, [8 x i32], double %x) #0 {
 ; SI-LABEL: add_m_inv_2pi_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x13
@@ -1278,7 +1278,7 @@
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_1_f64(double addrspace(1)* %out, [8 x i32], double %x) {
+define amdgpu_kernel void @add_inline_imm_1_f64(double addrspace(1)* %out, [8 x i32], double %x) #0 {
 ; SI-LABEL: add_inline_imm_1_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x13
@@ -1305,7 +1305,7 @@
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_2_f64(double addrspace(1)* %out, [8 x i32], double %x) {
+define amdgpu_kernel void @add_inline_imm_2_f64(double addrspace(1)* %out, [8 x i32], double %x) #0 {
 ; SI-LABEL: add_inline_imm_2_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x13
@@ -1332,7 +1332,7 @@
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_16_f64(double addrspace(1)* %out, [8 x i32], double %x) {
+define amdgpu_kernel void @add_inline_imm_16_f64(double addrspace(1)* %out, [8 x i32], double %x) #0 {
 ; SI-LABEL: add_inline_imm_16_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x13
@@ -1359,7 +1359,7 @@
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_neg_1_f64(double addrspace(1)* %out, [8 x i32], double %x) {
+define amdgpu_kernel void @add_inline_imm_neg_1_f64(double addrspace(1)* %out, [8 x i32], double %x) #0 {
 ; SI-LABEL: add_inline_imm_neg_1_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -1386,7 +1386,7 @@
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_neg_2_f64(double addrspace(1)* %out, [8 x i32], double %x) {
+define amdgpu_kernel void @add_inline_imm_neg_2_f64(double addrspace(1)* %out, [8 x i32], double %x) #0 {
 ; SI-LABEL: add_inline_imm_neg_2_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -1413,7 +1413,7 @@
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_neg_16_f64(double addrspace(1)* %out, [8 x i32], double %x) {
+define amdgpu_kernel void @add_inline_imm_neg_16_f64(double addrspace(1)* %out, [8 x i32], double %x) #0 {
 ; SI-LABEL: add_inline_imm_neg_16_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -1440,7 +1440,7 @@
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_63_f64(double addrspace(1)* %out, [8 x i32], double %x) {
+define amdgpu_kernel void @add_inline_imm_63_f64(double addrspace(1)* %out, [8 x i32], double %x) #0 {
 ; SI-LABEL: add_inline_imm_63_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x13
@@ -1467,7 +1467,7 @@
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_64_f64(double addrspace(1)* %out, [8 x i32], double %x) {
+define amdgpu_kernel void @add_inline_imm_64_f64(double addrspace(1)* %out, [8 x i32], double %x) #0 {
 ; SI-LABEL: add_inline_imm_64_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x13
@@ -1494,7 +1494,7 @@
   ret void
 }
 
-define amdgpu_kernel void @store_inline_imm_0.0_f64(double addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_0.0_f64(double addrspace(1)* %out) #0 {
 ; SI-LABEL: store_inline_imm_0.0_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -1520,7 +1520,7 @@
   ret void
 }
 
-define amdgpu_kernel void @store_literal_imm_neg_0.0_f64(double addrspace(1)* %out) {
+define amdgpu_kernel void @store_literal_imm_neg_0.0_f64(double addrspace(1)* %out) #0 {
 ; SI-LABEL: store_literal_imm_neg_0.0_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -1546,7 +1546,7 @@
   ret void
 }
 
-define amdgpu_kernel void @store_inline_imm_0.5_f64(double addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_0.5_f64(double addrspace(1)* %out) #0 {
 ; SI-LABEL: store_inline_imm_0.5_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -1572,7 +1572,7 @@
   ret void
 }
 
-define amdgpu_kernel void @store_inline_imm_m_0.5_f64(double addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_m_0.5_f64(double addrspace(1)* %out) #0 {
 ; SI-LABEL: store_inline_imm_m_0.5_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -1598,7 +1598,7 @@
   ret void
 }
 
-define amdgpu_kernel void @store_inline_imm_1.0_f64(double addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_1.0_f64(double addrspace(1)* %out) #0 {
 ; SI-LABEL: store_inline_imm_1.0_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -1624,7 +1624,7 @@
   ret void
 }
 
-define amdgpu_kernel void @store_inline_imm_m_1.0_f64(double addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_m_1.0_f64(double addrspace(1)* %out) #0 {
 ; SI-LABEL: store_inline_imm_m_1.0_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -1650,7 +1650,7 @@
   ret void
 }
 
-define amdgpu_kernel void @store_inline_imm_2.0_f64(double addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_2.0_f64(double addrspace(1)* %out) #0 {
 ; SI-LABEL: store_inline_imm_2.0_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -1676,7 +1676,7 @@
   ret void
 }
 
-define amdgpu_kernel void @store_inline_imm_m_2.0_f64(double addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_m_2.0_f64(double addrspace(1)* %out) #0 {
 ; SI-LABEL: store_inline_imm_m_2.0_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -1702,7 +1702,7 @@
   ret void
 }
 
-define amdgpu_kernel void @store_inline_imm_4.0_f64(double addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_4.0_f64(double addrspace(1)* %out) #0 {
 ; SI-LABEL: store_inline_imm_4.0_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -1728,7 +1728,7 @@
   ret void
 }
 
-define amdgpu_kernel void @store_inline_imm_m_4.0_f64(double addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_m_4.0_f64(double addrspace(1)* %out) #0 {
 ; SI-LABEL: store_inline_imm_m_4.0_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -1754,7 +1754,7 @@
   ret void
 }
 
-define amdgpu_kernel void @store_inv_2pi_f64(double addrspace(1)* %out) {
+define amdgpu_kernel void @store_inv_2pi_f64(double addrspace(1)* %out) #0 {
 ; SI-LABEL: store_inv_2pi_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -1780,7 +1780,7 @@
   ret void
 }
 
-define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f64(double addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f64(double addrspace(1)* %out) #0 {
 ; SI-LABEL: store_inline_imm_m_inv_2pi_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -1806,7 +1806,7 @@
   ret void
 }
 
-define amdgpu_kernel void @store_literal_imm_f64(double addrspace(1)* %out) {
+define amdgpu_kernel void @store_literal_imm_f64(double addrspace(1)* %out) #0 {
 ; SI-LABEL: store_literal_imm_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -1832,7 +1832,7 @@
   ret void
 }
 
-define amdgpu_vs void @literal_folding(float %arg) {
+define amdgpu_vs void @literal_folding(float %arg) #0 {
 ; GCN-LABEL: literal_folding:
 ; GCN:       ; %bb.0: ; %main_body
 ; GCN-NEXT:    v_mul_f32_e32 v1, 0x3f4353f8, v0
diff --git a/llvm/test/CodeGen/AMDGPU/imm16.ll b/llvm/test/CodeGen/AMDGPU/imm16.ll
--- a/llvm/test/CodeGen/AMDGPU/imm16.ll
+++ b/llvm/test/CodeGen/AMDGPU/imm16.ll
@@ -4,7 +4,7 @@
 
 ; FIXME: Merge into imm.ll
 
-define amdgpu_kernel void @store_inline_imm_neg_0.0_i16(i16 addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_neg_0.0_i16(i16 addrspace(1)* %out) #0 {
 ; VI-LABEL: store_inline_imm_neg_0.0_i16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -28,7 +28,7 @@
   ret void
 }
 
-define amdgpu_kernel void @store_inline_imm_0.0_f16(half addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_0.0_f16(half addrspace(1)* %out) #0 {
 ; VI-LABEL: store_inline_imm_0.0_f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -52,7 +52,7 @@
   ret void
 }
 
-define amdgpu_kernel void @store_imm_neg_0.0_f16(half addrspace(1)* %out) {
+define amdgpu_kernel void @store_imm_neg_0.0_f16(half addrspace(1)* %out) #0 {
 ; VI-LABEL: store_imm_neg_0.0_f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -76,7 +76,7 @@
   ret void
 }
 
-define amdgpu_kernel void @store_inline_imm_0.5_f16(half addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_0.5_f16(half addrspace(1)* %out) #0 {
 ; VI-LABEL: store_inline_imm_0.5_f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -100,7 +100,7 @@
   ret void
 }
 
-define amdgpu_kernel void @store_inline_imm_m_0.5_f16(half addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_m_0.5_f16(half addrspace(1)* %out) #0 {
 ; VI-LABEL: store_inline_imm_m_0.5_f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -124,7 +124,7 @@
   ret void
 }
 
-define amdgpu_kernel void @store_inline_imm_1.0_f16(half addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_1.0_f16(half addrspace(1)* %out) #0 {
 ; VI-LABEL: store_inline_imm_1.0_f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -148,7 +148,7 @@
   ret void
 }
 
-define amdgpu_kernel void @store_inline_imm_m_1.0_f16(half addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_m_1.0_f16(half addrspace(1)* %out) #0 {
 ; VI-LABEL: store_inline_imm_m_1.0_f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -172,7 +172,7 @@
   ret void
 }
 
-define amdgpu_kernel void @store_inline_imm_2.0_f16(half addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_2.0_f16(half addrspace(1)* %out) #0 {
 ; VI-LABEL: store_inline_imm_2.0_f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -196,7 +196,7 @@
   ret void
 }
 
-define amdgpu_kernel void @store_inline_imm_m_2.0_f16(half addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_m_2.0_f16(half addrspace(1)* %out) #0 {
 ; VI-LABEL: store_inline_imm_m_2.0_f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -220,7 +220,7 @@
   ret void
 }
 
-define amdgpu_kernel void @store_inline_imm_4.0_f16(half addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_4.0_f16(half addrspace(1)* %out) #0 {
 ; VI-LABEL: store_inline_imm_4.0_f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -244,7 +244,7 @@
   ret void
 }
 
-define amdgpu_kernel void @store_inline_imm_m_4.0_f16(half addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_m_4.0_f16(half addrspace(1)* %out) #0 {
 ; VI-LABEL: store_inline_imm_m_4.0_f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -268,7 +268,7 @@
   ret void
 }
 
-define amdgpu_kernel void @store_inline_imm_inv_2pi_f16(half addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_inv_2pi_f16(half addrspace(1)* %out) #0 {
 ; VI-LABEL: store_inline_imm_inv_2pi_f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -292,7 +292,7 @@
   ret void
 }
 
-define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f16(half addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f16(half addrspace(1)* %out) #0 {
 ; VI-LABEL: store_inline_imm_m_inv_2pi_f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -316,7 +316,7 @@
   ret void
 }
 
-define amdgpu_kernel void @store_literal_imm_f16(half addrspace(1)* %out) {
+define amdgpu_kernel void @store_literal_imm_f16(half addrspace(1)* %out) #0 {
 ; VI-LABEL: store_literal_imm_f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -340,7 +340,7 @@
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_0.0_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_0.0_f16(half addrspace(1)* %out, half %x) #0 {
 ; VI-LABEL: add_inline_imm_0.0_f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -369,7 +369,7 @@
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_0.5_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_0.5_f16(half addrspace(1)* %out, half %x) #0 {
 ; VI-LABEL: add_inline_imm_0.5_f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -398,7 +398,7 @@
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_neg_0.5_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_neg_0.5_f16(half addrspace(1)* %out, half %x) #0 {
 ; VI-LABEL: add_inline_imm_neg_0.5_f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -427,7 +427,7 @@
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_1.0_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_1.0_f16(half addrspace(1)* %out, half %x) #0 {
 ; VI-LABEL: add_inline_imm_1.0_f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -456,7 +456,7 @@
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_neg_1.0_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_neg_1.0_f16(half addrspace(1)* %out, half %x) #0 {
 ; VI-LABEL: add_inline_imm_neg_1.0_f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -485,7 +485,7 @@
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_2.0_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_2.0_f16(half addrspace(1)* %out, half %x) #0 {
 ; VI-LABEL: add_inline_imm_2.0_f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -514,7 +514,7 @@
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_neg_2.0_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_neg_2.0_f16(half addrspace(1)* %out, half %x) #0 {
 ; VI-LABEL: add_inline_imm_neg_2.0_f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -543,7 +543,7 @@
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_4.0_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_4.0_f16(half addrspace(1)* %out, half %x) #0 {
 ; VI-LABEL: add_inline_imm_4.0_f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -572,7 +572,7 @@
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_neg_4.0_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_neg_4.0_f16(half addrspace(1)* %out, half %x) #0 {
 ; VI-LABEL: add_inline_imm_neg_4.0_f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -601,7 +601,7 @@
   ret void
 }
 
-define amdgpu_kernel void @commute_add_inline_imm_0.5_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
+define amdgpu_kernel void @commute_add_inline_imm_0.5_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
 ; VI-LABEL: commute_add_inline_imm_0.5_f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
@@ -645,7 +645,7 @@
   ret void
 }
 
-define amdgpu_kernel void @commute_add_literal_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
+define amdgpu_kernel void @commute_add_literal_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
 ; VI-LABEL: commute_add_literal_f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
@@ -689,7 +689,7 @@
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_1_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_1_f16(half addrspace(1)* %out, half %x) #0 {
 ; VI-LABEL: add_inline_imm_1_f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -718,7 +718,7 @@
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_2_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_2_f16(half addrspace(1)* %out, half %x) #0 {
 ; VI-LABEL: add_inline_imm_2_f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -747,7 +747,7 @@
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_16_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_16_f16(half addrspace(1)* %out, half %x) #0 {
 ; VI-LABEL: add_inline_imm_16_f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -776,7 +776,7 @@
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_neg_1_f16(half addrspace(1)* %out, i16 addrspace(1)* %in) {
+define amdgpu_kernel void @add_inline_imm_neg_1_f16(half addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
 ; VI-LABEL: add_inline_imm_neg_1_f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
@@ -819,7 +819,7 @@
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_neg_2_f16(half addrspace(1)* %out, i16 addrspace(1)* %in) {
+define amdgpu_kernel void @add_inline_imm_neg_2_f16(half addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
 ; VI-LABEL: add_inline_imm_neg_2_f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
@@ -862,7 +862,7 @@
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_neg_16_f16(half addrspace(1)* %out, i16 addrspace(1)* %in) {
+define amdgpu_kernel void @add_inline_imm_neg_16_f16(half addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
 ; VI-LABEL: add_inline_imm_neg_16_f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
@@ -905,7 +905,7 @@
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_63_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_63_f16(half addrspace(1)* %out, half %x) #0 {
 ; VI-LABEL: add_inline_imm_63_f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -934,7 +934,7 @@
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_64_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_64_f16(half addrspace(1)* %out, half %x) #0 {
 ; VI-LABEL: add_inline_imm_64_f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -962,3 +962,4 @@
   store half %y, half addrspace(1)* %out
   ret void
 }
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll
--- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll
@@ -4,7 +4,7 @@
 @gv.fptr0 = external hidden unnamed_addr addrspace(4) constant void()*, align 4
 @gv.fptr1 = external hidden unnamed_addr addrspace(4) constant void(i32)*, align 4
 
-define amdgpu_kernel void @test_indirect_call_sgpr_ptr() {
+define amdgpu_kernel void @test_indirect_call_sgpr_ptr() #0 {
 ; GCN-LABEL: test_indirect_call_sgpr_ptr:
 ; GCN:         .amd_kernel_code_t
 ; GCN-NEXT:     amd_code_version_major = 1
@@ -97,7 +97,7 @@
   ret void
 }
 
-define amdgpu_kernel void @test_indirect_call_sgpr_ptr_arg() {
+define amdgpu_kernel void @test_indirect_call_sgpr_ptr_arg() #0 {
 ; GCN-LABEL: test_indirect_call_sgpr_ptr_arg:
 ; GCN:         .amd_kernel_code_t
 ; GCN-NEXT:     amd_code_version_major = 1
@@ -192,12 +192,13 @@
 }
 
 ; FIXME
-; define void @test_indirect_call_vgpr_ptr(void()* %fptr) {
+; define void @test_indirect_call_vgpr_ptr(void()* %fptr) #0 {
 ;   call void %fptr()
 ;   ret void
 ; }
 
-; define void @test_indirect_call_vgpr_ptr_arg(void(i32)* %fptr) {
+; define void @test_indirect_call_vgpr_ptr_arg(void(i32)* %fptr) #0 {
 ;   call void %fptr(i32 123)
 ;   ret void
 ; }
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll
--- a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll
@@ -3,7 +3,7 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 ; RUN: opt -mtriple=amdgcn-- -S -amdgpu-unify-divergent-exit-nodes -verify %s | FileCheck -check-prefix=IR %s
 
-define amdgpu_kernel void @infinite_loop(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @infinite_loop(i32 addrspace(1)* %out) #0 {
 ; SI-LABEL: infinite_loop:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -30,7 +30,7 @@
   br label %loop
 }
 
-define amdgpu_kernel void @infinite_loop_ret(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @infinite_loop_ret(i32 addrspace(1)* %out) #0 {
 ; SI-LABEL: infinite_loop_ret:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
@@ -73,7 +73,7 @@
   ret void
 }
 
-define amdgpu_kernel void @infinite_loops(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @infinite_loops(i32 addrspace(1)* %out) #0 {
 ; SI-LABEL: infinite_loops:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -123,7 +123,7 @@
   br label %loop2
 }
 
-define amdgpu_kernel void @infinite_loop_nest_ret(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @infinite_loop_nest_ret(i32 addrspace(1)* %out) #0 {
 ; SI-LABEL: infinite_loop_nest_ret:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
@@ -195,4 +195,6 @@
   ret void
 }
 
-declare i32 @llvm.amdgcn.workitem.id.x()
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
@@ -1324,7 +1324,7 @@
 
 ; This test requires handling INSERT_SUBREG in SIFixSGPRCopies.  Check that
 ; the compiler doesn't crash.
-define amdgpu_kernel void @insert_split_bb(<2 x i32> addrspace(1)* %out, i32 addrspace(1)* %in, i32 %a, i32 %b) {
+define amdgpu_kernel void @insert_split_bb(<2 x i32> addrspace(1)* %out, i32 addrspace(1)* %in, i32 %a, i32 %b) #0 {
 ; SI-LABEL: insert_split_bb:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dword s0, s[4:5], 0x4
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.a16.dim.ll
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.a16.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.a16.dim.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
 
-define amdgpu_ps <4 x float> @load_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+define amdgpu_ps <4 x float> @load_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) #0 {
 ; GFX9-LABEL: load_1d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_load v[0:3], v0, s[0:7] dmask:0xf unorm a16
@@ -21,7 +21,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @load_2d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+define amdgpu_ps <4 x float> @load_2d(<8 x i32> inreg %rsrc, <2 x i16> %coords) #0 {
 ; GFX9-LABEL: load_2d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_load v[0:3], v0, s[0:7] dmask:0xf unorm a16
@@ -41,7 +41,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @load_3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+define amdgpu_ps <4 x float> @load_3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) #0 {
 ; GFX9-LABEL: load_3d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm a16
@@ -62,7 +62,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @load_cube(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+define amdgpu_ps <4 x float> @load_cube(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) #0 {
 ; GFX9-LABEL: load_cube:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm a16 da
@@ -83,7 +83,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @load_1darray(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+define amdgpu_ps <4 x float> @load_1darray(<8 x i32> inreg %rsrc, <2 x i16> %coords) #0 {
 ; GFX9-LABEL: load_1darray:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_load v[0:3], v0, s[0:7] dmask:0xf unorm a16 da
@@ -103,7 +103,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @load_2darray(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+define amdgpu_ps <4 x float> @load_2darray(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) #0 {
 ; GFX9-LABEL: load_2darray:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm a16 da
@@ -124,7 +124,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @load_2dmsaa(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+define amdgpu_ps <4 x float> @load_2dmsaa(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) #0 {
 ; GFX9-LABEL: load_2dmsaa:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm a16
@@ -145,7 +145,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @load_2darraymsaa(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+define amdgpu_ps <4 x float> @load_2darraymsaa(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) #0 {
 ; GFX9-LABEL: load_2darraymsaa:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm a16 da
@@ -167,7 +167,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @load_mip_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+define amdgpu_ps <4 x float> @load_mip_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) #0 {
 ; GFX9-LABEL: load_mip_1d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_load_mip v[0:3], v0, s[0:7] dmask:0xf unorm a16
@@ -187,7 +187,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @load_mip_2d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+define amdgpu_ps <4 x float> @load_mip_2d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) #0 {
 ; GFX9-LABEL: load_mip_2d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_load_mip v[0:3], v[0:1], s[0:7] dmask:0xf unorm a16
@@ -208,7 +208,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @load_mip_3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+define amdgpu_ps <4 x float> @load_mip_3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) #0 {
 ; GFX9-LABEL: load_mip_3d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_load_mip v[0:3], v[0:1], s[0:7] dmask:0xf unorm a16
@@ -230,7 +230,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @load_mip_cube(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+define amdgpu_ps <4 x float> @load_mip_cube(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) #0 {
 ; GFX9-LABEL: load_mip_cube:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_load_mip v[0:3], v[0:1], s[0:7] dmask:0xf unorm a16 da
@@ -252,7 +252,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @load_mip_1darray(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+define amdgpu_ps <4 x float> @load_mip_1darray(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) #0 {
 ; GFX9-LABEL: load_mip_1darray:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_load_mip v[0:3], v[0:1], s[0:7] dmask:0xf unorm a16 da
@@ -273,7 +273,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @load_mip_2darray(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+define amdgpu_ps <4 x float> @load_mip_2darray(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) #0 {
 ; GFX9-LABEL: load_mip_2darray:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_load_mip v[0:3], v[0:1], s[0:7] dmask:0xf unorm a16 da
@@ -295,7 +295,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps void @store_1d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords) {
+define amdgpu_ps void @store_1d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords) #0 {
 ; GFX9-LABEL: store_1d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf unorm a16
@@ -312,7 +312,7 @@
   ret void
 }
 
-define amdgpu_ps void @store_2d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords) {
+define amdgpu_ps void @store_2d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords) #0 {
 ; GFX9-LABEL: store_2d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf unorm a16
@@ -330,7 +330,7 @@
   ret void
 }
 
-define amdgpu_ps void @store_3d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+define amdgpu_ps void @store_3d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords_lo, <2 x i16> %coords_hi) #0 {
 ; GFX9-LABEL: store_3d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_store v[0:3], v[4:5], s[0:7] dmask:0xf unorm a16
@@ -349,7 +349,7 @@
   ret void
 }
 
-define amdgpu_ps void @store_cube(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+define amdgpu_ps void @store_cube(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords_lo, <2 x i16> %coords_hi) #0 {
 ; GFX9-LABEL: store_cube:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_store v[0:3], v[4:5], s[0:7] dmask:0xf unorm a16 da
@@ -368,7 +368,7 @@
   ret void
 }
 
-define amdgpu_ps void @store_1darray(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords) {
+define amdgpu_ps void @store_1darray(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords) #0 {
 ; GFX9-LABEL: store_1darray:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf unorm a16 da
@@ -386,7 +386,7 @@
   ret void
 }
 
-define amdgpu_ps void @store_2darray(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+define amdgpu_ps void @store_2darray(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords_lo, <2 x i16> %coords_hi) #0 {
 ; GFX9-LABEL: store_2darray:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_store v[0:3], v[4:5], s[0:7] dmask:0xf unorm a16 da
@@ -405,7 +405,7 @@
   ret void
 }
 
-define amdgpu_ps void @store_2dmsaa(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+define amdgpu_ps void @store_2dmsaa(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords_lo, <2 x i16> %coords_hi) #0 {
 ; GFX9-LABEL: store_2dmsaa:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_store v[0:3], v[4:5], s[0:7] dmask:0xf unorm a16
@@ -424,7 +424,7 @@
   ret void
 }
 
-define amdgpu_ps void @store_2darraymsaa(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+define amdgpu_ps void @store_2darraymsaa(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords_lo, <2 x i16> %coords_hi) #0 {
 ; GFX9-LABEL: store_2darraymsaa:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_store v[0:3], v[4:5], s[0:7] dmask:0xf unorm a16 da
@@ -444,7 +444,7 @@
   ret void
 }
 
-define amdgpu_ps void @store_mip_1d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords) {
+define amdgpu_ps void @store_mip_1d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords) #0 {
 ; GFX9-LABEL: store_mip_1d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_store_mip v[0:3], v4, s[0:7] dmask:0xf unorm a16
@@ -462,7 +462,7 @@
   ret void
 }
 
-define amdgpu_ps void @store_mip_2d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+define amdgpu_ps void @store_mip_2d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords_lo, <2 x i16> %coords_hi) #0 {
 ; GFX9-LABEL: store_mip_2d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf unorm a16
@@ -481,7 +481,7 @@
   ret void
 }
 
-define amdgpu_ps void @store_mip_3d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+define amdgpu_ps void @store_mip_3d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords_lo, <2 x i16> %coords_hi) #0 {
 ; GFX9-LABEL: store_mip_3d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf unorm a16
@@ -501,7 +501,7 @@
   ret void
 }
 
-define amdgpu_ps void @store_mip_cube(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+define amdgpu_ps void @store_mip_cube(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords_lo, <2 x i16> %coords_hi) #0 {
 ; GFX9-LABEL: store_mip_cube:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf unorm a16 da
@@ -521,7 +521,7 @@
   ret void
 }
 
-define amdgpu_ps void @store_mip_1darray(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+define amdgpu_ps void @store_mip_1darray(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords_lo, <2 x i16> %coords_hi) #0 {
 ; GFX9-LABEL: store_mip_1darray:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf unorm a16 da
@@ -540,7 +540,7 @@
   ret void
 }
 
-define amdgpu_ps void @store_mip_2darray(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+define amdgpu_ps void @store_mip_2darray(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords_lo, <2 x i16> %coords_hi) #0 {
 ; GFX9-LABEL: store_mip_2darray:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf unorm a16 da
@@ -560,7 +560,7 @@
   ret void
 }
 
-define amdgpu_ps <4 x float> @getresinfo_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+define amdgpu_ps <4 x float> @getresinfo_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) #0 {
 ; GFX9-LABEL: getresinfo_1d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm a16
@@ -579,7 +579,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @getresinfo_2d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+define amdgpu_ps <4 x float> @getresinfo_2d(<8 x i32> inreg %rsrc, <2 x i16> %coords) #0 {
 ; GFX9-LABEL: getresinfo_2d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm a16
@@ -598,7 +598,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @getresinfo_3d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+define amdgpu_ps <4 x float> @getresinfo_3d(<8 x i32> inreg %rsrc, <2 x i16> %coords) #0 {
 ; GFX9-LABEL: getresinfo_3d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm a16
@@ -617,7 +617,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @getresinfo_cube(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+define amdgpu_ps <4 x float> @getresinfo_cube(<8 x i32> inreg %rsrc, <2 x i16> %coords) #0 {
 ; GFX9-LABEL: getresinfo_cube:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm a16 da
@@ -636,7 +636,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @getresinfo_1darray(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+define amdgpu_ps <4 x float> @getresinfo_1darray(<8 x i32> inreg %rsrc, <2 x i16> %coords) #0 {
 ; GFX9-LABEL: getresinfo_1darray:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm a16 da
@@ -655,7 +655,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @getresinfo_2darray(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+define amdgpu_ps <4 x float> @getresinfo_2darray(<8 x i32> inreg %rsrc, <2 x i16> %coords) #0 {
 ; GFX9-LABEL: getresinfo_2darray:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm a16 da
@@ -674,7 +674,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @getresinfo_2dmsaa(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+define amdgpu_ps <4 x float> @getresinfo_2dmsaa(<8 x i32> inreg %rsrc, <2 x i16> %coords) #0 {
 ; GFX9-LABEL: getresinfo_2dmsaa:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm a16
@@ -693,7 +693,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @getresinfo_2darraymsaa(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+define amdgpu_ps <4 x float> @getresinfo_2darraymsaa(<8 x i32> inreg %rsrc, <2 x i16> %coords) #0 {
 ; GFX9-LABEL: getresinfo_2darraymsaa:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm a16 da
@@ -712,7 +712,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps float @load_1d_V1(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+define amdgpu_ps float @load_1d_V1(<8 x i32> inreg %rsrc, <2 x i16> %coords) #0 {
 ; GFX9-LABEL: load_1d_V1:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_load v0, v0, s[0:7] dmask:0x8 unorm a16
@@ -731,7 +731,7 @@
   ret float %v
 }
 
-define amdgpu_ps <2 x float> @load_1d_V2(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+define amdgpu_ps <2 x float> @load_1d_V2(<8 x i32> inreg %rsrc, <2 x i16> %coords) #0 {
 ; GFX9-LABEL: load_1d_V2:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_load v[0:1], v0, s[0:7] dmask:0x9 unorm a16
@@ -750,7 +750,7 @@
   ret <2 x float> %v
 }
 
-define amdgpu_ps void @store_1d_V1(<8 x i32> inreg %rsrc, float %vdata, <2 x i16> %coords) {
+define amdgpu_ps void @store_1d_V1(<8 x i32> inreg %rsrc, float %vdata, <2 x i16> %coords) #0 {
 ; GFX9-LABEL: store_1d_V1:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_store v0, v1, s[0:7] dmask:0x2 unorm a16
@@ -767,7 +767,7 @@
   ret void
 }
 
-define amdgpu_ps void @store_1d_V2(<8 x i32> inreg %rsrc, <2 x float> %vdata, <2 x i16> %coords) {
+define amdgpu_ps void @store_1d_V2(<8 x i32> inreg %rsrc, <2 x float> %vdata, <2 x i16> %coords) #0 {
 ; GFX9-LABEL: store_1d_V2:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_store v[0:1], v2, s[0:7] dmask:0xc unorm a16
@@ -784,7 +784,7 @@
   ret void
 }
 
-define amdgpu_ps <4 x float> @load_1d_glc(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+define amdgpu_ps <4 x float> @load_1d_glc(<8 x i32> inreg %rsrc, <2 x i16> %coords) #0 {
 ; GFX9-LABEL: load_1d_glc:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_load v[0:3], v0, s[0:7] dmask:0xf unorm glc a16
@@ -803,7 +803,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @load_1d_slc(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+define amdgpu_ps <4 x float> @load_1d_slc(<8 x i32> inreg %rsrc, <2 x i16> %coords) #0 {
 ; GFX9-LABEL: load_1d_slc:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_load v[0:3], v0, s[0:7] dmask:0xf unorm slc a16
@@ -822,7 +822,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @load_1d_glc_slc(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+define amdgpu_ps <4 x float> @load_1d_glc_slc(<8 x i32> inreg %rsrc, <2 x i16> %coords) #0 {
 ; GFX9-LABEL: load_1d_glc_slc:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_load v[0:3], v0, s[0:7] dmask:0xf unorm glc slc a16
@@ -841,7 +841,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps void @store_1d_glc(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords) {
+define amdgpu_ps void @store_1d_glc(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords) #0 {
 ; GFX9-LABEL: store_1d_glc:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf unorm glc a16
@@ -858,7 +858,7 @@
   ret void
 }
 
-define amdgpu_ps void @store_1d_slc(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords) {
+define amdgpu_ps void @store_1d_slc(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords) #0 {
 ; GFX9-LABEL: store_1d_slc:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf unorm slc a16
@@ -875,7 +875,7 @@
   ret void
 }
 
-define amdgpu_ps void @store_1d_glc_slc(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords) {
+define amdgpu_ps void @store_1d_glc_slc(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords) #0 {
 ; GFX9-LABEL: store_1d_glc_slc:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf unorm glc slc a16
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.a16.encode.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.a16.encode.ll
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.a16.encode.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.a16.encode.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx900 -show-mc-encoding < %s | FileCheck -check-prefixes=GFX9 %s
 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -show-mc-encoding < %s | FileCheck -check-prefixes=GFX10 %s
 
-define amdgpu_ps <4 x float> @load_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+define amdgpu_ps <4 x float> @load_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) #0 {
 ; GFX9-LABEL: load_1d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_load v[0:3], v0, s[0:7] dmask:0xf unorm a16 ; encoding: [0x00,0x9f,0x00,0xf0,0x00,0x00,0x00,0x00]
@@ -21,7 +21,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @load_2d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+define amdgpu_ps <4 x float> @load_2d(<8 x i32> inreg %rsrc, <2 x i16> %coords) #0 {
 ; GFX9-LABEL: load_2d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_load v[0:3], v0, s[0:7] dmask:0xf unorm a16 ; encoding: [0x00,0x9f,0x00,0xf0,0x00,0x00,0x00,0x00]
@@ -41,7 +41,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @load_3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+define amdgpu_ps <4 x float> @load_3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) #0 {
 ; GFX9-LABEL: load_3d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm a16 ; encoding: [0x00,0x9f,0x00,0xf0,0x00,0x00,0x00,0x00]
@@ -62,7 +62,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @load_cube(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+define amdgpu_ps <4 x float> @load_cube(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) #0 {
 ; GFX9-LABEL: load_cube:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm a16 da ; encoding: [0x00,0xdf,0x00,0xf0,0x00,0x00,0x00,0x00]
@@ -83,7 +83,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @load_1darray(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+define amdgpu_ps <4 x float> @load_1darray(<8 x i32> inreg %rsrc, <2 x i16> %coords) #0 {
 ; GFX9-LABEL: load_1darray:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_load v[0:3], v0, s[0:7] dmask:0xf unorm a16 da ; encoding: [0x00,0xdf,0x00,0xf0,0x00,0x00,0x00,0x00]
@@ -103,7 +103,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @load_2darray(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+define amdgpu_ps <4 x float> @load_2darray(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) #0 {
 ; GFX9-LABEL: load_2darray:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm a16 da ; encoding: [0x00,0xdf,0x00,0xf0,0x00,0x00,0x00,0x00]
@@ -124,7 +124,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @load_2dmsaa(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+define amdgpu_ps <4 x float> @load_2dmsaa(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) #0 {
 ; GFX9-LABEL: load_2dmsaa:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm a16 ; encoding: [0x00,0x9f,0x00,0xf0,0x00,0x00,0x00,0x00]
@@ -145,7 +145,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @load_2darraymsaa(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+define amdgpu_ps <4 x float> @load_2darraymsaa(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) #0 {
 ; GFX9-LABEL: load_2darraymsaa:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm a16 da ; encoding: [0x00,0xdf,0x00,0xf0,0x00,0x00,0x00,0x00]
@@ -167,7 +167,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @load_mip_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+define amdgpu_ps <4 x float> @load_mip_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) #0 {
 ; GFX9-LABEL: load_mip_1d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_load_mip v[0:3], v0, s[0:7] dmask:0xf unorm a16 ; encoding: [0x00,0x9f,0x04,0xf0,0x00,0x00,0x00,0x00]
@@ -187,7 +187,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @load_mip_2d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+define amdgpu_ps <4 x float> @load_mip_2d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) #0 {
 ; GFX9-LABEL: load_mip_2d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_load_mip v[0:3], v[0:1], s[0:7] dmask:0xf unorm a16 ; encoding: [0x00,0x9f,0x04,0xf0,0x00,0x00,0x00,0x00]
@@ -208,7 +208,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @load_mip_3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+define amdgpu_ps <4 x float> @load_mip_3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) #0 {
 ; GFX9-LABEL: load_mip_3d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_load_mip v[0:3], v[0:1], s[0:7] dmask:0xf unorm a16 ; encoding: [0x00,0x9f,0x04,0xf0,0x00,0x00,0x00,0x00]
@@ -230,7 +230,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @load_mip_cube(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+define amdgpu_ps <4 x float> @load_mip_cube(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) #0 {
 ; GFX9-LABEL: load_mip_cube:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_load_mip v[0:3], v[0:1], s[0:7] dmask:0xf unorm a16 da ; encoding: [0x00,0xdf,0x04,0xf0,0x00,0x00,0x00,0x00]
@@ -252,7 +252,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @load_mip_1darray(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+define amdgpu_ps <4 x float> @load_mip_1darray(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) #0 {
 ; GFX9-LABEL: load_mip_1darray:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_load_mip v[0:3], v[0:1], s[0:7] dmask:0xf unorm a16 da ; encoding: [0x00,0xdf,0x04,0xf0,0x00,0x00,0x00,0x00]
@@ -273,7 +273,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @load_mip_2darray(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+define amdgpu_ps <4 x float> @load_mip_2darray(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) #0 {
 ; GFX9-LABEL: load_mip_2darray:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_load_mip v[0:3], v[0:1], s[0:7] dmask:0xf unorm a16 da ; encoding: [0x00,0xdf,0x04,0xf0,0x00,0x00,0x00,0x00]
@@ -295,7 +295,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps void @store_1d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords) {
+define amdgpu_ps void @store_1d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords) #0 {
 ; GFX9-LABEL: store_1d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf unorm a16 ; encoding: [0x00,0x9f,0x20,0xf0,0x04,0x00,0x00,0x00]
@@ -312,7 +312,7 @@
   ret void
 }
 
-define amdgpu_ps void @store_2d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords) {
+define amdgpu_ps void @store_2d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords) #0 {
 ; GFX9-LABEL: store_2d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf unorm a16 ; encoding: [0x00,0x9f,0x20,0xf0,0x04,0x00,0x00,0x00]
@@ -330,7 +330,7 @@
   ret void
 }
 
-define amdgpu_ps void @store_3d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+define amdgpu_ps void @store_3d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords_lo, <2 x i16> %coords_hi) #0 {
 ; GFX9-LABEL: store_3d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_store v[0:3], v[4:5], s[0:7] dmask:0xf unorm a16 ; encoding: [0x00,0x9f,0x20,0xf0,0x04,0x00,0x00,0x00]
@@ -349,7 +349,7 @@
   ret void
 }
 
-define amdgpu_ps void @store_cube(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+define amdgpu_ps void @store_cube(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords_lo, <2 x i16> %coords_hi) #0 {
 ; GFX9-LABEL: store_cube:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_store v[0:3], v[4:5], s[0:7] dmask:0xf unorm a16 da ; encoding: [0x00,0xdf,0x20,0xf0,0x04,0x00,0x00,0x00]
@@ -368,7 +368,7 @@
   ret void
 }
 
-define amdgpu_ps void @store_1darray(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords) {
+define amdgpu_ps void @store_1darray(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords) #0 {
 ; GFX9-LABEL: store_1darray:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf unorm a16 da ; encoding: [0x00,0xdf,0x20,0xf0,0x04,0x00,0x00,0x00]
@@ -386,7 +386,7 @@
   ret void
 }
 
-define amdgpu_ps void @store_2darray(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+define amdgpu_ps void @store_2darray(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords_lo, <2 x i16> %coords_hi) #0 {
 ; GFX9-LABEL: store_2darray:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_store v[0:3], v[4:5], s[0:7] dmask:0xf unorm a16 da ; encoding: [0x00,0xdf,0x20,0xf0,0x04,0x00,0x00,0x00]
@@ -405,7 +405,7 @@
   ret void
 }
 
-define amdgpu_ps void @store_2dmsaa(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+define amdgpu_ps void @store_2dmsaa(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords_lo, <2 x i16> %coords_hi) #0 {
 ; GFX9-LABEL: store_2dmsaa:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_store v[0:3], v[4:5], s[0:7] dmask:0xf unorm a16 ; encoding: [0x00,0x9f,0x20,0xf0,0x04,0x00,0x00,0x00]
@@ -424,7 +424,7 @@
   ret void
 }
 
-define amdgpu_ps void @store_2darraymsaa(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+define amdgpu_ps void @store_2darraymsaa(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords_lo, <2 x i16> %coords_hi) #0 {
 ; GFX9-LABEL: store_2darraymsaa:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_store v[0:3], v[4:5], s[0:7] dmask:0xf unorm a16 da ; encoding: [0x00,0xdf,0x20,0xf0,0x04,0x00,0x00,0x00]
@@ -444,7 +444,7 @@
   ret void
 }
 
-define amdgpu_ps void @store_mip_1d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords) {
+define amdgpu_ps void @store_mip_1d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords) #0 {
 ; GFX9-LABEL: store_mip_1d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_store_mip v[0:3], v4, s[0:7] dmask:0xf unorm a16 ; encoding: [0x00,0x9f,0x24,0xf0,0x04,0x00,0x00,0x00]
@@ -462,7 +462,7 @@
   ret void
 }
 
-define amdgpu_ps void @store_mip_2d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+define amdgpu_ps void @store_mip_2d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords_lo, <2 x i16> %coords_hi) #0 {
 ; GFX9-LABEL: store_mip_2d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf unorm a16 ; encoding: [0x00,0x9f,0x24,0xf0,0x04,0x00,0x00,0x00]
@@ -481,7 +481,7 @@
   ret void
 }
 
-define amdgpu_ps void @store_mip_3d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+define amdgpu_ps void @store_mip_3d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords_lo, <2 x i16> %coords_hi) #0 {
 ; GFX9-LABEL: store_mip_3d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf unorm a16 ; encoding: [0x00,0x9f,0x24,0xf0,0x04,0x00,0x00,0x00]
@@ -501,7 +501,7 @@
   ret void
 }
 
-define amdgpu_ps void @store_mip_cube(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+define amdgpu_ps void @store_mip_cube(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords_lo, <2 x i16> %coords_hi) #0 {
 ; GFX9-LABEL: store_mip_cube:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf unorm a16 da ; encoding: [0x00,0xdf,0x24,0xf0,0x04,0x00,0x00,0x00]
@@ -521,7 +521,7 @@
   ret void
 }
 
-define amdgpu_ps void @store_mip_1darray(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+define amdgpu_ps void @store_mip_1darray(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords_lo, <2 x i16> %coords_hi) #0 {
 ; GFX9-LABEL: store_mip_1darray:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf unorm a16 da ; encoding: [0x00,0xdf,0x24,0xf0,0x04,0x00,0x00,0x00]
@@ -540,7 +540,7 @@
   ret void
 }
 
-define amdgpu_ps void @store_mip_2darray(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+define amdgpu_ps void @store_mip_2darray(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords_lo, <2 x i16> %coords_hi) #0 {
 ; GFX9-LABEL: store_mip_2darray:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf unorm a16 da ; encoding: [0x00,0xdf,0x24,0xf0,0x04,0x00,0x00,0x00]
@@ -560,7 +560,7 @@
   ret void
 }
 
-define amdgpu_ps <4 x float> @getresinfo_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+define amdgpu_ps <4 x float> @getresinfo_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) #0 {
 ; GFX9-LABEL: getresinfo_1d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm a16 ; encoding: [0x00,0x9f,0x38,0xf0,0x00,0x00,0x00,0x00]
@@ -579,7 +579,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @getresinfo_2d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+define amdgpu_ps <4 x float> @getresinfo_2d(<8 x i32> inreg %rsrc, <2 x i16> %coords) #0 {
 ; GFX9-LABEL: getresinfo_2d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm a16 ; encoding: [0x00,0x9f,0x38,0xf0,0x00,0x00,0x00,0x00]
@@ -598,7 +598,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @getresinfo_3d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+define amdgpu_ps <4 x float> @getresinfo_3d(<8 x i32> inreg %rsrc, <2 x i16> %coords) #0 {
 ; GFX9-LABEL: getresinfo_3d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm a16 ; encoding: [0x00,0x9f,0x38,0xf0,0x00,0x00,0x00,0x00]
@@ -617,7 +617,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @getresinfo_cube(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+define amdgpu_ps <4 x float> @getresinfo_cube(<8 x i32> inreg %rsrc, <2 x i16> %coords) #0 {
 ; GFX9-LABEL: getresinfo_cube:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm a16 da ; encoding: [0x00,0xdf,0x38,0xf0,0x00,0x00,0x00,0x00]
@@ -636,7 +636,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @getresinfo_1darray(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+define amdgpu_ps <4 x float> @getresinfo_1darray(<8 x i32> inreg %rsrc, <2 x i16> %coords) #0 {
 ; GFX9-LABEL: getresinfo_1darray:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm a16 da ; encoding: [0x00,0xdf,0x38,0xf0,0x00,0x00,0x00,0x00]
@@ -655,7 +655,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @getresinfo_2darray(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+define amdgpu_ps <4 x float> @getresinfo_2darray(<8 x i32> inreg %rsrc, <2 x i16> %coords) #0 {
 ; GFX9-LABEL: getresinfo_2darray:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm a16 da ; encoding: [0x00,0xdf,0x38,0xf0,0x00,0x00,0x00,0x00]
@@ -674,7 +674,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @getresinfo_2dmsaa(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+define amdgpu_ps <4 x float> @getresinfo_2dmsaa(<8 x i32> inreg %rsrc, <2 x i16> %coords) #0 {
 ; GFX9-LABEL: getresinfo_2dmsaa:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm a16 ; encoding: [0x00,0x9f,0x38,0xf0,0x00,0x00,0x00,0x00]
@@ -693,7 +693,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @getresinfo_2darraymsaa(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+define amdgpu_ps <4 x float> @getresinfo_2darraymsaa(<8 x i32> inreg %rsrc, <2 x i16> %coords) #0 {
 ; GFX9-LABEL: getresinfo_2darraymsaa:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm a16 da ; encoding: [0x00,0xdf,0x38,0xf0,0x00,0x00,0x00,0x00]
@@ -712,7 +712,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps float @load_1d_V1(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+define amdgpu_ps float @load_1d_V1(<8 x i32> inreg %rsrc, <2 x i16> %coords) #0 {
 ; GFX9-LABEL: load_1d_V1:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_load v0, v0, s[0:7] dmask:0x8 unorm a16 ; encoding: [0x00,0x98,0x00,0xf0,0x00,0x00,0x00,0x00]
@@ -731,7 +731,7 @@
   ret float %v
 }
 
-define amdgpu_ps <2 x float> @load_1d_V2(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+define amdgpu_ps <2 x float> @load_1d_V2(<8 x i32> inreg %rsrc, <2 x i16> %coords) #0 {
 ; GFX9-LABEL: load_1d_V2:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_load v[0:1], v0, s[0:7] dmask:0x9 unorm a16 ; encoding: [0x00,0x99,0x00,0xf0,0x00,0x00,0x00,0x00]
@@ -750,7 +750,7 @@
   ret <2 x float> %v
 }
 
-define amdgpu_ps void @store_1d_V1(<8 x i32> inreg %rsrc, float %vdata, <2 x i16> %coords) {
+define amdgpu_ps void @store_1d_V1(<8 x i32> inreg %rsrc, float %vdata, <2 x i16> %coords) #0 {
 ; GFX9-LABEL: store_1d_V1:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_store v0, v1, s[0:7] dmask:0x2 unorm a16 ; encoding: [0x00,0x92,0x20,0xf0,0x01,0x00,0x00,0x00]
@@ -767,7 +767,7 @@
   ret void
 }
 
-define amdgpu_ps void @store_1d_V2(<8 x i32> inreg %rsrc, <2 x float> %vdata, <2 x i16> %coords) {
+define amdgpu_ps void @store_1d_V2(<8 x i32> inreg %rsrc, <2 x float> %vdata, <2 x i16> %coords) #0 {
 ; GFX9-LABEL: store_1d_V2:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_store v[0:1], v2, s[0:7] dmask:0xc unorm a16 ; encoding: [0x00,0x9c,0x20,0xf0,0x02,0x00,0x00,0x00]
@@ -784,7 +784,7 @@
   ret void
 }
 
-define amdgpu_ps <4 x float> @load_1d_glc(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+define amdgpu_ps <4 x float> @load_1d_glc(<8 x i32> inreg %rsrc, <2 x i16> %coords) #0 {
 ; GFX9-LABEL: load_1d_glc:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_load v[0:3], v0, s[0:7] dmask:0xf unorm glc a16 ; encoding: [0x00,0xbf,0x00,0xf0,0x00,0x00,0x00,0x00]
@@ -803,7 +803,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @load_1d_slc(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+define amdgpu_ps <4 x float> @load_1d_slc(<8 x i32> inreg %rsrc, <2 x i16> %coords) #0 {
 ; GFX9-LABEL: load_1d_slc:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_load v[0:3], v0, s[0:7] dmask:0xf unorm slc a16 ; encoding: [0x00,0x9f,0x00,0xf2,0x00,0x00,0x00,0x00]
@@ -822,7 +822,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @load_1d_glc_slc(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+define amdgpu_ps <4 x float> @load_1d_glc_slc(<8 x i32> inreg %rsrc, <2 x i16> %coords) #0 {
 ; GFX9-LABEL: load_1d_glc_slc:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_load v[0:3], v0, s[0:7] dmask:0xf unorm glc slc a16 ; encoding: [0x00,0xbf,0x00,0xf2,0x00,0x00,0x00,0x00]
@@ -841,7 +841,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps void @store_1d_glc(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords) {
+define amdgpu_ps void @store_1d_glc(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords) #0 {
 ; GFX9-LABEL: store_1d_glc:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf unorm glc a16 ; encoding: [0x00,0xbf,0x20,0xf0,0x04,0x00,0x00,0x00]
@@ -858,7 +858,7 @@
   ret void
 }
 
-define amdgpu_ps void @store_1d_slc(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords) {
+define amdgpu_ps void @store_1d_slc(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords) #0 {
 ; GFX9-LABEL: store_1d_slc:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf unorm slc a16 ; encoding: [0x00,0x9f,0x20,0xf2,0x04,0x00,0x00,0x00]
@@ -875,7 +875,7 @@
   ret void
 }
 
-define amdgpu_ps void @store_1d_glc_slc(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords) {
+define amdgpu_ps void @store_1d_glc_slc(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords) #0 {
 ; GFX9-LABEL: store_1d_glc_slc:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf unorm glc slc a16 ; encoding: [0x00,0xbf,0x20,0xf2,0x04,0x00,0x00,0x00]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll
@@ -5,7 +5,7 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-enable-prt-strict-null -verify-machineinstrs < %s | FileCheck -check-prefixes=NOPRT %s
 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GFX10 %s
 
-define amdgpu_ps <4 x float> @load_1d(<8 x i32> inreg %rsrc, i32 %s) {
+define amdgpu_ps <4 x float> @load_1d(<8 x i32> inreg %rsrc, i32 %s) #0 {
 ; VERDE-LABEL: load_1d:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    image_load v[0:3], v0, s[0:7] dmask:0xf unorm
@@ -41,7 +41,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @load_1d_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s) {
+define amdgpu_ps <4 x float> @load_1d_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s) #0 {
 ; VERDE-LABEL: load_1d_tfe:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    v_mov_b32_e32 v5, v0
@@ -125,7 +125,7 @@
   ret <4 x float> %v.vec
 }
 
-define amdgpu_ps <4 x float> @load_1d_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s) {
+define amdgpu_ps <4 x float> @load_1d_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s) #0 {
 ; VERDE-LABEL: load_1d_lwe:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    v_mov_b32_e32 v5, v0
@@ -209,7 +209,7 @@
   ret <4 x float> %v.vec
 }
 
-define amdgpu_ps <4 x float> @load_2d(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
+define amdgpu_ps <4 x float> @load_2d(<8 x i32> inreg %rsrc, i32 %s, i32 %t) #0 {
 ; VERDE-LABEL: load_2d:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm
@@ -245,7 +245,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @load_2d_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t) {
+define amdgpu_ps <4 x float> @load_2d_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t) #0 {
 ; VERDE-LABEL: load_2d_tfe:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    v_mov_b32_e32 v5, v0
@@ -333,7 +333,7 @@
   ret <4 x float> %v.vec
 }
 
-define amdgpu_ps <4 x float> @load_3d(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %r) {
+define amdgpu_ps <4 x float> @load_3d(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %r) #0 {
 ; VERDE-LABEL: load_3d:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    image_load v[0:3], v[0:2], s[0:7] dmask:0xf unorm
@@ -369,7 +369,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @load_3d_tfe_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %r) {
+define amdgpu_ps <4 x float> @load_3d_tfe_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %r) #0 {
 ; VERDE-LABEL: load_3d_tfe_lwe:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    v_mov_b32_e32 v5, v0
@@ -461,7 +461,7 @@
   ret <4 x float> %v.vec
 }
 
-define amdgpu_ps <4 x float> @load_cube(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice) {
+define amdgpu_ps <4 x float> @load_cube(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice) #0 {
 ; VERDE-LABEL: load_cube:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    image_load v[0:3], v[0:2], s[0:7] dmask:0xf unorm da
@@ -497,7 +497,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @load_cube_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %slice) {
+define amdgpu_ps <4 x float> @load_cube_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %slice) #0 {
 ; VERDE-LABEL: load_cube_lwe:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    v_mov_b32_e32 v5, v0
@@ -589,7 +589,7 @@
   ret <4 x float> %v.vec
 }
 
-define amdgpu_ps <4 x float> @load_1darray(<8 x i32> inreg %rsrc, i32 %s, i32 %slice) {
+define amdgpu_ps <4 x float> @load_1darray(<8 x i32> inreg %rsrc, i32 %s, i32 %slice) #0 {
 ; VERDE-LABEL: load_1darray:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm da
@@ -625,7 +625,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @load_1darray_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %slice) {
+define amdgpu_ps <4 x float> @load_1darray_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %slice) #0 {
 ; VERDE-LABEL: load_1darray_tfe:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    v_mov_b32_e32 v5, v0
@@ -713,7 +713,7 @@
   ret <4 x float> %v.vec
 }
 
-define amdgpu_ps <4 x float> @load_2darray(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice) {
+define amdgpu_ps <4 x float> @load_2darray(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice) #0 {
 ; VERDE-LABEL: load_2darray:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    image_load v[0:3], v[0:2], s[0:7] dmask:0xf unorm da
@@ -749,7 +749,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @load_2darray_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %slice) {
+define amdgpu_ps <4 x float> @load_2darray_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %slice) #0 {
 ; VERDE-LABEL: load_2darray_lwe:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    v_mov_b32_e32 v5, v0
@@ -841,7 +841,7 @@
   ret <4 x float> %v.vec
 }
 
-define amdgpu_ps <4 x float> @load_2dmsaa(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %fragid) {
+define amdgpu_ps <4 x float> @load_2dmsaa(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %fragid) #0 {
 ; VERDE-LABEL: load_2dmsaa:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    image_load v[0:3], v[0:2], s[0:7] dmask:0xf unorm
@@ -877,7 +877,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @load_2dmsaa_both(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %fragid) {
+define amdgpu_ps <4 x float> @load_2dmsaa_both(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %fragid) #0 {
 ; VERDE-LABEL: load_2dmsaa_both:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    v_mov_b32_e32 v5, v0
@@ -969,7 +969,7 @@
   ret <4 x float> %v.vec
 }
 
-define amdgpu_ps <4 x float> @load_2darraymsaa(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice, i32 %fragid) {
+define amdgpu_ps <4 x float> @load_2darraymsaa(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice, i32 %fragid) #0 {
 ; VERDE-LABEL: load_2darraymsaa:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    image_load v[0:3], v[0:3], s[0:7] dmask:0xf unorm da
@@ -1005,7 +1005,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @load_2darraymsaa_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %slice, i32 %fragid) {
+define amdgpu_ps <4 x float> @load_2darraymsaa_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %slice, i32 %fragid) #0 {
 ; VERDE-LABEL: load_2darraymsaa_tfe:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    v_mov_b32_e32 v5, v0
@@ -1101,7 +1101,7 @@
   ret <4 x float> %v.vec
 }
 
-define amdgpu_ps <4 x float> @load_mip_1d(<8 x i32> inreg %rsrc, i32 %s, i32 %mip) {
+define amdgpu_ps <4 x float> @load_mip_1d(<8 x i32> inreg %rsrc, i32 %s, i32 %mip) #0 {
 ; VERDE-LABEL: load_mip_1d:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    image_load_mip v[0:3], v[0:1], s[0:7] dmask:0xf unorm
@@ -1137,7 +1137,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @load_mip_1d_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %mip) {
+define amdgpu_ps <4 x float> @load_mip_1d_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %mip) #0 {
 ; VERDE-LABEL: load_mip_1d_lwe:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    v_mov_b32_e32 v5, v0
@@ -1225,7 +1225,7 @@
   ret <4 x float> %v.vec
 }
 
-define amdgpu_ps <4 x float> @load_mip_2d(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %mip) {
+define amdgpu_ps <4 x float> @load_mip_2d(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %mip) #0 {
 ; VERDE-LABEL: load_mip_2d:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    image_load_mip v[0:3], v[0:2], s[0:7] dmask:0xf unorm
@@ -1261,7 +1261,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @load_mip_2d_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %mip) {
+define amdgpu_ps <4 x float> @load_mip_2d_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %mip) #0 {
 ; VERDE-LABEL: load_mip_2d_tfe:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    v_mov_b32_e32 v5, v0
@@ -1353,7 +1353,7 @@
   ret <4 x float> %v.vec
 }
 
-define amdgpu_ps float @load_1d_V2_tfe_dmask0(<8 x i32> inreg %rsrc, i32 %s) {
+define amdgpu_ps float @load_1d_V2_tfe_dmask0(<8 x i32> inreg %rsrc, i32 %s) #0 {
 ; VERDE-LABEL: load_1d_V2_tfe_dmask0:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    v_mov_b32_e32 v1, 0
@@ -1405,7 +1405,7 @@
   ret float %vv
 }
 
-define amdgpu_ps float @load_1d_V1_tfe_dmask0(<8 x i32> inreg %rsrc, i32 %s) {
+define amdgpu_ps float @load_1d_V1_tfe_dmask0(<8 x i32> inreg %rsrc, i32 %s) #0 {
 ; VERDE-LABEL: load_1d_V1_tfe_dmask0:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    v_mov_b32_e32 v1, 0
@@ -1457,7 +1457,7 @@
   ret float %vv
 }
 
-define amdgpu_ps float @load_mip_2d_tfe_dmask0(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %mip) {
+define amdgpu_ps float @load_mip_2d_tfe_dmask0(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %mip) #0 {
 ; VERDE-LABEL: load_mip_2d_tfe_dmask0:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    v_mov_b32_e32 v3, 0
@@ -1509,7 +1509,7 @@
   ret float %vv
 }
 
-define amdgpu_ps float @load_mip_2d_tfe_nouse(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %mip) {
+define amdgpu_ps float @load_mip_2d_tfe_nouse(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %mip) #0 {
 ; VERDE-LABEL: load_mip_2d_tfe_nouse:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    v_mov_b32_e32 v3, 0
@@ -1561,7 +1561,7 @@
   ret float %vv
 }
 
-define amdgpu_ps float @load_mip_2d_tfe_nouse_V2(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %mip) {
+define amdgpu_ps float @load_mip_2d_tfe_nouse_V2(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %mip) #0 {
 ; VERDE-LABEL: load_mip_2d_tfe_nouse_V2:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    v_mov_b32_e32 v3, 0
@@ -1613,7 +1613,7 @@
   ret float %vv
 }
 
-define amdgpu_ps float @load_mip_2d_tfe_nouse_V1(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %mip) {
+define amdgpu_ps float @load_mip_2d_tfe_nouse_V1(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %mip) #0 {
 ; VERDE-LABEL: load_mip_2d_tfe_nouse_V1:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    v_mov_b32_e32 v3, 0
@@ -1665,7 +1665,7 @@
   ret float %vv
 }
 
-define amdgpu_ps <4 x float> @load_1d_tfe_V4_dmask3(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s) {
+define amdgpu_ps <4 x float> @load_1d_tfe_V4_dmask3(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s) #0 {
 ; VERDE-LABEL: load_1d_tfe_V4_dmask3:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    v_mov_b32_e32 v4, v0
@@ -1745,7 +1745,7 @@
   ret <4 x float> %v.vec
 }
 
-define amdgpu_ps <4 x float> @load_1d_tfe_V4_dmask2(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s) {
+define amdgpu_ps <4 x float> @load_1d_tfe_V4_dmask2(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s) #0 {
 ; VERDE-LABEL: load_1d_tfe_V4_dmask2:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    v_mov_b32_e32 v3, v0
@@ -1821,7 +1821,7 @@
   ret <4 x float> %v.vec
 }
 
-define amdgpu_ps <4 x float> @load_1d_tfe_V4_dmask1(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s) {
+define amdgpu_ps <4 x float> @load_1d_tfe_V4_dmask1(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s) #0 {
 ; VERDE-LABEL: load_1d_tfe_V4_dmask1:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    v_mov_b32_e32 v2, v0
@@ -1893,7 +1893,7 @@
   ret <4 x float> %v.vec
 }
 
-define amdgpu_ps <2 x float> @load_1d_tfe_V2_dmask1(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s) {
+define amdgpu_ps <2 x float> @load_1d_tfe_V2_dmask1(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s) #0 {
 ; VERDE-LABEL: load_1d_tfe_V2_dmask1:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    v_mov_b32_e32 v2, v0
@@ -1966,7 +1966,7 @@
 }
 
 
-define amdgpu_ps <4 x float> @load_mip_3d(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %r, i32 %mip) {
+define amdgpu_ps <4 x float> @load_mip_3d(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %r, i32 %mip) #0 {
 ; VERDE-LABEL: load_mip_3d:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf unorm
@@ -2002,7 +2002,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @load_mip_cube(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice, i32 %mip) {
+define amdgpu_ps <4 x float> @load_mip_cube(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice, i32 %mip) #0 {
 ; VERDE-LABEL: load_mip_cube:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf unorm da
@@ -2038,7 +2038,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @load_mip_1darray(<8 x i32> inreg %rsrc, i32 %s, i32 %slice, i32 %mip) {
+define amdgpu_ps <4 x float> @load_mip_1darray(<8 x i32> inreg %rsrc, i32 %s, i32 %slice, i32 %mip) #0 {
 ; VERDE-LABEL: load_mip_1darray:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    image_load_mip v[0:3], v[0:2], s[0:7] dmask:0xf unorm da
@@ -2074,7 +2074,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @load_mip_2darray(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice, i32 %mip) {
+define amdgpu_ps <4 x float> @load_mip_2darray(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice, i32 %mip) #0 {
 ; VERDE-LABEL: load_mip_2darray:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf unorm da
@@ -2110,7 +2110,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps void @store_1d(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s) {
+define amdgpu_ps void @store_1d(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s) #0 {
 ; VERDE-LABEL: store_1d:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf unorm
@@ -2141,7 +2141,7 @@
   ret void
 }
 
-define amdgpu_ps void @store_2d(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t) {
+define amdgpu_ps void @store_2d(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t) #0 {
 ; VERDE-LABEL: store_2d:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    image_store v[0:3], v[4:5], s[0:7] dmask:0xf unorm
@@ -2172,7 +2172,7 @@
   ret void
 }
 
-define amdgpu_ps void @store_3d(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t, i32 %r) {
+define amdgpu_ps void @store_3d(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t, i32 %r) #0 {
 ; VERDE-LABEL: store_3d:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    image_store v[0:3], v[4:6], s[0:7] dmask:0xf unorm
@@ -2203,7 +2203,7 @@
   ret void
 }
 
-define amdgpu_ps void @store_cube(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t, i32 %slice) {
+define amdgpu_ps void @store_cube(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t, i32 %slice) #0 {
 ; VERDE-LABEL: store_cube:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    image_store v[0:3], v[4:6], s[0:7] dmask:0xf unorm da
@@ -2234,7 +2234,7 @@
   ret void
 }
 
-define amdgpu_ps void @store_1darray(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %slice) {
+define amdgpu_ps void @store_1darray(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %slice) #0 {
 ; VERDE-LABEL: store_1darray:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    image_store v[0:3], v[4:5], s[0:7] dmask:0xf unorm da
@@ -2265,7 +2265,7 @@
   ret void
 }
 
-define amdgpu_ps void @store_2darray(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t, i32 %slice) {
+define amdgpu_ps void @store_2darray(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t, i32 %slice) #0 {
 ; VERDE-LABEL: store_2darray:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    image_store v[0:3], v[4:6], s[0:7] dmask:0xf unorm da
@@ -2296,7 +2296,7 @@
   ret void
 }
 
-define amdgpu_ps void @store_2dmsaa(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t, i32 %fragid) {
+define amdgpu_ps void @store_2dmsaa(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t, i32 %fragid) #0 {
 ; VERDE-LABEL: store_2dmsaa:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    image_store v[0:3], v[4:6], s[0:7] dmask:0xf unorm
@@ -2327,7 +2327,7 @@
   ret void
 }
 
-define amdgpu_ps void @store_2darraymsaa(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t, i32 %slice, i32 %fragid) {
+define amdgpu_ps void @store_2darraymsaa(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t, i32 %slice, i32 %fragid) #0 {
 ; VERDE-LABEL: store_2darraymsaa:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    image_store v[0:3], v[4:7], s[0:7] dmask:0xf unorm da
@@ -2358,7 +2358,7 @@
   ret void
 }
 
-define amdgpu_ps void @store_mip_1d(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %mip) {
+define amdgpu_ps void @store_mip_1d(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %mip) #0 {
 ; VERDE-LABEL: store_mip_1d:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf unorm
@@ -2389,7 +2389,7 @@
   ret void
 }
 
-define amdgpu_ps void @store_mip_2d(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t, i32 %mip) {
+define amdgpu_ps void @store_mip_2d(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t, i32 %mip) #0 {
 ; VERDE-LABEL: store_mip_2d:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    image_store_mip v[0:3], v[4:6], s[0:7] dmask:0xf unorm
@@ -2420,7 +2420,7 @@
   ret void
 }
 
-define amdgpu_ps void @store_mip_3d(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t, i32 %r, i32 %mip) {
+define amdgpu_ps void @store_mip_3d(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t, i32 %r, i32 %mip) #0 {
 ; VERDE-LABEL: store_mip_3d:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf unorm
@@ -2451,7 +2451,7 @@
   ret void
 }
 
-define amdgpu_ps void @store_mip_cube(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t, i32 %slice, i32 %mip) {
+define amdgpu_ps void @store_mip_cube(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t, i32 %slice, i32 %mip) #0 {
 ; VERDE-LABEL: store_mip_cube:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf unorm da
@@ -2482,7 +2482,7 @@
   ret void
 }
 
-define amdgpu_ps void @store_mip_1darray(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %slice, i32 %mip) {
+define amdgpu_ps void @store_mip_1darray(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %slice, i32 %mip) #0 {
 ; VERDE-LABEL: store_mip_1darray:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    image_store_mip v[0:3], v[4:6], s[0:7] dmask:0xf unorm da
@@ -2513,7 +2513,7 @@
   ret void
 }
 
-define amdgpu_ps void @store_mip_2darray(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t, i32 %slice, i32 %mip) {
+define amdgpu_ps void @store_mip_2darray(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t, i32 %slice, i32 %mip) #0 {
 ; VERDE-LABEL: store_mip_2darray:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf unorm da
@@ -2544,7 +2544,7 @@
   ret void
 }
 
-define amdgpu_ps <4 x float> @getresinfo_1d(<8 x i32> inreg %rsrc, i32 %mip) {
+define amdgpu_ps <4 x float> @getresinfo_1d(<8 x i32> inreg %rsrc, i32 %mip) #0 {
 ; VERDE-LABEL: getresinfo_1d:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm
@@ -2580,7 +2580,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @getresinfo_2d(<8 x i32> inreg %rsrc, i32 %mip) {
+define amdgpu_ps <4 x float> @getresinfo_2d(<8 x i32> inreg %rsrc, i32 %mip) #0 {
 ; VERDE-LABEL: getresinfo_2d:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm
@@ -2616,7 +2616,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @getresinfo_3d(<8 x i32> inreg %rsrc, i32 %mip) {
+define amdgpu_ps <4 x float> @getresinfo_3d(<8 x i32> inreg %rsrc, i32 %mip) #0 {
 ; VERDE-LABEL: getresinfo_3d:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm
@@ -2652,7 +2652,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @getresinfo_cube(<8 x i32> inreg %rsrc, i32 %mip) {
+define amdgpu_ps <4 x float> @getresinfo_cube(<8 x i32> inreg %rsrc, i32 %mip) #0 {
 ; VERDE-LABEL: getresinfo_cube:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm da
@@ -2688,7 +2688,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @getresinfo_1darray(<8 x i32> inreg %rsrc, i32 %mip) {
+define amdgpu_ps <4 x float> @getresinfo_1darray(<8 x i32> inreg %rsrc, i32 %mip) #0 {
 ; VERDE-LABEL: getresinfo_1darray:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm da
@@ -2724,7 +2724,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @getresinfo_2darray(<8 x i32> inreg %rsrc, i32 %mip) {
+define amdgpu_ps <4 x float> @getresinfo_2darray(<8 x i32> inreg %rsrc, i32 %mip) #0 {
 ; VERDE-LABEL: getresinfo_2darray:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm da
@@ -2760,7 +2760,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @getresinfo_2dmsaa(<8 x i32> inreg %rsrc, i32 %mip) {
+define amdgpu_ps <4 x float> @getresinfo_2dmsaa(<8 x i32> inreg %rsrc, i32 %mip) #0 {
 ; VERDE-LABEL: getresinfo_2dmsaa:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm
@@ -2796,7 +2796,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @getresinfo_2darraymsaa(<8 x i32> inreg %rsrc, i32 %mip) {
+define amdgpu_ps <4 x float> @getresinfo_2darraymsaa(<8 x i32> inreg %rsrc, i32 %mip) #0 {
 ; VERDE-LABEL: getresinfo_2darraymsaa:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm da
@@ -2832,7 +2832,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps float @load_1d_V1(<8 x i32> inreg %rsrc, i32 %s) {
+define amdgpu_ps float @load_1d_V1(<8 x i32> inreg %rsrc, i32 %s) #0 {
 ; VERDE-LABEL: load_1d_V1:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    image_load v0, v0, s[0:7] dmask:0x8 unorm
@@ -2868,7 +2868,7 @@
   ret float %v
 }
 
-define amdgpu_ps <2 x float> @load_1d_V2(<8 x i32> inreg %rsrc, i32 %s) {
+define amdgpu_ps <2 x float> @load_1d_V2(<8 x i32> inreg %rsrc, i32 %s) #0 {
 ; VERDE-LABEL: load_1d_V2:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    image_load v[0:1], v0, s[0:7] dmask:0x9 unorm
@@ -2904,7 +2904,7 @@
   ret <2 x float> %v
 }
 
-define amdgpu_ps void @store_1d_V1(<8 x i32> inreg %rsrc, float %vdata, i32 %s) {
+define amdgpu_ps void @store_1d_V1(<8 x i32> inreg %rsrc, float %vdata, i32 %s) #0 {
 ; VERDE-LABEL: store_1d_V1:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    image_store v0, v1, s[0:7] dmask:0x2 unorm
@@ -2935,7 +2935,7 @@
   ret void
 }
 
-define amdgpu_ps void @store_1d_V2(<8 x i32> inreg %rsrc, <2 x float> %vdata, i32 %s) {
+define amdgpu_ps void @store_1d_V2(<8 x i32> inreg %rsrc, <2 x float> %vdata, i32 %s) #0 {
 ; VERDE-LABEL: store_1d_V2:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    image_store v[0:1], v2, s[0:7] dmask:0xc unorm
@@ -2966,7 +2966,7 @@
   ret void
 }
 
-define amdgpu_ps <4 x float> @load_1d_glc(<8 x i32> inreg %rsrc, i32 %s) {
+define amdgpu_ps <4 x float> @load_1d_glc(<8 x i32> inreg %rsrc, i32 %s) #0 {
 ; VERDE-LABEL: load_1d_glc:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    image_load v[0:3], v0, s[0:7] dmask:0xf unorm glc
@@ -3002,7 +3002,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @load_1d_slc(<8 x i32> inreg %rsrc, i32 %s) {
+define amdgpu_ps <4 x float> @load_1d_slc(<8 x i32> inreg %rsrc, i32 %s) #0 {
 ; VERDE-LABEL: load_1d_slc:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    image_load v[0:3], v0, s[0:7] dmask:0xf unorm slc
@@ -3038,7 +3038,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @load_1d_glc_slc(<8 x i32> inreg %rsrc, i32 %s) {
+define amdgpu_ps <4 x float> @load_1d_glc_slc(<8 x i32> inreg %rsrc, i32 %s) #0 {
 ; VERDE-LABEL: load_1d_glc_slc:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    image_load v[0:3], v0, s[0:7] dmask:0xf unorm glc slc
@@ -3074,7 +3074,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps void @store_1d_glc(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s) {
+define amdgpu_ps void @store_1d_glc(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s) #0 {
 ; VERDE-LABEL: store_1d_glc:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf unorm glc
@@ -3105,7 +3105,7 @@
   ret void
 }
 
-define amdgpu_ps void @store_1d_slc(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s) {
+define amdgpu_ps void @store_1d_slc(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s) #0 {
 ; VERDE-LABEL: store_1d_slc:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf unorm slc
@@ -3136,7 +3136,7 @@
   ret void
 }
 
-define amdgpu_ps void @store_1d_glc_slc(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s) {
+define amdgpu_ps void @store_1d_glc_slc(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s) #0 {
 ; VERDE-LABEL: store_1d_glc_slc:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf unorm glc slc
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.a16.dim.ll
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.a16.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.a16.dim.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
 
-define amdgpu_ps <4 x float> @gather4_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t) {
+define amdgpu_ps <4 x float> @gather4_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t) #0 {
 ; GFX9-LABEL: gather4_2d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    s_mov_b64 s[12:13], exec
@@ -30,7 +30,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @gather4_cube(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %face) {
+define amdgpu_ps <4 x float> @gather4_cube(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %face) #0 {
 ; GFX9-LABEL: gather4_cube:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    s_mov_b64 s[12:13], exec
@@ -58,7 +58,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @gather4_2darray(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %slice) {
+define amdgpu_ps <4 x float> @gather4_2darray(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %slice) #0 {
 ; GFX9-LABEL: gather4_2darray:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    s_mov_b64 s[12:13], exec
@@ -86,7 +86,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @gather4_c_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %t) {
+define amdgpu_ps <4 x float> @gather4_c_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %t) #0 {
 ; GFX9-LABEL: gather4_c_2d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    s_mov_b64 s[12:13], exec
@@ -114,7 +114,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @gather4_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %clamp) {
+define amdgpu_ps <4 x float> @gather4_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %clamp) #0 {
 ; GFX9-LABEL: gather4_cl_2d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    s_mov_b64 s[12:13], exec
@@ -142,7 +142,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @gather4_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %t, half %clamp) {
+define amdgpu_ps <4 x float> @gather4_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %t, half %clamp) #0 {
 ; GFX9-LABEL: gather4_c_cl_2d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    s_mov_b64 s[12:13], exec
@@ -172,7 +172,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @gather4_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, half %s, half %t) {
+define amdgpu_ps <4 x float> @gather4_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, half %s, half %t) #0 {
 ; GFX9-LABEL: gather4_b_2d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    s_mov_b64 s[12:13], exec
@@ -200,7 +200,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @gather4_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, half %s, half %t) {
+define amdgpu_ps <4 x float> @gather4_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, half %s, half %t) #0 {
 ; GFX9-LABEL: gather4_c_b_2d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    s_mov_b64 s[12:13], exec
@@ -228,7 +228,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @gather4_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, half %s, half %t, half %clamp) {
+define amdgpu_ps <4 x float> @gather4_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, half %s, half %t, half %clamp) #0 {
 ; GFX9-LABEL: gather4_b_cl_2d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    s_mov_b64 s[12:13], exec
@@ -258,7 +258,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @gather4_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, half %s, half %t, half %clamp) {
+define amdgpu_ps <4 x float> @gather4_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, half %s, half %t, half %clamp) #0 {
 ; GFX9-LABEL: gather4_c_b_cl_2d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    s_mov_b64 s[12:13], exec
@@ -289,7 +289,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @gather4_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %lod) {
+define amdgpu_ps <4 x float> @gather4_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %lod) #0 {
 ; GFX9-LABEL: gather4_l_2d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
@@ -311,7 +311,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @gather4_c_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %t, half %lod) {
+define amdgpu_ps <4 x float> @gather4_c_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %t, half %lod) #0 {
 ; GFX9-LABEL: gather4_c_l_2d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    v_mov_b32_e32 v5, v3
@@ -335,7 +335,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @gather4_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t) {
+define amdgpu_ps <4 x float> @gather4_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t) #0 {
 ; GFX9-LABEL: gather4_lz_2d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
@@ -357,7 +357,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @gather4_c_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %t) {
+define amdgpu_ps <4 x float> @gather4_c_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %t) #0 {
 ; GFX9-LABEL: gather4_c_lz_2d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
 
-define amdgpu_ps <4 x float> @sample_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s) {
+define amdgpu_ps <4 x float> @sample_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s) #0 {
 ; GFX9-LABEL: sample_1d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    s_mov_b64 s[12:13], exec
@@ -26,7 +26,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t) {
+define amdgpu_ps <4 x float> @sample_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t) #0 {
 ; GFX9-LABEL: sample_2d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    s_mov_b64 s[12:13], exec
@@ -54,7 +54,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %r) {
+define amdgpu_ps <4 x float> @sample_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %r) #0 {
 ; GFX9-LABEL: sample_3d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    s_mov_b64 s[12:13], exec
@@ -82,7 +82,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_cube(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %face) {
+define amdgpu_ps <4 x float> @sample_cube(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %face) #0 {
 ; GFX9-LABEL: sample_cube:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    s_mov_b64 s[12:13], exec
@@ -110,7 +110,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_1darray(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %slice) {
+define amdgpu_ps <4 x float> @sample_1darray(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %slice) #0 {
 ; GFX9-LABEL: sample_1darray:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    s_mov_b64 s[12:13], exec
@@ -138,7 +138,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_2darray(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %slice) {
+define amdgpu_ps <4 x float> @sample_2darray(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %slice) #0 {
 ; GFX9-LABEL: sample_2darray:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    s_mov_b64 s[12:13], exec
@@ -166,7 +166,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_c_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s) {
+define amdgpu_ps <4 x float> @sample_c_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s) #0 {
 ; GFX9-LABEL: sample_c_1d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    s_mov_b64 s[12:13], exec
@@ -190,7 +190,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_c_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %t) {
+define amdgpu_ps <4 x float> @sample_c_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %t) #0 {
 ; GFX9-LABEL: sample_c_2d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    s_mov_b64 s[12:13], exec
@@ -218,7 +218,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %clamp) {
+define amdgpu_ps <4 x float> @sample_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %clamp) #0 {
 ; GFX9-LABEL: sample_cl_1d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    s_mov_b64 s[12:13], exec
@@ -246,7 +246,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %clamp) {
+define amdgpu_ps <4 x float> @sample_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %clamp) #0 {
 ; GFX9-LABEL: sample_cl_2d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    s_mov_b64 s[12:13], exec
@@ -274,7 +274,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_c_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %clamp) {
+define amdgpu_ps <4 x float> @sample_c_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %clamp) #0 {
 ; GFX9-LABEL: sample_c_cl_1d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    s_mov_b64 s[12:13], exec
@@ -302,7 +302,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %t, half %clamp) {
+define amdgpu_ps <4 x float> @sample_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %t, half %clamp) #0 {
 ; GFX9-LABEL: sample_c_cl_2d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    s_mov_b64 s[12:13], exec
@@ -332,7 +332,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_b_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, half %s) {
+define amdgpu_ps <4 x float> @sample_b_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, half %s) #0 {
 ; GFX9-LABEL: sample_b_1d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    s_mov_b64 s[12:13], exec
@@ -356,7 +356,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, half %s, half %t) {
+define amdgpu_ps <4 x float> @sample_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, half %s, half %t) #0 {
 ; GFX9-LABEL: sample_b_2d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    s_mov_b64 s[12:13], exec
@@ -384,7 +384,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_c_b_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, half %s) {
+define amdgpu_ps <4 x float> @sample_c_b_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, half %s) #0 {
 ; GFX9-LABEL: sample_c_b_1d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    s_mov_b64 s[12:13], exec
@@ -408,7 +408,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, half %s, half %t) {
+define amdgpu_ps <4 x float> @sample_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, half %s, half %t) #0 {
 ; GFX9-LABEL: sample_c_b_2d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    s_mov_b64 s[12:13], exec
@@ -436,7 +436,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_b_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, half %s, half %clamp) {
+define amdgpu_ps <4 x float> @sample_b_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, half %s, half %clamp) #0 {
 ; GFX9-LABEL: sample_b_cl_1d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    s_mov_b64 s[12:13], exec
@@ -464,7 +464,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, half %s, half %t, half %clamp) {
+define amdgpu_ps <4 x float> @sample_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, half %s, half %t, half %clamp) #0 {
 ; GFX9-LABEL: sample_b_cl_2d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    s_mov_b64 s[12:13], exec
@@ -494,7 +494,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_c_b_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, half %s, half %clamp) {
+define amdgpu_ps <4 x float> @sample_c_b_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, half %s, half %clamp) #0 {
 ; GFX9-LABEL: sample_c_b_cl_1d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    s_mov_b64 s[12:13], exec
@@ -522,7 +522,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, half %s, half %t, half %clamp) {
+define amdgpu_ps <4 x float> @sample_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, half %s, half %t, half %clamp) #0 {
 ; GFX9-LABEL: sample_c_b_cl_2d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    s_mov_b64 s[12:13], exec
@@ -553,7 +553,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, half %s) {
+define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, half %s) #0 {
 ; GFX9-LABEL: sample_d_1d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_sample_d v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf a16
@@ -571,7 +571,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t) {
+define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t) #0 {
 ; GFX9-LABEL: sample_d_2d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    v_mov_b32_e32 v6, 0xffff
@@ -603,7 +603,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %drdh, half %dsdv, half %dtdv, half %drdv, half %s, half %t, half %r) {
+define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %drdh, half %dsdv, half %dtdv, half %drdv, half %s, half %t, half %r) #0 {
 ; GFX9-LABEL: sample_d_3d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    v_mov_b32_e32 v12, v8
@@ -638,7 +638,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_c_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, half %s) {
+define amdgpu_ps <4 x float> @sample_c_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, half %s) #0 {
 ; GFX9-LABEL: sample_c_d_1d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_sample_c_d v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf a16
@@ -656,7 +656,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t) {
+define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t) #0 {
 ; GFX9-LABEL: sample_c_d_2d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    v_mov_b32_e32 v9, 0xffff
@@ -690,7 +690,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, half %s, half %clamp) {
+define amdgpu_ps <4 x float> @sample_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, half %s, half %clamp) #0 {
 ; GFX9-LABEL: sample_d_cl_1d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v2
@@ -712,7 +712,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp) {
+define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp) #0 {
 ; GFX9-LABEL: sample_d_cl_2d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    v_mov_b32_e32 v7, 0xffff
@@ -744,7 +744,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_c_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, half %s, half %clamp) {
+define amdgpu_ps <4 x float> @sample_c_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, half %s, half %clamp) #0 {
 ; GFX9-LABEL: sample_c_d_cl_1d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff, v3
@@ -766,7 +766,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp) {
+define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp) #0 {
 ; GFX9-LABEL: sample_c_d_cl_2d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    v_mov_b32_e32 v11, v7
@@ -800,7 +800,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, half %s) {
+define amdgpu_ps <4 x float> @sample_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, half %s) #0 {
 ; GFX9-LABEL: sample_cd_1d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_sample_cd v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf a16
@@ -818,7 +818,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t) {
+define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t) #0 {
 ; GFX9-LABEL: sample_cd_2d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    v_mov_b32_e32 v6, 0xffff
@@ -850,7 +850,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_c_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, half %s) {
+define amdgpu_ps <4 x float> @sample_c_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, half %s) #0 {
 ; GFX9-LABEL: sample_c_cd_1d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_sample_c_cd v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf a16
@@ -868,7 +868,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t) {
+define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t) #0 {
 ; GFX9-LABEL: sample_c_cd_2d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    v_mov_b32_e32 v9, 0xffff
@@ -902,7 +902,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, half %s, half %clamp) {
+define amdgpu_ps <4 x float> @sample_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, half %s, half %clamp) #0 {
 ; GFX9-LABEL: sample_cd_cl_1d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v2
@@ -924,7 +924,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp) {
+define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp) #0 {
 ; GFX9-LABEL: sample_cd_cl_2d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    v_mov_b32_e32 v7, 0xffff
@@ -956,7 +956,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_c_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, half %s, half %clamp) {
+define amdgpu_ps <4 x float> @sample_c_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, half %s, half %clamp) #0 {
 ; GFX9-LABEL: sample_c_cd_cl_1d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff, v3
@@ -978,7 +978,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp) {
+define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp) #0 {
 ; GFX9-LABEL: sample_c_cd_cl_2d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    v_mov_b32_e32 v11, v7
@@ -1012,7 +1012,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_l_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %lod) {
+define amdgpu_ps <4 x float> @sample_l_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %lod) #0 {
 ; GFX9-LABEL: sample_l_1d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
@@ -1034,7 +1034,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %lod) {
+define amdgpu_ps <4 x float> @sample_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %lod) #0 {
 ; GFX9-LABEL: sample_l_2d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
@@ -1056,7 +1056,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_c_l_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %lod) {
+define amdgpu_ps <4 x float> @sample_c_l_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %lod) #0 {
 ; GFX9-LABEL: sample_c_l_1d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
@@ -1078,7 +1078,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_c_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %t, half %lod) {
+define amdgpu_ps <4 x float> @sample_c_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %t, half %lod) #0 {
 ; GFX9-LABEL: sample_c_l_2d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    v_mov_b32_e32 v5, v3
@@ -1102,7 +1102,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_lz_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s) {
+define amdgpu_ps <4 x float> @sample_lz_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s) #0 {
 ; GFX9-LABEL: sample_lz_1d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_sample_lz v[0:3], v0, s[0:7], s[8:11] dmask:0xf a16
@@ -1120,7 +1120,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t) {
+define amdgpu_ps <4 x float> @sample_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t) #0 {
 ; GFX9-LABEL: sample_lz_2d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
@@ -1142,7 +1142,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_c_lz_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s) {
+define amdgpu_ps <4 x float> @sample_c_lz_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s) #0 {
 ; GFX9-LABEL: sample_c_lz_1d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_sample_c_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf a16
@@ -1160,7 +1160,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_c_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %t) {
+define amdgpu_ps <4 x float> @sample_c_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %t) #0 {
 ; GFX9-LABEL: sample_c_lz_2d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
@@ -1182,7 +1182,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %slice) {
+define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %slice) #0 {
 ; GFX9-LABEL: sample_c_d_o_2darray_V1:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    v_mov_b32_e32 v13, v8
@@ -1217,7 +1217,7 @@
   ret float %v
 }
 
-define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %slice) {
+define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %slice) #0 {
 ; GFX9-LABEL: sample_c_d_o_2darray_V2:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    v_mov_b32_e32 v13, v8
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll
@@ -4,7 +4,7 @@
 ; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefixes=GFX9 %s
 ; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefixes=GFX10 %s
 
-define amdgpu_ps half @image_sample_2d_f16(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
+define amdgpu_ps half @image_sample_2d_f16(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) #0 {
 ; TONGA-LABEL: image_sample_2d_f16:
 ; TONGA:       ; %bb.0: ; %main_body
 ; TONGA-NEXT:    s_mov_b64 s[12:13], exec
@@ -47,7 +47,7 @@
   ret half %tex
 }
 
-define amdgpu_ps half @image_sample_2d_f16_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, i32 addrspace(1)* inreg %out) {
+define amdgpu_ps half @image_sample_2d_f16_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, i32 addrspace(1)* inreg %out) #0 {
 ; TONGA-LABEL: image_sample_2d_f16_tfe:
 ; TONGA:       ; %bb.0: ; %main_body
 ; TONGA-NEXT:    s_mov_b64 s[14:15], exec
@@ -123,7 +123,7 @@
   ret half %tex.vec
 }
 
-define amdgpu_ps float @image_sample_c_d_1d_v2f16(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dsdv, float %s) {
+define amdgpu_ps float @image_sample_c_d_1d_v2f16(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dsdv, float %s) #0 {
 ; TONGA-LABEL: image_sample_c_d_1d_v2f16:
 ; TONGA:       ; %bb.0: ; %main_body
 ; TONGA-NEXT:    image_sample_c_d v[0:1], v[0:3], s[0:7], s[8:11] dmask:0x3 d16
@@ -156,7 +156,7 @@
   ret float %r
 }
 
-define amdgpu_ps <2 x float> @image_sample_c_d_1d_v2f16_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dsdv, float %s) {
+define amdgpu_ps <2 x float> @image_sample_c_d_1d_v2f16_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dsdv, float %s) #0 {
 ; TONGA-LABEL: image_sample_c_d_1d_v2f16_tfe:
 ; TONGA:       ; %bb.0: ; %main_body
 ; TONGA-NEXT:    v_mov_b32_e32 v4, 0
@@ -212,7 +212,7 @@
   ret <2 x float> %r
 }
 
-define amdgpu_ps <2 x float> @image_sample_b_2d_v4f16(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %s, float %t) {
+define amdgpu_ps <2 x float> @image_sample_b_2d_v4f16(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %s, float %t) #0 {
 ; TONGA-LABEL: image_sample_b_2d_v4f16:
 ; TONGA:       ; %bb.0: ; %main_body
 ; TONGA-NEXT:    s_mov_b64 s[12:13], exec
@@ -260,7 +260,7 @@
   ret <2 x float> %r
 }
 
-define amdgpu_ps <4 x float> @image_sample_b_2d_v4f16_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %s, float %t) {
+define amdgpu_ps <4 x float> @image_sample_b_2d_v4f16_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %s, float %t) #0 {
 ; TONGA-LABEL: image_sample_b_2d_v4f16_tfe:
 ; TONGA:       ; %bb.0: ; %main_body
 ; TONGA-NEXT:    s_mov_b64 s[12:13], exec
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll
@@ -3,7 +3,7 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6789 %s
 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GFX10 %s
 
-define amdgpu_ps <4 x float> @sample_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
+define amdgpu_ps <4 x float> @sample_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) #0 {
 ; VERDE-LABEL: sample_1d:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    s_mov_b64 s[12:13], exec
@@ -36,7 +36,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_1d_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 addrspace(1)* inreg %out, float %s) {
+define amdgpu_ps <4 x float> @sample_1d_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 addrspace(1)* inreg %out, float %s) #0 {
 ; VERDE-LABEL: sample_1d_tfe:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    s_mov_b64 s[16:17], exec
@@ -102,7 +102,7 @@
   ret <4 x float> %v.vec
 }
 
-define amdgpu_ps <2 x float> @sample_1d_tfe_adjust_writemask_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 addrspace(1)* inreg %out, float %s) {
+define amdgpu_ps <2 x float> @sample_1d_tfe_adjust_writemask_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 addrspace(1)* inreg %out, float %s) #0 {
 ; VERDE-LABEL: sample_1d_tfe_adjust_writemask_1:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    s_mov_b64 s[12:13], exec
@@ -150,7 +150,7 @@
   ret <2 x float> %res
 }
 
-define amdgpu_ps <2 x float> @sample_1d_tfe_adjust_writemask_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
+define amdgpu_ps <2 x float> @sample_1d_tfe_adjust_writemask_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) #0 {
 ; VERDE-LABEL: sample_1d_tfe_adjust_writemask_2:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    s_mov_b64 s[12:13], exec
@@ -198,7 +198,7 @@
   ret <2 x float> %res
 }
 
-define amdgpu_ps <2 x float> @sample_1d_tfe_adjust_writemask_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
+define amdgpu_ps <2 x float> @sample_1d_tfe_adjust_writemask_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) #0 {
 ; VERDE-LABEL: sample_1d_tfe_adjust_writemask_3:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    s_mov_b64 s[12:13], exec
@@ -246,7 +246,7 @@
   ret <2 x float> %res
 }
 
-define amdgpu_ps <2 x float> @sample_1d_tfe_adjust_writemask_4(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
+define amdgpu_ps <2 x float> @sample_1d_tfe_adjust_writemask_4(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) #0 {
 ; VERDE-LABEL: sample_1d_tfe_adjust_writemask_4:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    s_mov_b64 s[12:13], exec
@@ -294,7 +294,7 @@
   ret <2 x float> %res
 }
 
-define amdgpu_ps <4 x float> @sample_1d_tfe_adjust_writemask_12(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
+define amdgpu_ps <4 x float> @sample_1d_tfe_adjust_writemask_12(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) #0 {
 ; VERDE-LABEL: sample_1d_tfe_adjust_writemask_12:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    s_mov_b64 s[12:13], exec
@@ -347,7 +347,7 @@
   ret <4 x float> %res
 }
 
-define amdgpu_ps <4 x float> @sample_1d_tfe_adjust_writemask_24(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
+define amdgpu_ps <4 x float> @sample_1d_tfe_adjust_writemask_24(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) #0 {
 ; VERDE-LABEL: sample_1d_tfe_adjust_writemask_24:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    s_mov_b64 s[12:13], exec
@@ -400,7 +400,7 @@
   ret <4 x float> %res
 }
 
-define amdgpu_ps <4 x float> @sample_1d_tfe_adjust_writemask_134(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
+define amdgpu_ps <4 x float> @sample_1d_tfe_adjust_writemask_134(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) #0 {
 ; VERDE-LABEL: sample_1d_tfe_adjust_writemask_134:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    s_mov_b64 s[12:13], exec
@@ -458,7 +458,7 @@
   ret <4 x float> %res
 }
 
-define amdgpu_ps <4 x float> @sample_1d_lwe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 addrspace(1)* inreg %out, float %s) {
+define amdgpu_ps <4 x float> @sample_1d_lwe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 addrspace(1)* inreg %out, float %s) #0 {
 ; VERDE-LABEL: sample_1d_lwe:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    s_mov_b64 s[16:17], exec
@@ -524,7 +524,7 @@
   ret <4 x float> %v.vec
 }
 
-define amdgpu_ps <4 x float> @sample_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
+define amdgpu_ps <4 x float> @sample_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) #0 {
 ; VERDE-LABEL: sample_2d:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    s_mov_b64 s[12:13], exec
@@ -557,7 +557,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, float %r) {
+define amdgpu_ps <4 x float> @sample_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, float %r) #0 {
 ; VERDE-LABEL: sample_3d:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    s_mov_b64 s[12:13], exec
@@ -590,7 +590,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_cube(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, float %face) {
+define amdgpu_ps <4 x float> @sample_cube(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, float %face) #0 {
 ; VERDE-LABEL: sample_cube:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    s_mov_b64 s[12:13], exec
@@ -623,7 +623,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_1darray(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %slice) {
+define amdgpu_ps <4 x float> @sample_1darray(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %slice) #0 {
 ; VERDE-LABEL: sample_1darray:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    s_mov_b64 s[12:13], exec
@@ -656,7 +656,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_2darray(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, float %slice) {
+define amdgpu_ps <4 x float> @sample_2darray(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, float %slice) #0 {
 ; VERDE-LABEL: sample_2darray:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    s_mov_b64 s[12:13], exec
@@ -689,7 +689,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_c_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s) {
+define amdgpu_ps <4 x float> @sample_c_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s) #0 {
 ; VERDE-LABEL: sample_c_1d:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    s_mov_b64 s[12:13], exec
@@ -722,7 +722,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_c_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %t) {
+define amdgpu_ps <4 x float> @sample_c_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %t) #0 {
 ; VERDE-LABEL: sample_c_2d:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    s_mov_b64 s[12:13], exec
@@ -755,7 +755,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %clamp) {
+define amdgpu_ps <4 x float> @sample_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %clamp) #0 {
 ; VERDE-LABEL: sample_cl_1d:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    s_mov_b64 s[12:13], exec
@@ -788,7 +788,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, float %clamp) {
+define amdgpu_ps <4 x float> @sample_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, float %clamp) #0 {
 ; VERDE-LABEL: sample_cl_2d:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    s_mov_b64 s[12:13], exec
@@ -821,7 +821,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_c_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %clamp) {
+define amdgpu_ps <4 x float> @sample_c_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %clamp) #0 {
 ; VERDE-LABEL: sample_c_cl_1d:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    s_mov_b64 s[12:13], exec
@@ -854,7 +854,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %t, float %clamp) {
+define amdgpu_ps <4 x float> @sample_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %t, float %clamp) #0 {
 ; VERDE-LABEL: sample_c_cl_2d:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    s_mov_b64 s[12:13], exec
@@ -887,7 +887,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_b_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %s) {
+define amdgpu_ps <4 x float> @sample_b_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %s) #0 {
 ; VERDE-LABEL: sample_b_1d:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    s_mov_b64 s[12:13], exec
@@ -920,7 +920,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %s, float %t) {
+define amdgpu_ps <4 x float> @sample_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %s, float %t) #0 {
 ; VERDE-LABEL: sample_b_2d:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    s_mov_b64 s[12:13], exec
@@ -953,7 +953,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_c_b_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, float %s) {
+define amdgpu_ps <4 x float> @sample_c_b_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, float %s) #0 {
 ; VERDE-LABEL: sample_c_b_1d:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    s_mov_b64 s[12:13], exec
@@ -986,7 +986,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, float %s, float %t) {
+define amdgpu_ps <4 x float> @sample_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, float %s, float %t) #0 {
 ; VERDE-LABEL: sample_c_b_2d:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    s_mov_b64 s[12:13], exec
@@ -1019,7 +1019,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_b_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %s, float %clamp) {
+define amdgpu_ps <4 x float> @sample_b_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %s, float %clamp) #0 {
 ; VERDE-LABEL: sample_b_cl_1d:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    s_mov_b64 s[12:13], exec
@@ -1052,7 +1052,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %s, float %t, float %clamp) {
+define amdgpu_ps <4 x float> @sample_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %s, float %t, float %clamp) #0 {
 ; VERDE-LABEL: sample_b_cl_2d:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    s_mov_b64 s[12:13], exec
@@ -1085,7 +1085,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_c_b_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, float %s, float %clamp) {
+define amdgpu_ps <4 x float> @sample_c_b_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, float %s, float %clamp) #0 {
 ; VERDE-LABEL: sample_c_b_cl_1d:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    s_mov_b64 s[12:13], exec
@@ -1118,7 +1118,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, float %s, float %t, float %clamp) {
+define amdgpu_ps <4 x float> @sample_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, float %s, float %t, float %clamp) #0 {
 ; VERDE-LABEL: sample_c_b_cl_2d:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    s_mov_b64 s[12:13], exec
@@ -1151,7 +1151,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dsdv, float %s) {
+define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dsdv, float %s) #0 {
 ; VERDE-LABEL: sample_d_1d:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    image_sample_d v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf
@@ -1175,7 +1175,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t) {
+define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t) #0 {
 ; VERDE-LABEL: sample_d_2d:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    image_sample_d v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf
@@ -1199,7 +1199,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_c_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dsdv, float %s) {
+define amdgpu_ps <4 x float> @sample_c_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dsdv, float %s) #0 {
 ; VERDE-LABEL: sample_c_d_1d:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    image_sample_c_d v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf
@@ -1223,7 +1223,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t) {
+define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t) #0 {
 ; VERDE-LABEL: sample_c_d_2d:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    image_sample_c_d v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf
@@ -1247,7 +1247,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dsdv, float %s, float %clamp) {
+define amdgpu_ps <4 x float> @sample_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dsdv, float %s, float %clamp) #0 {
 ; VERDE-LABEL: sample_d_cl_1d:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    image_sample_d_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf
@@ -1271,7 +1271,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %clamp) {
+define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %clamp) #0 {
 ; VERDE-LABEL: sample_d_cl_2d:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    image_sample_d_cl v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf
@@ -1295,7 +1295,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_c_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp) {
+define amdgpu_ps <4 x float> @sample_c_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp) #0 {
 ; VERDE-LABEL: sample_c_d_cl_1d:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    image_sample_c_d_cl v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf
@@ -1319,7 +1319,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %clamp) {
+define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %clamp) #0 {
 ; VERDE-LABEL: sample_c_d_cl_2d:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    image_sample_c_d_cl v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf
@@ -1343,7 +1343,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dsdv, float %s) {
+define amdgpu_ps <4 x float> @sample_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dsdv, float %s) #0 {
 ; VERDE-LABEL: sample_cd_1d:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    image_sample_cd v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf
@@ -1367,7 +1367,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t) {
+define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t) #0 {
 ; VERDE-LABEL: sample_cd_2d:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    image_sample_cd v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf
@@ -1391,7 +1391,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_c_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dsdv, float %s) {
+define amdgpu_ps <4 x float> @sample_c_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dsdv, float %s) #0 {
 ; VERDE-LABEL: sample_c_cd_1d:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    image_sample_c_cd v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf
@@ -1415,7 +1415,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t) {
+define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t) #0 {
 ; VERDE-LABEL: sample_c_cd_2d:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    image_sample_c_cd v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf
@@ -1439,7 +1439,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dsdv, float %s, float %clamp) {
+define amdgpu_ps <4 x float> @sample_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dsdv, float %s, float %clamp) #0 {
 ; VERDE-LABEL: sample_cd_cl_1d:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    image_sample_cd_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf
@@ -1463,7 +1463,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %clamp) {
+define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %clamp) #0 {
 ; VERDE-LABEL: sample_cd_cl_2d:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    image_sample_cd_cl v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf
@@ -1487,7 +1487,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_c_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp) {
+define amdgpu_ps <4 x float> @sample_c_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp) #0 {
 ; VERDE-LABEL: sample_c_cd_cl_1d:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    image_sample_c_cd_cl v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf
@@ -1511,7 +1511,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %clamp) {
+define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %clamp) #0 {
 ; VERDE-LABEL: sample_c_cd_cl_2d:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    image_sample_c_cd_cl v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf
@@ -1535,7 +1535,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_l_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %lod) {
+define amdgpu_ps <4 x float> @sample_l_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %lod) #0 {
 ; VERDE-LABEL: sample_l_1d:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    image_sample_l v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf
@@ -1559,7 +1559,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, float %lod) {
+define amdgpu_ps <4 x float> @sample_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, float %lod) #0 {
 ; VERDE-LABEL: sample_l_2d:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    image_sample_l v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf
@@ -1583,7 +1583,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_c_l_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %lod) {
+define amdgpu_ps <4 x float> @sample_c_l_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %lod) #0 {
 ; VERDE-LABEL: sample_c_l_1d:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    image_sample_c_l v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf
@@ -1607,7 +1607,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_c_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %t, float %lod) {
+define amdgpu_ps <4 x float> @sample_c_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %t, float %lod) #0 {
 ; VERDE-LABEL: sample_c_l_2d:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    image_sample_c_l v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf
@@ -1631,7 +1631,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_lz_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
+define amdgpu_ps <4 x float> @sample_lz_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) #0 {
 ; VERDE-LABEL: sample_lz_1d:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    image_sample_lz v[0:3], v0, s[0:7], s[8:11] dmask:0xf
@@ -1655,7 +1655,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
+define amdgpu_ps <4 x float> @sample_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) #0 {
 ; VERDE-LABEL: sample_lz_2d:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    image_sample_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf
@@ -1679,7 +1679,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_c_lz_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s) {
+define amdgpu_ps <4 x float> @sample_c_lz_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s) #0 {
 ; VERDE-LABEL: sample_c_lz_1d:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    image_sample_c_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf
@@ -1703,7 +1703,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_c_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %t) {
+define amdgpu_ps <4 x float> @sample_c_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %t) #0 {
 ; VERDE-LABEL: sample_c_lz_2d:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    image_sample_c_lz v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf
@@ -1727,7 +1727,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %slice) {
+define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %slice) #0 {
 ; VERDE-LABEL: sample_c_d_o_2darray_V1:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    image_sample_c_d_o v0, v[0:15], s[0:7], s[8:11] dmask:0x4 da
@@ -1751,7 +1751,7 @@
   ret float %v
 }
 
-define amdgpu_ps float @sample_c_d_o_2darray_V1_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %slice, i32 addrspace(1)* inreg %out) {
+define amdgpu_ps float @sample_c_d_o_2darray_V1_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %slice, i32 addrspace(1)* inreg %out) #0 {
 ; VERDE-LABEL: sample_c_d_o_2darray_V1_tfe:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    v_mov_b32_e32 v9, 0
@@ -1800,7 +1800,7 @@
   ret float %v.vec
 }
 
-define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %slice) {
+define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %slice) #0 {
 ; VERDE-LABEL: sample_c_d_o_2darray_V2:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    image_sample_c_d_o v[0:1], v[0:15], s[0:7], s[8:11] dmask:0x6 da
@@ -1824,7 +1824,7 @@
   ret <2 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_c_d_o_2darray_V2_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %slice) {
+define amdgpu_ps <4 x float> @sample_c_d_o_2darray_V2_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %slice) #0 {
 ; VERDE-LABEL: sample_c_d_o_2darray_V2_tfe:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    v_mov_b32_e32 v9, 0
@@ -1874,7 +1874,7 @@
   ret <4 x float> %res.2
 }
 
-define amdgpu_ps <4 x float> @sample_1d_unorm(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
+define amdgpu_ps <4 x float> @sample_1d_unorm(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) #0 {
 ; VERDE-LABEL: sample_1d_unorm:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    s_mov_b64 s[12:13], exec
@@ -1907,7 +1907,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_1d_glc(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
+define amdgpu_ps <4 x float> @sample_1d_glc(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) #0 {
 ; VERDE-LABEL: sample_1d_glc:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    s_mov_b64 s[12:13], exec
@@ -1940,7 +1940,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_1d_slc(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
+define amdgpu_ps <4 x float> @sample_1d_slc(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) #0 {
 ; VERDE-LABEL: sample_1d_slc:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    s_mov_b64 s[12:13], exec
@@ -1973,7 +1973,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps <4 x float> @sample_1d_glc_slc(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
+define amdgpu_ps <4 x float> @sample_1d_glc_slc(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) #0 {
 ; VERDE-LABEL: sample_1d_glc_slc:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    s_mov_b64 s[12:13], exec
@@ -2006,7 +2006,7 @@
   ret <4 x float> %v
 }
 
-define amdgpu_ps float @adjust_writemask_sample_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
+define amdgpu_ps float @adjust_writemask_sample_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) #0 {
 ; VERDE-LABEL: adjust_writemask_sample_0:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    s_mov_b64 s[12:13], exec
@@ -2040,7 +2040,7 @@
   ret float %elt0
 }
 
-define amdgpu_ps <2 x float> @adjust_writemask_sample_01(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
+define amdgpu_ps <2 x float> @adjust_writemask_sample_01(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) #0 {
 ; VERDE-LABEL: adjust_writemask_sample_01:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    s_mov_b64 s[12:13], exec
@@ -2074,7 +2074,7 @@
   ret <2 x float> %out
 }
 
-define amdgpu_ps <3 x float> @adjust_writemask_sample_012(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
+define amdgpu_ps <3 x float> @adjust_writemask_sample_012(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) #0 {
 ; VERDE-LABEL: adjust_writemask_sample_012:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    s_mov_b64 s[12:13], exec
@@ -2108,7 +2108,7 @@
   ret <3 x float> %out
 }
 
-define amdgpu_ps <2 x float> @adjust_writemask_sample_12(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
+define amdgpu_ps <2 x float> @adjust_writemask_sample_12(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) #0 {
 ; VERDE-LABEL: adjust_writemask_sample_12:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    s_mov_b64 s[12:13], exec
@@ -2142,7 +2142,7 @@
   ret <2 x float> %out
 }
 
-define amdgpu_ps <2 x float> @adjust_writemask_sample_03(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
+define amdgpu_ps <2 x float> @adjust_writemask_sample_03(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) #0 {
 ; VERDE-LABEL: adjust_writemask_sample_03:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    s_mov_b64 s[12:13], exec
@@ -2176,7 +2176,7 @@
   ret <2 x float> %out
 }
 
-define amdgpu_ps <2 x float> @adjust_writemask_sample_13(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
+define amdgpu_ps <2 x float> @adjust_writemask_sample_13(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) #0 {
 ; VERDE-LABEL: adjust_writemask_sample_13:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    s_mov_b64 s[12:13], exec
@@ -2210,7 +2210,7 @@
   ret <2 x float> %out
 }
 
-define amdgpu_ps <3 x float> @adjust_writemask_sample_123(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
+define amdgpu_ps <3 x float> @adjust_writemask_sample_123(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) #0 {
 ; VERDE-LABEL: adjust_writemask_sample_123:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    s_mov_b64 s[12:13], exec
@@ -2244,7 +2244,7 @@
   ret <3 x float> %out
 }
 
-define amdgpu_ps <4 x float> @adjust_writemask_sample_none_enabled(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
+define amdgpu_ps <4 x float> @adjust_writemask_sample_none_enabled(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) #0 {
 ; VERDE-LABEL: adjust_writemask_sample_none_enabled:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    ; return to shader part epilog
@@ -2262,7 +2262,7 @@
   ret <4 x float> %r
 }
 
-define amdgpu_ps <2 x float> @adjust_writemask_sample_123_to_12(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
+define amdgpu_ps <2 x float> @adjust_writemask_sample_123_to_12(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) #0 {
 ; VERDE-LABEL: adjust_writemask_sample_123_to_12:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    s_mov_b64 s[12:13], exec
@@ -2296,7 +2296,7 @@
   ret <2 x float> %out
 }
 
-define amdgpu_ps <2 x float> @adjust_writemask_sample_013_to_13(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
+define amdgpu_ps <2 x float> @adjust_writemask_sample_013_to_13(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) #0 {
 ; VERDE-LABEL: adjust_writemask_sample_013_to_13:
 ; VERDE:       ; %bb.0: ; %main_body
 ; VERDE-NEXT:    s_mov_b64 s[12:13], exec
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.store.a16.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.store.a16.d16.ll
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.store.a16.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.store.a16.d16.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
 
-define amdgpu_ps void @store_f16_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <2 x i32> %val) {
+define amdgpu_ps void @store_f16_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <2 x i32> %val) #0 {
 ; GFX9-LABEL: store_f16_1d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_store v[1:2], v0, s[0:7] dmask:0x1 unorm a16 d16
@@ -20,7 +20,7 @@
   ret void
 }
 
-define amdgpu_ps void @store_v2f16_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <2 x i32> %val) {
+define amdgpu_ps void @store_v2f16_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <2 x i32> %val) #0 {
 ; GFX9-LABEL: store_v2f16_1d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_store v[1:2], v0, s[0:7] dmask:0x3 unorm a16 d16
@@ -38,7 +38,7 @@
   ret void
 }
 
-define amdgpu_ps void @store_v3f16_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <2 x i32> %val) {
+define amdgpu_ps void @store_v3f16_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <2 x i32> %val) #0 {
 ; GFX9-LABEL: store_v3f16_1d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_store v[1:2], v0, s[0:7] dmask:0x7 unorm a16 d16
@@ -56,7 +56,7 @@
   ret void
 }
 
-define amdgpu_ps void @store_v4f16_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <2 x i32> %val) {
+define amdgpu_ps void @store_v4f16_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <2 x i32> %val) #0 {
 ; GFX9-LABEL: store_v4f16_1d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_store v[1:2], v0, s[0:7] dmask:0xf unorm a16 d16
@@ -74,7 +74,7 @@
   ret void
 }
 
-define amdgpu_ps void @store_f16_2d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <2 x i32> %val) {
+define amdgpu_ps void @store_f16_2d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <2 x i32> %val) #0 {
 ; GFX9-LABEL: store_f16_2d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_store v[1:2], v0, s[0:7] dmask:0x1 unorm a16 d16
@@ -93,7 +93,7 @@
   ret void
 }
 
-define amdgpu_ps void @store_v2f16_2d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <2 x i32> %val) {
+define amdgpu_ps void @store_v2f16_2d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <2 x i32> %val) #0 {
 ; GFX9-LABEL: store_v2f16_2d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_store v[1:2], v0, s[0:7] dmask:0x3 unorm a16 d16
@@ -112,7 +112,7 @@
   ret void
 }
 
-define amdgpu_ps void @store_v3f16_2d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <2 x i32> %val) {
+define amdgpu_ps void @store_v3f16_2d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <2 x i32> %val) #0 {
 ; GFX9-LABEL: store_v3f16_2d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_store v[1:2], v0, s[0:7] dmask:0x7 unorm a16 d16
@@ -131,7 +131,7 @@
   ret void
 }
 
-define amdgpu_ps void @store_v4f16_2d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <2 x i32> %val) {
+define amdgpu_ps void @store_v4f16_2d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <2 x i32> %val) #0 {
 ; GFX9-LABEL: store_v4f16_2d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_store v[1:2], v0, s[0:7] dmask:0xf unorm a16 d16
@@ -150,7 +150,7 @@
   ret void
 }
 
-define amdgpu_ps void @store_f16_3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi, <2 x i32> %val) {
+define amdgpu_ps void @store_f16_3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi, <2 x i32> %val) #0 {
 ; GFX9-LABEL: store_f16_3d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_store v[2:3], v[0:1], s[0:7] dmask:0x1 unorm a16 d16
@@ -170,7 +170,7 @@
   ret void
 }
 
-define amdgpu_ps void @store_v2f16_3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi, <2 x i32> %val) {
+define amdgpu_ps void @store_v2f16_3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi, <2 x i32> %val) #0 {
 ; GFX9-LABEL: store_v2f16_3d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_store v[2:3], v[0:1], s[0:7] dmask:0x3 unorm a16 d16
@@ -190,7 +190,7 @@
   ret void
 }
 
-define amdgpu_ps void @store_v3f16_3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi, <2 x i32> %val) {
+define amdgpu_ps void @store_v3f16_3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi, <2 x i32> %val) #0 {
 ; GFX9-LABEL: store_v3f16_3d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_store v[2:3], v[0:1], s[0:7] dmask:0x7 unorm a16 d16
@@ -210,7 +210,7 @@
   ret void
 }
 
-define amdgpu_ps void @store_v4f16_3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi, <2 x i32> %val) {
+define amdgpu_ps void @store_v4f16_3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi, <2 x i32> %val) #0 {
 ; GFX9-LABEL: store_v4f16_3d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_store v[2:3], v[0:1], s[0:7] dmask:0xf unorm a16 d16
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.store.a16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.store.a16.ll
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.store.a16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.store.a16.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
 
-define amdgpu_ps void @store_f32_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <4 x float> %val) {
+define amdgpu_ps void @store_f32_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <4 x float> %val) #0 {
 ; GFX9-LABEL: store_f32_1d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_store v[1:4], v0, s[0:7] dmask:0x1 unorm a16
@@ -19,7 +19,7 @@
   ret void
 }
 
-define amdgpu_ps void @store_v2f32_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <4 x float> %val) {
+define amdgpu_ps void @store_v2f32_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <4 x float> %val) #0 {
 ; GFX9-LABEL: store_v2f32_1d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_store v[1:4], v0, s[0:7] dmask:0x3 unorm a16
@@ -36,7 +36,7 @@
   ret void
 }
 
-define amdgpu_ps void @store_v3f32_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <4 x float> %val) {
+define amdgpu_ps void @store_v3f32_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <4 x float> %val) #0 {
 ; GFX9-LABEL: store_v3f32_1d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_store v[1:4], v0, s[0:7] dmask:0x7 unorm a16
@@ -53,7 +53,7 @@
   ret void
 }
 
-define amdgpu_ps void @store_v4f32_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <4 x float> %val) {
+define amdgpu_ps void @store_v4f32_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <4 x float> %val) #0 {
 ; GFX9-LABEL: store_v4f32_1d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_store v[1:4], v0, s[0:7] dmask:0xf unorm a16
@@ -70,7 +70,7 @@
   ret void
 }
 
-define amdgpu_ps void @store_f32_2d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <4 x float> %val) {
+define amdgpu_ps void @store_f32_2d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <4 x float> %val) #0 {
 ; GFX9-LABEL: store_f32_2d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_store v[1:4], v0, s[0:7] dmask:0x1 unorm a16
@@ -88,7 +88,7 @@
   ret void
 }
 
-define amdgpu_ps void @store_v2f32_2d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <4 x float> %val) {
+define amdgpu_ps void @store_v2f32_2d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <4 x float> %val) #0 {
 ; GFX9-LABEL: store_v2f32_2d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_store v[1:4], v0, s[0:7] dmask:0x3 unorm a16
@@ -106,7 +106,7 @@
   ret void
 }
 
-define amdgpu_ps void @store_v3f32_2d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <4 x float> %val) {
+define amdgpu_ps void @store_v3f32_2d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <4 x float> %val) #0 {
 ; GFX9-LABEL: store_v3f32_2d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_store v[1:4], v0, s[0:7] dmask:0x7 unorm a16
@@ -124,7 +124,7 @@
   ret void
 }
 
-define amdgpu_ps void @store_v4f32_2d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <4 x float> %val) {
+define amdgpu_ps void @store_v4f32_2d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <4 x float> %val) #0 {
 ; GFX9-LABEL: store_v4f32_2d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_store v[1:4], v0, s[0:7] dmask:0xf unorm a16
@@ -142,7 +142,7 @@
   ret void
 }
 
-define amdgpu_ps void @store_f32_3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi, <4 x float> %val) {
+define amdgpu_ps void @store_f32_3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi, <4 x float> %val) #0 {
 ; GFX9-LABEL: store_f32_3d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_store v[2:5], v[0:1], s[0:7] dmask:0x1 unorm a16
@@ -161,7 +161,7 @@
   ret void
 }
 
-define amdgpu_ps void @store_v2f32_3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi, <4 x float> %val) {
+define amdgpu_ps void @store_v2f32_3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi, <4 x float> %val) #0 {
 ; GFX9-LABEL: store_v2f32_3d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_store v[2:5], v[0:1], s[0:7] dmask:0x3 unorm a16
@@ -180,7 +180,7 @@
   ret void
 }
 
-define amdgpu_ps void @store_v3f32_3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi, <4 x float> %val) {
+define amdgpu_ps void @store_v3f32_3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi, <4 x float> %val) #0 {
 ; GFX9-LABEL: store_v3f32_3d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_store v[2:5], v[0:1], s[0:7] dmask:0x7 unorm a16
@@ -199,7 +199,7 @@
   ret void
 }
 
-define amdgpu_ps void @store_v4f32_3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi, <4 x float> %val) {
+define amdgpu_ps void @store_v4f32_3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi, <4 x float> %val) #0 {
 ; GFX9-LABEL: store_v4f32_3d:
 ; GFX9:       ; %bb.0: ; %main_body
 ; GFX9-NEXT:    image_store v[2:5], v[0:1], s[0:7] dmask:0xf unorm a16
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
--- a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
@@ -3,7 +3,7 @@
 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8 %s
 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
 
-define amdgpu_kernel void @cos_f16(half addrspace(1)* %r, half addrspace(1)* %a) {
+define amdgpu_kernel void @cos_f16(half addrspace(1)* %r, half addrspace(1)* %a) #0 {
 ; GFX6-LABEL: cos_f16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -62,7 +62,7 @@
   ret void
 }
 
-define amdgpu_kernel void @cos_v2f16(<2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a) {
+define amdgpu_kernel void @cos_v2f16(<2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a) #0 {
 ; GFX6-LABEL: cos_v2f16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -142,3 +142,4 @@
 
 declare half @llvm.cos.f16(half %a)
 declare <2 x half> @llvm.cos.v2f16(<2 x half> %a)
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.dbg.value.ll b/llvm/test/CodeGen/AMDGPU/llvm.dbg.value.ll
--- a/llvm/test/CodeGen/AMDGPU/llvm.dbg.value.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.dbg.value.ll
@@ -21,6 +21,8 @@
 
 ; GCN-LABEL: {{^}}only_undef_dbg_value:
 ; NOOPT: ;DEBUG_VALUE: test_debug_value:globalptr_arg <- [DW_OP_constu 1, DW_OP_swap, DW_OP_xderef] undef
+; NOOPT-NEXT: .cfi_escape
+; NOOPT-NEXT: .cfi_undefined
 ; NOOPT-NEXT: s_endpgm
 
 ; OPT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
--- a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
@@ -88,7 +88,7 @@
 ; GFX9-NEXT:    s_endpgm
     half addrspace(1)* %r,
     half addrspace(1)* %a,
-    half addrspace(1)* %b) {
+    half addrspace(1)* %b) #0 {
 entry:
   %a.val = load volatile half, half addrspace(1)* %a
   %b.val = load volatile half, half addrspace(1)* %b
@@ -157,7 +157,7 @@
 ; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
     half addrspace(1)* %r,
-    half addrspace(1)* %b) {
+    half addrspace(1)* %b) #0 {
 entry:
   %b.val = load half, half addrspace(1)* %b
   %r.val = call half @llvm.maxnum.f16(half 3.0, half %b.val)
@@ -225,7 +225,7 @@
 ; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
     half addrspace(1)* %r,
-    half addrspace(1)* %a) {
+    half addrspace(1)* %a) #0 {
 entry:
   %a.val = load half, half addrspace(1)* %a
   %r.val = call half @llvm.maxnum.f16(half %a.val, half 4.0)
@@ -308,7 +308,7 @@
 ; GFX9-NEXT:    s_endpgm
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
-    <2 x half> addrspace(1)* %b) {
+    <2 x half> addrspace(1)* %b) #0 {
 entry:
   %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
   %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
@@ -376,7 +376,7 @@
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
     <2 x half> addrspace(1)* %r,
-    <2 x half> addrspace(1)* %b) {
+    <2 x half> addrspace(1)* %b) #0 {
 entry:
   %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
   %r.val = call <2 x half> @llvm.maxnum.v2f16(<2 x half> <half 3.0, half 4.0>, <2 x half> %b.val)
@@ -443,7 +443,7 @@
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
     <2 x half> addrspace(1)* %r,
-    <2 x half> addrspace(1)* %a) {
+    <2 x half> addrspace(1)* %a) #0 {
 entry:
   %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
   %r.val = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a.val, <2 x half> <half 4.0, half 3.0>)
@@ -542,7 +542,7 @@
 ; GFX9-NEXT:    s_endpgm
     <3 x half> addrspace(1)* %r,
     <3 x half> addrspace(1)* %a,
-    <3 x half> addrspace(1)* %b) {
+    <3 x half> addrspace(1)* %b) #0 {
 entry:
   %a.val = load <3 x half>, <3 x half> addrspace(1)* %a
   %b.val = load <3 x half>, <3 x half> addrspace(1)* %b
@@ -655,7 +655,7 @@
 ; GFX9-NEXT:    s_endpgm
     <4 x half> addrspace(1)* %r,
     <4 x half> addrspace(1)* %a,
-    <4 x half> addrspace(1)* %b) {
+    <4 x half> addrspace(1)* %b) #0 {
 entry:
   %a.val = load <4 x half>, <4 x half> addrspace(1)* %a
   %b.val = load <4 x half>, <4 x half> addrspace(1)* %b
@@ -746,10 +746,11 @@
 ; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
     <4 x half> addrspace(1)* %r,
-    <4 x half> addrspace(1)* %b) {
+    <4 x half> addrspace(1)* %b) #0 {
 entry:
   %b.val = load <4 x half>, <4 x half> addrspace(1)* %b
   %r.val = call <4 x half> @llvm.maxnum.v4f16(<4 x half> <half 8.0, half 2.0, half 3.0, half 4.0>, <4 x half> %b.val)
   store <4 x half> %r.val, <4 x half> addrspace(1)* %r
   ret void
 }
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
--- a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
@@ -88,7 +88,7 @@
 ; GFX9-NEXT:    s_endpgm
     half addrspace(1)* %r,
     half addrspace(1)* %a,
-    half addrspace(1)* %b) {
+    half addrspace(1)* %b) #0 {
 entry:
   %a.val = load volatile half, half addrspace(1)* %a
   %b.val = load volatile half, half addrspace(1)* %b
@@ -97,7 +97,7 @@
   ret void
 }
 
-define amdgpu_ps half @minnum_f16_no_ieee(half %a, half %b) {
+define amdgpu_ps half @minnum_f16_no_ieee(half %a, half %b) #0 {
 ; SI-LABEL: minnum_f16_no_ieee:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
@@ -180,7 +180,7 @@
 ; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
     half addrspace(1)* %r,
-    half addrspace(1)* %b) {
+    half addrspace(1)* %b) #0 {
 entry:
   %b.val = load half, half addrspace(1)* %b
   %r.val = call half @llvm.minnum.f16(half 3.0, half %b.val)
@@ -248,7 +248,7 @@
 ; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
     half addrspace(1)* %r,
-    half addrspace(1)* %a) {
+    half addrspace(1)* %a) #0 {
 entry:
   %a.val = load half, half addrspace(1)* %a
   %r.val = call half @llvm.minnum.f16(half %a.val, half 4.0)
@@ -331,7 +331,7 @@
 ; GFX9-NEXT:    s_endpgm
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
-    <2 x half> addrspace(1)* %b) {
+    <2 x half> addrspace(1)* %b) #0 {
 entry:
   %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
   %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
@@ -340,7 +340,7 @@
   ret void
 }
 
-define amdgpu_ps <2 x half> @minnum_v2f16_no_ieee(<2 x half> %a, <2 x half> %b) {
+define amdgpu_ps <2 x half> @minnum_v2f16_no_ieee(<2 x half> %a, <2 x half> %b) #0 {
 ; SI-LABEL: minnum_v2f16_no_ieee:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
@@ -429,7 +429,7 @@
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
     <2 x half> addrspace(1)* %r,
-    <2 x half> addrspace(1)* %b) {
+    <2 x half> addrspace(1)* %b) #0 {
 entry:
   %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
   %r.val = call <2 x half> @llvm.minnum.v2f16(<2 x half> <half 3.0, half 4.0>, <2 x half> %b.val)
@@ -496,7 +496,7 @@
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
     <2 x half> addrspace(1)* %r,
-    <2 x half> addrspace(1)* %a) {
+    <2 x half> addrspace(1)* %a) #0 {
 entry:
   %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
   %r.val = call <2 x half> @llvm.minnum.v2f16(<2 x half> %a.val, <2 x half> <half 4.0, half 3.0>)
@@ -595,7 +595,7 @@
 ; GFX9-NEXT:    s_endpgm
     <3 x half> addrspace(1)* %r,
     <3 x half> addrspace(1)* %a,
-    <3 x half> addrspace(1)* %b) {
+    <3 x half> addrspace(1)* %b) #0 {
 entry:
   %a.val = load <3 x half>, <3 x half> addrspace(1)* %a
   %b.val = load <3 x half>, <3 x half> addrspace(1)* %b
@@ -708,7 +708,7 @@
 ; GFX9-NEXT:    s_endpgm
     <4 x half> addrspace(1)* %r,
     <4 x half> addrspace(1)* %a,
-    <4 x half> addrspace(1)* %b) {
+    <4 x half> addrspace(1)* %b) #0 {
 entry:
   %a.val = load <4 x half>, <4 x half> addrspace(1)* %a
   %b.val = load <4 x half>, <4 x half> addrspace(1)* %b
@@ -799,10 +799,11 @@
 ; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
     <4 x half> addrspace(1)* %r,
-    <4 x half> addrspace(1)* %b) {
+    <4 x half> addrspace(1)* %b) #0 {
 entry:
   %b.val = load <4 x half>, <4 x half> addrspace(1)* %b
   %r.val = call <4 x half> @llvm.minnum.v4f16(<4 x half> <half 8.0, half 2.0, half 3.0, half 4.0>, <4 x half> %b.val)
   store <4 x half> %r.val, <4 x half> addrspace(1)* %r
   ret void
 }
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
--- a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
@@ -3,7 +3,7 @@
 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8 %s
 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
 
-define amdgpu_kernel void @sin_f16(half addrspace(1)* %r, half addrspace(1)* %a) {
+define amdgpu_kernel void @sin_f16(half addrspace(1)* %r, half addrspace(1)* %a) #0 {
 ; GFX6-LABEL: sin_f16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -62,7 +62,7 @@
   ret void
 }
 
-define amdgpu_kernel void @sin_v2f16(<2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a) {
+define amdgpu_kernel void @sin_v2f16(<2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a) #0 {
 ; GFX6-LABEL: sin_v2f16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -142,3 +142,4 @@
 
 declare half @llvm.sin.f16(half %a)
 declare <2 x half> @llvm.sin.v2f16(<2 x half> %a)
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/lshl64-to-32.ll b/llvm/test/CodeGen/AMDGPU/lshl64-to-32.ll
--- a/llvm/test/CodeGen/AMDGPU/lshl64-to-32.ll
+++ b/llvm/test/CodeGen/AMDGPU/lshl64-to-32.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn-- -mcpu=pitcairn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
-define amdgpu_kernel void @zext_shl64_to_32(i64 addrspace(1)* nocapture %out, i32 %x) {
+define amdgpu_kernel void @zext_shl64_to_32(i64 addrspace(1)* nocapture %out, i32 %x) #0 {
 ; GCN-LABEL: zext_shl64_to_32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
@@ -21,7 +21,7 @@
   ret void
 }
 
-define amdgpu_kernel void @sext_shl64_to_32(i64 addrspace(1)* nocapture %out, i32 %x) {
+define amdgpu_kernel void @sext_shl64_to_32(i64 addrspace(1)* nocapture %out, i32 %x) #0 {
 ; GCN-LABEL: sext_shl64_to_32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
@@ -42,7 +42,7 @@
   ret void
 }
 
-define amdgpu_kernel void @zext_shl64_overflow(i64 addrspace(1)* nocapture %out, i32 %x) {
+define amdgpu_kernel void @zext_shl64_overflow(i64 addrspace(1)* nocapture %out, i32 %x) #0 {
 ; GCN-LABEL: zext_shl64_overflow:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
@@ -64,7 +64,7 @@
   ret void
 }
 
-define amdgpu_kernel void @sext_shl64_overflow(i64 addrspace(1)* nocapture %out, i32 %x) {
+define amdgpu_kernel void @sext_shl64_overflow(i64 addrspace(1)* nocapture %out, i32 %x) #0 {
 ; GCN-LABEL: sext_shl64_overflow:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
@@ -86,7 +86,7 @@
   ret void
 }
 
-define amdgpu_kernel void @mulu24_shl64(i32 addrspace(1)* nocapture %arg) {
+define amdgpu_kernel void @mulu24_shl64(i32 addrspace(1)* nocapture %arg) #0 {
 ; GCN-LABEL: mulu24_shl64:
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -109,7 +109,7 @@
   ret void
 }
 
-define amdgpu_kernel void @muli24_shl64(i64 addrspace(1)* nocapture %arg, i32 addrspace(1)* nocapture readonly %arg1) {
+define amdgpu_kernel void @muli24_shl64(i64 addrspace(1)* nocapture %arg, i32 addrspace(1)* nocapture readonly %arg1) #0 {
 ; GCN-LABEL: muli24_shl64:
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -144,3 +144,4 @@
 }
 
 declare i32 @llvm.amdgcn.workitem.id.x()
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/memory_clause.ll b/llvm/test/CodeGen/AMDGPU/memory_clause.ll
--- a/llvm/test/CodeGen/AMDGPU/memory_clause.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory_clause.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -march=amdgcn -mcpu=gfx902 -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck -check-prefix=GCN %s
 
-define amdgpu_kernel void @vector_clause(<4 x i32> addrspace(1)* noalias nocapture readonly %arg, <4 x i32> addrspace(1)* noalias nocapture %arg1) {
+define amdgpu_kernel void @vector_clause(<4 x i32> addrspace(1)* noalias nocapture readonly %arg, <4 x i32> addrspace(1)* noalias nocapture %arg1) #0 {
 ; GCN-LABEL: vector_clause:
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
@@ -51,7 +51,7 @@
   ret void
 }
 
-define amdgpu_kernel void @scalar_clause(<4 x i32> addrspace(1)* noalias nocapture readonly %arg, <4 x i32> addrspace(1)* noalias nocapture %arg1) {
+define amdgpu_kernel void @scalar_clause(<4 x i32> addrspace(1)* noalias nocapture readonly %arg, <4 x i32> addrspace(1)* noalias nocapture %arg1) #0 {
 ; GCN-LABEL: scalar_clause:
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x24
@@ -108,7 +108,7 @@
   ret void
 }
 
-define void @mubuf_clause(<4 x i32> addrspace(5)* noalias nocapture readonly %arg, <4 x i32> addrspace(5)* noalias nocapture %arg1) {
+define void @mubuf_clause(<4 x i32> addrspace(5)* noalias nocapture readonly %arg, <4 x i32> addrspace(5)* noalias nocapture %arg1) #0 {
 ; GCN-LABEL: mubuf_clause:
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -185,7 +185,7 @@
   ret void
 }
 
-define amdgpu_kernel void @vector_clause_indirect(i64 addrspace(1)* noalias nocapture readonly %arg, <4 x i32> addrspace(1)* noalias nocapture readnone %arg1, <4 x i32> addrspace(1)* noalias nocapture %arg2) {
+define amdgpu_kernel void @vector_clause_indirect(i64 addrspace(1)* noalias nocapture readonly %arg, <4 x i32> addrspace(1)* noalias nocapture readnone %arg1, <4 x i32> addrspace(1)* noalias nocapture %arg2) #0 {
 ; GCN-LABEL: vector_clause_indirect:
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
@@ -225,7 +225,7 @@
   ret void
 }
 
-define void @load_global_d16_hi(i16 addrspace(1)* %in, i16 %reg, <2 x i16> addrspace(1)* %out) {
+define void @load_global_d16_hi(i16 addrspace(1)* %in, i16 %reg, <2 x i16> addrspace(1)* %out) #0 {
 ; GCN-LABEL: load_global_d16_hi:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -258,7 +258,7 @@
   ret void
 }
 
-define void @load_global_d16_lo(i16 addrspace(1)* %in, i32 %reg, <2 x i16> addrspace(1)* %out) {
+define void @load_global_d16_lo(i16 addrspace(1)* %in, i32 %reg, <2 x i16> addrspace(1)* %out) #0 {
 ; GCN-LABEL: load_global_d16_lo:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -292,3 +292,4 @@
 }
 
 declare i32 @llvm.amdgcn.workitem.id.x()
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll
--- a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll
+++ b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll
@@ -5,7 +5,7 @@
 
 ; Ensure two if.break calls, for both the inner and outer loops
 ; FIXME: duplicate comparison
-define amdgpu_vs void @multi_else_break(<4 x float> %vec, i32 %ub, i32 %cont) {
+define amdgpu_vs void @multi_else_break(<4 x float> %vec, i32 %ub, i32 %cont) #0 {
 ; OPT-LABEL: @multi_else_break(
 ; OPT-NEXT:  main_body:
 ; OPT-NEXT:    br label [[LOOP_OUTER:%.*]]
diff --git a/llvm/test/CodeGen/AMDGPU/noop-shader-O0.ll b/llvm/test/CodeGen/AMDGPU/noop-shader-O0.ll
--- a/llvm/test/CodeGen/AMDGPU/noop-shader-O0.ll
+++ b/llvm/test/CodeGen/AMDGPU/noop-shader-O0.ll
@@ -9,7 +9,7 @@
 target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
 target triple = "amdgcn-amd-amdpal"
 
-define amdgpu_vs void @noop_vs() {
+define amdgpu_vs void @noop_vs() #0 {
 ; GCN-LABEL: noop_vs:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_endpgm
@@ -17,7 +17,7 @@
   ret void
 }
 
-define amdgpu_ls void @noop_ls() {
+define amdgpu_ls void @noop_ls() #0 {
 ; GCN-LABEL: noop_ls:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_endpgm
@@ -25,7 +25,7 @@
   ret void
 }
 
-define amdgpu_hs void @noop_hs() {
+define amdgpu_hs void @noop_hs() #0 {
 ; GCN-LABEL: noop_hs:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_endpgm
@@ -33,7 +33,7 @@
   ret void
 }
 
-define amdgpu_es void @noop_es() {
+define amdgpu_es void @noop_es() #0 {
 ; GCN-LABEL: noop_es:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_endpgm
@@ -41,7 +41,7 @@
   ret void
 }
 
-define amdgpu_gs void @noop_gs() {
+define amdgpu_gs void @noop_gs() #0 {
 ; GCN-LABEL: noop_gs:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_endpgm
@@ -49,7 +49,7 @@
   ret void
 }
 
-define amdgpu_ps void @noop_ps() {
+define amdgpu_ps void @noop_ps() #0 {
 ; GCN-LABEL: noop_ps:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_endpgm
@@ -57,10 +57,11 @@
   ret void
 }
 
-define amdgpu_cs void @noop_cs() {
+define amdgpu_cs void @noop_cs() #0 {
 ; GCN-LABEL: noop_cs:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_endpgm
 entry:
   ret void
 }
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll b/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll
--- a/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll
+++ b/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll
@@ -5,7 +5,7 @@
 ; Test splitting flat instruction offsets into the low and high bits
 ; when the offset doesn't fit in the offset field.
 
-define i8 @flat_inst_valu_offset_1(i8* %p) {
+define i8 @flat_inst_valu_offset_1(i8* %p) #0 {
 ; GFX9-LABEL: flat_inst_valu_offset_1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -28,7 +28,7 @@
   ret i8 %load
 }
 
-define i8 @flat_inst_valu_offset_11bit_max(i8* %p) {
+define i8 @flat_inst_valu_offset_11bit_max(i8* %p) #0 {
 ; GFX9-LABEL: flat_inst_valu_offset_11bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -51,7 +51,7 @@
   ret i8 %load
 }
 
-define i8 @flat_inst_valu_offset_12bit_max(i8* %p) {
+define i8 @flat_inst_valu_offset_12bit_max(i8* %p) #0 {
 ; GFX9-LABEL: flat_inst_valu_offset_12bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -74,7 +74,7 @@
   ret i8 %load
 }
 
-define i8 @flat_inst_valu_offset_13bit_max(i8* %p) {
+define i8 @flat_inst_valu_offset_13bit_max(i8* %p) #0 {
 ; GFX9-LABEL: flat_inst_valu_offset_13bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -99,7 +99,7 @@
   ret i8 %load
 }
 
-define i8 @flat_inst_valu_offset_neg_11bit_max(i8* %p) {
+define i8 @flat_inst_valu_offset_neg_11bit_max(i8* %p) #0 {
 ; GFX9-LABEL: flat_inst_valu_offset_neg_11bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -124,7 +124,7 @@
   ret i8 %load
 }
 
-define i8 @flat_inst_valu_offset_neg_12bit_max(i8* %p) {
+define i8 @flat_inst_valu_offset_neg_12bit_max(i8* %p) #0 {
 ; GFX9-LABEL: flat_inst_valu_offset_neg_12bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -149,7 +149,7 @@
   ret i8 %load
 }
 
-define i8 @flat_inst_valu_offset_neg_13bit_max(i8* %p) {
+define i8 @flat_inst_valu_offset_neg_13bit_max(i8* %p) #0 {
 ; GFX9-LABEL: flat_inst_valu_offset_neg_13bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -174,7 +174,7 @@
   ret i8 %load
 }
 
-define i8 @flat_inst_valu_offset_2x_11bit_max(i8* %p) {
+define i8 @flat_inst_valu_offset_2x_11bit_max(i8* %p) #0 {
 ; GFX9-LABEL: flat_inst_valu_offset_2x_11bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -197,7 +197,7 @@
   ret i8 %load
 }
 
-define i8 @flat_inst_valu_offset_2x_12bit_max(i8* %p) {
+define i8 @flat_inst_valu_offset_2x_12bit_max(i8* %p) #0 {
 ; GFX9-LABEL: flat_inst_valu_offset_2x_12bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -222,7 +222,7 @@
   ret i8 %load
 }
 
-define i8 @flat_inst_valu_offset_2x_13bit_max(i8* %p) {
+define i8 @flat_inst_valu_offset_2x_13bit_max(i8* %p) #0 {
 ; GFX9-LABEL: flat_inst_valu_offset_2x_13bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -247,7 +247,7 @@
   ret i8 %load
 }
 
-define i8 @flat_inst_valu_offset_2x_neg_11bit_max(i8* %p) {
+define i8 @flat_inst_valu_offset_2x_neg_11bit_max(i8* %p) #0 {
 ; GFX9-LABEL: flat_inst_valu_offset_2x_neg_11bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -272,7 +272,7 @@
   ret i8 %load
 }
 
-define i8 @flat_inst_valu_offset_2x_neg_12bit_max(i8* %p) {
+define i8 @flat_inst_valu_offset_2x_neg_12bit_max(i8* %p) #0 {
 ; GFX9-LABEL: flat_inst_valu_offset_2x_neg_12bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -297,7 +297,7 @@
   ret i8 %load
 }
 
-define i8 @flat_inst_valu_offset_2x_neg_13bit_max(i8* %p) {
+define i8 @flat_inst_valu_offset_2x_neg_13bit_max(i8* %p) #0 {
 ; GFX9-LABEL: flat_inst_valu_offset_2x_neg_13bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -323,7 +323,7 @@
 }
 
 ; Fill 11-bit low-bits (1ull << 33) | 2047
-define i8 @flat_inst_valu_offset_64bit_11bit_split0(i8* %p) {
+define i8 @flat_inst_valu_offset_64bit_11bit_split0(i8* %p) #0 {
 ; GFX9-LABEL: flat_inst_valu_offset_64bit_11bit_split0:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -349,7 +349,7 @@
 }
 
 ; Fill 11-bit low-bits (1ull << 33) | 2048
-define i8 @flat_inst_valu_offset_64bit_11bit_split1(i8* %p) {
+define i8 @flat_inst_valu_offset_64bit_11bit_split1(i8* %p) #0 {
 ; GFX9-LABEL: flat_inst_valu_offset_64bit_11bit_split1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -375,7 +375,7 @@
 }
 
 ; Fill 12-bit low-bits (1ull << 33) | 4095
-define i8 @flat_inst_valu_offset_64bit_12bit_split0(i8* %p) {
+define i8 @flat_inst_valu_offset_64bit_12bit_split0(i8* %p) #0 {
 ; GFX9-LABEL: flat_inst_valu_offset_64bit_12bit_split0:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -401,7 +401,7 @@
 }
 
 ; Fill 12-bit low-bits (1ull << 33) | 4096
-define i8 @flat_inst_valu_offset_64bit_12bit_split1(i8* %p) {
+define i8 @flat_inst_valu_offset_64bit_12bit_split1(i8* %p) #0 {
 ; GFX9-LABEL: flat_inst_valu_offset_64bit_12bit_split1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -427,7 +427,7 @@
 }
 
 ; Fill 13-bit low-bits (1ull << 33) | 8191
-define i8 @flat_inst_valu_offset_64bit_13bit_split0(i8* %p) {
+define i8 @flat_inst_valu_offset_64bit_13bit_split0(i8* %p) #0 {
 ; GFX9-LABEL: flat_inst_valu_offset_64bit_13bit_split0:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -453,7 +453,7 @@
 }
 
 ; Fill 13-bit low-bits (1ull << 33) | 8192
-define i8 @flat_inst_valu_offset_64bit_13bit_split1(i8* %p) {
+define i8 @flat_inst_valu_offset_64bit_13bit_split1(i8* %p) #0 {
 ; GFX9-LABEL: flat_inst_valu_offset_64bit_13bit_split1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -479,7 +479,7 @@
 }
 
 ; Fill 11-bit low-bits, negative high bits (1ull << 63) | 2047
-define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split0(i8* %p) {
+define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split0(i8* %p) #0 {
 ; GFX9-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split0:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -506,7 +506,7 @@
 }
 
 ; Fill 11-bit low-bits, negative high bits (1ull << 63) | 2048
-define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split1(i8* %p) {
+define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split1(i8* %p) #0 {
 ; GFX9-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -533,7 +533,7 @@
 }
 
 ; Fill 12-bit low-bits, negative high bits (1ull << 63) | 4095
-define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split0(i8* %p) {
+define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split0(i8* %p) #0 {
 ; GFX9-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split0:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -560,7 +560,7 @@
 }
 
 ; Fill 12-bit low-bits, negative high bits (1ull << 63) | 4096
-define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split1(i8* %p) {
+define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split1(i8* %p) #0 {
 ; GFX9-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -587,7 +587,7 @@
 }
 
 ; Fill 13-bit low-bits, negative high bits (1ull << 63) | 8191
-define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split0(i8* %p) {
+define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split0(i8* %p) #0 {
 ; GFX9-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split0:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -614,7 +614,7 @@
 }
 
 ; Fill 13-bit low-bits, negative high bits (1ull << 63) | 8192
-define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split1(i8* %p) {
+define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split1(i8* %p) #0 {
 ; GFX9-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -640,7 +640,7 @@
   ret i8 %load
 }
 
-define amdgpu_kernel void @flat_inst_salu_offset_1(i8* %p) {
+define amdgpu_kernel void @flat_inst_salu_offset_1(i8* %p) #0 {
 ; GFX9-LABEL: flat_inst_salu_offset_1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -671,7 +671,7 @@
   ret void
 }
 
-define amdgpu_kernel void @flat_inst_salu_offset_11bit_max(i8* %p) {
+define amdgpu_kernel void @flat_inst_salu_offset_11bit_max(i8* %p) #0 {
 ; GFX9-LABEL: flat_inst_salu_offset_11bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -702,7 +702,7 @@
   ret void
 }
 
-define amdgpu_kernel void @flat_inst_salu_offset_12bit_max(i8* %p) {
+define amdgpu_kernel void @flat_inst_salu_offset_12bit_max(i8* %p) #0 {
 ; GFX9-LABEL: flat_inst_salu_offset_12bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -733,7 +733,7 @@
   ret void
 }
 
-define amdgpu_kernel void @flat_inst_salu_offset_13bit_max(i8* %p) {
+define amdgpu_kernel void @flat_inst_salu_offset_13bit_max(i8* %p) #0 {
 ; GFX9-LABEL: flat_inst_salu_offset_13bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -766,7 +766,7 @@
   ret void
 }
 
-define amdgpu_kernel void @flat_inst_salu_offset_neg_11bit_max(i8* %p) {
+define amdgpu_kernel void @flat_inst_salu_offset_neg_11bit_max(i8* %p) #0 {
 ; GFX9-LABEL: flat_inst_salu_offset_neg_11bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -799,7 +799,7 @@
   ret void
 }
 
-define amdgpu_kernel void @flat_inst_salu_offset_neg_12bit_max(i8* %p) {
+define amdgpu_kernel void @flat_inst_salu_offset_neg_12bit_max(i8* %p) #0 {
 ; GFX9-LABEL: flat_inst_salu_offset_neg_12bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -832,7 +832,7 @@
   ret void
 }
 
-define amdgpu_kernel void @flat_inst_salu_offset_neg_13bit_max(i8* %p) {
+define amdgpu_kernel void @flat_inst_salu_offset_neg_13bit_max(i8* %p) #0 {
 ; GFX9-LABEL: flat_inst_salu_offset_neg_13bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -865,7 +865,7 @@
   ret void
 }
 
-define amdgpu_kernel void @flat_inst_salu_offset_2x_11bit_max(i8* %p) {
+define amdgpu_kernel void @flat_inst_salu_offset_2x_11bit_max(i8* %p) #0 {
 ; GFX9-LABEL: flat_inst_salu_offset_2x_11bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -896,7 +896,7 @@
   ret void
 }
 
-define amdgpu_kernel void @flat_inst_salu_offset_2x_12bit_max(i8* %p) {
+define amdgpu_kernel void @flat_inst_salu_offset_2x_12bit_max(i8* %p) #0 {
 ; GFX9-LABEL: flat_inst_salu_offset_2x_12bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -929,7 +929,7 @@
   ret void
 }
 
-define amdgpu_kernel void @flat_inst_salu_offset_2x_13bit_max(i8* %p) {
+define amdgpu_kernel void @flat_inst_salu_offset_2x_13bit_max(i8* %p) #0 {
 ; GFX9-LABEL: flat_inst_salu_offset_2x_13bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -962,7 +962,7 @@
   ret void
 }
 
-define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_11bit_max(i8* %p) {
+define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_11bit_max(i8* %p) #0 {
 ; GFX9-LABEL: flat_inst_salu_offset_2x_neg_11bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -995,7 +995,7 @@
   ret void
 }
 
-define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_12bit_max(i8* %p) {
+define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_12bit_max(i8* %p) #0 {
 ; GFX9-LABEL: flat_inst_salu_offset_2x_neg_12bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1028,7 +1028,7 @@
   ret void
 }
 
-define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_13bit_max(i8* %p) {
+define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_13bit_max(i8* %p) #0 {
 ; GFX9-LABEL: flat_inst_salu_offset_2x_neg_13bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1062,7 +1062,7 @@
 }
 
 ; Fill 11-bit low-bits (1ull << 33) | 2047
-define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split0(i8* %p) {
+define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split0(i8* %p) #0 {
 ; GFX9-LABEL: flat_inst_salu_offset_64bit_11bit_split0:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1095,7 +1095,7 @@
 }
 
 ; Fill 11-bit low-bits (1ull << 33) | 2048
-define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split1(i8* %p) {
+define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split1(i8* %p) #0 {
 ; GFX9-LABEL: flat_inst_salu_offset_64bit_11bit_split1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1128,7 +1128,7 @@
 }
 
 ; Fill 12-bit low-bits (1ull << 33) | 4095
-define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split0(i8* %p) {
+define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split0(i8* %p) #0 {
 ; GFX9-LABEL: flat_inst_salu_offset_64bit_12bit_split0:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1161,7 +1161,7 @@
 }
 
 ; Fill 12-bit low-bits (1ull << 33) | 4096
-define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split1(i8* %p) {
+define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split1(i8* %p) #0 {
 ; GFX9-LABEL: flat_inst_salu_offset_64bit_12bit_split1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1195,7 +1195,7 @@
 }
 
 ; Fill 13-bit low-bits (1ull << 33) | 8191
-define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split0(i8* %p) {
+define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split0(i8* %p) #0 {
 ; GFX9-LABEL: flat_inst_salu_offset_64bit_13bit_split0:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1229,7 +1229,7 @@
 }
 
 ; Fill 13-bit low-bits (1ull << 33) | 8192
-define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split1(i8* %p) {
+define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split1(i8* %p) #0 {
 ; GFX9-LABEL: flat_inst_salu_offset_64bit_13bit_split1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1263,7 +1263,7 @@
 }
 
 ; Fill 11-bit low-bits, negative high bits (1ull << 63) | 2047
-define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split0(i8* %p) {
+define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split0(i8* %p) #0 {
 ; GFX9-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split0:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1297,7 +1297,7 @@
 }
 
 ; Fill 11-bit low-bits, negative high bits (1ull << 63) | 2048
-define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split1(i8* %p) {
+define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split1(i8* %p) #0 {
 ; GFX9-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1331,7 +1331,7 @@
 }
 
 ; Fill 12-bit low-bits, negative high bits (1ull << 63) | 4095
-define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split0(i8* %p) {
+define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split0(i8* %p) #0 {
 ; GFX9-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split0:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1365,7 +1365,7 @@
 }
 
 ; Fill 12-bit low-bits, negative high bits (1ull << 63) | 4096
-define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split1(i8* %p) {
+define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split1(i8* %p) #0 {
 ; GFX9-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1400,7 +1400,7 @@
 }
 
 ; Fill 13-bit low-bits, negative high bits (1ull << 63) | 8191
-define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split0(i8* %p) {
+define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split0(i8* %p) #0 {
 ; GFX9-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split0:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1435,7 +1435,7 @@
 }
 
 ; Fill 13-bit low-bits, negative high bits (1ull << 63) | 8192
-define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split1(i8* %p) {
+define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split1(i8* %p) #0 {
 ; GFX9-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1468,3 +1468,4 @@
   store i8 %load, i8* undef
   ret void
 }
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/offset-split-global.ll b/llvm/test/CodeGen/AMDGPU/offset-split-global.ll
--- a/llvm/test/CodeGen/AMDGPU/offset-split-global.ll
+++ b/llvm/test/CodeGen/AMDGPU/offset-split-global.ll
@@ -5,7 +5,7 @@
 ; Test splitting flat instruction offsets into the low and high bits
 ; when the offset doesn't fit in the offset field.
 
-define i8 @global_inst_valu_offset_1(i8 addrspace(1)* %p) {
+define i8 @global_inst_valu_offset_1(i8 addrspace(1)* %p) #0 {
 ; GFX9-LABEL: global_inst_valu_offset_1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -26,7 +26,7 @@
   ret i8 %load
 }
 
-define i8 @global_inst_valu_offset_11bit_max(i8 addrspace(1)* %p) {
+define i8 @global_inst_valu_offset_11bit_max(i8 addrspace(1)* %p) #0 {
 ; GFX9-LABEL: global_inst_valu_offset_11bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -47,7 +47,7 @@
   ret i8 %load
 }
 
-define i8 @global_inst_valu_offset_12bit_max(i8 addrspace(1)* %p) {
+define i8 @global_inst_valu_offset_12bit_max(i8 addrspace(1)* %p) #0 {
 ; GFX9-LABEL: global_inst_valu_offset_12bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -70,7 +70,7 @@
   ret i8 %load
 }
 
-define i8 @global_inst_valu_offset_13bit_max(i8 addrspace(1)* %p) {
+define i8 @global_inst_valu_offset_13bit_max(i8 addrspace(1)* %p) #0 {
 ; GFX9-LABEL: global_inst_valu_offset_13bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -95,7 +95,7 @@
   ret i8 %load
 }
 
-define i8 @global_inst_valu_offset_neg_11bit_max(i8 addrspace(1)* %p) {
+define i8 @global_inst_valu_offset_neg_11bit_max(i8 addrspace(1)* %p) #0 {
 ; GFX9-LABEL: global_inst_valu_offset_neg_11bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -116,7 +116,7 @@
   ret i8 %load
 }
 
-define i8 @global_inst_valu_offset_neg_12bit_max(i8 addrspace(1)* %p) {
+define i8 @global_inst_valu_offset_neg_12bit_max(i8 addrspace(1)* %p) #0 {
 ; GFX9-LABEL: global_inst_valu_offset_neg_12bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -139,7 +139,7 @@
   ret i8 %load
 }
 
-define i8 @global_inst_valu_offset_neg_13bit_max(i8 addrspace(1)* %p) {
+define i8 @global_inst_valu_offset_neg_13bit_max(i8 addrspace(1)* %p) #0 {
 ; GFX9-LABEL: global_inst_valu_offset_neg_13bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -164,7 +164,7 @@
   ret i8 %load
 }
 
-define i8 @global_inst_valu_offset_2x_11bit_max(i8 addrspace(1)* %p) {
+define i8 @global_inst_valu_offset_2x_11bit_max(i8 addrspace(1)* %p) #0 {
 ; GFX9-LABEL: global_inst_valu_offset_2x_11bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -187,7 +187,7 @@
   ret i8 %load
 }
 
-define i8 @global_inst_valu_offset_2x_12bit_max(i8 addrspace(1)* %p) {
+define i8 @global_inst_valu_offset_2x_12bit_max(i8 addrspace(1)* %p) #0 {
 ; GFX9-LABEL: global_inst_valu_offset_2x_12bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -212,7 +212,7 @@
   ret i8 %load
 }
 
-define i8 @global_inst_valu_offset_2x_13bit_max(i8 addrspace(1)* %p) {
+define i8 @global_inst_valu_offset_2x_13bit_max(i8 addrspace(1)* %p) #0 {
 ; GFX9-LABEL: global_inst_valu_offset_2x_13bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -237,7 +237,7 @@
   ret i8 %load
 }
 
-define i8 @global_inst_valu_offset_2x_neg_11bit_max(i8 addrspace(1)* %p) {
+define i8 @global_inst_valu_offset_2x_neg_11bit_max(i8 addrspace(1)* %p) #0 {
 ; GFX9-LABEL: global_inst_valu_offset_2x_neg_11bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -260,7 +260,7 @@
   ret i8 %load
 }
 
-define i8 @global_inst_valu_offset_2x_neg_12bit_max(i8 addrspace(1)* %p) {
+define i8 @global_inst_valu_offset_2x_neg_12bit_max(i8 addrspace(1)* %p) #0 {
 ; GFX9-LABEL: global_inst_valu_offset_2x_neg_12bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -285,7 +285,7 @@
   ret i8 %load
 }
 
-define i8 @global_inst_valu_offset_2x_neg_13bit_max(i8 addrspace(1)* %p) {
+define i8 @global_inst_valu_offset_2x_neg_13bit_max(i8 addrspace(1)* %p) #0 {
 ; GFX9-LABEL: global_inst_valu_offset_2x_neg_13bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -311,7 +311,7 @@
 }
 
 ; Fill 11-bit low-bits (1ull << 33) | 2047
-define i8 @global_inst_valu_offset_64bit_11bit_split0(i8 addrspace(1)* %p) {
+define i8 @global_inst_valu_offset_64bit_11bit_split0(i8 addrspace(1)* %p) #0 {
 ; GFX9-LABEL: global_inst_valu_offset_64bit_11bit_split0:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -337,7 +337,7 @@
 }
 
 ; Fill 11-bit low-bits (1ull << 33) | 2048
-define i8 @global_inst_valu_offset_64bit_11bit_split1(i8 addrspace(1)* %p) {
+define i8 @global_inst_valu_offset_64bit_11bit_split1(i8 addrspace(1)* %p) #0 {
 ; GFX9-LABEL: global_inst_valu_offset_64bit_11bit_split1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -363,7 +363,7 @@
 }
 
 ; Fill 12-bit low-bits (1ull << 33) | 4095
-define i8 @global_inst_valu_offset_64bit_12bit_split0(i8 addrspace(1)* %p) {
+define i8 @global_inst_valu_offset_64bit_12bit_split0(i8 addrspace(1)* %p) #0 {
 ; GFX9-LABEL: global_inst_valu_offset_64bit_12bit_split0:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -389,7 +389,7 @@
 }
 
 ; Fill 12-bit low-bits (1ull << 33) | 4096
-define i8 @global_inst_valu_offset_64bit_12bit_split1(i8 addrspace(1)* %p) {
+define i8 @global_inst_valu_offset_64bit_12bit_split1(i8 addrspace(1)* %p) #0 {
 ; GFX9-LABEL: global_inst_valu_offset_64bit_12bit_split1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -415,7 +415,7 @@
 }
 
 ; Fill 13-bit low-bits (1ull << 33) | 8191
-define i8 @global_inst_valu_offset_64bit_13bit_split0(i8 addrspace(1)* %p) {
+define i8 @global_inst_valu_offset_64bit_13bit_split0(i8 addrspace(1)* %p) #0 {
 ; GFX9-LABEL: global_inst_valu_offset_64bit_13bit_split0:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -441,7 +441,7 @@
 }
 
 ; Fill 13-bit low-bits (1ull << 33) | 8192
-define i8 @global_inst_valu_offset_64bit_13bit_split1(i8 addrspace(1)* %p) {
+define i8 @global_inst_valu_offset_64bit_13bit_split1(i8 addrspace(1)* %p) #0 {
 ; GFX9-LABEL: global_inst_valu_offset_64bit_13bit_split1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -467,7 +467,7 @@
 }
 
 ; Fill 11-bit low-bits, negative high bits (1ull << 63) | 2047
-define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split0(i8 addrspace(1)* %p) {
+define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split0(i8 addrspace(1)* %p) #0 {
 ; GFX9-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split0:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -494,7 +494,7 @@
 }
 
 ; Fill 11-bit low-bits, negative high bits (1ull << 63) | 2048
-define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split1(i8 addrspace(1)* %p) {
+define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split1(i8 addrspace(1)* %p) #0 {
 ; GFX9-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -521,7 +521,7 @@
 }
 
 ; Fill 12-bit low-bits, negative high bits (1ull << 63) | 4095
-define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split0(i8 addrspace(1)* %p) {
+define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split0(i8 addrspace(1)* %p) #0 {
 ; GFX9-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split0:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -548,7 +548,7 @@
 }
 
 ; Fill 12-bit low-bits, negative high bits (1ull << 63) | 4096
-define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split1(i8 addrspace(1)* %p) {
+define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split1(i8 addrspace(1)* %p) #0 {
 ; GFX9-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -575,7 +575,7 @@
 }
 
 ; Fill 13-bit low-bits, negative high bits (1ull << 63) | 8191
-define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split0(i8 addrspace(1)* %p) {
+define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split0(i8 addrspace(1)* %p) #0 {
 ; GFX9-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split0:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -602,7 +602,7 @@
 }
 
 ; Fill 13-bit low-bits, negative high bits (1ull << 63) | 8192
-define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split1(i8 addrspace(1)* %p) {
+define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split1(i8 addrspace(1)* %p) #0 {
 ; GFX9-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -628,7 +628,7 @@
   ret i8 %load
 }
 
-define amdgpu_kernel void @global_inst_salu_offset_1(i8 addrspace(1)* %p) {
+define amdgpu_kernel void @global_inst_salu_offset_1(i8 addrspace(1)* %p) #0 {
 ; GFX9-LABEL: global_inst_salu_offset_1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -657,7 +657,7 @@
   ret void
 }
 
-define amdgpu_kernel void @global_inst_salu_offset_11bit_max(i8 addrspace(1)* %p) {
+define amdgpu_kernel void @global_inst_salu_offset_11bit_max(i8 addrspace(1)* %p) #0 {
 ; GFX9-LABEL: global_inst_salu_offset_11bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -686,7 +686,7 @@
   ret void
 }
 
-define amdgpu_kernel void @global_inst_salu_offset_12bit_max(i8 addrspace(1)* %p) {
+define amdgpu_kernel void @global_inst_salu_offset_12bit_max(i8 addrspace(1)* %p) #0 {
 ; GFX9-LABEL: global_inst_salu_offset_12bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -715,7 +715,7 @@
   ret void
 }
 
-define amdgpu_kernel void @global_inst_salu_offset_13bit_max(i8 addrspace(1)* %p) {
+define amdgpu_kernel void @global_inst_salu_offset_13bit_max(i8 addrspace(1)* %p) #0 {
 ; GFX9-LABEL: global_inst_salu_offset_13bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -746,7 +746,7 @@
   ret void
 }
 
-define amdgpu_kernel void @global_inst_salu_offset_neg_11bit_max(i8 addrspace(1)* %p) {
+define amdgpu_kernel void @global_inst_salu_offset_neg_11bit_max(i8 addrspace(1)* %p) #0 {
 ; GFX9-LABEL: global_inst_salu_offset_neg_11bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -775,7 +775,7 @@
   ret void
 }
 
-define amdgpu_kernel void @global_inst_salu_offset_neg_12bit_max(i8 addrspace(1)* %p) {
+define amdgpu_kernel void @global_inst_salu_offset_neg_12bit_max(i8 addrspace(1)* %p) #0 {
 ; GFX9-LABEL: global_inst_salu_offset_neg_12bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -804,7 +804,7 @@
   ret void
 }
 
-define amdgpu_kernel void @global_inst_salu_offset_neg_13bit_max(i8 addrspace(1)* %p) {
+define amdgpu_kernel void @global_inst_salu_offset_neg_13bit_max(i8 addrspace(1)* %p) #0 {
 ; GFX9-LABEL: global_inst_salu_offset_neg_13bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -835,7 +835,7 @@
   ret void
 }
 
-define amdgpu_kernel void @global_inst_salu_offset_2x_11bit_max(i8 addrspace(1)* %p) {
+define amdgpu_kernel void @global_inst_salu_offset_2x_11bit_max(i8 addrspace(1)* %p) #0 {
 ; GFX9-LABEL: global_inst_salu_offset_2x_11bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -864,7 +864,7 @@
   ret void
 }
 
-define amdgpu_kernel void @global_inst_salu_offset_2x_12bit_max(i8 addrspace(1)* %p) {
+define amdgpu_kernel void @global_inst_salu_offset_2x_12bit_max(i8 addrspace(1)* %p) #0 {
 ; GFX9-LABEL: global_inst_salu_offset_2x_12bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -895,7 +895,7 @@
   ret void
 }
 
-define amdgpu_kernel void @global_inst_salu_offset_2x_13bit_max(i8 addrspace(1)* %p) {
+define amdgpu_kernel void @global_inst_salu_offset_2x_13bit_max(i8 addrspace(1)* %p) #0 {
 ; GFX9-LABEL: global_inst_salu_offset_2x_13bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -926,7 +926,7 @@
   ret void
 }
 
-define amdgpu_kernel void @global_inst_salu_offset_2x_neg_11bit_max(i8 addrspace(1)* %p) {
+define amdgpu_kernel void @global_inst_salu_offset_2x_neg_11bit_max(i8 addrspace(1)* %p) #0 {
 ; GFX9-LABEL: global_inst_salu_offset_2x_neg_11bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -955,7 +955,7 @@
   ret void
 }
 
-define amdgpu_kernel void @global_inst_salu_offset_2x_neg_12bit_max(i8 addrspace(1)* %p) {
+define amdgpu_kernel void @global_inst_salu_offset_2x_neg_12bit_max(i8 addrspace(1)* %p) #0 {
 ; GFX9-LABEL: global_inst_salu_offset_2x_neg_12bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -986,7 +986,7 @@
   ret void
 }
 
-define amdgpu_kernel void @global_inst_salu_offset_2x_neg_13bit_max(i8 addrspace(1)* %p) {
+define amdgpu_kernel void @global_inst_salu_offset_2x_neg_13bit_max(i8 addrspace(1)* %p) #0 {
 ; GFX9-LABEL: global_inst_salu_offset_2x_neg_13bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1018,7 +1018,7 @@
 }
 
 ; Fill 11-bit low-bits (1ull << 33) | 2047
-define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(i8 addrspace(1)* %p) {
+define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(i8 addrspace(1)* %p) #0 {
 ; GFX9-LABEL: global_inst_salu_offset_64bit_11bit_split0:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1049,7 +1049,7 @@
 }
 
 ; Fill 11-bit low-bits (1ull << 33) | 2048
-define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(i8 addrspace(1)* %p) {
+define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(i8 addrspace(1)* %p) #0 {
 ; GFX9-LABEL: global_inst_salu_offset_64bit_11bit_split1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1080,7 +1080,7 @@
 }
 
 ; Fill 12-bit low-bits (1ull << 33) | 4095
-define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(i8 addrspace(1)* %p) {
+define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(i8 addrspace(1)* %p) #0 {
 ; GFX9-LABEL: global_inst_salu_offset_64bit_12bit_split0:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1111,7 +1111,7 @@
 }
 
 ; Fill 12-bit low-bits (1ull << 33) | 4096
-define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(i8 addrspace(1)* %p) {
+define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(i8 addrspace(1)* %p) #0 {
 ; GFX9-LABEL: global_inst_salu_offset_64bit_12bit_split1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1143,7 +1143,7 @@
 }
 
 ; Fill 13-bit low-bits (1ull << 33) | 8191
-define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(i8 addrspace(1)* %p) {
+define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(i8 addrspace(1)* %p) #0 {
 ; GFX9-LABEL: global_inst_salu_offset_64bit_13bit_split0:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1175,7 +1175,7 @@
 }
 
 ; Fill 13-bit low-bits (1ull << 33) | 8192
-define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(i8 addrspace(1)* %p) {
+define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(i8 addrspace(1)* %p) #0 {
 ; GFX9-LABEL: global_inst_salu_offset_64bit_13bit_split1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1207,7 +1207,7 @@
 }
 
 ; Fill 11-bit low-bits, negative high bits (1ull << 63) | 2047
-define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split0(i8 addrspace(1)* %p) {
+define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split0(i8 addrspace(1)* %p) #0 {
 ; GFX9-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split0:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1240,7 +1240,7 @@
 }
 
 ; Fill 11-bit low-bits, negative high bits (1ull << 63) | 2048
-define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split1(i8 addrspace(1)* %p) {
+define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split1(i8 addrspace(1)* %p) #0 {
 ; GFX9-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1273,7 +1273,7 @@
 }
 
 ; Fill 12-bit low-bits, negative high bits (1ull << 63) | 4095
-define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split0(i8 addrspace(1)* %p) {
+define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split0(i8 addrspace(1)* %p) #0 {
 ; GFX9-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split0:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1306,7 +1306,7 @@
 }
 
 ; Fill 12-bit low-bits, negative high bits (1ull << 63) | 4096
-define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split1(i8 addrspace(1)* %p) {
+define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split1(i8 addrspace(1)* %p) #0 {
 ; GFX9-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1340,7 +1340,7 @@
 }
 
 ; Fill 13-bit low-bits, negative high bits (1ull << 63) | 8191
-define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split0(i8 addrspace(1)* %p) {
+define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split0(i8 addrspace(1)* %p) #0 {
 ; GFX9-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split0:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1374,7 +1374,7 @@
 }
 
 ; Fill 13-bit low-bits, negative high bits (1ull << 63) | 8192
-define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split1(i8 addrspace(1)* %p) {
+define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split1(i8 addrspace(1)* %p) #0 {
 ; GFX9-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1406,3 +1406,4 @@
   store i8 %load, i8 addrspace(1)* undef
   ret void
 }
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/or3.ll b/llvm/test/CodeGen/AMDGPU/or3.ll
--- a/llvm/test/CodeGen/AMDGPU/or3.ll
+++ b/llvm/test/CodeGen/AMDGPU/or3.ll
@@ -7,7 +7,7 @@
 ; V_OR3_B32
 ; ===================================================================================
 
-define amdgpu_ps float @or3(i32 %a, i32 %b, i32 %c) {
+define amdgpu_ps float @or3(i32 %a, i32 %b, i32 %c) #0 {
 ; VI-LABEL: or3:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -32,7 +32,7 @@
 
 ; ThreeOp instruction variant not used due to Constant Bus Limitations
 ; TODO: with reassociation it is possible to replace a v_or_b32_e32 with an s_or_b32
-define amdgpu_ps float @or3_vgpr_a(i32 %a, i32 inreg %b, i32 inreg %c) {
+define amdgpu_ps float @or3_vgpr_a(i32 %a, i32 inreg %b, i32 inreg %c) #0 {
 ; VI-LABEL: or3_vgpr_a:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    v_or_b32_e32 v0, s2, v0
@@ -56,7 +56,7 @@
   ret float %bc
 }
 
-define amdgpu_ps float @or3_vgpr_all2(i32 %a, i32 %b, i32 %c) {
+define amdgpu_ps float @or3_vgpr_all2(i32 %a, i32 %b, i32 %c) #0 {
 ; VI-LABEL: or3_vgpr_all2:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    v_or_b32_e32 v1, v1, v2
@@ -79,7 +79,7 @@
   ret float %bc
 }
 
-define amdgpu_ps float @or3_vgpr_bc(i32 inreg %a, i32 %b, i32 %c) {
+define amdgpu_ps float @or3_vgpr_bc(i32 inreg %a, i32 %b, i32 %c) #0 {
 ; VI-LABEL: or3_vgpr_bc:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    v_or_b32_e32 v0, s2, v0
@@ -102,7 +102,7 @@
   ret float %bc
 }
 
-define amdgpu_ps float @or3_vgpr_const(i32 %a, i32 %b) {
+define amdgpu_ps float @or3_vgpr_const(i32 %a, i32 %b) #0 {
 ; VI-LABEL: or3_vgpr_const:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    v_or_b32_e32 v0, v1, v0
@@ -124,3 +124,4 @@
   %bc = bitcast i32 %result to float
   ret float %bc
 }
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll
--- a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll
+++ b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll
@@ -97,7 +97,7 @@
   ret void
 }
 
-define amdgpu_kernel void @scalar_to_vector_v4i16() {
+define amdgpu_kernel void @scalar_to_vector_v4i16() #0 {
 ; SI-LABEL: scalar_to_vector_v4i16:
 ; SI:       ; %bb.0: ; %bb
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
@@ -139,7 +139,7 @@
   ret void
 }
 
-define amdgpu_kernel void @scalar_to_vector_v4f16() {
+define amdgpu_kernel void @scalar_to_vector_v4f16() #0 {
 ; SI-LABEL: scalar_to_vector_v4f16:
 ; SI:       ; %bb.0: ; %bb
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
@@ -247,3 +247,4 @@
   store <2 x half> %bc, <2 x half> addrspace(1)* %out
   ret void
 }
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/sdiv.ll b/llvm/test/CodeGen/AMDGPU/sdiv.ll
--- a/llvm/test/CodeGen/AMDGPU/sdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv.ll
@@ -13,7 +13,7 @@
 ; This was fixed by adding an additional pattern in R600Instructions.td to
 ; match this pattern with a CNDGE_INT.
 
-define amdgpu_kernel void @sdiv_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @sdiv_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 ; GCN-LABEL: sdiv_i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
@@ -213,7 +213,7 @@
   ret void
 }
 
-define amdgpu_kernel void @sdiv_i32_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @sdiv_i32_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 ; GCN-LABEL: sdiv_i32_4:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -307,7 +307,7 @@
 ; Multiply by a weird constant to make sure setIntDivIsCheap is
 ; working.
 
-define amdgpu_kernel void @slow_sdiv_i32_3435(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @slow_sdiv_i32_3435(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 ; GCN-LABEL: slow_sdiv_i32_3435:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -405,7 +405,7 @@
   ret void
 }
 
-define amdgpu_kernel void @sdiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @sdiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) #0 {
 ; GCN-LABEL: sdiv_v2i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -734,7 +734,7 @@
   ret void
 }
 
-define amdgpu_kernel void @sdiv_v2i32_4(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @sdiv_v2i32_4(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) #0 {
 ; GCN-LABEL: sdiv_v2i32_4:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -843,7 +843,7 @@
   ret void
 }
 
-define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
 ; GCN-LABEL: sdiv_v4i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x9
@@ -1432,7 +1432,7 @@
   ret void
 }
 
-define amdgpu_kernel void @sdiv_v4i32_4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @sdiv_v4i32_4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
 ; GCN-LABEL: sdiv_v4i32_4:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -1576,7 +1576,7 @@
   ret void
 }
 
-define amdgpu_kernel void @v_sdiv_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
+define amdgpu_kernel void @v_sdiv_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
 ; GCN-LABEL: v_sdiv_i8:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -1721,7 +1721,7 @@
   ret void
 }
 
-define amdgpu_kernel void @v_sdiv_i23(i32 addrspace(1)* %out, i23 addrspace(1)* %in) {
+define amdgpu_kernel void @v_sdiv_i23(i32 addrspace(1)* %out, i23 addrspace(1)* %in) #0 {
 ; GCN-LABEL: v_sdiv_i23:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -1904,7 +1904,7 @@
   ret void
 }
 
-define amdgpu_kernel void @v_sdiv_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %in) {
+define amdgpu_kernel void @v_sdiv_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %in) #0 {
 ; GCN-LABEL: v_sdiv_i24:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -2091,7 +2091,7 @@
   ret void
 }
 
-define amdgpu_kernel void @v_sdiv_i25(i32 addrspace(1)* %out, i25 addrspace(1)* %in) {
+define amdgpu_kernel void @v_sdiv_i25(i32 addrspace(1)* %out, i25 addrspace(1)* %in) #0 {
 ; GCN-LABEL: v_sdiv_i25:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
@@ -2335,7 +2335,7 @@
 ;   ret void
 ; }
 
-define amdgpu_kernel void @scalarize_mulhs_4xi32(<4 x i32> addrspace(1)* nocapture readonly %in, <4 x i32> addrspace(1)* nocapture %out) {
+define amdgpu_kernel void @scalarize_mulhs_4xi32(<4 x i32> addrspace(1)* nocapture readonly %in, <4 x i32> addrspace(1)* nocapture %out) #0 {
 ; GCN-LABEL: scalarize_mulhs_4xi32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -2482,3 +2482,4 @@
   store <4 x i32> %2, <4 x i32> addrspace(1)* %out, align 16
   ret void
 }
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
--- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx600 -amdgpu-bypass-slow-div=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 ; RUN: llc -march=amdgcn -mcpu=gfx600 -amdgpu-bypass-slow-div=0 -amdgpu-codegenprepare-expand-div64 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-IR %s
 
-define amdgpu_kernel void @s_test_sdiv(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @s_test_sdiv(i64 addrspace(1)* %out, i64 %x, i64 %y) #0 {
 ; GCN-LABEL: s_test_sdiv:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
@@ -255,7 +255,7 @@
   ret void
 }
 
-define i64 @v_test_sdiv(i64 %x, i64 %y) {
+define i64 @v_test_sdiv(i64 %x, i64 %y) #0 {
 ; GCN-LABEL: v_test_sdiv:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -494,7 +494,7 @@
   ret i64 %result
 }
 
-define amdgpu_kernel void @s_test_sdiv24_64(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @s_test_sdiv24_64(i64 addrspace(1)* %out, i64 %x, i64 %y) #0 {
 ; GCN-LABEL: s_test_sdiv24_64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -561,7 +561,7 @@
   ret void
 }
 
-define i64 @v_test_sdiv24_64(i64 %x, i64 %y) {
+define i64 @v_test_sdiv24_64(i64 %x, i64 %y) #0 {
 ; GCN-LABEL: v_test_sdiv24_64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -603,7 +603,7 @@
   ret i64 %result
 }
 
-define amdgpu_kernel void @s_test_sdiv32_64(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @s_test_sdiv32_64(i64 addrspace(1)* %out, i64 %x, i64 %y) #0 {
 ; GCN-LABEL: s_test_sdiv32_64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -666,7 +666,7 @@
   ret void
 }
 
-define amdgpu_kernel void @s_test_sdiv31_64(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @s_test_sdiv31_64(i64 addrspace(1)* %out, i64 %x, i64 %y) #0 {
 ; GCN-LABEL: s_test_sdiv31_64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -733,7 +733,7 @@
   ret void
 }
 
-define amdgpu_kernel void @s_test_sdiv23_64(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @s_test_sdiv23_64(i64 addrspace(1)* %out, i64 %x, i64 %y) #0 {
 ; GCN-LABEL: s_test_sdiv23_64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -800,7 +800,7 @@
   ret void
 }
 
-define amdgpu_kernel void @s_test_sdiv25_64(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @s_test_sdiv25_64(i64 addrspace(1)* %out, i64 %x, i64 %y) #0 {
 ; GCN-LABEL: s_test_sdiv25_64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -867,7 +867,7 @@
   ret void
 }
 
-define amdgpu_kernel void @s_test_sdiv24_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) {
+define amdgpu_kernel void @s_test_sdiv24_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) #0 {
 ; GCN-LABEL: s_test_sdiv24_v2i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
@@ -968,7 +968,7 @@
   ret void
 }
 
-define amdgpu_kernel void @s_test_sdiv24_48(i48 addrspace(1)* %out, i48 %x, i48 %y) {
+define amdgpu_kernel void @s_test_sdiv24_48(i48 addrspace(1)* %out, i48 %x, i48 %y) #0 {
 ; GCN-LABEL: s_test_sdiv24_48:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
@@ -1130,7 +1130,7 @@
   ret void
 }
 
-define amdgpu_kernel void @s_test_sdiv_k_num_i64(i64 addrspace(1)* %out, i64 %x) {
+define amdgpu_kernel void @s_test_sdiv_k_num_i64(i64 addrspace(1)* %out, i64 %x) #0 {
 ; GCN-LABEL: s_test_sdiv_k_num_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -1350,7 +1350,7 @@
   ret void
 }
 
-define i64 @v_test_sdiv_k_num_i64(i64 %x) {
+define i64 @v_test_sdiv_k_num_i64(i64 %x) #0 {
 ; GCN-LABEL: v_test_sdiv_k_num_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1562,7 +1562,7 @@
   ret i64 %result
 }
 
-define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) {
+define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) #0 {
 ; GCN-LABEL: v_test_sdiv_pow2_k_num_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1780,7 +1780,7 @@
   ret i64 %result
 }
 
-define i64 @v_test_sdiv_pow2_k_den_i64(i64 %x) {
+define i64 @v_test_sdiv_pow2_k_den_i64(i64 %x) #0 {
 ; GCN-LABEL: v_test_sdiv_pow2_k_den_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1881,7 +1881,7 @@
   ret i64 %result
 }
 
-define amdgpu_kernel void @s_test_sdiv24_k_num_i64(i64 addrspace(1)* %out, i64 %x) {
+define amdgpu_kernel void @s_test_sdiv24_k_num_i64(i64 addrspace(1)* %out, i64 %x) #0 {
 ; GCN-LABEL: s_test_sdiv24_k_num_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -1941,7 +1941,7 @@
   ret void
 }
 
-define amdgpu_kernel void @s_test_sdiv24_k_den_i64(i64 addrspace(1)* %out, i64 %x) {
+define amdgpu_kernel void @s_test_sdiv24_k_den_i64(i64 addrspace(1)* %out, i64 %x) #0 {
 ; GCN-LABEL: s_test_sdiv24_k_den_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -1999,7 +1999,7 @@
   ret void
 }
 
-define i64 @v_test_sdiv24_k_num_i64(i64 %x) {
+define i64 @v_test_sdiv24_k_num_i64(i64 %x) #0 {
 ; GCN-LABEL: v_test_sdiv24_k_num_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2044,7 +2044,7 @@
   ret i64 %result
 }
 
-define i64 @v_test_sdiv24_pow2_k_num_i64(i64 %x) {
+define i64 @v_test_sdiv24_pow2_k_num_i64(i64 %x) #0 {
 ; GCN-LABEL: v_test_sdiv24_pow2_k_num_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2089,7 +2089,7 @@
   ret i64 %result
 }
 
-define i64 @v_test_sdiv24_pow2_k_den_i64(i64 %x) {
+define i64 @v_test_sdiv24_pow2_k_den_i64(i64 %x) #0 {
 ; GCN-LABEL: v_test_sdiv24_pow2_k_den_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2122,3 +2122,4 @@
   %result = sdiv i64 %x.shr, 32768
   ret i64 %result
 }
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/select.f16.ll b/llvm/test/CodeGen/AMDGPU/select.f16.ll
--- a/llvm/test/CodeGen/AMDGPU/select.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/select.f16.ll
@@ -77,7 +77,7 @@
     half addrspace(1)* %a,
     half addrspace(1)* %b,
     half addrspace(1)* %c,
-    half addrspace(1)* %d) {
+    half addrspace(1)* %d) #0 {
 entry:
   %a.val = load volatile half, half addrspace(1)* %a
   %b.val = load volatile half, half addrspace(1)* %b
@@ -153,7 +153,7 @@
     half addrspace(1)* %r,
     half addrspace(1)* %b,
     half addrspace(1)* %c,
-    half addrspace(1)* %d) {
+    half addrspace(1)* %d) #0 {
 entry:
   %b.val = load volatile half, half addrspace(1)* %b
   %c.val = load volatile half, half addrspace(1)* %c
@@ -228,7 +228,7 @@
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %c,
-    half addrspace(1)* %d) {
+    half addrspace(1)* %d) #0 {
 entry:
   %a.val = load volatile half, half addrspace(1)* %a
   %c.val = load volatile half, half addrspace(1)* %c
@@ -304,7 +304,7 @@
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b,
-    half addrspace(1)* %d) {
+    half addrspace(1)* %d) #0 {
 entry:
   %a.val = load volatile half, half addrspace(1)* %a
   %b.val = load volatile half, half addrspace(1)* %b
@@ -380,7 +380,7 @@
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b,
-    half addrspace(1)* %c) {
+    half addrspace(1)* %c) #0 {
 entry:
   %a.val = load volatile half, half addrspace(1)* %a
   %b.val = load volatile half, half addrspace(1)* %b
@@ -488,7 +488,7 @@
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b,
     <2 x half> addrspace(1)* %c,
-    <2 x half> addrspace(1)* %d) {
+    <2 x half> addrspace(1)* %d) #0 {
 entry:
   %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
   %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
@@ -584,7 +584,7 @@
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %b,
     <2 x half> addrspace(1)* %c,
-    <2 x half> addrspace(1)* %d) {
+    <2 x half> addrspace(1)* %d) #0 {
 entry:
   %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
   %c.val = load <2 x half>, <2 x half> addrspace(1)* %c
@@ -679,7 +679,7 @@
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %c,
-    <2 x half> addrspace(1)* %d) {
+    <2 x half> addrspace(1)* %d) #0 {
 entry:
   %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
   %c.val = load <2 x half>, <2 x half> addrspace(1)* %c
@@ -774,7 +774,7 @@
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b,
-    <2 x half> addrspace(1)* %d) {
+    <2 x half> addrspace(1)* %d) #0 {
 entry:
   %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
   %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
@@ -870,7 +870,7 @@
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b,
-    <2 x half> addrspace(1)* %c) {
+    <2 x half> addrspace(1)* %c) #0 {
 entry:
   %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
   %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
@@ -880,3 +880,4 @@
   store <2 x half> %r.val, <2 x half> addrspace(1)* %r
   ret void
 }
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/sext-divergence-driven-isel.ll b/llvm/test/CodeGen/AMDGPU/sext-divergence-driven-isel.ll
--- a/llvm/test/CodeGen/AMDGPU/sext-divergence-driven-isel.ll
+++ b/llvm/test/CodeGen/AMDGPU/sext-divergence-driven-isel.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s
 
-define amdgpu_kernel void @sext_i16_to_i32_uniform(i32 addrspace(1)* %out, i16 %a, i32 %b) {
+define amdgpu_kernel void @sext_i16_to_i32_uniform(i32 addrspace(1)* %out, i16 %a, i32 %b) #0 {
 ; GCN-LABEL: sext_i16_to_i32_uniform:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
@@ -21,7 +21,7 @@
 }
 
 
-define amdgpu_kernel void @sext_i16_to_i64_uniform(i64 addrspace(1)* %out, i16 %a, i64 %b) {
+define amdgpu_kernel void @sext_i16_to_i64_uniform(i64 addrspace(1)* %out, i16 %a, i64 %b) #0 {
 ; GCN-LABEL: sext_i16_to_i64_uniform:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
@@ -43,7 +43,7 @@
   ret void
 }
 
-define amdgpu_kernel void @sext_i16_to_i32_divergent(i32 addrspace(1)* %out, i16 %a, i32 %b) {
+define amdgpu_kernel void @sext_i16_to_i32_divergent(i32 addrspace(1)* %out, i16 %a, i32 %b) #0 {
 ; GCN-LABEL: sext_i16_to_i32_divergent:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
@@ -64,7 +64,7 @@
 }
 
 
-define amdgpu_kernel void @sext_i16_to_i64_divergent(i64 addrspace(1)* %out, i16 %a, i64 %b) {
+define amdgpu_kernel void @sext_i16_to_i64_divergent(i64 addrspace(1)* %out, i16 %a, i64 %b) #0 {
 ; GCN-LABEL: sext_i16_to_i64_divergent:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll b/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll
--- a/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll
@@ -8,7 +8,7 @@
 ; threads will execute the same code paths, so we don't need to worry
 ; about instructions in different blocks overwriting each other.
 
-define amdgpu_kernel void @sgpr_if_else_salu_br(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) {
+define amdgpu_kernel void @sgpr_if_else_salu_br(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) #0 {
 ; SI-LABEL: sgpr_if_else_salu_br:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
@@ -48,7 +48,7 @@
   ret void
 }
 
-define amdgpu_kernel void @sgpr_if_else_salu_br_opt(i32 addrspace(1)* %out, [8 x i32], i32 %a, [8 x i32], i32 %b, [8 x i32], i32 %c, [8 x i32], i32 %d, [8 x i32], i32 %e) {
+define amdgpu_kernel void @sgpr_if_else_salu_br_opt(i32 addrspace(1)* %out, [8 x i32], i32 %a, [8 x i32], i32 %b, [8 x i32], i32 %c, [8 x i32], i32 %d, [8 x i32], i32 %e) #0 {
 ; SI-LABEL: sgpr_if_else_salu_br_opt:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dword s2, s[0:1], 0x13
@@ -93,7 +93,7 @@
 
 ; The two S_ADD instructions should write to different registers, since
 ; different threads will take different control flow paths.
-define amdgpu_kernel void @sgpr_if_else_valu_br(i32 addrspace(1)* %out, float %a, i32 %b, i32 %c, i32 %d, i32 %e) {
+define amdgpu_kernel void @sgpr_if_else_valu_br(i32 addrspace(1)* %out, float %a, i32 %b, i32 %c, i32 %d, i32 %e) #0 {
 ; SI-LABEL: sgpr_if_else_valu_br:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    v_cvt_f32_u32_e32 v0, v0
@@ -141,7 +141,7 @@
   ret void
 }
 
-define amdgpu_kernel void @sgpr_if_else_valu_cmp_phi_br(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) {
+define amdgpu_kernel void @sgpr_if_else_valu_cmp_phi_br(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) #0 {
 ; SI-LABEL: sgpr_if_else_valu_cmp_phi_br:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -214,4 +214,4 @@
 
 declare i32 @llvm.amdgcn.workitem.id.x() #0
 
-attributes #0 = { readnone }
+attributes #0 = { nounwind readnone }
diff --git a/llvm/test/CodeGen/AMDGPU/shift-i128.ll b/llvm/test/CodeGen/AMDGPU/shift-i128.ll
--- a/llvm/test/CodeGen/AMDGPU/shift-i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/shift-i128.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
-define i128 @v_shl_i128_vv(i128 %lhs, i128 %rhs) {
+define i128 @v_shl_i128_vv(i128 %lhs, i128 %rhs) #0 {
 ; GCN-LABEL: v_shl_i128_vv:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -26,7 +26,7 @@
   ret i128 %shl
 }
 
-define i128 @v_lshr_i128_vv(i128 %lhs, i128 %rhs) {
+define i128 @v_lshr_i128_vv(i128 %lhs, i128 %rhs) #0 {
 ; GCN-LABEL: v_lshr_i128_vv:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -52,7 +52,7 @@
   ret i128 %shl
 }
 
-define i128 @v_ashr_i128_vv(i128 %lhs, i128 %rhs) {
+define i128 @v_ashr_i128_vv(i128 %lhs, i128 %rhs) #0 {
 ; GCN-LABEL: v_ashr_i128_vv:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -79,7 +79,7 @@
 }
 
 
-define i128 @v_shl_i128_vk(i128 %lhs) {
+define i128 @v_shl_i128_vk(i128 %lhs) #0 {
 ; GCN-LABEL: v_shl_i128_vk:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -93,7 +93,7 @@
   ret i128 %shl
 }
 
-define i128 @v_lshr_i128_vk(i128 %lhs) {
+define i128 @v_lshr_i128_vk(i128 %lhs) #0 {
 ; GCN-LABEL: v_lshr_i128_vk:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -106,7 +106,7 @@
   ret i128 %shl
 }
 
-define i128 @v_ashr_i128_vk(i128 %lhs) {
+define i128 @v_ashr_i128_vk(i128 %lhs) #0 {
 ; GCN-LABEL: v_ashr_i128_vk:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -120,7 +120,7 @@
   ret i128 %shl
 }
 
-define i128 @v_shl_i128_kv(i128 %rhs) {
+define i128 @v_shl_i128_kv(i128 %rhs) #0 {
 ; GCN-LABEL: v_shl_i128_kv:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -142,7 +142,7 @@
   ret i128 %shl
 }
 
-define i128 @v_lshr_i128_kv(i128 %rhs) {
+define i128 @v_lshr_i128_kv(i128 %rhs) #0 {
 ; GCN-LABEL: v_lshr_i128_kv:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -163,7 +163,7 @@
   ret i128 %shl
 }
 
-define i128 @v_ashr_i128_kv(i128 %rhs) {
+define i128 @v_ashr_i128_kv(i128 %rhs) #0 {
 ; GCN-LABEL: v_ashr_i128_kv:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -181,7 +181,7 @@
   ret i128 %shl
 }
 
-define amdgpu_kernel void @s_shl_i128_ss(i128 %lhs, i128 %rhs) {
+define amdgpu_kernel void @s_shl_i128_ss(i128 %lhs, i128 %rhs) #0 {
 ; GCN-LABEL: s_shl_i128_ss:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx8 s[4:11], s[4:5], 0x0
@@ -218,7 +218,7 @@
   ret void
 }
 
-define amdgpu_kernel void @s_lshr_i128_ss(i128 %lhs, i128 %rhs) {
+define amdgpu_kernel void @s_lshr_i128_ss(i128 %lhs, i128 %rhs) #0 {
 ; GCN-LABEL: s_lshr_i128_ss:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx8 s[4:11], s[4:5], 0x0
@@ -255,7 +255,7 @@
   ret void
 }
 
-define amdgpu_kernel void @s_ashr_i128_ss(i128 %lhs, i128 %rhs) {
+define amdgpu_kernel void @s_ashr_i128_ss(i128 %lhs, i128 %rhs) #0 {
 ; GCN-LABEL: s_ashr_i128_ss:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx8 s[4:11], s[4:5], 0x0
@@ -294,7 +294,7 @@
   ret void
 }
 
-define <2 x i128> @v_shl_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
+define <2 x i128> @v_shl_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) #0 {
 ; GCN-LABEL: v_shl_v2i128_vv:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -343,7 +343,7 @@
   ret <2 x i128> %shl
 }
 
-define <2 x i128> @v_lshr_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
+define <2 x i128> @v_lshr_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) #0 {
 ; GCN-LABEL: v_lshr_v2i128_vv:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -392,7 +392,7 @@
   ret <2 x i128> %shl
 }
 
-define <2 x i128> @v_ashr_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
+define <2 x i128> @v_ashr_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) #0 {
 ; GCN-LABEL: v_ashr_v2i128_vv:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -443,7 +443,7 @@
   ret <2 x i128> %shl
 }
 
-define amdgpu_kernel void @s_shl_v2i128ss(<2 x i128> %lhs, <2 x i128> %rhs) {
+define amdgpu_kernel void @s_shl_v2i128ss(<2 x i128> %lhs, <2 x i128> %rhs) #0 {
 ; GCN-LABEL: s_shl_v2i128ss:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x0
@@ -513,7 +513,7 @@
   ret void
 }
 
-define amdgpu_kernel void @s_lshr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) {
+define amdgpu_kernel void @s_lshr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) #0 {
 ; GCN-LABEL: s_lshr_v2i128_ss:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x0
@@ -583,7 +583,7 @@
   ret void
 }
 
-define amdgpu_kernel void @s_ashr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) {
+define amdgpu_kernel void @s_ashr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) #0 {
 ; GCN-LABEL: s_ashr_v2i128_ss:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x0
@@ -657,3 +657,4 @@
   ret void
 }
 
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/shl.ll b/llvm/test/CodeGen/AMDGPU/shl.ll
--- a/llvm/test/CodeGen/AMDGPU/shl.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl.ll
@@ -7,7 +7,7 @@
 
 declare i32 @llvm.amdgcn.workgroup.id.x() #0
 
-define amdgpu_kernel void @shl_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @shl_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) #0 {
 ; GCN-LABEL: shl_v2i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -53,7 +53,7 @@
   ret void
 }
 
-define amdgpu_kernel void @shl_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @shl_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
 ; GCN-LABEL: shl_v4i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -104,7 +104,7 @@
   ret void
 }
 
-define amdgpu_kernel void @shl_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
+define amdgpu_kernel void @shl_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
 ; GCN-LABEL: shl_i16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -159,7 +159,7 @@
   ret void
 }
 
-define amdgpu_kernel void @shl_i16_v_s(i16 addrspace(1)* %out, i16 addrspace(1)* %in, i16 %b) {
+define amdgpu_kernel void @shl_i16_v_s(i16 addrspace(1)* %out, i16 addrspace(1)* %in, i16 %b) #0 {
 ; GCN-LABEL: shl_i16_v_s:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -214,7 +214,7 @@
   ret void
 }
 
-define amdgpu_kernel void @shl_i16_v_compute_s(i16 addrspace(1)* %out, i16 addrspace(1)* %in, i16 %b) {
+define amdgpu_kernel void @shl_i16_v_compute_s(i16 addrspace(1)* %out, i16 addrspace(1)* %in, i16 %b) #0 {
 ; GCN-LABEL: shl_i16_v_compute_s:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -278,7 +278,7 @@
   ret void
 }
 
-define amdgpu_kernel void @shl_i16_computed_amount(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
+define amdgpu_kernel void @shl_i16_computed_amount(i16 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
 ; GCN-LABEL: shl_i16_computed_amount:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -353,7 +353,7 @@
   ret void
 }
 
-define amdgpu_kernel void @shl_i16_i_s(i16 addrspace(1)* %out, i16 zeroext %a) {
+define amdgpu_kernel void @shl_i16_i_s(i16 addrspace(1)* %out, i16 zeroext %a) #0 {
 ; GCN-LABEL: shl_i16_i_s:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
@@ -399,7 +399,7 @@
   ret void
 }
 
-define amdgpu_kernel void @shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
+define amdgpu_kernel void @shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
 ; GCN-LABEL: shl_v2i16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -478,7 +478,7 @@
   ret void
 }
 
-define amdgpu_kernel void @shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
+define amdgpu_kernel void @shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
 ; GCN-LABEL: shl_v4i16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -598,7 +598,7 @@
   ret void
 }
 
-define amdgpu_kernel void @shl_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @shl_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
 ; GCN-LABEL: shl_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -654,7 +654,7 @@
   ret void
 }
 
-define amdgpu_kernel void @shl_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
+define amdgpu_kernel void @shl_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) #0 {
 ; GCN-LABEL: shl_v2i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -726,7 +726,7 @@
   ret void
 }
 
-define amdgpu_kernel void @shl_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
+define amdgpu_kernel void @shl_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) #0 {
 ; GCN-LABEL: shl_v4i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -837,7 +837,7 @@
 }
 
 ; Make sure load width gets reduced to i32 load.
-define amdgpu_kernel void @s_shl_32_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a) {
+define amdgpu_kernel void @s_shl_32_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a) #0 {
 ; GCN-LABEL: s_shl_32_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
@@ -866,7 +866,7 @@
   ret void
 }
 
-define amdgpu_kernel void @v_shl_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @v_shl_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
 ; GCN-LABEL: v_shl_32_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
@@ -914,7 +914,7 @@
   ret void
 }
 
-define amdgpu_kernel void @s_shl_constant_i64(i64 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @s_shl_constant_i64(i64 addrspace(1)* %out, i64 %a) #0 {
 ; GCN-LABEL: s_shl_constant_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -961,7 +961,7 @@
   ret void
 }
 
-define amdgpu_kernel void @v_shl_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
+define amdgpu_kernel void @v_shl_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) #0 {
 ; GCN-LABEL: v_shl_constant_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -1019,7 +1019,7 @@
   ret void
 }
 
-define amdgpu_kernel void @v_shl_i64_32_bit_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
+define amdgpu_kernel void @v_shl_i64_32_bit_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) #0 {
 ; GCN-LABEL: v_shl_i64_32_bit_constant:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -1074,7 +1074,7 @@
   ret void
 }
 
-define amdgpu_kernel void @v_shl_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
+define amdgpu_kernel void @v_shl_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) #0 {
 ; GCN-LABEL: v_shl_inline_imm_64_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -1127,7 +1127,7 @@
   ret void
 }
 
-define amdgpu_kernel void @s_shl_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_shl_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) #0 {
 ; GCN-LABEL: s_shl_inline_imm_64_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
@@ -1168,7 +1168,7 @@
   ret void
 }
 
-define amdgpu_kernel void @s_shl_inline_imm_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_shl_inline_imm_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) #0 {
 ; GCN-LABEL: s_shl_inline_imm_1_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
@@ -1203,7 +1203,7 @@
   ret void
 }
 
-define amdgpu_kernel void @s_shl_inline_imm_1_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_shl_inline_imm_1_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) #0 {
 ; GCN-LABEL: s_shl_inline_imm_1_0_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
@@ -1236,7 +1236,7 @@
   ret void
 }
 
-define amdgpu_kernel void @s_shl_inline_imm_neg_1_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_shl_inline_imm_neg_1_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) #0 {
 ; GCN-LABEL: s_shl_inline_imm_neg_1_0_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
@@ -1269,7 +1269,7 @@
   ret void
 }
 
-define amdgpu_kernel void @s_shl_inline_imm_0_5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_shl_inline_imm_0_5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) #0 {
 ; GCN-LABEL: s_shl_inline_imm_0_5_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
@@ -1302,7 +1302,7 @@
   ret void
 }
 
-define amdgpu_kernel void @s_shl_inline_imm_neg_0_5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_shl_inline_imm_neg_0_5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) #0 {
 ; GCN-LABEL: s_shl_inline_imm_neg_0_5_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
@@ -1335,7 +1335,7 @@
   ret void
 }
 
-define amdgpu_kernel void @s_shl_inline_imm_2_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_shl_inline_imm_2_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) #0 {
 ; GCN-LABEL: s_shl_inline_imm_2_0_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
@@ -1368,7 +1368,7 @@
   ret void
 }
 
-define amdgpu_kernel void @s_shl_inline_imm_neg_2_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_shl_inline_imm_neg_2_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) #0 {
 ; GCN-LABEL: s_shl_inline_imm_neg_2_0_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
@@ -1401,7 +1401,7 @@
   ret void
 }
 
-define amdgpu_kernel void @s_shl_inline_imm_4_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_shl_inline_imm_4_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) #0 {
 ; GCN-LABEL: s_shl_inline_imm_4_0_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
@@ -1434,7 +1434,7 @@
   ret void
 }
 
-define amdgpu_kernel void @s_shl_inline_imm_neg_4_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_shl_inline_imm_neg_4_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) #0 {
 ; GCN-LABEL: s_shl_inline_imm_neg_4_0_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
@@ -1470,7 +1470,7 @@
 
 ; Test with the 64-bit integer bitpattern for a 32-bit float in the
 ; low 32-bits, which is not a valid 64-bit inline immmediate.
-define amdgpu_kernel void @s_shl_inline_imm_f32_4_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_shl_inline_imm_f32_4_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) #0 {
 ; GCN-LABEL: s_shl_inline_imm_f32_4_0_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
@@ -1514,7 +1514,7 @@
 }
 
 ; FIXME: Copy of -1 register
-define amdgpu_kernel void @s_shl_inline_imm_f32_neg_4_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_shl_inline_imm_f32_neg_4_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) #0 {
 ; GCN-LABEL: s_shl_inline_imm_f32_neg_4_0_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
@@ -1560,7 +1560,7 @@
   ret void
 }
 
-define amdgpu_kernel void @s_shl_inline_high_imm_f32_4_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_shl_inline_high_imm_f32_4_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) #0 {
 ; GCN-LABEL: s_shl_inline_high_imm_f32_4_0_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
@@ -1595,7 +1595,7 @@
   ret void
 }
 
-define amdgpu_kernel void @s_shl_inline_high_imm_f32_neg_4_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_shl_inline_high_imm_f32_neg_4_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) #0 {
 ; GCN-LABEL: s_shl_inline_high_imm_f32_neg_4_0_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
@@ -1630,7 +1630,7 @@
   ret void
 }
 
-define amdgpu_kernel void @test_mul2(i32 %p) {
+define amdgpu_kernel void @test_mul2(i32 %p) #0 {
 ; GCN-LABEL: test_mul2:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dword s0, s[0:1], 0x9
@@ -1657,7 +1657,7 @@
    ret void
 }
 
-define void @shl_or_k(i32 addrspace(1)* %out, i32 %in) {
+define void @shl_or_k(i32 addrspace(1)* %out, i32 %in) #0 {
 ; GCN-LABEL: shl_or_k:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1689,7 +1689,7 @@
   ret void
 }
 
-define void @shl_or_k_two_uses(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %in) {
+define void @shl_or_k_two_uses(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %in) #0 {
 ; GCN-LABEL: shl_or_k_two_uses:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/shl_add.ll b/llvm/test/CodeGen/AMDGPU/shl_add.ll
--- a/llvm/test/CodeGen/AMDGPU/shl_add.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl_add.ll
@@ -7,7 +7,7 @@
 ; V_LSHL_ADD_U32
 ; ===================================================================================
 
-define amdgpu_ps float @shl_add(i32 %a, i32 %b, i32 %c) {
+define amdgpu_ps float @shl_add(i32 %a, i32 %b, i32 %c) #0 {
 ; VI-LABEL: shl_add:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
@@ -31,7 +31,7 @@
 }
 
 ; ThreeOp instruction variant not used due to Constant Bus Limitations
-define amdgpu_ps float @shl_add_vgpr_a(i32 %a, i32 inreg %b, i32 inreg %c) {
+define amdgpu_ps float @shl_add_vgpr_a(i32 %a, i32 inreg %b, i32 inreg %c) #0 {
 ; VI-LABEL: shl_add_vgpr_a:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, s2, v0
@@ -55,7 +55,7 @@
   ret float %bc
 }
 
-define amdgpu_ps float @shl_add_vgpr_all(i32 %a, i32 %b, i32 %c) {
+define amdgpu_ps float @shl_add_vgpr_all(i32 %a, i32 %b, i32 %c) #0 {
 ; VI-LABEL: shl_add_vgpr_all:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
@@ -78,7 +78,7 @@
   ret float %bc
 }
 
-define amdgpu_ps float @shl_add_vgpr_ab(i32 %a, i32 %b, i32 inreg %c) {
+define amdgpu_ps float @shl_add_vgpr_ab(i32 %a, i32 %b, i32 inreg %c) #0 {
 ; VI-LABEL: shl_add_vgpr_ab:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
@@ -101,7 +101,7 @@
   ret float %bc
 }
 
-define amdgpu_ps float @shl_add_vgpr_const(i32 %a, i32 %b) {
+define amdgpu_ps float @shl_add_vgpr_const(i32 %a, i32 %b) #0 {
 ; VI-LABEL: shl_add_vgpr_const:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
@@ -123,3 +123,4 @@
   %bc = bitcast i32 %result to float
   ret float %bc
 }
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/shl_or.ll b/llvm/test/CodeGen/AMDGPU/shl_or.ll
--- a/llvm/test/CodeGen/AMDGPU/shl_or.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl_or.ll
@@ -7,7 +7,7 @@
 ; V_LSHL_OR_B32
 ; ===================================================================================
 
-define amdgpu_ps float @shl_or(i32 %a, i32 %b, i32 %c) {
+define amdgpu_ps float @shl_or(i32 %a, i32 %b, i32 %c) #0 {
 ; VI-LABEL: shl_or:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
@@ -30,7 +30,7 @@
   ret float %bc
 }
 
-define amdgpu_ps float @shl_or_vgpr_c(i32 inreg %a, i32 inreg %b, i32 %c) {
+define amdgpu_ps float @shl_or_vgpr_c(i32 inreg %a, i32 inreg %b, i32 %c) #0 {
 ; VI-LABEL: shl_or_vgpr_c:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_lshl_b32 s0, s2, s3
@@ -54,7 +54,7 @@
   ret float %bc
 }
 
-define amdgpu_ps float @shl_or_vgpr_all2(i32 %a, i32 %b, i32 %c) {
+define amdgpu_ps float @shl_or_vgpr_all2(i32 %a, i32 %b, i32 %c) #0 {
 ; VI-LABEL: shl_or_vgpr_all2:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
@@ -77,7 +77,7 @@
   ret float %bc
 }
 
-define amdgpu_ps float @shl_or_vgpr_ac(i32 %a, i32 inreg %b, i32 %c) {
+define amdgpu_ps float @shl_or_vgpr_ac(i32 %a, i32 inreg %b, i32 %c) #0 {
 ; VI-LABEL: shl_or_vgpr_ac:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, s2, v0
@@ -100,7 +100,7 @@
   ret float %bc
 }
 
-define amdgpu_ps float @shl_or_vgpr_const(i32 %a, i32 %b) {
+define amdgpu_ps float @shl_or_vgpr_const(i32 %a, i32 %b) #0 {
 ; VI-LABEL: shl_or_vgpr_const:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
@@ -123,7 +123,7 @@
   ret float %bc
 }
 
-define amdgpu_ps float @shl_or_vgpr_const2(i32 %a, i32 %b) {
+define amdgpu_ps float @shl_or_vgpr_const2(i32 %a, i32 %b) #0 {
 ; VI-LABEL: shl_or_vgpr_const2:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 6, v0
@@ -146,7 +146,7 @@
   ret float %bc
 }
 
-define amdgpu_ps float @shl_or_vgpr_const_scalar1(i32 inreg %a, i32 %b) {
+define amdgpu_ps float @shl_or_vgpr_const_scalar1(i32 inreg %a, i32 %b) #0 {
 ; VI-LABEL: shl_or_vgpr_const_scalar1:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_lshl_b32 s0, s2, 6
@@ -169,7 +169,7 @@
   ret float %bc
 }
 
-define amdgpu_ps float @shl_or_vgpr_const_scalar2(i32 %a, i32 inreg %b) {
+define amdgpu_ps float @shl_or_vgpr_const_scalar2(i32 %a, i32 inreg %b) #0 {
 ; VI-LABEL: shl_or_vgpr_const_scalar2:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 6, v0
@@ -191,3 +191,4 @@
   %bc = bitcast i32 %result to float
   ret float %bc
 }
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll
--- a/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll
@@ -2,7 +2,7 @@
 ; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI %s
 ; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck --check-prefix=FLAT %s
 
-define amdgpu_kernel void @break_inserted_outside_of_loop(i32 addrspace(1)* %out, i32 %a) {
+define amdgpu_kernel void @break_inserted_outside_of_loop(i32 addrspace(1)* %out, i32 %a) #0 {
 ; SI-LABEL: break_inserted_outside_of_loop:
 ; SI:       ; %bb.0: ; %main_body
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
@@ -64,7 +64,7 @@
   br i1 %1, label %ENDLOOP, label %ENDIF
 }
 
-define amdgpu_kernel void @phi_cond_outside_loop(i32 %b) {
+define amdgpu_kernel void @phi_cond_outside_loop(i32 %b) #0 {
 ; SI-LABEL: phi_cond_outside_loop:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, -1, 0
diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll
--- a/llvm/test/CodeGen/AMDGPU/srem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem64.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx600 -amdgpu-bypass-slow-div=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 ; RUN: llc -march=amdgcn -mcpu=gfx600 -amdgpu-bypass-slow-div=0 -amdgpu-codegenprepare-expand-div64 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-IR %s
 
-define amdgpu_kernel void @s_test_srem(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @s_test_srem(i64 addrspace(1)* %out, i64 %x, i64 %y) #0 {
 ; GCN-LABEL: s_test_srem:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0xd
@@ -233,7 +233,7 @@
   ret void
 }
 
-define i64 @v_test_srem(i64 %x, i64 %y) {
+define i64 @v_test_srem(i64 %x, i64 %y) #0 {
 ; GCN-LABEL: v_test_srem:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -475,7 +475,7 @@
   ret i64 %result
 }
 
-define amdgpu_kernel void @s_test_srem23_64(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @s_test_srem23_64(i64 addrspace(1)* %out, i64 %x, i64 %y) #0 {
 ; GCN-LABEL: s_test_srem23_64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -546,7 +546,7 @@
   ret void
 }
 
-define amdgpu_kernel void @s_test_srem24_64(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @s_test_srem24_64(i64 addrspace(1)* %out, i64 %x, i64 %y) #0 {
 ; GCN-LABEL: s_test_srem24_64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -617,7 +617,7 @@
   ret void
 }
 
-define i64 @v_test_srem24_64(i64 %x, i64 %y) {
+define i64 @v_test_srem24_64(i64 %x, i64 %y) #0 {
 ; GCN-LABEL: v_test_srem24_64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -671,7 +671,7 @@
   ret i64 %result
 }
 
-define amdgpu_kernel void @s_test_srem25_64(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @s_test_srem25_64(i64 addrspace(1)* %out, i64 %x, i64 %y) #0 {
 ; GCN-LABEL: s_test_srem25_64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -742,7 +742,7 @@
   ret void
 }
 
-define amdgpu_kernel void @s_test_srem31_64(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @s_test_srem31_64(i64 addrspace(1)* %out, i64 %x, i64 %y) #0 {
 ; GCN-LABEL: s_test_srem31_64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -814,7 +814,7 @@
 }
 
 ; 32 known sign bits
-define amdgpu_kernel void @s_test_srem32_64(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @s_test_srem32_64(i64 addrspace(1)* %out, i64 %x, i64 %y) #0 {
 ; GCN-LABEL: s_test_srem32_64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -880,7 +880,7 @@
 }
 
 ; 33 known sign bits
-define amdgpu_kernel void @s_test_srem33_64(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @s_test_srem33_64(i64 addrspace(1)* %out, i64 %x, i64 %y) #0 {
 ; GCN-LABEL: s_test_srem33_64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
@@ -1145,7 +1145,7 @@
   ret void
 }
 
-define amdgpu_kernel void @s_test_srem24_48(i48 addrspace(1)* %out, i48 %x, i48 %y) {
+define amdgpu_kernel void @s_test_srem24_48(i48 addrspace(1)* %out, i48 %x, i48 %y) #0 {
 ; GCN-LABEL: s_test_srem24_48:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
@@ -1317,7 +1317,7 @@
   ret void
 }
 
-define amdgpu_kernel void @s_test_srem_k_num_i64(i64 addrspace(1)* %out, i64 %x) {
+define amdgpu_kernel void @s_test_srem_k_num_i64(i64 addrspace(1)* %out, i64 %x) #0 {
 ; GCN-LABEL: s_test_srem_k_num_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -1534,7 +1534,7 @@
   ret void
 }
 
-define i64 @v_test_srem_k_num_i64(i64 %x) {
+define i64 @v_test_srem_k_num_i64(i64 %x) #0 {
 ; GCN-LABEL: v_test_srem_k_num_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1744,7 +1744,7 @@
   ret i64 %result
 }
 
-define i64 @v_test_srem_pow2_k_num_i64(i64 %x) {
+define i64 @v_test_srem_pow2_k_num_i64(i64 %x) #0 {
 ; GCN-LABEL: v_test_srem_pow2_k_num_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1960,7 +1960,7 @@
   ret i64 %result
 }
 
-define i64 @v_test_srem_pow2_k_den_i64(i64 %x) {
+define i64 @v_test_srem_pow2_k_den_i64(i64 %x) #0 {
 ; GCN-LABEL: v_test_srem_pow2_k_den_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2066,7 +2066,7 @@
   ret i64 %result
 }
 
-define amdgpu_kernel void @s_test_srem24_k_num_i64(i64 addrspace(1)* %out, i64 %x) {
+define amdgpu_kernel void @s_test_srem24_k_num_i64(i64 addrspace(1)* %out, i64 %x) #0 {
 ; GCN-LABEL: s_test_srem24_k_num_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -2126,7 +2126,7 @@
   ret void
 }
 
-define amdgpu_kernel void @s_test_srem24_k_den_i64(i64 addrspace(1)* %out, i64 %x) {
+define amdgpu_kernel void @s_test_srem24_k_den_i64(i64 addrspace(1)* %out, i64 %x) #0 {
 ; GCN-LABEL: s_test_srem24_k_den_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -2190,7 +2190,7 @@
   ret void
 }
 
-define i64 @v_test_srem24_k_num_i64(i64 %x) {
+define i64 @v_test_srem24_k_num_i64(i64 %x) #0 {
 ; GCN-LABEL: v_test_srem24_k_num_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2239,7 +2239,7 @@
   ret i64 %result
 }
 
-define i64 @v_test_srem24_pow2_k_num_i64(i64 %x) {
+define i64 @v_test_srem24_pow2_k_num_i64(i64 %x) #0 {
 ; GCN-LABEL: v_test_srem24_pow2_k_num_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2288,7 +2288,7 @@
   ret i64 %result
 }
 
-define i64 @v_test_srem24_pow2_k_den_i64(i64 %x) {
+define i64 @v_test_srem24_pow2_k_den_i64(i64 %x) #0 {
 ; GCN-LABEL: v_test_srem24_pow2_k_den_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2325,3 +2325,4 @@
   %result = srem i64 %x.shr, 32768
   ret i64 %result
 }
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll
--- a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll
+++ b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll
@@ -12,7 +12,7 @@
 ; speculatively refer to the ABI stack pointer register at all.
 
 ; An assert was hit when frame offset register was used to address FrameIndex.
-define amdgpu_kernel void @kernel_background_evaluate(float addrspace(5)* %kg, <4 x i32> addrspace(1)* %input, <4 x float> addrspace(1)* %output, i32 %i) {
+define amdgpu_kernel void @kernel_background_evaluate(float addrspace(5)* %kg, <4 x i32> addrspace(1)* %input, <4 x float> addrspace(1)* %output, i32 %i) #0 {
 ; GCN-LABEL: kernel_background_evaluate:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_load_dword s6, s[0:1], 0x24
@@ -74,3 +74,4 @@
 }
 
 declare hidden i32 @svm_eval_nodes(float addrspace(5)*, <1339 x i32> addrspace(5)*, <4 x i32> addrspace(5)*, i32, i32) local_unnamed_addr
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll
--- a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll
@@ -3,7 +3,7 @@
 ; RUN: llc < %s -march=amdgcn -mcpu=fiji -verify-machineinstrs | FileCheck %s -enable-var-scope -check-prefixes=GCN,VI
 
 ; Make sure high constant 0 isn't pointlessly materialized
-define i16 @trunc_bitcast_i64_lshr_32_i16(i64 %bar) {
+define i16 @trunc_bitcast_i64_lshr_32_i16(i64 %bar) #0 {
 ; GCN-LABEL: trunc_bitcast_i64_lshr_32_i16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14,7 +14,7 @@
   ret i16 %trunc
 }
 
-define i32 @trunc_bitcast_i64_lshr_32_i32(i64 %bar) {
+define i32 @trunc_bitcast_i64_lshr_32_i32(i64 %bar) #0 {
 ; GCN-LABEL: trunc_bitcast_i64_lshr_32_i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -25,7 +25,7 @@
   ret i32 %trunc
 }
 
-define i16 @trunc_bitcast_v2i32_to_i16(<2 x i32> %bar) {
+define i16 @trunc_bitcast_v2i32_to_i16(<2 x i32> %bar) #0 {
 ; SI-LABEL: trunc_bitcast_v2i32_to_i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -54,7 +54,7 @@
 }
 
 ; Make sure there's no crash if the source vector type is FP
-define i16 @trunc_bitcast_v2f32_to_i16(<2 x float> %bar) {
+define i16 @trunc_bitcast_v2f32_to_i16(<2 x float> %bar) #0 {
 ; SI-LABEL: trunc_bitcast_v2f32_to_i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -82,7 +82,7 @@
   ret i16 %add
 }
 
-define amdgpu_kernel void @truncate_high_elt_extract_vector(<2 x i16> addrspace(1)* nocapture readonly %arg, <2 x i16> addrspace(1)* nocapture readonly %arg1, <2 x i16> addrspace(1)* nocapture %arg2) local_unnamed_addr {
+define amdgpu_kernel void @truncate_high_elt_extract_vector(<2 x i16> addrspace(1)* nocapture readonly %arg, <2 x i16> addrspace(1)* nocapture readonly %arg1, <2 x i16> addrspace(1)* nocapture %arg2) local_unnamed_addr #0 {
 ; SI-LABEL: truncate_high_elt_extract_vector:
 ; SI:       ; %bb.0: ; %bb
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -156,3 +156,4 @@
   %trunc = trunc <2 x i64> %arg0 to <2 x i16>
   ret <2 x i16> %trunc
 }
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll
--- a/llvm/test/CodeGen/AMDGPU/udiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx600 -amdgpu-bypass-slow-div=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 ; RUN: llc -march=amdgcn -mcpu=gfx600 -amdgpu-bypass-slow-div=0 -amdgpu-codegenprepare-expand-div64 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-IR %s
 
-define amdgpu_kernel void @s_test_udiv_i64(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @s_test_udiv_i64(i64 addrspace(1)* %out, i64 %x, i64 %y) #0 {
 ; GCN-LABEL: s_test_udiv_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
@@ -223,7 +223,7 @@
   ret void
 }
 
-define i64 @v_test_udiv_i64(i64 %x, i64 %y) {
+define i64 @v_test_udiv_i64(i64 %x, i64 %y) #0 {
 ; GCN-LABEL: v_test_udiv_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -431,7 +431,7 @@
   ret i64 %result
 }
 
-define amdgpu_kernel void @s_test_udiv24_64(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @s_test_udiv24_64(i64 addrspace(1)* %out, i64 %x, i64 %y) #0 {
 ; GCN-LABEL: s_test_udiv24_64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -488,7 +488,7 @@
   ret void
 }
 
-define i64 @v_test_udiv24_i64(i64 %x, i64 %y) {
+define i64 @v_test_udiv24_i64(i64 %x, i64 %y) #0 {
 ; GCN-LABEL: v_test_udiv24_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -530,7 +530,7 @@
   ret i64 %result
 }
 
-define amdgpu_kernel void @s_test_udiv32_i64(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @s_test_udiv32_i64(i64 addrspace(1)* %out, i64 %x, i64 %y) #0 {
 ; GCN-LABEL: s_test_udiv32_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dword s2, s[0:1], 0xe
@@ -583,7 +583,7 @@
   ret void
 }
 
-define amdgpu_kernel void @s_test_udiv31_i64(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @s_test_udiv31_i64(i64 addrspace(1)* %out, i64 %x, i64 %y) #0 {
 ; GCN-LABEL: s_test_udiv31_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -640,7 +640,7 @@
   ret void
 }
 
-define amdgpu_kernel void @s_test_udiv23_i64(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @s_test_udiv23_i64(i64 addrspace(1)* %out, i64 %x, i64 %y) #0 {
 ; GCN-LABEL: s_test_udiv23_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -697,7 +697,7 @@
   ret void
 }
 
-define amdgpu_kernel void @s_test_udiv24_i48(i48 addrspace(1)* %out, i48 %x, i48 %y) {
+define amdgpu_kernel void @s_test_udiv24_i48(i48 addrspace(1)* %out, i48 %x, i48 %y) #0 {
 ; GCN-LABEL: s_test_udiv24_i48:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dword s2, s[0:1], 0xd
@@ -937,7 +937,7 @@
   ret void
 }
 
-define amdgpu_kernel void @s_test_udiv_k_num_i64(i64 addrspace(1)* %out, i64 %x) {
+define amdgpu_kernel void @s_test_udiv_k_num_i64(i64 addrspace(1)* %out, i64 %x) #0 {
 ; GCN-LABEL: s_test_udiv_k_num_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -1136,12 +1136,12 @@
   ret void
 }
 
-; define i64 @v_test_udiv_k_num_i64(i64 %x) {
+; define i64 @v_test_udiv_k_num_i64(i64 %x) #0 {
 ;   %result = udiv i64 24, %x
 ;   ret i64 %result
 ; }
 
-define i64 @v_test_udiv_pow2_k_num_i64(i64 %x) {
+define i64 @v_test_udiv_pow2_k_num_i64(i64 %x) #0 {
 ; GCN-LABEL: v_test_udiv_pow2_k_num_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1331,7 +1331,7 @@
   ret i64 %result
 }
 
-define i64 @v_test_udiv_pow2_k_den_i64(i64 %x) {
+define i64 @v_test_udiv_pow2_k_den_i64(i64 %x) #0 {
 ; GCN-LABEL: v_test_udiv_pow2_k_den_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1421,7 +1421,7 @@
   ret i64 %result
 }
 
-define amdgpu_kernel void @s_test_udiv_k_den_i64(i64 addrspace(1)* %out, i64 %x) {
+define amdgpu_kernel void @s_test_udiv_k_den_i64(i64 addrspace(1)* %out, i64 %x) #0 {
 ; GCN-LABEL: s_test_udiv_k_den_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0x4f800000
@@ -1613,7 +1613,7 @@
   ret void
 }
 
-define i64 @v_test_udiv_k_den_i64(i64 %x) {
+define i64 @v_test_udiv_k_den_i64(i64 %x) #0 {
 ; GCN-LABEL: v_test_udiv_k_den_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1795,7 +1795,7 @@
   ret i64 %result
 }
 
-define amdgpu_kernel void @s_test_udiv24_k_num_i64(i64 addrspace(1)* %out, i64 %x) {
+define amdgpu_kernel void @s_test_udiv24_k_num_i64(i64 addrspace(1)* %out, i64 %x) #0 {
 ; GCN-LABEL: s_test_udiv24_k_num_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -1843,7 +1843,7 @@
   ret void
 }
 
-define amdgpu_kernel void @s_test_udiv24_k_den_i64(i64 addrspace(1)* %out, i64 %x) {
+define amdgpu_kernel void @s_test_udiv24_k_den_i64(i64 addrspace(1)* %out, i64 %x) #0 {
 ; GCN-LABEL: s_test_udiv24_k_den_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -1893,7 +1893,7 @@
   ret void
 }
 
-define i64 @v_test_udiv24_k_num_i64(i64 %x) {
+define i64 @v_test_udiv24_k_num_i64(i64 %x) #0 {
 ; GCN-LABEL: v_test_udiv24_k_num_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1932,7 +1932,7 @@
   ret i64 %result
 }
 
-define i64 @v_test_udiv24_pow2_k_num_i64(i64 %x) {
+define i64 @v_test_udiv24_pow2_k_num_i64(i64 %x) #0 {
 ; GCN-LABEL: v_test_udiv24_pow2_k_num_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1971,7 +1971,7 @@
   ret i64 %result
 }
 
-define i64 @v_test_udiv24_pow2_k_den_i64(i64 %x) {
+define i64 @v_test_udiv24_pow2_k_den_i64(i64 %x) #0 {
 ; GCN-LABEL: v_test_udiv24_pow2_k_den_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1998,3 +1998,4 @@
   %result = udiv i64 %x.shr, 32768
   ret i64 %result
 }
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll
--- a/llvm/test/CodeGen/AMDGPU/urem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/urem64.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx600 -amdgpu-bypass-slow-div=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 ; RUN: llc -march=amdgcn -mcpu=gfx600 -amdgpu-bypass-slow-div=0 -amdgpu-codegenprepare-expand-div64 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-IR %s
 
-define amdgpu_kernel void @s_test_urem_i64(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @s_test_urem_i64(i64 addrspace(1)* %out, i64 %x, i64 %y) #0 {
 ; GCN-LABEL: s_test_urem_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0xd
@@ -233,7 +233,7 @@
   ret void
 }
 
-define i64 @v_test_urem_i64(i64 %x, i64 %y) {
+define i64 @v_test_urem_i64(i64 %x, i64 %y) #0 {
 ; GCN-LABEL: v_test_urem_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -446,7 +446,7 @@
   ret i64 %result
 }
 
-define amdgpu_kernel void @s_test_urem31_i64(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @s_test_urem31_i64(i64 addrspace(1)* %out, i64 %x, i64 %y) #0 {
 ; GCN-LABEL: s_test_urem31_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dword s2, s[0:1], 0xe
@@ -505,7 +505,7 @@
   ret void
 }
 
-define amdgpu_kernel void @s_test_urem31_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) {
+define amdgpu_kernel void @s_test_urem31_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) #0 {
 ; GCN-LABEL: s_test_urem31_v2i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
@@ -596,7 +596,7 @@
   ret void
 }
 
-define amdgpu_kernel void @s_test_urem24_i64(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @s_test_urem24_i64(i64 addrspace(1)* %out, i64 %x, i64 %y) #0 {
 ; GCN-LABEL: s_test_urem24_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dword s2, s[0:1], 0xe
@@ -655,7 +655,7 @@
   ret void
 }
 
-define amdgpu_kernel void @s_test_urem23_64_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) {
+define amdgpu_kernel void @s_test_urem23_64_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) #0 {
 ; GCN-LABEL: s_test_urem23_64_v2i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
@@ -746,7 +746,7 @@
   ret void
 }
 
-define amdgpu_kernel void @s_test_urem_k_num_i64(i64 addrspace(1)* %out, i64 %x) {
+define amdgpu_kernel void @s_test_urem_k_num_i64(i64 addrspace(1)* %out, i64 %x) #0 {
 ; GCN-LABEL: s_test_urem_k_num_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -953,7 +953,7 @@
   ret void
 }
 
-define amdgpu_kernel void @s_test_urem_k_den_i64(i64 addrspace(1)* %out, i64 %x) {
+define amdgpu_kernel void @s_test_urem_k_den_i64(i64 addrspace(1)* %out, i64 %x) #0 {
 ; GCN-LABEL: s_test_urem_k_den_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0x4f800000
@@ -1153,12 +1153,12 @@
 }
 
 ; FIXME: Constant bus violation
-; define i64 @v_test_urem_k_num_i64(i64 %x) {
+; define i64 @v_test_urem_k_num_i64(i64 %x) #0 {
 ;   %result = urem i64 24, %x
 ;   ret i64 %result
 ; }
 
-define i64 @v_test_urem_pow2_k_num_i64(i64 %x) {
+define i64 @v_test_urem_pow2_k_num_i64(i64 %x) #0 {
 ; GCN-LABEL: v_test_urem_pow2_k_num_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1353,7 +1353,7 @@
   ret i64 %result
 }
 
-define i64 @v_test_urem_pow2_k_den_i64(i64 %x) {
+define i64 @v_test_urem_pow2_k_den_i64(i64 %x) #0 {
 ; GCN-LABEL: v_test_urem_pow2_k_den_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1444,7 +1444,7 @@
   ret i64 %result
 }
 
-define amdgpu_kernel void @s_test_urem24_k_num_i64(i64 addrspace(1)* %out, i64 %x) {
+define amdgpu_kernel void @s_test_urem24_k_num_i64(i64 addrspace(1)* %out, i64 %x) #0 {
 ; GCN-LABEL: s_test_urem24_k_num_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -1496,7 +1496,7 @@
   ret void
 }
 
-define amdgpu_kernel void @s_test_urem24_k_den_i64(i64 addrspace(1)* %out, i64 %x) {
+define amdgpu_kernel void @s_test_urem24_k_den_i64(i64 addrspace(1)* %out, i64 %x) #0 {
 ; GCN-LABEL: s_test_urem24_k_den_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -1552,7 +1552,7 @@
   ret void
 }
 
-define i64 @v_test_urem24_k_num_i64(i64 %x) {
+define i64 @v_test_urem24_k_num_i64(i64 %x) #0 {
 ; GCN-LABEL: v_test_urem24_k_num_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1595,7 +1595,7 @@
   ret i64 %result
 }
 
-define i64 @v_test_urem24_pow2_k_num_i64(i64 %x) {
+define i64 @v_test_urem24_pow2_k_num_i64(i64 %x) #0 {
 ; GCN-LABEL: v_test_urem24_pow2_k_num_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1638,7 +1638,7 @@
   ret i64 %result
 }
 
-define i64 @v_test_urem24_pow2_k_den_i64(i64 %x) {
+define i64 @v_test_urem24_pow2_k_den_i64(i64 %x) #0 {
 ; GCN-LABEL: v_test_urem24_pow2_k_den_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1667,3 +1667,4 @@
   %result = urem i64 %x.shr, 32768
   ret i64 %result
 }
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll b/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll
--- a/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll
@@ -52,7 +52,7 @@
 ; VI-NEXT:    s_endpgm
     half addrspace(1)* %r,
     half addrspace(1)* %a,
-    half addrspace(1)* %b) {
+    half addrspace(1)* %b) #0 {
 entry:
   %a.val = load half, half addrspace(1)* %a
   %b.val = load half, half addrspace(1)* %b
@@ -136,7 +136,7 @@
     half addrspace(1)* %r1,
     half addrspace(1)* %a,
     half addrspace(1)* %b,
-    half addrspace(1)* %c) {
+    half addrspace(1)* %c) #0 {
 entry:
   %a.val = load volatile half, half addrspace(1)* %a
   %b.val = load volatile half, half addrspace(1)* %b
@@ -151,3 +151,4 @@
   store half %r1.val, half addrspace(1)* %r1
   ret void
 }
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
--- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s
 
-define <4 x half> @shuffle_v4f16_23uu(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
+define <4 x half> @shuffle_v4f16_23uu(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) #0 {
 ; GFX9-LABEL: shuffle_v4f16_23uu:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14,7 +14,7 @@
   ret <4 x half> %shuffle
 }
 
-define <4 x half> @shuffle_v4f16_234u(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
+define <4 x half> @shuffle_v4f16_234u(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) #0 {
 ; GFX9-LABEL: shuffle_v4f16_234u:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -30,7 +30,7 @@
   ret <4 x half> %shuffle
 }
 
-define <4 x half> @shuffle_v4f16_u1u3(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
+define <4 x half> @shuffle_v4f16_u1u3(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) #0 {
 ; GFX9-LABEL: shuffle_v4f16_u1u3:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -43,7 +43,7 @@
   ret <4 x half> %shuffle
 }
 
-define <4 x half> @shuffle_v4f16_u3u1(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
+define <4 x half> @shuffle_v4f16_u3u1(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) #0 {
 ; GFX9-LABEL: shuffle_v4f16_u3u1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -57,7 +57,7 @@
   ret <4 x half> %shuffle
 }
 
-define <4 x half> @shuffle_v4f16_u3uu(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
+define <4 x half> @shuffle_v4f16_u3uu(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) #0 {
 ; GFX9-LABEL: shuffle_v4f16_u3uu:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -70,7 +70,7 @@
   ret <4 x half> %shuffle
 }
 
-define <4 x half> @shuffle_v4f16_3u6u(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
+define <4 x half> @shuffle_v4f16_3u6u(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) #0 {
 ; GFX9-LABEL: shuffle_v4f16_3u6u:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -86,7 +86,7 @@
   ret <4 x half> %shuffle
 }
 
-define <4 x half> @shuffle_v4f16_3uu7(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
+define <4 x half> @shuffle_v4f16_3uu7(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) #0 {
 ; GFX9-LABEL: shuffle_v4f16_3uu7:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -102,7 +102,7 @@
   ret <4 x half> %shuffle
 }
 
-define <4 x half> @shuffle_v4f16_35u5(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
+define <4 x half> @shuffle_v4f16_35u5(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) #0 {
 ; GFX9-LABEL: shuffle_v4f16_35u5:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -122,7 +122,7 @@
   ret <4 x half> %shuffle
 }
 
-define <4 x half> @shuffle_v4f16_357u(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
+define <4 x half> @shuffle_v4f16_357u(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) #0 {
 ; GFX9-LABEL: shuffle_v4f16_357u:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -142,7 +142,7 @@
   ret <4 x half> %shuffle
 }
 
-define <4 x half> @shuffle_v4f16_0101(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
+define <4 x half> @shuffle_v4f16_0101(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) #0 {
 ; GFX9-LABEL: shuffle_v4f16_0101:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -156,7 +156,7 @@
   ret <4 x half> %shuffle
 }
 
-define <4 x half> @shuffle_v4f16_0123(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
+define <4 x half> @shuffle_v4f16_0123(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) #0 {
 ; GFX9-LABEL: shuffle_v4f16_0123:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -169,7 +169,7 @@
   ret <4 x half> %shuffle
 }
 
-define <4 x half> @shuffle_v4f16_0145(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
+define <4 x half> @shuffle_v4f16_0145(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) #0 {
 ; GFX9-LABEL: shuffle_v4f16_0145:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -183,7 +183,7 @@
   ret <4 x half> %shuffle
 }
 
-define <4 x half> @shuffle_v4f16_0167(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
+define <4 x half> @shuffle_v4f16_0167(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) #0 {
 ; GFX9-LABEL: shuffle_v4f16_0167:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -197,7 +197,7 @@
   ret <4 x half> %shuffle
 }
 
-define <4 x half> @shuffle_v4f16_2301(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
+define <4 x half> @shuffle_v4f16_2301(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) #0 {
 ; GFX9-LABEL: shuffle_v4f16_2301:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -211,7 +211,7 @@
   ret <4 x half> %shuffle
 }
 
-define <4 x half> @shuffle_v4f16_2323(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
+define <4 x half> @shuffle_v4f16_2323(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) #0 {
 ; GFX9-LABEL: shuffle_v4f16_2323:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -225,7 +225,7 @@
   ret <4 x half> %shuffle
 }
 
-define <4 x half> @shuffle_v4f16_2345(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
+define <4 x half> @shuffle_v4f16_2345(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) #0 {
 ; GFX9-LABEL: shuffle_v4f16_2345:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -239,7 +239,7 @@
   ret <4 x half> %shuffle
 }
 
-define <4 x half> @shuffle_v4f16_2367(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
+define <4 x half> @shuffle_v4f16_2367(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) #0 {
 ; GFX9-LABEL: shuffle_v4f16_2367:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -253,7 +253,7 @@
   ret <4 x half> %shuffle
 }
 
-define <4 x half> @shuffle_v4f16_4501(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
+define <4 x half> @shuffle_v4f16_4501(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) #0 {
 ; GFX9-LABEL: shuffle_v4f16_4501:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -269,7 +269,7 @@
   ret <4 x half> %shuffle
 }
 
-define <4 x half> @shuffle_v4f16_4523(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
+define <4 x half> @shuffle_v4f16_4523(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) #0 {
 ; GFX9-LABEL: shuffle_v4f16_4523:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -285,7 +285,7 @@
   ret <4 x half> %shuffle
 }
 
-define <4 x half> @shuffle_v4f16_4545(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
+define <4 x half> @shuffle_v4f16_4545(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) #0 {
 ; GFX9-LABEL: shuffle_v4f16_4545:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -299,7 +299,7 @@
   ret <4 x half> %shuffle
 }
 
-define <4 x half> @shuffle_v4f16_4567(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
+define <4 x half> @shuffle_v4f16_4567(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) #0 {
 ; GFX9-LABEL: shuffle_v4f16_4567:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -312,7 +312,7 @@
   ret <4 x half> %shuffle
 }
 
-define <4 x half> @shuffle_v4f16_6701(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
+define <4 x half> @shuffle_v4f16_6701(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) #0 {
 ; GFX9-LABEL: shuffle_v4f16_6701:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -328,7 +328,7 @@
   ret <4 x half> %shuffle
 }
 
-define <4 x half> @shuffle_v4f16_6723(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
+define <4 x half> @shuffle_v4f16_6723(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) #0 {
 ; GFX9-LABEL: shuffle_v4f16_6723:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -344,7 +344,7 @@
   ret <4 x half> %shuffle
 }
 
-define <4 x half> @shuffle_v4f16_6745(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
+define <4 x half> @shuffle_v4f16_6745(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) #0 {
 ; GFX9-LABEL: shuffle_v4f16_6745:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -358,7 +358,7 @@
   ret <4 x half> %shuffle
 }
 
-define <4 x half> @shuffle_v4f16_6767(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
+define <4 x half> @shuffle_v4f16_6767(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) #0 {
 ; GFX9-LABEL: shuffle_v4f16_6767:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -372,7 +372,7 @@
   ret <4 x half> %shuffle
 }
 
-define <4 x half> @shuffle_v4f16_2356(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
+define <4 x half> @shuffle_v4f16_2356(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) #0 {
 ; GFX9-LABEL: shuffle_v4f16_2356:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -390,7 +390,7 @@
   ret <4 x half> %shuffle
 }
 
-define <4 x half> @shuffle_v4f16_5623(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
+define <4 x half> @shuffle_v4f16_5623(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) #0 {
 ; GFX9-LABEL: shuffle_v4f16_5623:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -408,7 +408,7 @@
   ret <4 x half> %shuffle
 }
 
-define <4 x half> @shuffle_v4f16_3456(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
+define <4 x half> @shuffle_v4f16_3456(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) #0 {
 ; GFX9-LABEL: shuffle_v4f16_3456:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -427,7 +427,7 @@
   ret <4 x half> %shuffle
 }
 
-define <4 x half> @shuffle_v4f16_5634(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
+define <4 x half> @shuffle_v4f16_5634(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) #0 {
 ; GFX9-LABEL: shuffle_v4f16_5634:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -447,7 +447,7 @@
   ret <4 x half> %shuffle
 }
 
-define <4 x half> @shuffle_v4f16_5734(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
+define <4 x half> @shuffle_v4f16_5734(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) #0 {
 ; GFX9-LABEL: shuffle_v4f16_5734:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -468,7 +468,7 @@
   ret <4 x half> %shuffle
 }
 
-define <4 x i16> @shuffle_v4i16_2356(<4 x i16> addrspace(1)* %arg0, <4 x i16> addrspace(1)* %arg1) {
+define <4 x i16> @shuffle_v4i16_2356(<4 x i16> addrspace(1)* %arg0, <4 x i16> addrspace(1)* %arg1) #0 {
 ; GFX9-LABEL: shuffle_v4i16_2356:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -486,7 +486,7 @@
   ret <4 x i16> %shuffle
 }
 
-define <4 x i16> @shuffle_v4i16_0167(<4 x i16> addrspace(1)* %arg0, <4 x i16> addrspace(1)* %arg1) {
+define <4 x i16> @shuffle_v4i16_0167(<4 x i16> addrspace(1)* %arg0, <4 x i16> addrspace(1)* %arg1) #0 {
 ; GFX9-LABEL: shuffle_v4i16_0167:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -500,7 +500,7 @@
   ret <4 x i16> %shuffle
 }
 
-define <4 x half> @shuffle_v4f16_0000(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
+define <4 x half> @shuffle_v4f16_0000(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) #0 {
 ; GFX9-LABEL: shuffle_v4f16_0000:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -516,7 +516,7 @@
   ret <4 x half> %shuffle
 }
 
-define <4 x half> @shuffle_v4f16_1010(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
+define <4 x half> @shuffle_v4f16_1010(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) #0 {
 ; GFX9-LABEL: shuffle_v4f16_1010:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -533,7 +533,7 @@
   ret <4 x half> %shuffle
 }
 
-define <4 x half> @shuffle_v4f16_1100(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
+define <4 x half> @shuffle_v4f16_1100(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) #0 {
 ; GFX9-LABEL: shuffle_v4f16_1100:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -552,7 +552,7 @@
   ret <4 x half> %shuffle
 }
 
-define <4 x half> @shuffle_v4f16_6161(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
+define <4 x half> @shuffle_v4f16_6161(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) #0 {
 ; GFX9-LABEL: shuffle_v4f16_6161:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -571,7 +571,7 @@
   ret <4 x half> %shuffle
 }
 
-define <4 x half> @shuffle_v4f16_2333(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
+define <4 x half> @shuffle_v4f16_2333(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) #0 {
 ; GFX9-LABEL: shuffle_v4f16_2333:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -587,7 +587,7 @@
   ret <4 x half> %shuffle
 }
 
-define <4 x half> @shuffle_v4f16_6667(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
+define <4 x half> @shuffle_v4f16_6667(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) #0 {
 ; GFX9-LABEL: shuffle_v4f16_6667:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -603,7 +603,7 @@
   ret <4 x half> %shuffle
 }
 
-define <4 x half> @shuffle_v8f16_0101(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) {
+define <4 x half> @shuffle_v8f16_0101(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) #0 {
 ; GFX9-LABEL: shuffle_v8f16_0101:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -617,7 +617,7 @@
   ret <4 x half> %shuffle
 }
 
-define <4 x half> @shuffle_v8f16_0123(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) {
+define <4 x half> @shuffle_v8f16_0123(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) #0 {
 ; GFX9-LABEL: shuffle_v8f16_0123:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -630,7 +630,7 @@
   ret <4 x half> %shuffle
 }
 
-define <4 x half> @shuffle_v8f16_4589(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) {
+define <4 x half> @shuffle_v8f16_4589(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) #0 {
 ; GFX9-LABEL: shuffle_v8f16_4589:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -644,7 +644,7 @@
   ret <4 x half> %shuffle
 }
 
-define <4 x half> @shuffle_v8f16_10_11_2_3(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) {
+define <4 x half> @shuffle_v8f16_10_11_2_3(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) #0 {
 ; GFX9-LABEL: shuffle_v8f16_10_11_2_3:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -660,7 +660,7 @@
   ret <4 x half> %shuffle
 }
 
-define <4 x half> @shuffle_v8f16_13_14_2_3(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) {
+define <4 x half> @shuffle_v8f16_13_14_2_3(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) #0 {
 ; GFX9-LABEL: shuffle_v8f16_13_14_2_3:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -678,7 +678,7 @@
   ret <4 x half> %shuffle
 }
 
-define <4 x half> @shuffle_v3f16_0122(<3 x half> addrspace(1)* %arg0, <3 x half> addrspace(1)* %arg1) {
+define <4 x half> @shuffle_v3f16_0122(<3 x half> addrspace(1)* %arg0, <3 x half> addrspace(1)* %arg1) #0 {
 ; GFX9-LABEL: shuffle_v3f16_0122:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -693,7 +693,7 @@
   ret <4 x half> %shuffle
 }
 
-define <4 x half> @shuffle_v2f16_0122(<2 x half> addrspace(1)* %arg0, <2 x half> addrspace(1)* %arg1) {
+define <4 x half> @shuffle_v2f16_0122(<2 x half> addrspace(1)* %arg0, <2 x half> addrspace(1)* %arg1) #0 {
 ; GFX9-LABEL: shuffle_v2f16_0122:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -709,7 +709,7 @@
   ret <4 x half> %shuffle
 }
 
-define <6 x half> @shuffle_v6f16_452367(<6 x half> addrspace(1)* %arg0, <6 x half> addrspace(1)* %arg1) {
+define <6 x half> @shuffle_v6f16_452367(<6 x half> addrspace(1)* %arg0, <6 x half> addrspace(1)* %arg1) #0 {
 ; GFX9-LABEL: shuffle_v6f16_452367:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -728,7 +728,7 @@
   ret <6 x half> %shuffle
 }
 
-define amdgpu_kernel void @fma_shuffle(<4 x half> addrspace(1)* nocapture readonly %A, <4 x half> addrspace(1)* nocapture readonly %B, <4 x half> addrspace(1)* nocapture %C)  {
+define amdgpu_kernel void @fma_shuffle(<4 x half> addrspace(1)* nocapture readonly %A, <4 x half> addrspace(1)* nocapture readonly %B, <4 x half> addrspace(1)* nocapture %C) #0 {
 ; GFX9-LABEL: fma_shuffle:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -783,7 +783,7 @@
   ret void
 }
 
-define <4 x half> @shuffle_v4f16_0456(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
+define <4 x half> @shuffle_v4f16_0456(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) #0 {
 ; GFX9-LABEL: shuffle_v4f16_0456:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -803,7 +803,7 @@
   ret <4 x half> %shuffle
 }
 
-define amdgpu_kernel void @shuffle_scalar_load_v8i32_0123(<8 x i32> addrspace(4)* %in, <4 x i32> addrspace(1)* %out)  {
+define amdgpu_kernel void @shuffle_scalar_load_v8i32_0123(<8 x i32> addrspace(4)* %in, <4 x i32> addrspace(1)* %out) #0 {
 ; GFX9-LABEL: shuffle_scalar_load_v8i32_0123:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
--- a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
+++ b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -amdgpu-codegenprepare-widen-constant-loads=0 -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
 ; RUN: llc -amdgpu-codegenprepare-widen-constant-loads=0 -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
 
-define amdgpu_kernel void @widen_i16_constant_load(i16 addrspace(4)* %arg) {
+define amdgpu_kernel void @widen_i16_constant_load(i16 addrspace(4)* %arg) #0 {
 ; SI-LABEL: widen_i16_constant_load:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -39,7 +39,7 @@
   ret void
 }
 
-define amdgpu_kernel void @widen_i16_constant_load_zext_i32(i16 addrspace(4)* %arg) {
+define amdgpu_kernel void @widen_i16_constant_load_zext_i32(i16 addrspace(4)* %arg) #0 {
 ; SI-LABEL: widen_i16_constant_load_zext_i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -79,7 +79,7 @@
   ret void
 }
 
-define amdgpu_kernel void @widen_i16_constant_load_sext_i32(i16 addrspace(4)* %arg) {
+define amdgpu_kernel void @widen_i16_constant_load_sext_i32(i16 addrspace(4)* %arg) #0 {
 ; SI-LABEL: widen_i16_constant_load_sext_i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -119,7 +119,7 @@
   ret void
 }
 
-define amdgpu_kernel void @widen_i17_constant_load(i17 addrspace(4)* %arg) {
+define amdgpu_kernel void @widen_i17_constant_load(i17 addrspace(4)* %arg) #0 {
 ; SI-LABEL: widen_i17_constant_load:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -166,7 +166,7 @@
   ret void
 }
 
-define amdgpu_kernel void @widen_f16_constant_load(half addrspace(4)* %arg) {
+define amdgpu_kernel void @widen_f16_constant_load(half addrspace(4)* %arg) #0 {
 ; SI-LABEL: widen_f16_constant_load:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -201,7 +201,7 @@
 }
 
 ; FIXME: valu usage on VI
-define amdgpu_kernel void @widen_v2i8_constant_load(<2 x i8> addrspace(4)* %arg) {
+define amdgpu_kernel void @widen_v2i8_constant_load(<2 x i8> addrspace(4)* %arg) #0 {
 ; SI-LABEL: widen_v2i8_constant_load:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -250,7 +250,7 @@
   ret void
 }
 
-define amdgpu_kernel void @no_widen_i16_constant_divergent_load(i16 addrspace(4)* %arg) {
+define amdgpu_kernel void @no_widen_i16_constant_divergent_load(i16 addrspace(4)* %arg) #0 {
 ; SI-LABEL: no_widen_i16_constant_divergent_load:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -295,7 +295,7 @@
   ret void
 }
 
-define amdgpu_kernel void @widen_i1_constant_load(i1 addrspace(4)* %arg) {
+define amdgpu_kernel void @widen_i1_constant_load(i1 addrspace(4)* %arg) #0 {
 ; SI-LABEL: widen_i1_constant_load:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -329,7 +329,7 @@
   ret void
 }
 
-define amdgpu_kernel void @widen_i16_zextload_i64_constant_load(i16 addrspace(4)* %arg) {
+define amdgpu_kernel void @widen_i16_zextload_i64_constant_load(i16 addrspace(4)* %arg) #0 {
 ; SI-LABEL: widen_i16_zextload_i64_constant_load:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -369,7 +369,7 @@
   ret void
 }
 
-define amdgpu_kernel void @widen_i1_zext_to_i64_constant_load(i1 addrspace(4)* %arg) {
+define amdgpu_kernel void @widen_i1_zext_to_i64_constant_load(i1 addrspace(4)* %arg) #0 {
 ; SI-LABEL: widen_i1_zext_to_i64_constant_load:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -410,7 +410,7 @@
   ret void
 }
 
-define amdgpu_kernel void @widen_i16_constant32_load(i16 addrspace(6)* %arg) {
+define amdgpu_kernel void @widen_i16_constant32_load(i16 addrspace(6)* %arg) #0 {
 ; SI-LABEL: widen_i16_constant32_load:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dword s0, s[0:1], 0x9
@@ -449,7 +449,7 @@
   ret void
 }
 
-define amdgpu_kernel void @widen_i16_global_invariant_load(i16 addrspace(1)* %arg) {
+define amdgpu_kernel void @widen_i16_global_invariant_load(i16 addrspace(1)* %arg) #0 {
 ; SI-LABEL: widen_i16_global_invariant_load:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -489,3 +489,4 @@
 declare i32 @llvm.amdgcn.workitem.id.x()
 
 !0 = !{}
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/xor3.ll b/llvm/test/CodeGen/AMDGPU/xor3.ll
--- a/llvm/test/CodeGen/AMDGPU/xor3.ll
+++ b/llvm/test/CodeGen/AMDGPU/xor3.ll
@@ -6,7 +6,7 @@
 ; V_XOR3_B32
 ; ===================================================================================
 
-define amdgpu_ps float @xor3(i32 %a, i32 %b, i32 %c) {
+define amdgpu_ps float @xor3(i32 %a, i32 %b, i32 %c) #0 {
 ; GFX9-LABEL: xor3:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v1
@@ -24,7 +24,7 @@
   ret float %bc
 }
 
-define amdgpu_ps float @xor3_vgpr_b(i32 inreg %a, i32 %b, i32 inreg %c) {
+define amdgpu_ps float @xor3_vgpr_b(i32 inreg %a, i32 %b, i32 inreg %c) #0 {
 ; GFX9-LABEL: xor3_vgpr_b:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_xor_b32_e32 v0, s2, v0
@@ -42,7 +42,7 @@
   ret float %bc
 }
 
-define amdgpu_ps float @xor3_vgpr_all2(i32 %a, i32 %b, i32 %c) {
+define amdgpu_ps float @xor3_vgpr_all2(i32 %a, i32 %b, i32 %c) #0 {
 ; GFX9-LABEL: xor3_vgpr_all2:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_xor_b32_e32 v1, v1, v2
@@ -60,7 +60,7 @@
   ret float %bc
 }
 
-define amdgpu_ps float @xor3_vgpr_bc(i32 inreg %a, i32 %b, i32 %c) {
+define amdgpu_ps float @xor3_vgpr_bc(i32 inreg %a, i32 %b, i32 %c) #0 {
 ; GFX9-LABEL: xor3_vgpr_bc:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_xor_b32_e32 v0, s2, v0
@@ -78,7 +78,7 @@
   ret float %bc
 }
 
-define amdgpu_ps float @xor3_vgpr_const(i32 %a, i32 %b) {
+define amdgpu_ps float @xor3_vgpr_const(i32 %a, i32 %b) #0 {
 ; GFX9-LABEL: xor3_vgpr_const:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v1
@@ -96,7 +96,7 @@
   ret float %bc
 }
 
-define amdgpu_ps <2 x float> @xor3_multiuse_outer(i32 %a, i32 %b, i32 %c, i32 %x) {
+define amdgpu_ps <2 x float> @xor3_multiuse_outer(i32 %a, i32 %b, i32 %c, i32 %x) #0 {
 ; GFX9-LABEL: xor3_multiuse_outer:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v1
@@ -119,7 +119,7 @@
   ret <2 x float> %bc
 }
 
-define amdgpu_ps <2 x float> @xor3_multiuse_inner(i32 %a, i32 %b, i32 %c) {
+define amdgpu_ps <2 x float> @xor3_multiuse_inner(i32 %a, i32 %b, i32 %c) #0 {
 ; GFX9-LABEL: xor3_multiuse_inner:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v1
@@ -142,7 +142,7 @@
 
 ; A case where uniform values end up in VGPRs -- we could use v_xor3_b32 here,
 ; but we don't.
-define amdgpu_ps float @xor3_uniform_vgpr(float inreg %a, float inreg %b, float inreg %c) {
+define amdgpu_ps float @xor3_uniform_vgpr(float inreg %a, float inreg %b, float inreg %c) #0 {
 ; GFX9-LABEL: xor3_uniform_vgpr:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x40400000
@@ -173,3 +173,4 @@
   %bc = bitcast i32 %result to float
   ret float %bc
 }
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/xor_add.ll b/llvm/test/CodeGen/AMDGPU/xor_add.ll
--- a/llvm/test/CodeGen/AMDGPU/xor_add.ll
+++ b/llvm/test/CodeGen/AMDGPU/xor_add.ll
@@ -7,7 +7,7 @@
 ; V_XAD_U32
 ; ===================================================================================
 
-define amdgpu_ps float @xor_add(i32 %a, i32 %b, i32 %c) {
+define amdgpu_ps float @xor_add(i32 %a, i32 %b, i32 %c) #0 {
 ; VI-LABEL: xor_add:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    v_xor_b32_e32 v0, v0, v1
@@ -31,7 +31,7 @@
 }
 
 ; ThreeOp instruction variant not used due to Constant Bus Limitations
-define amdgpu_ps float @xor_add_vgpr_a(i32 %a, i32 inreg %b, i32 inreg %c) {
+define amdgpu_ps float @xor_add_vgpr_a(i32 %a, i32 inreg %b, i32 inreg %c) #0 {
 ; VI-LABEL: xor_add_vgpr_a:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    v_xor_b32_e32 v0, s2, v0
@@ -55,7 +55,7 @@
   ret float %bc
 }
 
-define amdgpu_ps float @xor_add_vgpr_all(i32 %a, i32 %b, i32 %c) {
+define amdgpu_ps float @xor_add_vgpr_all(i32 %a, i32 %b, i32 %c) #0 {
 ; VI-LABEL: xor_add_vgpr_all:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    v_xor_b32_e32 v0, v0, v1
@@ -78,7 +78,7 @@
   ret float %bc
 }
 
-define amdgpu_ps float @xor_add_vgpr_ab(i32 %a, i32 %b, i32 inreg %c) {
+define amdgpu_ps float @xor_add_vgpr_ab(i32 %a, i32 %b, i32 inreg %c) #0 {
 ; VI-LABEL: xor_add_vgpr_ab:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    v_xor_b32_e32 v0, v0, v1
@@ -101,7 +101,7 @@
   ret float %bc
 }
 
-define amdgpu_ps float @xor_add_vgpr_const(i32 %a, i32 %b) {
+define amdgpu_ps float @xor_add_vgpr_const(i32 %a, i32 %b) #0 {
 ; VI-LABEL: xor_add_vgpr_const:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    v_xor_b32_e32 v0, 3, v0
@@ -123,3 +123,4 @@
   %bc = bitcast i32 %result to float
   ret float %bc
 }
+attributes #0 = { nounwind }