diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -255,6 +255,8 @@
 //===----------------------------------------------------------------------===//
 TARGET_BUILTIN(__builtin_amdgcn_permlane16, "UiUiUiUiUiIbIb", "nc", "gfx10-insts")
 TARGET_BUILTIN(__builtin_amdgcn_permlanex16, "UiUiUiUiUiIbIb", "nc", "gfx10-insts")
+TARGET_BUILTIN(__builtin_amdgcn_permlane16_f32, "fffUiUiIbIb", "nc", "gfx10-insts")
+TARGET_BUILTIN(__builtin_amdgcn_permlanex16_f32, "fffUiUiIbIb", "nc", "gfx10-insts")
 TARGET_BUILTIN(__builtin_amdgcn_mov_dpp8, "UiUiIUi", "nc", "gfx10-insts")
 
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -17354,6 +17354,35 @@
     return Builder.CreateCall(F, Args);
   }
 
+  case AMDGPU::BI__builtin_amdgcn_permlane16:
+  case AMDGPU::BI__builtin_amdgcn_permlanex16:
+  case AMDGPU::BI__builtin_amdgcn_permlane16_f32:
+  case AMDGPU::BI__builtin_amdgcn_permlanex16_f32: {
+    Intrinsic::ID Intrin;
+    switch (BuiltinID) {
+    case AMDGPU::BI__builtin_amdgcn_permlane16:
+    case AMDGPU::BI__builtin_amdgcn_permlane16_f32:
+      Intrin = Intrinsic::amdgcn_permlane16;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_permlanex16:
+    case AMDGPU::BI__builtin_amdgcn_permlanex16_f32:
+      Intrin = Intrinsic::amdgcn_permlanex16;
+      break;
+    }
+    llvm::Value *Src0 = EmitScalarExpr(E->getArg(0));
+    llvm::Value *Src1 = EmitScalarExpr(E->getArg(1));
+    llvm::Value *Src2 = EmitScalarExpr(E->getArg(2));
+    llvm::Value *Src3 = EmitScalarExpr(E->getArg(3));
+    llvm::Value *Src4 = EmitScalarExpr(E->getArg(4));
+    llvm::Value *Src5 = EmitScalarExpr(E->getArg(5));
+
+    llvm::Function *F = CGM.getIntrinsic(Intrin, Src1->getType());
+    return Builder.CreateCall(F, {Src0, Src1, Src2, Src3, Src4, Src5});
+  }
+  case AMDGPU::BI__builtin_amdgcn_readfirstlane:
+    return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_readfirstlane);
+  case AMDGPU::BI__builtin_amdgcn_readlane:
+    return emitBinaryBuiltin(*this, E, Intrinsic::amdgcn_readlane);
   // amdgcn workitem
   case AMDGPU::BI__builtin_amdgcn_workitem_id_x:
     return emitRangedBuiltin(*this, Intrinsic::amdgcn_workitem_id_x, 0, 1024);
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl
@@ -7,17 +7,30 @@
 typedef unsigned long ulong;
 
 // CHECK-LABEL: @test_permlane16(
-// CHECK: call i32 @llvm.amdgcn.permlane16(i32 %a, i32 %b, i32 %c, i32 %d, i1 false, i1 false)
+// CHECK: call i32 @llvm.amdgcn.permlane16.i32(i32 %a, i32 %b, i32 %c, i32 %d, i1 false, i1 false)
 void test_permlane16(global uint* out, uint a, uint b, uint c, uint d) {
   *out = __builtin_amdgcn_permlane16(a, b, c, d, 0, 0);
 }
 
 // CHECK-LABEL: @test_permlanex16(
-// CHECK: call i32 @llvm.amdgcn.permlanex16(i32 %a, i32 %b, i32 %c, i32 %d, i1 false, i1 false)
+// CHECK: call i32 @llvm.amdgcn.permlanex16.i32(i32 %a, i32 %b, i32 %c, i32 %d, i1 false, i1 false)
 void test_permlanex16(global uint* out, uint a, uint b, uint c, uint d) {
   *out = __builtin_amdgcn_permlanex16(a, b, c, d, 0, 0);
 }
 
+// CHECK-LABEL: @test_permlane16_f32(
+// CHECK: call float @llvm.amdgcn.permlane16.f32(float %a, float %b, i32 %c, i32 %d, i1 false, i1 false)
+void test_permlane16_f32(global float* out, float a, float b, uint c, uint d) {
+  *out = __builtin_amdgcn_permlane16_f32(a, b, c, d, 0, 0);
+}
+
+// CHECK-LABEL: @test_permlanex16_f32(
+// CHECK: call float @llvm.amdgcn.permlanex16.f32(float %a, float %b, i32 %c, i32 %d, i1 false, i1 false)
+void test_permlanex16_f32(global float* out, float a, float b, uint c, uint d) {
+  *out = __builtin_amdgcn_permlanex16_f32(a, b, c, d, 0, 0);
+}
+
+
 // CHECK-LABEL: @test_mov_dpp8(
 // CHECK: call i32 @llvm.amdgcn.mov.dpp8.i32(i32 %a, i32 1)
 void test_mov_dpp8(global uint* out, uint a) {
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
@@ -292,14 +292,14 @@
 }
 
 // CHECK-LABEL: @test_readfirstlane
-// CHECK: call i32 @llvm.amdgcn.readfirstlane(i32 %a)
+// CHECK: call i32 @llvm.amdgcn.readfirstlane.i32(i32 %a)
 void test_readfirstlane(global int* out, int a)
 {
   *out = __builtin_amdgcn_readfirstlane(a);
 }
 
 // CHECK-LABEL: @test_readlane
-// CHECK: call i32 @llvm.amdgcn.readlane(i32 %a, i32 %b)
+// CHECK: call i32 @llvm.amdgcn.readlane.i32(i32 %a, i32 %b)
 void test_readlane(global int* out, int a, int b)
 {
   *out = __builtin_amdgcn_readlane(a, b);
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx10-param.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx10-param.cl
--- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx10-param.cl
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx10-param.cl
@@ -13,6 +13,16 @@
   *out = __builtin_amdgcn_permlanex16(a, b, c, d, 1, e); // expected-error{{argument to '__builtin_amdgcn_permlanex16' must be a constant integer}}
 }
 
+void test_permlane16_f32(global float* out, float a, float b, uint c, uint d, uint e) {
+  *out = __builtin_amdgcn_permlane16_f32(a, b, c, d, e, 1); // expected-error{{argument to '__builtin_amdgcn_permlane16_f32' must be a constant integer}}
+  *out = __builtin_amdgcn_permlane16_f32(a, b, c, d, 1, e); // expected-error{{argument to '__builtin_amdgcn_permlane16_f32' must be a constant integer}}
+}
+
+void test_permlanex16_f32(global float* out, float a, float b, uint c, uint d, uint e) {
+  *out = __builtin_amdgcn_permlanex16_f32(a, b, c, d, e, 1); // expected-error{{argument to '__builtin_amdgcn_permlanex16_f32' must be a constant integer}}
+  *out = __builtin_amdgcn_permlanex16_f32(a, b, c, d, 1, e); // expected-error{{argument to '__builtin_amdgcn_permlanex16_f32' must be a constant integer}}
+}
+
 void test_mov_dpp8(global uint* out, uint a, uint b) {
   *out = __builtin_amdgcn_mov_dpp8(a, b); // expected-error{{argument to '__builtin_amdgcn_mov_dpp8' must be a constant integer}}
 }
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1667,15 +1667,13 @@
             [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
 
 def int_amdgcn_readfirstlane :
-  ClangBuiltin<"__builtin_amdgcn_readfirstlane">,
-  Intrinsic<[llvm_i32_ty], [llvm_i32_ty],
+  Intrinsic<[llvm_any_ty], [LLVMMatchType<0>],
             [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
 
 // The lane argument must be uniform across the currently active threads of the
 // current wave. Otherwise, the result is undefined.
 def int_amdgcn_readlane :
-  ClangBuiltin<"__builtin_amdgcn_readlane">,
-  Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+  Intrinsic<[llvm_any_ty], [LLVMMatchType<0>, llvm_i32_ty],
             [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
 
 // The value to write and lane select arguments must be uniform across the
@@ -1683,10 +1681,10 @@
 // undefined.
 def int_amdgcn_writelane :
   ClangBuiltin<"__builtin_amdgcn_writelane">,
-  Intrinsic<[llvm_i32_ty], [
-    llvm_i32_ty,    // uniform value to write: returned by the selected lane
+  Intrinsic<[llvm_any_ty], [
+    LLVMMatchType<0>,    // uniform value to write: returned by the selected lane
     llvm_i32_ty,    // uniform lane select
-    llvm_i32_ty     // returned by all lanes other than the selected one
+    LLVMMatchType<0>     // returned by all lanes other than the selected one
   ],
   [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]
 >;
@@ -1941,16 +1939,16 @@
 //===----------------------------------------------------------------------===//
 
 // llvm.amdgcn.permlane16 <old> <src0> <src1> <src2> <fi> <bound_control>
-def int_amdgcn_permlane16 : ClangBuiltin<"__builtin_amdgcn_permlane16">,
-  Intrinsic<[llvm_i32_ty],
-            [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty],
+def int_amdgcn_permlane16 :
+  Intrinsic<[llvm_any_ty],
+            [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty],
             [IntrNoMem, IntrConvergent, IntrWillReturn,
              ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>, IntrNoCallback, IntrNoFree]>;
 
 // llvm.amdgcn.permlanex16 <old> <src0> <src1> <src2> <fi> <bound_control>
-def int_amdgcn_permlanex16 : ClangBuiltin<"__builtin_amdgcn_permlanex16">,
-  Intrinsic<[llvm_i32_ty],
-            [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty],
+def int_amdgcn_permlanex16 :
+  Intrinsic<[llvm_any_ty],
+            [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty],
             [IntrNoMem, IntrConvergent, IntrWillReturn,
              ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>, IntrNoCallback, IntrNoFree]>;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
@@ -344,7 +344,7 @@
   V = buildNonAtomicBinOp(
       B, Op, V,
       B.CreateIntrinsic(
-          Intrinsic::amdgcn_permlanex16, {},
+          Intrinsic::amdgcn_permlanex16, {B.getInt32Ty()},
           {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()}));
 
   if (ST->isWave32())
@@ -359,7 +359,7 @@
   // Pick an arbitrary lane from 0..31 and an arbitrary lane from 32..63 and
   // combine them with a scalar operation.
   Function *ReadLane =
-      Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {});
+      Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {Ty});
   Value *const Lane0 = B.CreateCall(ReadLane, {V, B.getInt32(0)});
   Value *const Lane32 = B.CreateCall(ReadLane, {V, B.getInt32(32)});
   return buildNonAtomicBinOp(B, Op, Lane0, Lane32);
@@ -402,7 +402,7 @@
     // 48..63).
     assert(ST->hasPermLaneX16());
     Value *const PermX = B.CreateIntrinsic(
-        Intrinsic::amdgcn_permlanex16, {},
+        Intrinsic::amdgcn_permlanex16, {B.getInt32Ty()},
         {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()});
     V = buildNonAtomicBinOp(
         B, Op, V,
@@ -411,8 +411,8 @@
                       B.getInt32(0xa), B.getInt32(0xf), B.getFalse()}));
     if (!ST->isWave32()) {
       // Combine lane 31 into lanes 32..63.
-      Value *const Lane31 = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {},
-                                              {V, B.getInt32(31)});
+      Value *const Lane31 = B.CreateIntrinsic(
+          Intrinsic::amdgcn_readlane, {V->getType()}, {V, B.getInt32(31)});
       V = buildNonAtomicBinOp(
           B, Op, V,
           B.CreateCall(UpdateDPP,
@@ -439,9 +439,9 @@
                       B.getInt32(0xf), B.getFalse()});
   } else {
     Function *ReadLane =
-        Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {});
+        Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {Ty});
     Function *WriteLane =
-        Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane, {});
+        Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane, {Ty});
 
     // On GFX10 all DPP operations are confined to a single row. To get cross-
     // row operations we have to use permlane or readlane.
@@ -592,7 +592,7 @@
       // will provide to the atomic operation.
       Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1);
       assert(TyBitWidth == 32);
-      NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {},
+      NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {Ty},
                                {NewV, LastLaneIdx});
     }
 
@@ -672,27 +672,8 @@
     // We need to broadcast the value who was the lowest active lane (the first
     // lane) to all other lanes in the wavefront. We use an intrinsic for this,
     // but have to handle 64-bit broadcasts with two calls to this intrinsic.
-    Value *BroadcastI = nullptr;
-
-    if (TyBitWidth == 64) {
-      Value *const ExtractLo = B.CreateTrunc(PHI, B.getInt32Ty());
-      Value *const ExtractHi =
-          B.CreateTrunc(B.CreateLShr(PHI, 32), B.getInt32Ty());
-      CallInst *const ReadFirstLaneLo =
-          B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractLo);
-      CallInst *const ReadFirstLaneHi =
-          B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractHi);
-      Value *const PartialInsert = B.CreateInsertElement(
-          PoisonValue::get(VecTy), ReadFirstLaneLo, B.getInt32(0));
-      Value *const Insert =
-          B.CreateInsertElement(PartialInsert, ReadFirstLaneHi, B.getInt32(1));
-      BroadcastI = B.CreateBitCast(Insert, Ty);
-    } else if (TyBitWidth == 32) {
-
-      BroadcastI = B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, PHI);
-    } else {
-      llvm_unreachable("Unhandled atomic bit width");
-    }
+    Value *BroadcastI =
+        B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {Ty}, {PHI});
 
     // Now that we have the result of our single atomic operation, we need to
     // get our individual lane's slice into the result. We use the lane offset
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -233,6 +233,10 @@
 
   bool visitIntrinsicInst(IntrinsicInst &I);
   bool visitBitreverseIntrinsicInst(IntrinsicInst &I);
+  bool visitLaneIntrinsicInst(IntrinsicInst &I);
+  Value *buildLegalLaneIntrinsic(IRBuilder<> &B, Intrinsic::ID IID,
+                                 Value *Data0, Value *Lane = nullptr,
+                                 Value *Data1 = nullptr);
 
   bool doInitialization(Module &M) override;
   bool runOnFunction(Function &F) override;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -943,6 +943,17 @@
       if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>())) {
         return IC.replaceInstUsesWith(II, Src);
       }
+
+      // readfirstlane (bitcast x) -> bitcast (readfirstlane x)
+      Value *BitcastInput = nullptr;
+      if (match(Src,
+                PatternMatch::m_BitCast(PatternMatch::m_Value(BitcastInput)))) {
+        CallInst *NewCall =
+            IC.Builder.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane,
+                                       {BitcastInput->getType()}, BitcastInput);
+        Value *NewCast = IC.Builder.CreateBitCast(NewCall, II.getType());
+        return IC.replaceInstUsesWith(II, NewCast);
+      }
     } else {
       // readlane (readlane x, y), y -> readlane x, y
       if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>(
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -18,6 +18,7 @@
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/KnownBits.h"
@@ -76,6 +77,12 @@
 
   bool canWidenScalarExtLoad(LoadInst &LI) const;
   bool visitLoadInst(LoadInst &LI);
+  bool visitIntrinsicInst(IntrinsicInst &I);
+  bool visitLaneIntrinsicInst(IntrinsicInst &I);
+
+  Value *buildLegalLaneIntrinsic(IRBuilder<> &B, Intrinsic::ID IID,
+                                 Value *Data0, Value *Data1, Value *Lane0,
+                                 Value *Lane1, Value *Mod0, Value *Mod1);
 };
 
 } // end anonymous namespace
@@ -177,6 +184,172 @@
   return true;
 }
 
+Value *AMDGPULateCodeGenPrepare::buildLegalLaneIntrinsic(
+    IRBuilder<> &B, Intrinsic::ID IID, Value *Data0, Value *Data1, Value *Lane0,
+    Value *Lane1, Value *Mod0, Value *Mod1) {
+  Type *Ty = Data0->getType();
+  bool IsPermLane = (IID == Intrinsic::amdgcn_permlane16 ||
+                     IID == Intrinsic::amdgcn_permlanex16);
+
+  if (Ty == B.getInt32Ty()) {
+    if (IsPermLane) {
+      Value *Args[6] = {Data0, Data1, Lane0, Lane1, Mod0, Mod1};
+      return B.CreateIntrinsic(IID, {Ty}, {Args});
+    }
+
+    // {write, read, readfirst}lane
+    Value *Args[3] = {Data0, Lane0, Data1};
+    unsigned NumArgs = Data1 != nullptr ? 3 : Lane0 != nullptr ? 2 : 1;
+    return B.CreateIntrinsic(IID, {B.getInt32Ty()}, {Args, NumArgs});
+  }
+
+  if (auto *VecTy = dyn_cast<FixedVectorType>(Ty)) {
+    Type *EltType = VecTy->getElementType();
+    bool is16Bit =
+        (EltType->isIntegerTy() && EltType->getIntegerBitWidth() == 16) ||
+        (EltType->isHalfTy());
+    int EC = VecTy->getElementCount().getKnownMinValue();
+
+    Value *Result = UndefValue::get(Ty);
+    for (int i = 0; i < EC; i += 1 + is16Bit) {
+      Value *EltData0;
+      Value *EltData1 = nullptr;
+
+      if (is16Bit) {
+        int Idxs[2] = {i, i + 1};
+        EltData0 = B.CreateShuffleVector(Data0, UndefValue::get(Ty), Idxs);
+        EltData0 = B.CreateBitCast(EltData0, B.getInt32Ty());
+      } else {
+        EltData0 = B.CreateExtractElement(Data0, i);
+      }
+
+      if (Data1) {
+        if (is16Bit) {
+          int Idxs[2] = {i, i + 1};
+          EltData1 = B.CreateShuffleVector(Data1, UndefValue::get(Ty), Idxs);
+          EltData1 = B.CreateBitCast(EltData1, B.getInt32Ty());
+        } else {
+          EltData1 = B.CreateExtractElement(Data1, i);
+        }
+      }
+
+      Value *EltResult = buildLegalLaneIntrinsic(B, IID, EltData0, EltData1,
+                                                 Lane0, Lane1, Mod0, Mod1);
+
+      if (is16Bit) {
+        EltResult =
+            B.CreateBitCast(EltResult, FixedVectorType::get(EltType, 2));
+        for (int j = 0; j < 2; ++j) {
+          if (i + j >= EC)
+            break;
+          Result = B.CreateInsertElement(
+              Result, B.CreateExtractElement(EltResult, j), i + j);
+        }
+      } else {
+        Result = B.CreateInsertElement(Result, EltResult, i);
+      }
+    }
+
+    return Result;
+  }
+
+  unsigned BitWidth = DL->getTypeSizeInBits(Ty);
+  Type *IntTy = Ty;
+
+  if (!Ty->isIntegerTy()) {
+    IntTy = IntegerType::get(Mod->getContext(), BitWidth);
+    if (!Ty->isPointerTy()) {
+      Data0 = B.CreateBitCast(Data0, IntTy);
+      if (Data1)
+        Data1 = B.CreateBitCast(Data1, IntTy);
+    }
+  }
+
+  if ((BitWidth % 32) != 0) {
+    Type *ExtendedTy =
+        IntegerType::get(Mod->getContext(), (BitWidth + 31) & ~31);
+    Data0 = B.CreateZExt(Data0, ExtendedTy);
+    if (Data1)
+      Data1 = B.CreateZExt(Data1, ExtendedTy);
+  }
+
+  if (BitWidth > 32) {
+    Type *VecTy = FixedVectorType::get(B.getInt32Ty(), (BitWidth + 31) / 32);
+    Data0 = B.CreateBitCast(Data0, VecTy);
+    if (Data1)
+      Data1 = B.CreateBitCast(Data1, VecTy);
+  }
+
+  Value *Result =
+      buildLegalLaneIntrinsic(B, IID, Data0, Data1, Lane0, Lane1, Mod0, Mod1);
+
+  if ((BitWidth % 32) != 0) {
+    if (BitWidth > 32) {
+      Result = B.CreateBitCast(
+          Result, IntegerType::get(Mod->getContext(), (BitWidth + 31) / 32));
+    }
+
+    Result =
+        B.CreateTrunc(Result, IntegerType::get(Mod->getContext(), BitWidth));
+  }
+
+  return Result->getType()->isPointerTy() ? Result
+                                          : B.CreateBitCast(Result, Ty);
+}
+
+/// "Legalize" readfirstlane/readlane/writelane to single-dword intrinsics
+/// on i32.
+///
+/// Done during codegen prepare purely because this turned out to be simpler
+/// than doing it in this generality in SelectionDAG.
+bool AMDGPULateCodeGenPrepare::visitLaneIntrinsicInst(IntrinsicInst &I) {
+  Type *Ty = I.getType();
+  if (Ty->isIntegerTy(32) && Ty->getIntegerBitWidth() == 32)
+    return false; // already legal
+
+  Value *Data0 = I.getArgOperand(0);
+  Value *Data1 = nullptr;
+  Value *Lane0 = nullptr;
+  Value *Lane1 = nullptr;
+  Value *Mod0 = nullptr;
+  Value *Mod1 = nullptr;
+
+  if (I.getIntrinsicID() == Intrinsic::amdgcn_readlane) {
+    Lane0 = I.getArgOperand(1);
+  } else if (I.getIntrinsicID() == Intrinsic::amdgcn_writelane) {
+    Lane0 = I.getArgOperand(1);
+    Data1 = I.getArgOperand(2);
+  } else if (I.getIntrinsicID() == Intrinsic::amdgcn_permlane16 ||
+             I.getIntrinsicID() == Intrinsic::amdgcn_permlanex16) {
+    Data1 = I.getArgOperand(1);
+    Lane0 = I.getArgOperand(2);
+    Lane1 = I.getArgOperand(3);
+    Mod0 = I.getArgOperand(4);
+    Mod1 = I.getArgOperand(5);
+  }
+
+  IRBuilder<> Builder(&I);
+  Value *Legalized = buildLegalLaneIntrinsic(Builder, I.getIntrinsicID(), Data0,
+                                             Data1, Lane0, Lane1, Mod0, Mod1);
+
+  I.replaceAllUsesWith(Legalized);
+  I.eraseFromParent();
+  return true;
+}
+
+bool AMDGPULateCodeGenPrepare::visitIntrinsicInst(IntrinsicInst &I) {
+  switch (I.getIntrinsicID()) {
+  case Intrinsic::amdgcn_readfirstlane:
+  case Intrinsic::amdgcn_readlane:
+  case Intrinsic::amdgcn_writelane:
+  case Intrinsic::amdgcn_permlane16:
+  case Intrinsic::amdgcn_permlanex16:
+    return visitLaneIntrinsicInst(I);
+  default:
+    return false;
+  }
+}
+
 INITIALIZE_PASS_BEGIN(AMDGPULateCodeGenPrepare, DEBUG_TYPE,
                       "AMDGPU IR late optimizations", false, false)
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -1117,13 +1117,13 @@
 bool GCNPassConfig::addPreISel() {
   AMDGPUPassConfig::addPreISel();
 
-  if (TM->getOptLevel() > CodeGenOpt::None)
-    addPass(createAMDGPULateCodeGenPreparePass());
-
   if (isPassEnabled(EnableAtomicOptimizations, CodeGenOpt::Less)) {
     addPass(createAMDGPUAtomicOptimizerPass());
   }
 
+  if (TM->getOptLevel() > CodeGenOpt::None)
+    addPass(createAMDGPULateCodeGenPreparePass());
+
   if (TM->getOptLevel() > CodeGenOpt::None)
     addPass(createSinkingPass());
 
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3145,7 +3145,7 @@
 // FIXME: Should also do this for readlane, but tablegen crashes on
 // the ignored src1.
 def : GCNPat<
-  (int_amdgcn_readfirstlane (i32 imm:$src)),
+  (i32 (int_amdgcn_readfirstlane (i32 imm:$src))),
   (S_MOV_B32 SReg_32:$src)
 >;
 
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -681,8 +681,8 @@
 
 class PermlanePat<SDPatternOperator permlane,
   Instruction inst> : GCNPat<
-  (permlane i32:$vdst_in, i32:$src0, i32:$src1, i32:$src2,
-            timm:$fi, timm:$bc),
+  (i32 (permlane i32:$vdst_in, i32:$src0, i32:$src1, i32:$src2,
+            timm:$fi, timm:$bc)),
   (inst (opsel_i1timm $fi), VGPR_32:$src0, (opsel_i1timm $bc),
         SCSrc_b32:$src1, 0, SCSrc_b32:$src2, VGPR_32:$vdst_in)
 >;
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
@@ -7,14 +7,14 @@
   ret void
 }
 
-; CHECK: DIVERGENT: %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
+; CHECK: DIVERGENT: %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
 define amdgpu_kernel void @v_permlane16_b32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #0 {
   %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
   store i32 %v, ptr addrspace(1) %out
   ret void
 }
 
-; CHECK: DIVERGENT: %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
+; CHECK: DIVERGENT: %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
 define amdgpu_kernel void @v_permlanex16_b32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #0 {
   %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
   store i32 %v, ptr addrspace(1) %out
@@ -42,7 +42,7 @@
   ret void
 }
 
-; CHECK: DIVERGENT: %tmp0 = call i32 @llvm.amdgcn.writelane(i32 0, i32 1, i32 2)
+; CHECK: DIVERGENT: %tmp0 = call i32 @llvm.amdgcn.writelane.i32(i32 0, i32 1, i32 2)
 define amdgpu_kernel void @writelane(ptr addrspace(1) %out) #0 {
   %tmp0 = call i32 @llvm.amdgcn.writelane(i32 0, i32 1, i32 2)
   store i32 %tmp0, ptr addrspace(1) %out
diff --git a/llvm/test/Assembler/autoupgrade-amdgpu-intrinsics.ll b/llvm/test/Assembler/autoupgrade-amdgpu-intrinsics.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Assembler/autoupgrade-amdgpu-intrinsics.ll
@@ -0,0 +1,68 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
+; RUN: opt -S < %s | FileCheck %s
+
+declare i32 @llvm.amdgcn.permlane16(i32, i32, i32, i32, i1, i1)
+declare i32 @llvm.amdgcn.permlanex16(i32, i32, i32, i32, i1, i1)
+declare i32 @llvm.amdgcn.readlane(i32, i32)
+declare i32 @llvm.amdgcn.readfirstlane(i32)
+declare i32 @llvm.amdgcn.writelane(i32, i32, i32)
+
+define void @test_permlanex6(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #1 {
+; CHECK-LABEL: define void @test_permlanex6
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]]) {
+; CHECK-NEXT:    [[V:%.*]] = call i32 @llvm.amdgcn.permlane16.i32(i32 [[SRC0]], i32 [[SRC0]], i32 [[SRC1]], i32 [[SRC2]], i1 true, i1 false)
+; CHECK-NEXT:    store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+  %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 true, i1 false)
+  store i32 %v, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define void @test_permlanex16(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #1 {
+; CHECK-LABEL: define void @test_permlanex16
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]]) {
+; CHECK-NEXT:    [[V:%.*]] = call i32 @llvm.amdgcn.permlanex16.i32(i32 [[SRC0]], i32 [[SRC0]], i32 [[SRC1]], i32 [[SRC2]], i1 true, i1 false)
+; CHECK-NEXT:    store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+  %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 true, i1 false)
+  store i32 %v, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define void @test_readlane(ptr addrspace(1) %out, i32 %src) {
+; CHECK-LABEL: define void @test_readlane
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 [[SRC:%.*]]) {
+; CHECK-NEXT:    [[READLANE:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[SRC]], i32 15)
+; CHECK-NEXT:    store i32 [[READLANE]], ptr addrspace(1) [[OUT]], align 2
+; CHECK-NEXT:    ret void
+;
+  %readlane = call i32 @llvm.amdgcn.readlane(i32 %src, i32 15)
+  store i32 %readlane, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+define void @test_readfirstlane(ptr addrspace(1) %out, i32 %src) {
+; CHECK-LABEL: define void @test_readfirstlane
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 [[SRC:%.*]]) {
+; CHECK-NEXT:    [[READFIRSTLANE:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[SRC]])
+; CHECK-NEXT:    store i32 [[READFIRSTLANE]], ptr addrspace(1) [[OUT]], align 2
+; CHECK-NEXT:    ret void
+;
+  %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %src)
+  store i32 %readfirstlane, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+define void @test_writelane(ptr addrspace(1) %out, i32 %src) {
+; CHECK-LABEL: define void @test_writelane
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 [[SRC:%.*]]) {
+; CHECK-NEXT:    [[WRITELANE:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 1234, i32 15, i32 [[SRC]])
+; CHECK-NEXT:    store i32 [[WRITELANE]], ptr addrspace(1) [[OUT]], align 2
+; CHECK-NEXT:    ret void
+;
+  %writelane = call i32 @llvm.amdgcn.writelane(i32 1234, i32 15, i32 %src)
+  store i32 %writelane, ptr addrspace(1) %out, align 2
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll
@@ -1,5 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
 ; RUN: opt -S -mtriple=amdgcn-- -amdgpu-atomic-optimizer -verify-machineinstrs %s | FileCheck -check-prefix=IR %s
 ; RUN: llc -global-isel -mtriple=amdgcn-- -amdgpu-atomic-optimizations -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
@@ -9,7 +8,8 @@
 declare void @llvm.amdgcn.struct.buffer.store.format.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32, i32 immarg)
 
 define amdgpu_cs void @atomic_add(<4 x i32> inreg %arg)  {
-; IR-LABEL: @atomic_add(
+; IR-LABEL: define amdgpu_cs void @atomic_add
+; IR-SAME: (<4 x i32> inreg [[ARG:%.*]]) {
 ; IR-NEXT:  .entry:
 ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
 ; IR-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
@@ -22,7 +22,7 @@
 ; IR-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[TMP5]], 0
 ; IR-NEXT:    br i1 [[TMP8]], label [[TMP9:%.*]], label [[TMP11:%.*]]
 ; IR:       9:
-; IR-NEXT:    [[TMP10:%.*]] = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 [[TMP7]], <4 x i32> [[ARG:%.*]], i32 0, i32 0, i32 0, i32 0)
+; IR-NEXT:    [[TMP10:%.*]] = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 [[TMP7]], <4 x i32> [[ARG]], i32 0, i32 0, i32 0, i32 0)
 ; IR-NEXT:    br label [[TMP11]]
 ; IR:       11:
 ; IR-NEXT:    ret void
@@ -48,7 +48,8 @@
 }
 
 define amdgpu_cs void @atomic_add_and_format(<4 x i32> inreg %arg) {
-; IR-LABEL: @atomic_add_and_format(
+; IR-LABEL: define amdgpu_cs void @atomic_add_and_format
+; IR-SAME: (<4 x i32> inreg [[ARG:%.*]]) {
 ; IR-NEXT:  .entry:
 ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
 ; IR-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
@@ -61,11 +62,11 @@
 ; IR-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[TMP5]], 0
 ; IR-NEXT:    br i1 [[TMP8]], label [[TMP9:%.*]], label [[TMP11:%.*]]
 ; IR:       9:
-; IR-NEXT:    [[TMP10:%.*]] = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 [[TMP7]], <4 x i32> [[ARG:%.*]], i32 0, i32 0, i32 0, i32 0)
+; IR-NEXT:    [[TMP10:%.*]] = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 [[TMP7]], <4 x i32> [[ARG]], i32 0, i32 0, i32 0, i32 0)
 ; IR-NEXT:    br label [[TMP11]]
 ; IR:       11:
 ; IR-NEXT:    [[TMP12:%.*]] = phi i32 [ poison, [[DOTENTRY:%.*]] ], [ [[TMP10]], [[TMP9]] ]
-; IR-NEXT:    [[TMP13:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP12]])
+; IR-NEXT:    [[TMP13:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP12]])
 ; IR-NEXT:    [[TMP14:%.*]] = add i32 [[TMP13]], [[TMP5]]
 ; IR-NEXT:    call void @llvm.amdgcn.struct.buffer.store.format.v4i32(<4 x i32> [[ARG]], <4 x i32> [[ARG]], i32 [[TMP14]], i32 0, i32 0, i32 0)
 ; IR-NEXT:    ret void
@@ -103,7 +104,8 @@
 }
 
 define amdgpu_cs void @atomic_sub(<4 x i32> inreg %arg)  {
-; IR-LABEL: @atomic_sub(
+; IR-LABEL: define amdgpu_cs void @atomic_sub
+; IR-SAME: (<4 x i32> inreg [[ARG:%.*]]) {
 ; IR-NEXT:  .entry:
 ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
 ; IR-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
@@ -116,7 +118,7 @@
 ; IR-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[TMP5]], 0
 ; IR-NEXT:    br i1 [[TMP8]], label [[TMP9:%.*]], label [[TMP11:%.*]]
 ; IR:       9:
-; IR-NEXT:    [[TMP10:%.*]] = call i32 @llvm.amdgcn.struct.buffer.atomic.sub.i32(i32 [[TMP7]], <4 x i32> [[ARG:%.*]], i32 0, i32 0, i32 0, i32 0)
+; IR-NEXT:    [[TMP10:%.*]] = call i32 @llvm.amdgcn.struct.buffer.atomic.sub.i32(i32 [[TMP7]], <4 x i32> [[ARG]], i32 0, i32 0, i32 0, i32 0)
 ; IR-NEXT:    br label [[TMP11]]
 ; IR:       11:
 ; IR-NEXT:    ret void
@@ -142,7 +144,8 @@
 }
 
 define amdgpu_cs void @atomic_sub_and_format(<4 x i32> inreg %arg) {
-; IR-LABEL: @atomic_sub_and_format(
+; IR-LABEL: define amdgpu_cs void @atomic_sub_and_format
+; IR-SAME: (<4 x i32> inreg [[ARG:%.*]]) {
 ; IR-NEXT:  .entry:
 ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
 ; IR-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
@@ -155,11 +158,11 @@
 ; IR-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[TMP5]], 0
 ; IR-NEXT:    br i1 [[TMP8]], label [[TMP9:%.*]], label [[TMP11:%.*]]
 ; IR:       9:
-; IR-NEXT:    [[TMP10:%.*]] = call i32 @llvm.amdgcn.struct.buffer.atomic.sub.i32(i32 [[TMP7]], <4 x i32> [[ARG:%.*]], i32 0, i32 0, i32 0, i32 0)
+; IR-NEXT:    [[TMP10:%.*]] = call i32 @llvm.amdgcn.struct.buffer.atomic.sub.i32(i32 [[TMP7]], <4 x i32> [[ARG]], i32 0, i32 0, i32 0, i32 0)
 ; IR-NEXT:    br label [[TMP11]]
 ; IR:       11:
 ; IR-NEXT:    [[TMP12:%.*]] = phi i32 [ poison, [[DOTENTRY:%.*]] ], [ [[TMP10]], [[TMP9]] ]
-; IR-NEXT:    [[TMP13:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP12]])
+; IR-NEXT:    [[TMP13:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP12]])
 ; IR-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP13]], [[TMP5]]
 ; IR-NEXT:    call void @llvm.amdgcn.struct.buffer.store.format.v4i32(<4 x i32> [[ARG]], <4 x i32> [[ARG]], i32 [[TMP14]], i32 0, i32 0, i32 0)
 ; IR-NEXT:    ret void
@@ -197,7 +200,8 @@
 }
 
 define amdgpu_cs void @atomic_xor(<4 x i32> inreg %arg)  {
-; IR-LABEL: @atomic_xor(
+; IR-LABEL: define amdgpu_cs void @atomic_xor
+; IR-SAME: (<4 x i32> inreg [[ARG:%.*]]) {
 ; IR-NEXT:  .entry:
 ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
 ; IR-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
@@ -211,7 +215,7 @@
 ; IR-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
 ; IR-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
 ; IR:       10:
-; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.struct.buffer.atomic.xor.i32(i32 [[TMP8]], <4 x i32> [[ARG:%.*]], i32 0, i32 0, i32 0, i32 0)
+; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.struct.buffer.atomic.xor.i32(i32 [[TMP8]], <4 x i32> [[ARG]], i32 0, i32 0, i32 0, i32 0)
 ; IR-NEXT:    br label [[TMP12]]
 ; IR:       12:
 ; IR-NEXT:    ret void
@@ -238,7 +242,8 @@
 }
 
 define amdgpu_cs void @atomic_xor_and_format(<4 x i32> inreg %arg) {
-; IR-LABEL: @atomic_xor_and_format(
+; IR-LABEL: define amdgpu_cs void @atomic_xor_and_format
+; IR-SAME: (<4 x i32> inreg [[ARG:%.*]]) {
 ; IR-NEXT:  .entry:
 ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
 ; IR-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
@@ -252,11 +257,11 @@
 ; IR-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
 ; IR-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
 ; IR:       10:
-; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.struct.buffer.atomic.xor.i32(i32 [[TMP8]], <4 x i32> [[ARG:%.*]], i32 0, i32 0, i32 0, i32 0)
+; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.struct.buffer.atomic.xor.i32(i32 [[TMP8]], <4 x i32> [[ARG]], i32 0, i32 0, i32 0, i32 0)
 ; IR-NEXT:    br label [[TMP12]]
 ; IR:       12:
 ; IR-NEXT:    [[TMP13:%.*]] = phi i32 [ poison, [[DOTENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
-; IR-NEXT:    [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]])
+; IR-NEXT:    [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]])
 ; IR-NEXT:    [[TMP15:%.*]] = and i32 [[TMP5]], 1
 ; IR-NEXT:    [[TMP16:%.*]] = xor i32 [[TMP14]], [[TMP15]]
 ; IR-NEXT:    call void @llvm.amdgcn.struct.buffer.store.format.v4i32(<4 x i32> [[ARG]], <4 x i32> [[ARG]], i32 [[TMP16]], i32 0, i32 0, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --force-update
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s
 ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomic-scan.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-scan.ll
--- a/llvm/test/CodeGen/AMDGPU/global-atomic-scan.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomic-scan.ll
@@ -1,10 +1,11 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
 ; RUN:  opt -S -mtriple=amdgcn-- -passes=amdgpu-atomic-optimizer %s | FileCheck -check-prefix=IR %s
 
 define amdgpu_kernel void @atomic_add_i32_offset(ptr addrspace(1) %out, i32 %in) {
-; IR-LABEL: @atomic_add_i32_offset(
+; IR-LABEL: define amdgpu_kernel void @atomic_add_i32_offset
+; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]]) {
 ; IR-NEXT:  entry:
-; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4
+; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 4
 ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
 ; IR-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
 ; IR-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
@@ -13,7 +14,7 @@
 ; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
 ; IR-NEXT:    [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
 ; IR-NEXT:    [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
-; IR-NEXT:    [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
+; IR-NEXT:    [[TMP8:%.*]] = mul i32 [[IN]], [[TMP7]]
 ; IR-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
 ; IR-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
 ; IR:       10:
@@ -29,9 +30,10 @@
 }
 
 define amdgpu_kernel void @atomic_add_i32_max_neg_offset(ptr addrspace(1) %out, i32 %in) {
-; IR-LABEL: @atomic_add_i32_max_neg_offset(
+; IR-LABEL: define amdgpu_kernel void @atomic_add_i32_max_neg_offset
+; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]]) {
 ; IR-NEXT:  entry:
-; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 -1024
+; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 -1024
 ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
 ; IR-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
 ; IR-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
@@ -40,7 +42,7 @@
 ; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
 ; IR-NEXT:    [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
 ; IR-NEXT:    [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
-; IR-NEXT:    [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
+; IR-NEXT:    [[TMP8:%.*]] = mul i32 [[IN]], [[TMP7]]
 ; IR-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
 ; IR-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
 ; IR:       10:
@@ -56,9 +58,10 @@
 }
 
 define amdgpu_kernel void @atomic_add_i32_soffset(ptr addrspace(1) %out, i32 %in) {
-; IR-LABEL: @atomic_add_i32_soffset(
+; IR-LABEL: define amdgpu_kernel void @atomic_add_i32_soffset
+; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]]) {
 ; IR-NEXT:  entry:
-; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 9000
+; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 9000
 ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
 ; IR-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
 ; IR-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
@@ -67,7 +70,7 @@
 ; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
 ; IR-NEXT:    [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
 ; IR-NEXT:    [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
-; IR-NEXT:    [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
+; IR-NEXT:    [[TMP8:%.*]] = mul i32 [[IN]], [[TMP7]]
 ; IR-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
 ; IR-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
 ; IR:       10:
@@ -83,9 +86,10 @@
 }
 
 define amdgpu_kernel void @atomic_add_i32_huge_offset(ptr addrspace(1) %out, i32 %in) {
-; IR-LABEL: @atomic_add_i32_huge_offset(
+; IR-LABEL: define amdgpu_kernel void @atomic_add_i32_huge_offset
+; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]]) {
 ; IR-NEXT:  entry:
-; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 47224239175595
+; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 47224239175595
 ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
 ; IR-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
 ; IR-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
@@ -94,7 +98,7 @@
 ; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
 ; IR-NEXT:    [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
 ; IR-NEXT:    [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
-; IR-NEXT:    [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
+; IR-NEXT:    [[TMP8:%.*]] = mul i32 [[IN]], [[TMP7]]
 ; IR-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
 ; IR-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
 ; IR:       10:
@@ -111,9 +115,10 @@
 }
 
 define amdgpu_kernel void @atomic_add_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
-; IR-LABEL: @atomic_add_i32_ret_offset(
+; IR-LABEL: define amdgpu_kernel void @atomic_add_i32_ret_offset
+; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT2:%.*]], i32 [[IN:%.*]]) {
 ; IR-NEXT:  entry:
-; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4
+; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 4
 ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
 ; IR-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
 ; IR-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
@@ -122,7 +127,7 @@
 ; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
 ; IR-NEXT:    [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
 ; IR-NEXT:    [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
-; IR-NEXT:    [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
+; IR-NEXT:    [[TMP8:%.*]] = mul i32 [[IN]], [[TMP7]]
 ; IR-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
 ; IR-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
 ; IR:       10:
@@ -130,10 +135,10 @@
 ; IR-NEXT:    br label [[TMP12]]
 ; IR:       12:
 ; IR-NEXT:    [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
-; IR-NEXT:    [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]])
+; IR-NEXT:    [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]])
 ; IR-NEXT:    [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]]
 ; IR-NEXT:    [[TMP16:%.*]] = add i32 [[TMP14]], [[TMP15]]
-; IR-NEXT:    store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4
+; IR-NEXT:    store i32 [[TMP16]], ptr addrspace(1) [[OUT2]], align 4
 ; IR-NEXT:    ret void
 ;
 entry:
@@ -144,9 +149,10 @@
 }
 
 define amdgpu_kernel void @atomic_add_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) {
-; IR-LABEL: @atomic_add_i32_addr64_offset(
+; IR-LABEL: define amdgpu_kernel void @atomic_add_i32_addr64_offset
+; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]], i64 [[INDEX:%.*]]) {
 ; IR-NEXT:  entry:
-; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
+; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[INDEX]]
 ; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4
 ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
 ; IR-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
@@ -156,7 +162,7 @@
 ; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
 ; IR-NEXT:    [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
 ; IR-NEXT:    [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
-; IR-NEXT:    [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
+; IR-NEXT:    [[TMP8:%.*]] = mul i32 [[IN]], [[TMP7]]
 ; IR-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
 ; IR-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
 ; IR:       10:
@@ -173,9 +179,10 @@
 }
 
 define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
-; IR-LABEL: @atomic_add_i32_ret_addr64_offset(
+; IR-LABEL: define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset
+; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT2:%.*]], i32 [[IN:%.*]], i64 [[INDEX:%.*]]) {
 ; IR-NEXT:  entry:
-; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
+; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[INDEX]]
 ; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4
 ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
 ; IR-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
@@ -185,7 +192,7 @@
 ; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
 ; IR-NEXT:    [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
 ; IR-NEXT:    [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
-; IR-NEXT:    [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
+; IR-NEXT:    [[TMP8:%.*]] = mul i32 [[IN]], [[TMP7]]
 ; IR-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
 ; IR-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
 ; IR:       10:
@@ -193,10 +200,10 @@
 ; IR-NEXT:    br label [[TMP12]]
 ; IR:       12:
 ; IR-NEXT:    [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
-; IR-NEXT:    [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]])
+; IR-NEXT:    [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]])
 ; IR-NEXT:    [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]]
 ; IR-NEXT:    [[TMP16:%.*]] = add i32 [[TMP14]], [[TMP15]]
-; IR-NEXT:    store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4
+; IR-NEXT:    store i32 [[TMP16]], ptr addrspace(1) [[OUT2]], align 4
 ; IR-NEXT:    ret void
 ;
 entry:
@@ -208,7 +215,8 @@
 }
 
 define amdgpu_kernel void @atomic_add_i32(ptr addrspace(1) %out, i32 %in) {
-; IR-LABEL: @atomic_add_i32(
+; IR-LABEL: define amdgpu_kernel void @atomic_add_i32
+; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]]) {
 ; IR-NEXT:  entry:
 ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
 ; IR-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
@@ -218,11 +226,11 @@
 ; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
 ; IR-NEXT:    [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
 ; IR-NEXT:    [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
-; IR-NEXT:    [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
+; IR-NEXT:    [[TMP8:%.*]] = mul i32 [[IN]], [[TMP7]]
 ; IR-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
 ; IR-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
 ; IR:       10:
-; IR-NEXT:    [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[OUT:%.*]], i32 [[TMP8]] seq_cst, align 4
+; IR-NEXT:    [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[OUT]], i32 [[TMP8]] seq_cst, align 4
 ; IR-NEXT:    br label [[TMP12]]
 ; IR:       12:
 ; IR-NEXT:    ret void
@@ -233,7 +241,8 @@
 }
 
 define amdgpu_kernel void @atomic_add_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
-; IR-LABEL: @atomic_add_i32_ret(
+; IR-LABEL: define amdgpu_kernel void @atomic_add_i32_ret
+; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT2:%.*]], i32 [[IN:%.*]]) {
 ; IR-NEXT:  entry:
 ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
 ; IR-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
@@ -243,18 +252,18 @@
 ; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
 ; IR-NEXT:    [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
 ; IR-NEXT:    [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
-; IR-NEXT:    [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
+; IR-NEXT:    [[TMP8:%.*]] = mul i32 [[IN]], [[TMP7]]
 ; IR-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
 ; IR-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
 ; IR:       10:
-; IR-NEXT:    [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[OUT:%.*]], i32 [[TMP8]] seq_cst, align 4
+; IR-NEXT:    [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[OUT]], i32 [[TMP8]] seq_cst, align 4
 ; IR-NEXT:    br label [[TMP12]]
 ; IR:       12:
 ; IR-NEXT:    [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
-; IR-NEXT:    [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]])
+; IR-NEXT:    [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]])
 ; IR-NEXT:    [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]]
 ; IR-NEXT:    [[TMP16:%.*]] = add i32 [[TMP14]], [[TMP15]]
-; IR-NEXT:    store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4
+; IR-NEXT:    store i32 [[TMP16]], ptr addrspace(1) [[OUT2]], align 4
 ; IR-NEXT:    ret void
 ;
 entry:
@@ -264,9 +273,10 @@
 }
 
 define amdgpu_kernel void @atomic_add_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) {
-; IR-LABEL: @atomic_add_i32_addr64(
+; IR-LABEL: define amdgpu_kernel void @atomic_add_i32_addr64
+; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]], i64 [[INDEX:%.*]]) {
 ; IR-NEXT:  entry:
-; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
+; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[INDEX]]
 ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
 ; IR-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
 ; IR-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
@@ -275,7 +285,7 @@
 ; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
 ; IR-NEXT:    [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
 ; IR-NEXT:    [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
-; IR-NEXT:    [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
+; IR-NEXT:    [[TMP8:%.*]] = mul i32 [[IN]], [[TMP7]]
 ; IR-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
 ; IR-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
 ; IR:       10:
@@ -291,9 +301,10 @@
 }
 
 define amdgpu_kernel void @atomic_add_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
-; IR-LABEL: @atomic_add_i32_ret_addr64(
+; IR-LABEL: define amdgpu_kernel void @atomic_add_i32_ret_addr64
+; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT2:%.*]], i32 [[IN:%.*]], i64 [[INDEX:%.*]]) {
 ; IR-NEXT:  entry:
-; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
+; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[INDEX]]
 ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
 ; IR-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
 ; IR-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
@@ -302,7 +313,7 @@
 ; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
 ; IR-NEXT:    [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
 ; IR-NEXT:    [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
-; IR-NEXT:    [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
+; IR-NEXT:    [[TMP8:%.*]] = mul i32 [[IN]], [[TMP7]]
 ; IR-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
 ; IR-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
 ; IR:       10:
@@ -310,10 +321,10 @@
 ; IR-NEXT:    br label [[TMP12]]
 ; IR:       12:
 ; IR-NEXT:    [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
-; IR-NEXT:    [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]])
+; IR-NEXT:    [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]])
 ; IR-NEXT:    [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]]
 ; IR-NEXT:    [[TMP16:%.*]] = add i32 [[TMP14]], [[TMP15]]
-; IR-NEXT:    store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4
+; IR-NEXT:    store i32 [[TMP16]], ptr addrspace(1) [[OUT2]], align 4
 ; IR-NEXT:    ret void
 ;
 entry:
@@ -324,9 +335,10 @@
 }
 
 define amdgpu_kernel void @atomic_and_i32_offset(ptr addrspace(1) %out, i32 %in) {
-; IR-LABEL: @atomic_and_i32_offset(
+; IR-LABEL: define amdgpu_kernel void @atomic_and_i32_offset
+; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]]) {
 ; IR-NEXT:  entry:
-; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4
+; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 4
 ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
 ; IR-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
 ; IR-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
@@ -336,7 +348,7 @@
 ; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
 ; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
 ; IR:       7:
-; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] seq_cst, align 4
+; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[GEP]], i32 [[IN]] seq_cst, align 4
 ; IR-NEXT:    br label [[TMP9]]
 ; IR:       9:
 ; IR-NEXT:    ret void
@@ -348,9 +360,10 @@
 }
 
 define amdgpu_kernel void @atomic_and_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
-; IR-LABEL: @atomic_and_i32_ret_offset(
+; IR-LABEL: define amdgpu_kernel void @atomic_and_i32_ret_offset
+; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT2:%.*]], i32 [[IN:%.*]]) {
 ; IR-NEXT:  entry:
-; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4
+; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 4
 ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
 ; IR-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
 ; IR-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
@@ -360,14 +373,14 @@
 ; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
 ; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
 ; IR:       7:
-; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] seq_cst, align 4
+; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[GEP]], i32 [[IN]] seq_cst, align 4
 ; IR-NEXT:    br label [[TMP9]]
 ; IR:       9:
 ; IR-NEXT:    [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
-; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
 ; IR-NEXT:    [[TMP12:%.*]] = select i1 [[TMP6]], i32 -1, i32 [[IN]]
 ; IR-NEXT:    [[TMP13:%.*]] = and i32 [[TMP11]], [[TMP12]]
-; IR-NEXT:    store i32 [[TMP13]], ptr addrspace(1) [[OUT2:%.*]], align 4
+; IR-NEXT:    store i32 [[TMP13]], ptr addrspace(1) [[OUT2]], align 4
 ; IR-NEXT:    ret void
 ;
 entry:
@@ -378,9 +391,10 @@
 }
 
 define amdgpu_kernel void @atomic_and_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) {
-; IR-LABEL: @atomic_and_i32_addr64_offset(
+; IR-LABEL: define amdgpu_kernel void @atomic_and_i32_addr64_offset
+; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]], i64 [[INDEX:%.*]]) {
 ; IR-NEXT:  entry:
-; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
+; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[INDEX]]
 ; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4
 ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
 ; IR-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
@@ -391,7 +405,7 @@
 ; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
 ; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
 ; IR:       7:
-; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] seq_cst, align 4
+; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[GEP]], i32 [[IN]] seq_cst, align 4
 ; IR-NEXT:    br label [[TMP9]]
 ; IR:       9:
 ; IR-NEXT:    ret void
@@ -404,9 +418,10 @@
 }
 
 define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
-; IR-LABEL: @atomic_and_i32_ret_addr64_offset(
+; IR-LABEL: define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset
+; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT2:%.*]], i32 [[IN:%.*]], i64 [[INDEX:%.*]]) {
 ; IR-NEXT:  entry:
-; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
+; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[INDEX]]
 ; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4
 ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
 ; IR-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
@@ -417,14 +432,14 @@
 ; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
 ; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
 ; IR:       7:
-; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] seq_cst, align 4
+; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[GEP]], i32 [[IN]] seq_cst, align 4
 ; IR-NEXT:    br label [[TMP9]]
 ; IR:       9:
 ; IR-NEXT:    [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
-; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
 ; IR-NEXT:    [[TMP12:%.*]] = select i1 [[TMP6]], i32 -1, i32 [[IN]]
 ; IR-NEXT:    [[TMP13:%.*]] = and i32 [[TMP11]], [[TMP12]]
-; IR-NEXT:    store i32 [[TMP13]], ptr addrspace(1) [[OUT2:%.*]], align 4
+; IR-NEXT:    store i32 [[TMP13]], ptr addrspace(1) [[OUT2]], align 4
 ; IR-NEXT:    ret void
 ;
 entry:
@@ -436,7 +451,8 @@
 }
 
 define amdgpu_kernel void @atomic_and_i32(ptr addrspace(1) %out, i32 %in) {
-; IR-LABEL: @atomic_and_i32(
+; IR-LABEL: define amdgpu_kernel void @atomic_and_i32
+; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]]) {
 ; IR-NEXT:  entry:
 ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
 ; IR-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
@@ -447,7 +463,7 @@
 ; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
 ; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
 ; IR:       7:
-; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]] seq_cst, align 4
+; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[OUT]], i32 [[IN]] seq_cst, align 4
 ; IR-NEXT:    br label [[TMP9]]
 ; IR:       9:
 ; IR-NEXT:    ret void
@@ -458,7 +474,8 @@
 }
 
 define amdgpu_kernel void @atomic_and_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
-; IR-LABEL: @atomic_and_i32_ret(
+; IR-LABEL: define amdgpu_kernel void @atomic_and_i32_ret
+; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT2:%.*]], i32 [[IN:%.*]]) {
 ; IR-NEXT:  entry:
 ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
 ; IR-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
@@ -469,14 +486,14 @@
 ; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
 ; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
 ; IR:       7:
-; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]] seq_cst, align 4
+; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[OUT]], i32 [[IN]] seq_cst, align 4
 ; IR-NEXT:    br label [[TMP9]]
 ; IR:       9:
 ; IR-NEXT:    [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
-; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
 ; IR-NEXT:    [[TMP12:%.*]] = select i1 [[TMP6]], i32 -1, i32 [[IN]]
 ; IR-NEXT:    [[TMP13:%.*]] = and i32 [[TMP11]], [[TMP12]]
-; IR-NEXT:    store i32 [[TMP13]], ptr addrspace(1) [[OUT2:%.*]], align 4
+; IR-NEXT:    store i32 [[TMP13]], ptr addrspace(1) [[OUT2]], align 4
 ; IR-NEXT:    ret void
 ;
 entry:
@@ -486,9 +503,10 @@
 }
 
 define amdgpu_kernel void @atomic_and_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) {
-; IR-LABEL: @atomic_and_i32_addr64(
+; IR-LABEL: define amdgpu_kernel void @atomic_and_i32_addr64
+; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]], i64 [[INDEX:%.*]]) {
 ; IR-NEXT:  entry:
-; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
+; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[INDEX]]
 ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
 ; IR-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
 ; IR-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
@@ -498,7 +516,7 @@
 ; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
 ; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
 ; IR:       7:
-; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[PTR]], i32 [[IN:%.*]] seq_cst, align 4
+; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[PTR]], i32 [[IN]] seq_cst, align 4
 ; IR-NEXT:    br label [[TMP9]]
 ; IR:       9:
 ; IR-NEXT:    ret void
@@ -510,9 +528,10 @@
 }
 
 define amdgpu_kernel void @atomic_and_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
-; IR-LABEL: @atomic_and_i32_ret_addr64(
+; IR-LABEL: define amdgpu_kernel void @atomic_and_i32_ret_addr64
+; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT2:%.*]], i32 [[IN:%.*]], i64 [[INDEX:%.*]]) {
 ; IR-NEXT:  entry:
-; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
+; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[INDEX]]
 ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
 ; IR-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
 ; IR-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
@@ -522,14 +541,14 @@
 ; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
 ; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
 ; IR:       7:
-; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[PTR]], i32 [[IN:%.*]] seq_cst, align 4
+; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[PTR]], i32 [[IN]] seq_cst, align 4
 ; IR-NEXT:    br label [[TMP9]]
 ; IR:       9:
 ; IR-NEXT:    [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
-; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
 ; IR-NEXT:    [[TMP12:%.*]] = select i1 [[TMP6]], i32 -1, i32 [[IN]]
 ; IR-NEXT:    [[TMP13:%.*]] = and i32 [[TMP11]], [[TMP12]]
-; IR-NEXT:    store i32 [[TMP13]], ptr addrspace(1) [[OUT2:%.*]], align 4
+; IR-NEXT:    store i32 [[TMP13]], ptr addrspace(1) [[OUT2]], align 4
 ; IR-NEXT:    ret void
 ;
 entry:
@@ -540,9 +559,10 @@
 }
 
 define amdgpu_kernel void @atomic_sub_i32_offset(ptr addrspace(1) %out, i32 %in) {
-; IR-LABEL: @atomic_sub_i32_offset(
+; IR-LABEL: define amdgpu_kernel void @atomic_sub_i32_offset
+; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]]) {
 ; IR-NEXT:  entry:
-; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4
+; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 4
 ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
 ; IR-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
 ; IR-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
@@ -551,7 +571,7 @@
 ; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
 ; IR-NEXT:    [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
 ; IR-NEXT:    [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
-; IR-NEXT:    [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
+; IR-NEXT:    [[TMP8:%.*]] = mul i32 [[IN]], [[TMP7]]
 ; IR-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
 ; IR-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
 ; IR:       10:
@@ -567,9 +587,10 @@
 }
 
 define amdgpu_kernel void @atomic_sub_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
-; IR-LABEL: @atomic_sub_i32_ret_offset(
+; IR-LABEL: define amdgpu_kernel void @atomic_sub_i32_ret_offset
+; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT2:%.*]], i32 [[IN:%.*]]) {
 ; IR-NEXT:  entry:
-; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4
+; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 4
 ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
 ; IR-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
 ; IR-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
@@ -578,7 +599,7 @@
 ; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
 ; IR-NEXT:    [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
 ; IR-NEXT:    [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
-; IR-NEXT:    [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
+; IR-NEXT:    [[TMP8:%.*]] = mul i32 [[IN]], [[TMP7]]
 ; IR-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
 ; IR-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
 ; IR:       10:
@@ -586,10 +607,10 @@
 ; IR-NEXT:    br label [[TMP12]]
 ; IR:       12:
 ; IR-NEXT:    [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
-; IR-NEXT:    [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]])
+; IR-NEXT:    [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]])
 ; IR-NEXT:    [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]]
 ; IR-NEXT:    [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]]
-; IR-NEXT:    store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4
+; IR-NEXT:    store i32 [[TMP16]], ptr addrspace(1) [[OUT2]], align 4
 ; IR-NEXT:    ret void
 ;
 entry:
@@ -600,9 +621,10 @@
 }
 
 define amdgpu_kernel void @atomic_sub_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) {
-; IR-LABEL: @atomic_sub_i32_addr64_offset(
+; IR-LABEL: define amdgpu_kernel void @atomic_sub_i32_addr64_offset
+; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]], i64 [[INDEX:%.*]]) {
 ; IR-NEXT:  entry:
-; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
+; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[INDEX]]
 ; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4
 ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
 ; IR-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
@@ -612,7 +634,7 @@
 ; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
 ; IR-NEXT:    [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
 ; IR-NEXT:    [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
-; IR-NEXT:    [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
+; IR-NEXT:    [[TMP8:%.*]] = mul i32 [[IN]], [[TMP7]]
 ; IR-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
 ; IR-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
 ; IR:       10:
@@ -629,9 +651,10 @@
 }
 
 define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
-; IR-LABEL: @atomic_sub_i32_ret_addr64_offset(
+; IR-LABEL: define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset
+; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT2:%.*]], i32 [[IN:%.*]], i64 [[INDEX:%.*]]) {
 ; IR-NEXT:  entry:
-; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
+; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[INDEX]]
 ; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4
 ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
 ; IR-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
@@ -641,7 +664,7 @@
 ; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
 ; IR-NEXT:    [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
 ; IR-NEXT:    [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
-; IR-NEXT:    [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
+; IR-NEXT:    [[TMP8:%.*]] = mul i32 [[IN]], [[TMP7]]
 ; IR-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
 ; IR-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
 ; IR:       10:
@@ -649,10 +672,10 @@
 ; IR-NEXT:    br label [[TMP12]]
 ; IR:       12:
 ; IR-NEXT:    [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
-; IR-NEXT:    [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]])
+; IR-NEXT:    [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]])
 ; IR-NEXT:    [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]]
 ; IR-NEXT:    [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]]
-; IR-NEXT:    store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4
+; IR-NEXT:    store i32 [[TMP16]], ptr addrspace(1) [[OUT2]], align 4
 ; IR-NEXT:    ret void
 ;
 entry:
@@ -664,7 +687,8 @@
 }
 
 define amdgpu_kernel void @atomic_sub_i32(ptr addrspace(1) %out, i32 %in) {
-; IR-LABEL: @atomic_sub_i32(
+; IR-LABEL: define amdgpu_kernel void @atomic_sub_i32
+; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]]) {
 ; IR-NEXT:  entry:
 ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
 ; IR-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
@@ -674,11 +698,11 @@
 ; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
 ; IR-NEXT:    [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
 ; IR-NEXT:    [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
-; IR-NEXT:    [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
+; IR-NEXT:    [[TMP8:%.*]] = mul i32 [[IN]], [[TMP7]]
 ; IR-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
 ; IR-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
 ; IR:       10:
-; IR-NEXT:    [[TMP11:%.*]] = atomicrmw volatile sub ptr addrspace(1) [[OUT:%.*]], i32 [[TMP8]] seq_cst, align 4
+; IR-NEXT:    [[TMP11:%.*]] = atomicrmw volatile sub ptr addrspace(1) [[OUT]], i32 [[TMP8]] seq_cst, align 4
 ; IR-NEXT:    br label [[TMP12]]
 ; IR:       12:
 ; IR-NEXT:    ret void
@@ -689,7 +713,8 @@
 }
 
 define amdgpu_kernel void @atomic_sub_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
-; IR-LABEL: @atomic_sub_i32_ret(
+; IR-LABEL: define amdgpu_kernel void @atomic_sub_i32_ret
+; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT2:%.*]], i32 [[IN:%.*]]) {
 ; IR-NEXT:  entry:
 ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
 ; IR-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
@@ -699,18 +724,18 @@
 ; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
 ; IR-NEXT:    [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
 ; IR-NEXT:    [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
-; IR-NEXT:    [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
+; IR-NEXT:    [[TMP8:%.*]] = mul i32 [[IN]], [[TMP7]]
 ; IR-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
 ; IR-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
 ; IR:       10:
-; IR-NEXT:    [[TMP11:%.*]] = atomicrmw volatile sub ptr addrspace(1) [[OUT:%.*]], i32 [[TMP8]] seq_cst, align 4
+; IR-NEXT:    [[TMP11:%.*]] = atomicrmw volatile sub ptr addrspace(1) [[OUT]], i32 [[TMP8]] seq_cst, align 4
 ; IR-NEXT:    br label [[TMP12]]
 ; IR:       12:
 ; IR-NEXT:    [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
-; IR-NEXT:    [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]])
+; IR-NEXT:    [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]])
 ; IR-NEXT:    [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]]
 ; IR-NEXT:    [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]]
-; IR-NEXT:    store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4
+; IR-NEXT:    store i32 [[TMP16]], ptr addrspace(1) [[OUT2]], align 4
 ; IR-NEXT:    ret void
 ;
 entry:
@@ -720,9 +745,10 @@
 }
 
 define amdgpu_kernel void @atomic_sub_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) {
-; IR-LABEL: @atomic_sub_i32_addr64(
+; IR-LABEL: define amdgpu_kernel void @atomic_sub_i32_addr64
+; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]], i64 [[INDEX:%.*]]) {
 ; IR-NEXT:  entry:
-; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
+; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[INDEX]]
 ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
 ; IR-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
 ; IR-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
@@ -731,7 +757,7 @@
 ; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
 ; IR-NEXT:    [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
 ; IR-NEXT:    [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
-; IR-NEXT:    [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
+; IR-NEXT:    [[TMP8:%.*]] = mul i32 [[IN]], [[TMP7]]
 ; IR-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
 ; IR-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
 ; IR:       10:
@@ -747,9 +773,10 @@
 }
 
 define amdgpu_kernel void @atomic_sub_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
-; IR-LABEL: @atomic_sub_i32_ret_addr64(
+; IR-LABEL: define amdgpu_kernel void @atomic_sub_i32_ret_addr64
+; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT2:%.*]], i32 [[IN:%.*]], i64 [[INDEX:%.*]]) {
 ; IR-NEXT:  entry:
-; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
+; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[INDEX]]
 ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
 ; IR-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
 ; IR-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
@@ -758,7 +785,7 @@
 ; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
 ; IR-NEXT:    [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
 ; IR-NEXT:    [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
-; IR-NEXT:    [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
+; IR-NEXT:    [[TMP8:%.*]] = mul i32 [[IN]], [[TMP7]]
 ; IR-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
 ; IR-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
 ; IR:       10:
@@ -766,10 +793,10 @@
 ; IR-NEXT:    br label [[TMP12]]
 ; IR:       12:
 ; IR-NEXT:    [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
-; IR-NEXT:    [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]])
+; IR-NEXT:    [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]])
 ; IR-NEXT:    [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]]
 ; IR-NEXT:    [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]]
-; IR-NEXT:    store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4
+; IR-NEXT:    store i32 [[TMP16]], ptr addrspace(1) [[OUT2]], align 4
 ; IR-NEXT:    ret void
 ;
 entry:
@@ -780,9 +807,10 @@
 }
 
 define amdgpu_kernel void @atomic_max_i32_offset(ptr addrspace(1) %out, i32 %in) {
-; IR-LABEL: @atomic_max_i32_offset(
+; IR-LABEL: define amdgpu_kernel void @atomic_max_i32_offset
+; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]]) {
 ; IR-NEXT:  entry:
-; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4
+; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 4
 ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
 ; IR-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
 ; IR-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
@@ -792,7 +820,7 @@
 ; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
 ; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
 ; IR:       7:
-; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] seq_cst, align 4
+; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[GEP]], i32 [[IN]] seq_cst, align 4
 ; IR-NEXT:    br label [[TMP9]]
 ; IR:       9:
 ; IR-NEXT:    ret void
@@ -804,9 +832,10 @@
 }
 
 define amdgpu_kernel void @atomic_max_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
-; IR-LABEL: @atomic_max_i32_ret_offset(
+; IR-LABEL: define amdgpu_kernel void @atomic_max_i32_ret_offset
+; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT2:%.*]], i32 [[IN:%.*]]) {
 ; IR-NEXT:  entry:
-; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4
+; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 4
 ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
 ; IR-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
 ; IR-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
@@ -816,15 +845,15 @@
 ; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
 ; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
 ; IR:       7:
-; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
+; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[GEP]], i32 [[IN]] syncscope("workgroup") seq_cst, align 4
 ; IR-NEXT:    br label [[TMP9]]
 ; IR:       9:
 ; IR-NEXT:    [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
-; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
 ; IR-NEXT:    [[TMP12:%.*]] = select i1 [[TMP6]], i32 -2147483648, i32 [[IN]]
 ; IR-NEXT:    [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]]
 ; IR-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
-; IR-NEXT:    store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4
+; IR-NEXT:    store i32 [[TMP14]], ptr addrspace(1) [[OUT2]], align 4
 ; IR-NEXT:    ret void
 ;
 entry:
@@ -835,9 +864,10 @@
 }
 
 define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) {
-; IR-LABEL: @atomic_max_i32_addr64_offset(
+; IR-LABEL: define amdgpu_kernel void @atomic_max_i32_addr64_offset
+; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]], i64 [[INDEX:%.*]]) {
 ; IR-NEXT:  entry:
-; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
+; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[INDEX]]
 ; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4
 ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
 ; IR-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
@@ -848,7 +878,7 @@
 ; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
 ; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
 ; IR:       7:
-; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
+; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[GEP]], i32 [[IN]] syncscope("workgroup") seq_cst, align 4
 ; IR-NEXT:    br label [[TMP9]]
 ; IR:       9:
 ; IR-NEXT:    ret void
@@ -861,9 +891,10 @@
 }
 
 define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
-; IR-LABEL: @atomic_max_i32_ret_addr64_offset(
+; IR-LABEL: define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset
+; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT2:%.*]], i32 [[IN:%.*]], i64 [[INDEX:%.*]]) {
 ; IR-NEXT:  entry:
-; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
+; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[INDEX]]
 ; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4
 ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
 ; IR-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
@@ -874,15 +905,15 @@
 ; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
 ; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
 ; IR:       7:
-; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
+; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[GEP]], i32 [[IN]] syncscope("workgroup") seq_cst, align 4
 ; IR-NEXT:    br label [[TMP9]]
 ; IR:       9:
 ; IR-NEXT:    [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
-; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
 ; IR-NEXT:    [[TMP12:%.*]] = select i1 [[TMP6]], i32 -2147483648, i32 [[IN]]
 ; IR-NEXT:    [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]]
 ; IR-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
-; IR-NEXT:    store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4
+; IR-NEXT:    store i32 [[TMP14]], ptr addrspace(1) [[OUT2]], align 4
 ; IR-NEXT:    ret void
 ;
 entry:
@@ -894,7 +925,8 @@
 }
 
 define amdgpu_kernel void @atomic_max_i32(ptr addrspace(1) %out, i32 %in) {
-; IR-LABEL: @atomic_max_i32(
+; IR-LABEL: define amdgpu_kernel void @atomic_max_i32
+; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]]) {
 ; IR-NEXT:  entry:
 ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
 ; IR-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
@@ -905,7 +937,7 @@
 ; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
 ; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
 ; IR:       7:
-; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
+; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[OUT]], i32 [[IN]] syncscope("workgroup") seq_cst, align 4
 ; IR-NEXT:    br label [[TMP9]]
 ; IR:       9:
 ; IR-NEXT:    ret void
@@ -916,7 +948,8 @@
 }
 
 define amdgpu_kernel void @atomic_max_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
-; IR-LABEL: @atomic_max_i32_ret(
+; IR-LABEL: define amdgpu_kernel void @atomic_max_i32_ret
+; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT2:%.*]], i32 [[IN:%.*]]) {
 ; IR-NEXT:  entry:
 ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
 ; IR-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
@@ -927,15 +960,15 @@
 ; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
 ; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
 ; IR:       7:
-; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
+; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[OUT]], i32 [[IN]] syncscope("workgroup") seq_cst, align 4
 ; IR-NEXT:    br label [[TMP9]]
 ; IR:       9:
 ; IR-NEXT:    [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
-; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
 ; IR-NEXT:    [[TMP12:%.*]] = select i1 [[TMP6]], i32 -2147483648, i32 [[IN]]
 ; IR-NEXT:    [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]]
 ; IR-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
-; IR-NEXT:    store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4
+; IR-NEXT:    store i32 [[TMP14]], ptr addrspace(1) [[OUT2]], align 4
 ; IR-NEXT:    ret void
 ;
 entry:
@@ -945,9 +978,10 @@
 }
 
 define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) {
-; IR-LABEL: @atomic_max_i32_addr64(
+; IR-LABEL: define amdgpu_kernel void @atomic_max_i32_addr64
+; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]], i64 [[INDEX:%.*]]) {
 ; IR-NEXT:  entry:
-; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
+; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[INDEX]]
 ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
 ; IR-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
 ; IR-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
@@ -957,7 +991,7 @@
 ; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
 ; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
 ; IR:       7:
-; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[PTR]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
+; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[PTR]], i32 [[IN]] syncscope("workgroup") seq_cst, align 4
 ; IR-NEXT:    br label [[TMP9]]
 ; IR:       9:
 ; IR-NEXT:    ret void
@@ -969,9 +1003,10 @@
 }
 
 define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
-; IR-LABEL: @atomic_max_i32_ret_addr64(
+; IR-LABEL: define amdgpu_kernel void @atomic_max_i32_ret_addr64
+; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT2:%.*]], i32 [[IN:%.*]], i64 [[INDEX:%.*]]) {
 ; IR-NEXT:  entry:
-; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
+; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[INDEX]]
 ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
 ; IR-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
 ; IR-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
@@ -981,15 +1016,15 @@
 ; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
 ; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
 ; IR:       7:
-; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[PTR]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
+; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[PTR]], i32 [[IN]] syncscope("workgroup") seq_cst, align 4
 ; IR-NEXT:    br label [[TMP9]]
 ; IR:       9:
 ; IR-NEXT:    [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
-; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
 ; IR-NEXT:    [[TMP12:%.*]] = select i1 [[TMP6]], i32 -2147483648, i32 [[IN]]
 ; IR-NEXT:    [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]]
 ; IR-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
-; IR-NEXT:    store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4
+; IR-NEXT:    store i32 [[TMP14]], ptr addrspace(1) [[OUT2]], align 4
 ; IR-NEXT:    ret void
 ;
 entry:
@@ -1000,9 +1035,10 @@
 }
 
 define amdgpu_kernel void @atomic_umax_i32_offset(ptr addrspace(1) %out, i32 %in) {
-; IR-LABEL: @atomic_umax_i32_offset(
+; IR-LABEL: define amdgpu_kernel void @atomic_umax_i32_offset
+; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]]) {
 ; IR-NEXT:  entry:
-; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4
+; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 4
 ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
 ; IR-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
 ; IR-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
@@ -1012,7 +1048,7 @@
 ; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
 ; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
 ; IR:       7:
-; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
+; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[GEP]], i32 [[IN]] syncscope("workgroup") seq_cst, align 4
 ; IR-NEXT:    br label [[TMP9]]
 ; IR:       9:
 ; IR-NEXT:    ret void
@@ -1024,9 +1060,10 @@
 }
 
 define amdgpu_kernel void @atomic_umax_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
-; IR-LABEL: @atomic_umax_i32_ret_offset(
+; IR-LABEL: define amdgpu_kernel void @atomic_umax_i32_ret_offset
+; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT2:%.*]], i32 [[IN:%.*]]) {
 ; IR-NEXT:  entry:
-; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4
+; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 4
 ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
 ; IR-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
 ; IR-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
@@ -1036,15 +1073,15 @@
 ; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
 ; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
 ; IR:       7:
-; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
+; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[GEP]], i32 [[IN]] syncscope("workgroup") seq_cst, align 4
 ; IR-NEXT:    br label [[TMP9]]
 ; IR:       9:
 ; IR-NEXT:    [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
-; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
 ; IR-NEXT:    [[TMP12:%.*]] = select i1 [[TMP6]], i32 0, i32 [[IN]]
 ; IR-NEXT:    [[TMP13:%.*]] = icmp ugt i32 [[TMP11]], [[TMP12]]
 ; IR-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
-; IR-NEXT:    store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4
+; IR-NEXT:    store i32 [[TMP14]], ptr addrspace(1) [[OUT2]], align 4
 ; IR-NEXT:    ret void
 ;
 entry:
@@ -1055,9 +1092,10 @@
 }
 
 define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) {
-; IR-LABEL: @atomic_umax_i32_addr64_offset(
+; IR-LABEL: define amdgpu_kernel void @atomic_umax_i32_addr64_offset
+; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]], i64 [[INDEX:%.*]]) {
 ; IR-NEXT:  entry:
-; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
+; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[INDEX]]
 ; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4
 ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
 ; IR-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
@@ -1068,7 +1106,7 @@
 ; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
 ; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
 ; IR:       7:
-; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
+; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[GEP]], i32 [[IN]] syncscope("workgroup") seq_cst, align 4
 ; IR-NEXT:    br label [[TMP9]]
 ; IR:       9:
 ; IR-NEXT:    ret void
@@ -1081,9 +1119,10 @@
 }
 
 define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
-; IR-LABEL: @atomic_umax_i32_ret_addr64_offset(
+; IR-LABEL: define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset
+; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT2:%.*]], i32 [[IN:%.*]], i64 [[INDEX:%.*]]) {
 ; IR-NEXT:  entry:
-; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
+; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[INDEX]]
 ; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4
 ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
 ; IR-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
@@ -1094,15 +1133,15 @@
 ; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
 ; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
 ; IR:       7:
-; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
+; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[GEP]], i32 [[IN]] syncscope("workgroup") seq_cst, align 4
 ; IR-NEXT:    br label [[TMP9]]
 ; IR:       9:
 ; IR-NEXT:    [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
-; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
 ; IR-NEXT:    [[TMP12:%.*]] = select i1 [[TMP6]], i32 0, i32 [[IN]]
 ; IR-NEXT:    [[TMP13:%.*]] = icmp ugt i32 [[TMP11]], [[TMP12]]
 ; IR-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
-; IR-NEXT:    store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4
+; IR-NEXT:    store i32 [[TMP14]], ptr addrspace(1) [[OUT2]], align 4
 ; IR-NEXT:    ret void
 ;
 entry:
@@ -1114,7 +1153,8 @@
 }
 
 define amdgpu_kernel void @atomic_umax_i32(ptr addrspace(1) %out, i32 %in) {
-; IR-LABEL: @atomic_umax_i32(
+; IR-LABEL: define amdgpu_kernel void @atomic_umax_i32
+; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]]) {
 ; IR-NEXT:  entry:
 ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
 ; IR-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
@@ -1125,7 +1165,7 @@
 ; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
 ; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
 ; IR:       7:
-; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
+; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[OUT]], i32 [[IN]] syncscope("workgroup") seq_cst, align 4
 ; IR-NEXT:    br label [[TMP9]]
 ; IR:       9:
 ; IR-NEXT:    ret void
@@ -1136,7 +1176,8 @@
 }
 
 define amdgpu_kernel void @atomic_umax_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
-; IR-LABEL: @atomic_umax_i32_ret(
+; IR-LABEL: define amdgpu_kernel void @atomic_umax_i32_ret
+; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT2:%.*]], i32 [[IN:%.*]]) {
 ; IR-NEXT:  entry:
 ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
 ; IR-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
@@ -1147,15 +1188,15 @@
 ; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
 ; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
 ; IR:       7:
-; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
+; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[OUT]], i32 [[IN]] syncscope("workgroup") seq_cst, align 4
 ; IR-NEXT:    br label [[TMP9]]
 ; IR:       9:
 ; IR-NEXT:    [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
-; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
 ; IR-NEXT:    [[TMP12:%.*]] = select i1 [[TMP6]], i32 0, i32 [[IN]]
 ; IR-NEXT:    [[TMP13:%.*]] = icmp ugt i32 [[TMP11]], [[TMP12]]
 ; IR-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
-; IR-NEXT:    store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4
+; IR-NEXT:    store i32 [[TMP14]], ptr addrspace(1) [[OUT2]], align 4
 ; IR-NEXT:    ret void
 ;
 entry:
@@ -1165,9 +1206,10 @@
 }
 
 define amdgpu_kernel void @atomic_umax_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) {
-; IR-LABEL: @atomic_umax_i32_addr64(
+; IR-LABEL: define amdgpu_kernel void @atomic_umax_i32_addr64
+; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]], i64 [[INDEX:%.*]]) {
 ; IR-NEXT:  entry:
-; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
+; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[INDEX]]
 ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
 ; IR-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
 ; IR-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
@@ -1177,7 +1219,7 @@
 ; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
 ; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
 ; IR:       7:
-; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[PTR]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
+; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[PTR]], i32 [[IN]] syncscope("workgroup") seq_cst, align 4
 ; IR-NEXT:    br label [[TMP9]]
 ; IR:       9:
 ; IR-NEXT:    ret void
@@ -1189,9 +1231,10 @@
 }
 
 define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
-; IR-LABEL: @atomic_umax_i32_ret_addr64(
+; IR-LABEL: define amdgpu_kernel void @atomic_umax_i32_ret_addr64
+; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT2:%.*]], i32 [[IN:%.*]], i64 [[INDEX:%.*]]) {
 ; IR-NEXT:  entry:
-; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
+; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[INDEX]]
 ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
 ; IR-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
 ; IR-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
@@ -1201,15 +1244,15 @@
 ; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
 ; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
 ; IR:       7:
-; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[PTR]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
+; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[PTR]], i32 [[IN]] syncscope("workgroup") seq_cst, align 4
 ; IR-NEXT:    br label [[TMP9]]
 ; IR:       9:
 ; IR-NEXT:    [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
-; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
 ; IR-NEXT:    [[TMP12:%.*]] = select i1 [[TMP6]], i32 0, i32 [[IN]]
 ; IR-NEXT:    [[TMP13:%.*]] = icmp ugt i32 [[TMP11]], [[TMP12]]
 ; IR-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
-; IR-NEXT:    store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4
+; IR-NEXT:    store i32 [[TMP14]], ptr addrspace(1) [[OUT2]], align 4
 ; IR-NEXT:    ret void
 ;
 entry:
@@ -1220,9 +1263,10 @@
 }
 
 define amdgpu_kernel void @atomic_min_i32_offset(ptr addrspace(1) %out, i32 %in) {
-; IR-LABEL: @atomic_min_i32_offset(
+; IR-LABEL: define amdgpu_kernel void @atomic_min_i32_offset
+; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]]) {
 ; IR-NEXT:  entry:
-; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4
+; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 4
 ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
 ; IR-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
 ; IR-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
@@ -1232,7 +1276,7 @@
 ; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
 ; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
 ; IR:       7:
-; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
+; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[GEP]], i32 [[IN]] syncscope("workgroup") seq_cst, align 4
 ; IR-NEXT:    br label [[TMP9]]
 ; IR:       9:
 ; IR-NEXT:    ret void
@@ -1244,9 +1288,10 @@
 }
 
 define amdgpu_kernel void @atomic_min_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
-; IR-LABEL: @atomic_min_i32_ret_offset(
+; IR-LABEL: define amdgpu_kernel void @atomic_min_i32_ret_offset
+; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT2:%.*]], i32 [[IN:%.*]]) {
 ; IR-NEXT:  entry:
-; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4
+; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 4
 ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
 ; IR-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
 ; IR-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
@@ -1256,15 +1301,15 @@
 ; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
 ; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
 ; IR:       7:
-; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
+; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[GEP]], i32 [[IN]] syncscope("workgroup") seq_cst, align 4
 ; IR-NEXT:    br label [[TMP9]]
 ; IR:       9:
 ; IR-NEXT:    [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
-; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
 ; IR-NEXT:    [[TMP12:%.*]] = select i1 [[TMP6]], i32 2147483647, i32 [[IN]]
 ; IR-NEXT:    [[TMP13:%.*]] = icmp slt i32 [[TMP11]], [[TMP12]]
 ; IR-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
-; IR-NEXT:    store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4
+; IR-NEXT:    store i32 [[TMP14]], ptr addrspace(1) [[OUT2]], align 4
 ; IR-NEXT:    ret void
 ;
 entry:
@@ -1275,9 +1320,10 @@
 }
 
 define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) {
-; IR-LABEL: @atomic_min_i32_addr64_offset(
+; IR-LABEL: define amdgpu_kernel void @atomic_min_i32_addr64_offset
+; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]], i64 [[INDEX:%.*]]) {
 ; IR-NEXT:  entry:
-; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
+; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[INDEX]]
 ; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4
 ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
 ; IR-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
@@ -1288,7 +1334,7 @@
 ; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
 ; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
 ; IR:       7:
-; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
+; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[GEP]], i32 [[IN]] syncscope("workgroup") seq_cst, align 4
 ; IR-NEXT:    br label [[TMP9]]
 ; IR:       9:
 ; IR-NEXT:    ret void
@@ -1301,9 +1347,10 @@
 }
 
 define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
-; IR-LABEL: @atomic_min_i32_ret_addr64_offset(
+; IR-LABEL: define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset
+; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT2:%.*]], i32 [[IN:%.*]], i64 [[INDEX:%.*]]) {
 ; IR-NEXT:  entry:
-; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
+; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[INDEX]]
 ; IR-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4
 ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
 ; IR-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
@@ -1314,15 +1361,15 @@
 ; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
 ; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
 ; IR:       7:
-; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
+; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[GEP]], i32 [[IN]] syncscope("workgroup") seq_cst, align 4
 ; IR-NEXT:    br label [[TMP9]]
 ; IR:       9:
 ; IR-NEXT:    [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
-; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
 ; IR-NEXT:    [[TMP12:%.*]] = select i1 [[TMP6]], i32 2147483647, i32 [[IN]]
 ; IR-NEXT:    [[TMP13:%.*]] = icmp slt i32 [[TMP11]], [[TMP12]]
 ; IR-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
-; IR-NEXT:    store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4
+; IR-NEXT:    store i32 [[TMP14]], ptr addrspace(1) [[OUT2]], align 4
 ; IR-NEXT:    ret void
 ;
 entry:
@@ -1334,7 +1381,8 @@
 }
 
 define amdgpu_kernel void @atomic_min_i32(ptr addrspace(1) %out, i32 %in) {
-; IR-LABEL: @atomic_min_i32(
+; IR-LABEL: define amdgpu_kernel void @atomic_min_i32
+; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]]) {
 ; IR-NEXT:  entry:
 ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
 ; IR-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
@@ -1345,7 +1393,7 @@
 ; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
 ; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
 ; IR:       7:
-; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
+; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[OUT]], i32 [[IN]] syncscope("workgroup") seq_cst, align 4
 ; IR-NEXT:    br label [[TMP9]]
 ; IR:       9:
 ; IR-NEXT:    ret void
@@ -1356,7 +1404,8 @@
 }
 
 define amdgpu_kernel void @atomic_min_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
-; IR-LABEL: @atomic_min_i32_ret(
+; IR-LABEL: define amdgpu_kernel void @atomic_min_i32_ret
+; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT2:%.*]], i32 [[IN:%.*]]) {
 ; IR-NEXT:  entry:
 ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
 ; IR-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
@@ -1367,15 +1416,15 @@
 ; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
 ; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
 ; IR:       7:
-; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
+; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[OUT]], i32 [[IN]] syncscope("workgroup") seq_cst, align 4
 ; IR-NEXT:    br label [[TMP9]]
 ; IR:       9:
 ; IR-NEXT:    [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
-; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
 ; IR-NEXT:    [[TMP12:%.*]] = select i1 [[TMP6]], i32 2147483647, i32 [[IN]]
 ; IR-NEXT:    [[TMP13:%.*]] = icmp slt i32 [[TMP11]], [[TMP12]]
 ; IR-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
-; IR-NEXT:    store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4
+; IR-NEXT:    store i32 [[TMP14]], ptr addrspace(1) [[OUT2]], align 4
 ; IR-NEXT:    ret void
 ;
 entry:
@@ -1385,9 +1434,10 @@
 }
 
 define amdgpu_kernel void @atomic_min_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) {
-; IR-LABEL: @atomic_min_i32_addr64(
+; IR-LABEL: define amdgpu_kernel void @atomic_min_i32_addr64
+; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]], i64 [[INDEX:%.*]]) {
 ; IR-NEXT:  entry:
-; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
+; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[INDEX]]
 ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
 ; IR-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
 ; IR-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
@@ -1397,7 +1447,7 @@
 ; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
 ; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
 ; IR:       7:
-; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[PTR]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
+; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[PTR]], i32 [[IN]] syncscope("workgroup") seq_cst, align 4
 ; IR-NEXT:    br label [[TMP9]]
 ; IR:       9:
 ; IR-NEXT:    ret void
@@ -1409,9 +1459,10 @@
 }
 
 define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
-; IR-LABEL: @atomic_min_i32_ret_addr64(
+; IR-LABEL: define amdgpu_kernel void @atomic_min_i32_ret_addr64
+; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT2:%.*]], i32 [[IN:%.*]], i64 [[INDEX:%.*]]) {
 ; IR-NEXT:  entry:
-; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
+; IR-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[INDEX]]
 ; IR-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
 ; IR-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
 ; IR-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
@@ -1421,15 +1472,15 @@
 ; IR-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
 ; IR-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
 ; IR:       7:
-; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[PTR]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
+; IR-NEXT:    [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[PTR]], i32 [[IN]] syncscope("workgroup") seq_cst, align 4
 ; IR-NEXT:    br label [[TMP9]]
 ; IR:       9:
 ; IR-NEXT:    [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
-; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
 ; IR-NEXT:    [[TMP12:%.*]] = select i1 [[TMP6]], i32 2147483647, i32 [[IN]]
 ; IR-NEXT:    [[TMP13:%.*]] = icmp slt i32 [[TMP11]], [[TMP12]]
 ; IR-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
-; IR-NEXT:    store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4
+; IR-NEXT:    store i32 [[TMP14]], ptr addrspace(1) [[OUT2]], align 4
 ; IR-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
@@ -6,6 +6,22 @@
 
 declare i32 @llvm.amdgcn.permlane16(i32, i32, i32, i32, i1, i1)
 declare i32 @llvm.amdgcn.permlanex16(i32, i32, i32, i32, i1, i1)
+
+
+declare i16 @llvm.amdgcn.permlane16.i16(i16, i16, i32, i32, i1, i1) #0
+declare half @llvm.amdgcn.permlane16.f16(half, half, i32, i32, i1, i1) #0
+declare float @llvm.amdgcn.permlane16.f32(float, float, i32, i32, i1, i1) #0
+declare <3 x i16> @llvm.amdgcn.permlane16.v3i16(<3 x i16>, <3 x i16>, i32, i32, i1, i1) #0
+declare <9 x float> @llvm.amdgcn.permlane16.v9f32(<9 x float>, <9 x float>, i32, i32, i1, i1) #0
+
+declare i16 @llvm.amdgcn.permlanex16.i16(i16, i16, i32, i32, i1, i1) #0
+declare half @llvm.amdgcn.permlanex16.f16(half, half, i32, i32, i1, i1) #0
+declare float @llvm.amdgcn.permlanex16.f32(float, float, i32, i32, i1, i1) #0
+declare <3 x i16> @llvm.amdgcn.permlanex16.v3i16(<3 x i16>, <3 x i16>, i32, i32, i1, i1) #0
+declare <9 x float> @llvm.amdgcn.permlanex16.v9f32(<9 x float>, <9 x float>, i32, i32, i1, i1) #0
+
+
+
 declare i32 @llvm.amdgcn.workitem.id.x()
 declare i32 @llvm.amdgcn.workitem.id.y()
 
@@ -917,6 +933,255 @@
   ret void
 }
 
+define void @test_permlane_i16(ptr addrspace(1) %out, i16 %src0, i32 %src1, i32 %src2) #1 {
+; GFX10-LABEL: test_permlane_i16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX10-NEXT:    v_readfirstlane_b32 s4, v3
+; GFX10-NEXT:    v_readfirstlane_b32 s5, v4
+; GFX10-NEXT:    v_permlane16_b32 v2, v2, s4, s5 op_sel:[1,0]
+; GFX10-NEXT:    global_store_short v[0:1], v2, off
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: test_permlane_i16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v3
+; GFX11-NEXT:    v_readfirstlane_b32 s1, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_permlane16_b32 v2, v2, s0, s1 op_sel:[1,0]
+; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %v = call i16 @llvm.amdgcn.permlane16.i16(i16 %src0, i16 %src0, i32 %src1, i32 %src2, i1 true, i1 false)
+  store i16 %v, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define void @test_permlane_f16(ptr addrspace(1) %out, half %src0, i32 %src1, i32 %src2) #1 {
+; GFX10-LABEL: test_permlane_f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX10-NEXT:    v_readfirstlane_b32 s4, v3
+; GFX10-NEXT:    v_readfirstlane_b32 s5, v4
+; GFX10-NEXT:    v_permlane16_b32 v2, v2, s4, s5 op_sel:[1,0]
+; GFX10-NEXT:    global_store_short v[0:1], v2, off
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: test_permlane_f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v3
+; GFX11-NEXT:    v_readfirstlane_b32 s1, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_permlane16_b32 v2, v2, s0, s1 op_sel:[1,0]
+; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %v = call half @llvm.amdgcn.permlane16.f16(half %src0, half %src0, i32 %src1, i32 %src2, i1 true, i1 false)
+  store half %v, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define void @test_permlane_f32(ptr addrspace(1) %out, float %src0, i32 %src1, i32 %src2) #1 {
+; GFX10-LABEL: test_permlane_f32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_readfirstlane_b32 s4, v3
+; GFX10-NEXT:    v_readfirstlane_b32 s5, v4
+; GFX10-NEXT:    v_permlane16_b32 v2, v2, s4, s5 op_sel:[1,0]
+; GFX10-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: test_permlane_f32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v3
+; GFX11-NEXT:    v_readfirstlane_b32 s1, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_permlane16_b32 v2, v2, s0, s1 op_sel:[1,0]
+; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %v = call float @llvm.amdgcn.permlane16.f32(float %src0, float %src0, i32 %src1, i32 %src2, i1 true, i1 false)
+  store float %v, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define void @test_permlane_v3i16(ptr addrspace(1) %out, <3 x i16> %src0, i32 %src1, i32 %src2) #1 {
+; GFX10-SDAG-LABEL: test_permlane_v3i16:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s4, v4
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s5, v5
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v3, v3, s4, s5 op_sel:[1,0]
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v2, v2, s4, s5 op_sel:[1,0]
+; GFX10-SDAG-NEXT:    global_store_short v[0:1], v3, off offset:4
+; GFX10-SDAG-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: test_permlane_v3i16:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s4, v4
+; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s5, v5
+; GFX10-GISEL-NEXT:    v_lshl_or_b32 v3, s4, 16, v3
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v2, v2, s4, s5 op_sel:[1,0]
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v3, v3, s4, s5 op_sel:[1,0]
+; GFX10-GISEL-NEXT:    global_store_short v[0:1], v2, off
+; GFX10-GISEL-NEXT:    global_store_short_d16_hi v[0:1], v2, off offset:2
+; GFX10-GISEL-NEXT:    global_store_short v[0:1], v3, off offset:4
+; GFX10-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: test_permlane_v3i16:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s0, v4
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s1, v5
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v3, v3, s0, s1 op_sel:[1,0]
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v2, v2, s0, s1 op_sel:[1,0]
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    global_store_b16 v[0:1], v3, off offset:4
+; GFX11-SDAG-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: test_permlane_v3i16:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-GISEL-NEXT:    v_readfirstlane_b32 s0, v4
+; GFX11-GISEL-NEXT:    v_readfirstlane_b32 s1, v5
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_lshl_or_b32 v3, s0, 16, v3
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v2, v2, s0, s1 op_sel:[1,0]
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v3, v3, s0, s1 op_sel:[1,0]
+; GFX11-GISEL-NEXT:    s_clause 0x2
+; GFX11-GISEL-NEXT:    global_store_b16 v[0:1], v2, off
+; GFX11-GISEL-NEXT:    global_store_d16_hi_b16 v[0:1], v2, off offset:2
+; GFX11-GISEL-NEXT:    global_store_b16 v[0:1], v3, off offset:4
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %v = call <3 x i16> @llvm.amdgcn.permlane16.v3i16(<3 x i16> %src0, <3 x i16> %src0, i32 %src1, i32 %src2, i1 true, i1 false)
+  store <3 x i16> %v, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define void @test_permlane_v9f32(ptr addrspace(1) %out, <9 x float> %src0, i32 %src1, i32 %src2) #1 {
+; GFX10-SDAG-LABEL: test_permlane_v9f32:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s4, v11
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s5, v12
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v10, v10, s4, s5 op_sel:[1,0]
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v6, v6, s4, s5 op_sel:[1,0]
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v7, v7, s4, s5 op_sel:[1,0]
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v8, v8, s4, s5 op_sel:[1,0]
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v9, v9, s4, s5 op_sel:[1,0]
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v2, v2, s4, s5 op_sel:[1,0]
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v3, v3, s4, s5 op_sel:[1,0]
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v4, v4, s4, s5 op_sel:[1,0]
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v5, v5, s4, s5 op_sel:[1,0]
+; GFX10-SDAG-NEXT:    global_store_dword v[0:1], v10, off offset:32
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[0:1], v[6:9], off offset:16
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; GFX10-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: test_permlane_v9f32:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s4, v11
+; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s5, v12
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v2, v2, s4, s5 op_sel:[1,0]
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v3, v3, s4, s5 op_sel:[1,0]
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v4, v4, s4, s5 op_sel:[1,0]
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v5, v5, s4, s5 op_sel:[1,0]
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v6, v6, s4, s5 op_sel:[1,0]
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v7, v7, s4, s5 op_sel:[1,0]
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v8, v8, s4, s5 op_sel:[1,0]
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v9, v9, s4, s5 op_sel:[1,0]
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v10, v10, s4, s5 op_sel:[1,0]
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[0:1], v[6:9], off offset:16
+; GFX10-GISEL-NEXT:    global_store_dword v[0:1], v10, off offset:32
+; GFX10-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: test_permlane_v9f32:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s0, v11
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s1, v12
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v10, v10, s0, s1 op_sel:[1,0]
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v6, v6, s0, s1 op_sel:[1,0]
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v7, v7, s0, s1 op_sel:[1,0]
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v8, v8, s0, s1 op_sel:[1,0]
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v9, v9, s0, s1 op_sel:[1,0]
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v2, v2, s0, s1 op_sel:[1,0]
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v3, v3, s0, s1 op_sel:[1,0]
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v4, v4, s0, s1 op_sel:[1,0]
+; GFX11-SDAG-NEXT:    v_permlane16_b32 v5, v5, s0, s1 op_sel:[1,0]
+; GFX11-SDAG-NEXT:    s_clause 0x2
+; GFX11-SDAG-NEXT:    global_store_b32 v[0:1], v10, off offset:32
+; GFX11-SDAG-NEXT:    global_store_b128 v[0:1], v[6:9], off offset:16
+; GFX11-SDAG-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: test_permlane_v9f32:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    v_readfirstlane_b32 s0, v11
+; GFX11-GISEL-NEXT:    v_readfirstlane_b32 s1, v12
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v2, v2, s0, s1 op_sel:[1,0]
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v3, v3, s0, s1 op_sel:[1,0]
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v4, v4, s0, s1 op_sel:[1,0]
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v5, v5, s0, s1 op_sel:[1,0]
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v6, v6, s0, s1 op_sel:[1,0]
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v7, v7, s0, s1 op_sel:[1,0]
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v8, v8, s0, s1 op_sel:[1,0]
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v9, v9, s0, s1 op_sel:[1,0]
+; GFX11-GISEL-NEXT:    v_permlane16_b32 v10, v10, s0, s1 op_sel:[1,0]
+; GFX11-GISEL-NEXT:    s_clause 0x2
+; GFX11-GISEL-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX11-GISEL-NEXT:    global_store_b128 v[0:1], v[6:9], off offset:16
+; GFX11-GISEL-NEXT:    global_store_b32 v[0:1], v10, off offset:32
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %v = call <9 x float> @llvm.amdgcn.permlane16.v9f32(<9 x float> %src0, <9 x float> %src0, i32 %src1, i32 %src2, i1 true, i1 false)
+  store <9 x float> %v, ptr addrspace(1) %out, align 4
+  ret void
+}
+
 define amdgpu_kernel void @v_permlanex16_b32_tid_tid(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
 ; GFX10-LABEL: v_permlanex16_b32_tid_tid:
 ; GFX10:       ; %bb.0:
@@ -1122,3 +1387,252 @@
   store i32 %v, ptr addrspace(1) %out
   ret void
 }
+
+define void @test_permlanex16_i16(ptr addrspace(1) %out, i16 %src0, i32 %src1, i32 %src2) #1 {
+; GFX10-LABEL: test_permlanex16_i16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX10-NEXT:    v_readfirstlane_b32 s4, v3
+; GFX10-NEXT:    v_readfirstlane_b32 s5, v4
+; GFX10-NEXT:    v_permlanex16_b32 v2, v2, s4, s5 op_sel:[1,0]
+; GFX10-NEXT:    global_store_short v[0:1], v2, off
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: test_permlanex16_i16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v3
+; GFX11-NEXT:    v_readfirstlane_b32 s1, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_permlanex16_b32 v2, v2, s0, s1 op_sel:[1,0]
+; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %v = call i16 @llvm.amdgcn.permlanex16.i16(i16 %src0, i16 %src0, i32 %src1, i32 %src2, i1 true, i1 false)
+  store i16 %v, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define void @test_permlanxex16_f16(ptr addrspace(1) %out, half %src0, i32 %src1, i32 %src2) #1 {
+; GFX10-LABEL: test_permlanxex16_f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX10-NEXT:    v_readfirstlane_b32 s4, v3
+; GFX10-NEXT:    v_readfirstlane_b32 s5, v4
+; GFX10-NEXT:    v_permlanex16_b32 v2, v2, s4, s5 op_sel:[1,0]
+; GFX10-NEXT:    global_store_short v[0:1], v2, off
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: test_permlanxex16_f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v3
+; GFX11-NEXT:    v_readfirstlane_b32 s1, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_permlanex16_b32 v2, v2, s0, s1 op_sel:[1,0]
+; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %v = call half @llvm.amdgcn.permlanex16.f16(half %src0, half %src0, i32 %src1, i32 %src2, i1 true, i1 false)
+  store half %v, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define void @test_permlanex16_f32(ptr addrspace(1) %out, float %src0, i32 %src1, i32 %src2) #1 {
+; GFX10-LABEL: test_permlanex16_f32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_readfirstlane_b32 s4, v3
+; GFX10-NEXT:    v_readfirstlane_b32 s5, v4
+; GFX10-NEXT:    v_permlanex16_b32 v2, v2, s4, s5 op_sel:[1,0]
+; GFX10-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: test_permlanex16_f32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v3
+; GFX11-NEXT:    v_readfirstlane_b32 s1, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_permlanex16_b32 v2, v2, s0, s1 op_sel:[1,0]
+; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %v = call float @llvm.amdgcn.permlanex16.f32(float %src0, float %src0, i32 %src1, i32 %src2, i1 true, i1 false)
+  store float %v, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define void @test_permlanex16_v3i16(ptr addrspace(1) %out, <3 x i16> %src0, i32 %src1, i32 %src2) #1 {
+; GFX10-SDAG-LABEL: test_permlanex16_v3i16:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s4, v4
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s5, v5
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v3, v3, s4, s5 op_sel:[1,0]
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v2, v2, s4, s5 op_sel:[1,0]
+; GFX10-SDAG-NEXT:    global_store_short v[0:1], v3, off offset:4
+; GFX10-SDAG-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: test_permlanex16_v3i16:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s4, v4
+; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s5, v5
+; GFX10-GISEL-NEXT:    v_lshl_or_b32 v3, s4, 16, v3
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v2, v2, s4, s5 op_sel:[1,0]
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v3, v3, s4, s5 op_sel:[1,0]
+; GFX10-GISEL-NEXT:    global_store_short v[0:1], v2, off
+; GFX10-GISEL-NEXT:    global_store_short_d16_hi v[0:1], v2, off offset:2
+; GFX10-GISEL-NEXT:    global_store_short v[0:1], v3, off offset:4
+; GFX10-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: test_permlanex16_v3i16:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s0, v4
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s1, v5
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v3, v3, s0, s1 op_sel:[1,0]
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v2, v2, s0, s1 op_sel:[1,0]
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    global_store_b16 v[0:1], v3, off offset:4
+; GFX11-SDAG-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: test_permlanex16_v3i16:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-GISEL-NEXT:    v_readfirstlane_b32 s0, v4
+; GFX11-GISEL-NEXT:    v_readfirstlane_b32 s1, v5
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_lshl_or_b32 v3, s0, 16, v3
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v2, v2, s0, s1 op_sel:[1,0]
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v3, v3, s0, s1 op_sel:[1,0]
+; GFX11-GISEL-NEXT:    s_clause 0x2
+; GFX11-GISEL-NEXT:    global_store_b16 v[0:1], v2, off
+; GFX11-GISEL-NEXT:    global_store_d16_hi_b16 v[0:1], v2, off offset:2
+; GFX11-GISEL-NEXT:    global_store_b16 v[0:1], v3, off offset:4
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %v = call <3 x i16> @llvm.amdgcn.permlanex16.v3i16(<3 x i16> %src0, <3 x i16> %src0, i32 %src1, i32 %src2, i1 true, i1 false)
+  store <3 x i16> %v, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define void @test_permlanex16_v9f32(ptr addrspace(1) %out, <9 x float> %src0, i32 %src1, i32 %src2) #1 {
+; GFX10-SDAG-LABEL: test_permlanex16_v9f32:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s4, v11
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s5, v12
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v10, v10, s4, s5 op_sel:[1,0]
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v6, v6, s4, s5 op_sel:[1,0]
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v7, v7, s4, s5 op_sel:[1,0]
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v8, v8, s4, s5 op_sel:[1,0]
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v9, v9, s4, s5 op_sel:[1,0]
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v2, v2, s4, s5 op_sel:[1,0]
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v3, v3, s4, s5 op_sel:[1,0]
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v4, v4, s4, s5 op_sel:[1,0]
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v5, v5, s4, s5 op_sel:[1,0]
+; GFX10-SDAG-NEXT:    global_store_dword v[0:1], v10, off offset:32
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[0:1], v[6:9], off offset:16
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; GFX10-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: test_permlanex16_v9f32:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s4, v11
+; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s5, v12
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v2, v2, s4, s5 op_sel:[1,0]
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v3, v3, s4, s5 op_sel:[1,0]
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v4, v4, s4, s5 op_sel:[1,0]
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v5, v5, s4, s5 op_sel:[1,0]
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v6, v6, s4, s5 op_sel:[1,0]
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v7, v7, s4, s5 op_sel:[1,0]
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v8, v8, s4, s5 op_sel:[1,0]
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v9, v9, s4, s5 op_sel:[1,0]
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v10, v10, s4, s5 op_sel:[1,0]
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[0:1], v[6:9], off offset:16
+; GFX10-GISEL-NEXT:    global_store_dword v[0:1], v10, off offset:32
+; GFX10-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: test_permlanex16_v9f32:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s0, v11
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s1, v12
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v10, v10, s0, s1 op_sel:[1,0]
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v6, v6, s0, s1 op_sel:[1,0]
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v7, v7, s0, s1 op_sel:[1,0]
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v8, v8, s0, s1 op_sel:[1,0]
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v9, v9, s0, s1 op_sel:[1,0]
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v2, v2, s0, s1 op_sel:[1,0]
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v3, v3, s0, s1 op_sel:[1,0]
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v4, v4, s0, s1 op_sel:[1,0]
+; GFX11-SDAG-NEXT:    v_permlanex16_b32 v5, v5, s0, s1 op_sel:[1,0]
+; GFX11-SDAG-NEXT:    s_clause 0x2
+; GFX11-SDAG-NEXT:    global_store_b32 v[0:1], v10, off offset:32
+; GFX11-SDAG-NEXT:    global_store_b128 v[0:1], v[6:9], off offset:16
+; GFX11-SDAG-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: test_permlanex16_v9f32:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    v_readfirstlane_b32 s0, v11
+; GFX11-GISEL-NEXT:    v_readfirstlane_b32 s1, v12
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v2, v2, s0, s1 op_sel:[1,0]
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v3, v3, s0, s1 op_sel:[1,0]
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v4, v4, s0, s1 op_sel:[1,0]
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v5, v5, s0, s1 op_sel:[1,0]
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v6, v6, s0, s1 op_sel:[1,0]
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v7, v7, s0, s1 op_sel:[1,0]
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v8, v8, s0, s1 op_sel:[1,0]
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v9, v9, s0, s1 op_sel:[1,0]
+; GFX11-GISEL-NEXT:    v_permlanex16_b32 v10, v10, s0, s1 op_sel:[1,0]
+; GFX11-GISEL-NEXT:    s_clause 0x2
+; GFX11-GISEL-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX11-GISEL-NEXT:    global_store_b128 v[0:1], v[6:9], off offset:16
+; GFX11-GISEL-NEXT:    global_store_b32 v[0:1], v10, off offset:32
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %v = call <9 x float> @llvm.amdgcn.permlanex16.v9f32(<9 x float> %src0, <9 x float> %src0, i32 %src1, i32 %src2, i1 true, i1 false)
+  store <9 x float> %v, ptr addrspace(1) %out, align 4
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
@@ -1,11 +1,20 @@
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope %s
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope %s
 
-declare i32 @llvm.amdgcn.readfirstlane(i32) #0
+declare i32 @llvm.amdgcn.readfirstlane.i32(i32) #0
+declare float @llvm.amdgcn.readfirstlane.f32(float) #0
+declare <2 x half> @llvm.amdgcn.readfirstlane.v2f16(<2 x half>) #0
+declare <2 x i16> @llvm.amdgcn.readfirstlane.v2i16(<2 x i16>) #0
+declare i16 @llvm.amdgcn.readfirstlane.i16(i16) #0
+declare half @llvm.amdgcn.readfirstlane.f16(half) #0
+declare <3 x i16> @llvm.amdgcn.readfirstlane.v3i16(<3 x i16>) #0
+declare <9 x float> @llvm.amdgcn.readfirstlane.v9f32(<9 x float>) #0
+
 
 ; CHECK-LABEL: {{^}}test_readfirstlane:
 ; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, v2
 define void @test_readfirstlane(ptr addrspace(1) %out, i32 %src) #1 {
-  %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %src)
+  %readfirstlane = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %src)
   store i32 %readfirstlane, ptr addrspace(1) %out, align 4
   ret void
 }
@@ -15,7 +24,7 @@
 ; CHECK-NOT: [[SGPR_VAL]]
 ; CHECK: ; use [[SGPR_VAL]]
 define amdgpu_kernel void @test_readfirstlane_imm(ptr addrspace(1) %out) #1 {
-  %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 32)
+  %readfirstlane = call i32 @llvm.amdgcn.readfirstlane.i32(i32 32)
   call void asm sideeffect "; use $0", "s"(i32 %readfirstlane)
   ret void
 }
@@ -25,7 +34,7 @@
 ; CHECK-NOT: [[VVAL]]
 ; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VVAL]]
 define amdgpu_kernel void @test_readfirstlane_imm_fold(ptr addrspace(1) %out) #1 {
-  %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 32)
+  %readfirstlane = call i32 @llvm.amdgcn.readfirstlane.i32(i32 32)
   store i32 %readfirstlane, ptr addrspace(1) %out, align 4
   ret void
 }
@@ -36,7 +45,7 @@
 ; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VVAL]]
 define amdgpu_kernel void @test_readfirstlane_m0(ptr addrspace(1) %out) #1 {
   %m0 = call i32 asm "s_mov_b32 m0, -1", "={m0}"()
-  %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %m0)
+  %readfirstlane = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %m0)
   store i32 %readfirstlane, ptr addrspace(1) %out, align 4
   ret void
 }
@@ -51,7 +60,7 @@
 ; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VCOPY]]
 define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr(ptr addrspace(1) %out) #1 {
   %sgpr = call i32 asm "s_mov_b32 $0, 0", "=s"()
-  %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %sgpr)
+  %readfirstlane = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %sgpr)
   store i32 %readfirstlane, ptr addrspace(1) %out, align 4
   ret void
 }
@@ -62,10 +71,84 @@
 define amdgpu_kernel void @test_readfirstlane_fi(ptr addrspace(1) %out) #1 {
   %alloca = alloca i32, addrspace(5)
   %int = ptrtoint ptr addrspace(5) %alloca to i32
-  %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %int)
+  %readfirstlane = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %int)
   call void asm sideeffect "; use $0", "s"(i32 %readfirstlane)
   ret void
 }
 
+; CHECK-LABEL: {{^}}test_readfirstlane_v2f16:
+; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, v2
+; CHECK-NOT: v_readfirstlane_b32
+define void @test_readfirstlane_v2f16(ptr addrspace(1) %out, <2 x half> %src) #1 {
+  %readfirstlane = call <2 x half> @llvm.amdgcn.readfirstlane.v2f16(<2 x half> %src)
+  store <2 x half> %readfirstlane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_readfirstlane_v2i16:
+; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, v2
+; CHECK-NOT: v_readfirstlane_b32
+define void @test_readfirstlane_v2i16(ptr addrspace(1) %out, <2 x i16> %src) #1 {
+  %readfirstlane = call <2 x i16> @llvm.amdgcn.readfirstlane.v2i16(<2 x i16> %src)
+  store <2 x i16> %readfirstlane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+
+; CHECK-LABEL: {{^}}test_readfirstlane_i16:
+; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, v2
+; CHECK-NOT: v_readfirstlane_b32
+define void @test_readfirstlane_i16(ptr addrspace(1) %out, i16 %src) {
+  %readfirstlane = call i16 @llvm.amdgcn.readfirstlane.i16(i16 %src)
+  store i16 %readfirstlane, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_readfirstlane_f16:
+; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, v2
+; CHECK-NOT: v_readfirstlane_b32
+define void @test_readfirstlane_f16(ptr addrspace(1) %out, half %src) {
+  %readfirstlane = call half @llvm.amdgcn.readfirstlane.f16(half %src)
+  store half %readfirstlane, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_readfirstlane_f32:
+; CHECK-NOT: v_cvt_f32_i32_e32
+; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, v2
+; CHECK-NOT: v_readfirstlane_b32
+define void @test_readfirstlane_f32(ptr addrspace(1) %out, float %src) #1 {
+  %readfirstlane = call float @llvm.amdgcn.readfirstlane.f32(float %src)
+  store float %readfirstlane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_readfirstlane_v3i16:
+; CHECK-DAG: v_readfirstlane_b32 s{{[0-9]+}},
+; CHECK-DAG: v_readfirstlane_b32 s{{[0-9]+}},
+; CHECK-NOT: v_readfirstlane_b32
+define void @test_readfirstlane_v3i16(ptr addrspace(1) %out, <3 x i16> %src) {
+  %readfirstlane = call <3 x i16> @llvm.amdgcn.readfirstlane.v3i16(<3 x i16> %src)
+  store <3 x i16> %readfirstlane, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_readfirstlane_v9f32:
+; CHECK-DAG: v_readfirstlane_b32 s{{[0-9]+}}, v2
+; CHECK-DAG: v_readfirstlane_b32 s{{[0-9]+}}, v3
+; CHECK-DAG: v_readfirstlane_b32 s{{[0-9]+}}, v4
+; CHECK-DAG: v_readfirstlane_b32 s{{[0-9]+}}, v5
+; CHECK-DAG: v_readfirstlane_b32 s{{[0-9]+}}, v6
+; CHECK-DAG: v_readfirstlane_b32 s{{[0-9]+}}, v7
+; CHECK-DAG: v_readfirstlane_b32 s{{[0-9]+}}, v8
+; CHECK-DAG: v_readfirstlane_b32 s{{[0-9]+}}, v9
+; CHECK-DAG: v_readfirstlane_b32 s{{[0-9]+}}, v10
+; CHECK-NOT: v_readfirstlane_b32
+define void @test_readfirstlane_v9f32(ptr addrspace(1) %out, <9 x float> %src) {
+  %readfirstlane = call <9 x float> @llvm.amdgcn.readfirstlane.v9f32(<9 x float> %src)
+  store <9 x float> %readfirstlane, ptr addrspace(1) %out, align 2
+  ret void
+}
+
 attributes #0 = { nounwind readnone convergent }
-attributes #1 = { nounwind }
+attributes #1 = { nounwind }
\ No newline at end of file
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
@@ -1,6 +1,11 @@
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope %s
 
 declare i32 @llvm.amdgcn.readlane(i32, i32) #0
+declare i16 @llvm.amdgcn.readlane.i16(i16, i32) #0
+declare half @llvm.amdgcn.readlane.f16(half, i32) #0
+declare float @llvm.amdgcn.readlane.f32(float, i32) #0
+declare <3 x i16> @llvm.amdgcn.readlane.v3i16(<3 x i16>, i32) #0
+declare <9 x float> @llvm.amdgcn.readlane.v9f32(<9 x float>, i32) #0
 
 ; CHECK-LABEL: {{^}}test_readlane_sreg_sreg:
 ; CHECK-NOT: v_readlane_b32
@@ -77,8 +82,63 @@
   ret void
 }
 
+; CHECK-LABEL: {{^}}test_readlane_i16:
+; CHECK: v_readlane_b32 s{{[0-9]+}}, v2, 15
+; CHECK-NOT: v_readlane_b32
+define void @test_readlane_i16(ptr addrspace(1) %out, i16 %src) {
+  %readlane = call i16 @llvm.amdgcn.readlane.i16(i16 %src, i32 15)
+  store i16 %readlane, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_readlane_f16:
+; CHECK: v_readlane_b32 s{{[0-9]+}}, v2, 15
+; CHECK-NOT: v_readlane_b32
+define void @test_readlane_f16(ptr addrspace(1) %out, half %src) {
+  %readlane = call half @llvm.amdgcn.readlane.f16(half %src, i32 15)
+  store half %readlane, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_readlane_f32:
+; CHECK-NOT: v_cvt_f32_i32_e32
+; CHECK: v_readlane_b32 s{{[0-9]+}}, v2, 15
+; CHECK-NOT: v_readlane_b32
+define void @test_readlane_f32(ptr addrspace(1) %out, float %src) {
+  %readlane = call float @llvm.amdgcn.readlane.f32(float %src, i32 15)
+  store float %readlane, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_readlane_v3i16:
+; CHECK-DAG: v_readlane_b32 s{{[0-9]+}},
+; CHECK-DAG: v_readlane_b32 s{{[0-9]+}},
+; CHECK-NOT: v_readlane_b32
+define void @test_readlane_v3i16(ptr addrspace(1) %out, <3 x i16> %src) {
+  %readlane = call <3 x i16> @llvm.amdgcn.readlane.v3i16(<3 x i16> %src, i32 15)
+  store <3 x i16> %readlane, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_readlane_v9f32:
+; CHECK-DAG: v_readlane_b32 s{{[0-9]+}}, v2, 15
+; CHECK-DAG: v_readlane_b32 s{{[0-9]+}}, v3, 15
+; CHECK-DAG: v_readlane_b32 s{{[0-9]+}}, v4, 15
+; CHECK-DAG: v_readlane_b32 s{{[0-9]+}}, v5, 15
+; CHECK-DAG: v_readlane_b32 s{{[0-9]+}}, v6, 15
+; CHECK-DAG: v_readlane_b32 s{{[0-9]+}}, v7, 15
+; CHECK-DAG: v_readlane_b32 s{{[0-9]+}}, v8, 15
+; CHECK-DAG: v_readlane_b32 s{{[0-9]+}}, v9, 15
+; CHECK-DAG: v_readlane_b32 s{{[0-9]+}}, v10, 15
+; CHECK-NOT: v_readlane_b32
+define void @test_readlane_v9f32(<9 x float> addrspace(1)* %out, <9 x float> %src) {
+  %readlane = call <9 x float> @llvm.amdgcn.readlane.v9f32(<9 x float> %src, i32 15)
+  store <9 x float> %readlane, <9 x float> addrspace(1)* %out, align 2
+  ret void
+}
+
 declare i32 @llvm.amdgcn.workitem.id.x() #2
 
 attributes #0 = { nounwind readnone convergent }
 attributes #1 = { nounwind }
-attributes #2 = { nounwind readnone }
+attributes #2 = { nounwind readnone }
\ No newline at end of file
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
@@ -4,6 +4,11 @@
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=CHECK,GFX10 %s
 
 declare i32 @llvm.amdgcn.writelane(i32, i32, i32) #0
+declare i16 @llvm.amdgcn.writelane.i16(i16, i32, i16) #0
+declare half @llvm.amdgcn.writelane.f16(half, i32, half) #0
+declare float @llvm.amdgcn.writelane.f32(float, i32, float) #0
+declare <3 x i16> @llvm.amdgcn.writelane.v3i16(<3 x i16>, i32, <3 x i16>) #0
+declare <9 x float> @llvm.amdgcn.writelane.v9f32(<9 x float>, i32, <9 x float>) #0
 
 ; CHECK-LABEL: {{^}}test_writelane_sreg:
 ; CIGFX9: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, m0
@@ -80,8 +85,66 @@
   ret void
 }
 
+; CHECK-LABEL: {{^}}test_writelane_i16:
+; CHECK: v_writelane_b32 v{{[0-9]+}},
+; CHECK-NOT: v_writelane_b32
+define void @test_writelane_i16(ptr addrspace(1) %out, i16 %src) {
+  %writelane = call i16 @llvm.amdgcn.writelane.i16(i16 1234, i32 15, i16 %src)
+  store i16 %writelane, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_writelane_f16:
+; CHECK: v_writelane_b32 v{{[0-9]+}},
+; CHECK-NOT: v_writelane_b32
+define void @test_writelane_f16(ptr addrspace(1) %out, half %src) {
+  %writelane = call half @llvm.amdgcn.writelane.f16(half 1.0, i32 15, half %src)
+  store half %writelane, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+
+; CHECK-LABEL: {{^}}test_writelane_f32:
+; CHECK-NOT: v_cvt_f32_i32_e32
+; CHECK: v_writelane_b32 v{{[0-9]+}},
+; CHECK-NOT: v_writelane_b32
+define void @test_writelane_f32(ptr addrspace(1) %out, float %src) #1 {
+  %writelane = call float @llvm.amdgcn.writelane.f32(float 2.0, i32 15, float %src)
+  store float %writelane, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+
+; CHECK-LABEL: {{^}}test_writelane_v3i16:
+; CHECK-DAG: v_writelane_b32 v{{[0-9]+}},
+; CHECK-DAG: v_writelane_b32 v{{[0-9]+}},
+; CHECK-NOT: v_writelane_b32
+define void @test_writelane_v3i16(ptr addrspace(1) %out, <3 x i16> %src) {
+  %writelane = call <3 x i16> @llvm.amdgcn.writelane.v3i16(<3 x i16> zeroinitializer, i32 15, <3 x i16> %src)
+  store <3 x i16> %writelane, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_writelane_v9f32:
+; CHECK-DAG: v_writelane_b32 v{{[0-9]+}}, 0, 15
+; CHECK-DAG: v_writelane_b32 v{{[0-9]+}}, 0, 15
+; CHECK-DAG: v_writelane_b32 v{{[0-9]+}}, 0, 15
+; CHECK-DAG: v_writelane_b32 v{{[0-9]+}}, 0, 15
+; CHECK-DAG: v_writelane_b32 v{{[0-9]+}}, 0, 15
+; CHECK-DAG: v_writelane_b32 v{{[0-9]+}}, 0, 15
+; CHECK-DAG: v_writelane_b32 v{{[0-9]+}}, 0, 15
+; CHECK-DAG: v_writelane_b32 v{{[0-9]+}}, 0, 15
+; CHECK-DAG: v_writelane_b32 v{{[0-9]+}}, 0, 15
+; CHECK-NOT: v_writelane_b32
+define void @test_writelane_v9f32(ptr addrspace(1) %out, <9 x float> %src) {
+  %writelane = call <9 x float> @llvm.amdgcn.writelane.v9f32(<9 x float> zeroinitializer, i32 15, <9 x float> %src)
+  store <9 x float> %writelane, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+
 declare i32 @llvm.amdgcn.workitem.id.x() #2
 
 attributes #0 = { nounwind readnone convergent }
 attributes #1 = { nounwind }
-attributes #2 = { nounwind readnone }
+attributes #2 = { nounwind readnone }
\ No newline at end of file
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
--- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
 ; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes=instcombine -S < %s | FileCheck %s
 
 ; --------------------------------------------------------------------
@@ -9,7 +9,8 @@
 declare double @llvm.amdgcn.rcp.f64(double) nounwind readnone
 
 define float @test_constant_fold_rcp_f32_undef() nounwind {
-; CHECK-LABEL: @test_constant_fold_rcp_f32_undef(
+; CHECK-LABEL: define float @test_constant_fold_rcp_f32_undef
+; CHECK-SAME: () #[[ATTR1:[0-9]+]] {
 ; CHECK-NEXT:    ret float 0x7FF8000000000000
 ;
   %val = call float @llvm.amdgcn.rcp.f32(float undef) nounwind readnone
@@ -17,7 +18,8 @@
 }
 
 define float @test_constant_fold_rcp_f32_1() nounwind {
-; CHECK-LABEL: @test_constant_fold_rcp_f32_1(
+; CHECK-LABEL: define float @test_constant_fold_rcp_f32_1
+; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:    ret float 1.000000e+00
 ;
   %val = call float @llvm.amdgcn.rcp.f32(float 1.0) nounwind readnone
@@ -25,7 +27,8 @@
 }
 
 define double @test_constant_fold_rcp_f64_1() nounwind {
-; CHECK-LABEL: @test_constant_fold_rcp_f64_1(
+; CHECK-LABEL: define double @test_constant_fold_rcp_f64_1
+; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:    ret double 1.000000e+00
 ;
   %val = call double @llvm.amdgcn.rcp.f64(double 1.0) nounwind readnone
@@ -33,7 +36,8 @@
 }
 
 define float @test_constant_fold_rcp_f32_half() nounwind {
-; CHECK-LABEL: @test_constant_fold_rcp_f32_half(
+; CHECK-LABEL: define float @test_constant_fold_rcp_f32_half
+; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:    ret float 2.000000e+00
 ;
   %val = call float @llvm.amdgcn.rcp.f32(float 0.5) nounwind readnone
@@ -41,7 +45,8 @@
 }
 
 define double @test_constant_fold_rcp_f64_half() nounwind {
-; CHECK-LABEL: @test_constant_fold_rcp_f64_half(
+; CHECK-LABEL: define double @test_constant_fold_rcp_f64_half
+; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:    ret double 2.000000e+00
 ;
   %val = call double @llvm.amdgcn.rcp.f64(double 0.5) nounwind readnone
@@ -49,7 +54,8 @@
 }
 
 define float @test_constant_fold_rcp_f32_43() nounwind {
-; CHECK-LABEL: @test_constant_fold_rcp_f32_43(
+; CHECK-LABEL: define float @test_constant_fold_rcp_f32_43
+; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:    ret float 0x3F97D05F40000000
 ;
   %val = call float @llvm.amdgcn.rcp.f32(float 4.300000e+01) nounwind readnone
@@ -57,7 +63,8 @@
 }
 
 define double @test_constant_fold_rcp_f64_43() nounwind {
-; CHECK-LABEL: @test_constant_fold_rcp_f64_43(
+; CHECK-LABEL: define double @test_constant_fold_rcp_f64_43
+; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:    ret double 0x3F97D05F417D05F4
 ;
   %val = call double @llvm.amdgcn.rcp.f64(double 4.300000e+01) nounwind readnone
@@ -65,8 +72,9 @@
 }
 
 define float @test_constant_fold_rcp_f32_43_strictfp() nounwind strictfp {
-; CHECK-LABEL: @test_constant_fold_rcp_f32_43_strictfp(
-; CHECK-NEXT:    [[VAL:%.*]] = call float @llvm.amdgcn.rcp.f32(float 4.300000e+01) #[[ATTR14:[0-9]+]]
+; CHECK-LABEL: define float @test_constant_fold_rcp_f32_43_strictfp
+; CHECK-SAME: () #[[ATTR2:[0-9]+]] {
+; CHECK-NEXT:    [[VAL:%.*]] = call float @llvm.amdgcn.rcp.f32(float 4.300000e+01) #[[ATTR13:[0-9]+]]
 ; CHECK-NEXT:    ret float [[VAL]]
 ;
   %val = call float @llvm.amdgcn.rcp.f32(float 4.300000e+01) strictfp nounwind readnone
@@ -82,7 +90,8 @@
 declare double @llvm.amdgcn.sqrt.f64(double) nounwind readnone
 
 define half @test_constant_fold_sqrt_f16_undef() nounwind {
-; CHECK-LABEL: @test_constant_fold_sqrt_f16_undef(
+; CHECK-LABEL: define half @test_constant_fold_sqrt_f16_undef
+; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:    ret half 0xH7E00
 ;
   %val = call half @llvm.amdgcn.sqrt.f16(half undef) nounwind readnone
@@ -90,7 +99,8 @@
 }
 
 define float @test_constant_fold_sqrt_f32_undef() nounwind {
-; CHECK-LABEL: @test_constant_fold_sqrt_f32_undef(
+; CHECK-LABEL: define float @test_constant_fold_sqrt_f32_undef
+; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:    ret float 0x7FF8000000000000
 ;
   %val = call float @llvm.amdgcn.sqrt.f32(float undef) nounwind readnone
@@ -98,7 +108,8 @@
 }
 
 define double @test_constant_fold_sqrt_f64_undef() nounwind {
-; CHECK-LABEL: @test_constant_fold_sqrt_f64_undef(
+; CHECK-LABEL: define double @test_constant_fold_sqrt_f64_undef
+; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:    ret double 0x7FF8000000000000
 ;
   %val = call double @llvm.amdgcn.sqrt.f64(double undef) nounwind readnone
@@ -106,8 +117,9 @@
 }
 
 define half @test_constant_fold_sqrt_f16_0() nounwind {
-; CHECK-LABEL: @test_constant_fold_sqrt_f16_0(
-; CHECK-NEXT:    [[VAL:%.*]] = call half @llvm.amdgcn.sqrt.f16(half 0xH0000) #[[ATTR15:[0-9]+]]
+; CHECK-LABEL: define half @test_constant_fold_sqrt_f16_0
+; CHECK-SAME: () #[[ATTR1]] {
+; CHECK-NEXT:    [[VAL:%.*]] = call half @llvm.amdgcn.sqrt.f16(half 0xH0000) #[[ATTR14:[0-9]+]]
 ; CHECK-NEXT:    ret half [[VAL]]
 ;
   %val = call half @llvm.amdgcn.sqrt.f16(half 0.0) nounwind readnone
@@ -115,8 +127,9 @@
 }
 
 define float @test_constant_fold_sqrt_f32_0() nounwind {
-; CHECK-LABEL: @test_constant_fold_sqrt_f32_0(
-; CHECK-NEXT:    [[VAL:%.*]] = call float @llvm.amdgcn.sqrt.f32(float 0.000000e+00) #[[ATTR15]]
+; CHECK-LABEL: define float @test_constant_fold_sqrt_f32_0
+; CHECK-SAME: () #[[ATTR1]] {
+; CHECK-NEXT:    [[VAL:%.*]] = call float @llvm.amdgcn.sqrt.f32(float 0.000000e+00) #[[ATTR14]]
 ; CHECK-NEXT:    ret float [[VAL]]
 ;
   %val = call float @llvm.amdgcn.sqrt.f32(float 0.0) nounwind readnone
@@ -124,8 +137,9 @@
 }
 
 define double @test_constant_fold_sqrt_f64_0() nounwind {
-; CHECK-LABEL: @test_constant_fold_sqrt_f64_0(
-; CHECK-NEXT:    [[VAL:%.*]] = call double @llvm.amdgcn.sqrt.f64(double 0.000000e+00) #[[ATTR15]]
+; CHECK-LABEL: define double @test_constant_fold_sqrt_f64_0
+; CHECK-SAME: () #[[ATTR1]] {
+; CHECK-NEXT:    [[VAL:%.*]] = call double @llvm.amdgcn.sqrt.f64(double 0.000000e+00) #[[ATTR14]]
 ; CHECK-NEXT:    ret double [[VAL]]
 ;
   %val = call double @llvm.amdgcn.sqrt.f64(double 0.0) nounwind readnone
@@ -133,8 +147,9 @@
 }
 
 define half @test_constant_fold_sqrt_f16_neg0() nounwind {
-; CHECK-LABEL: @test_constant_fold_sqrt_f16_neg0(
-; CHECK-NEXT:    [[VAL:%.*]] = call half @llvm.amdgcn.sqrt.f16(half 0xH8000) #[[ATTR15]]
+; CHECK-LABEL: define half @test_constant_fold_sqrt_f16_neg0
+; CHECK-SAME: () #[[ATTR1]] {
+; CHECK-NEXT:    [[VAL:%.*]] = call half @llvm.amdgcn.sqrt.f16(half 0xH8000) #[[ATTR14]]
 ; CHECK-NEXT:    ret half [[VAL]]
 ;
   %val = call half @llvm.amdgcn.sqrt.f16(half -0.0) nounwind readnone
@@ -142,8 +157,9 @@
 }
 
 define float @test_constant_fold_sqrt_f32_neg0() nounwind {
-; CHECK-LABEL: @test_constant_fold_sqrt_f32_neg0(
-; CHECK-NEXT:    [[VAL:%.*]] = call float @llvm.amdgcn.sqrt.f32(float -0.000000e+00) #[[ATTR15]]
+; CHECK-LABEL: define float @test_constant_fold_sqrt_f32_neg0
+; CHECK-SAME: () #[[ATTR1]] {
+; CHECK-NEXT:    [[VAL:%.*]] = call float @llvm.amdgcn.sqrt.f32(float -0.000000e+00) #[[ATTR14]]
 ; CHECK-NEXT:    ret float [[VAL]]
 ;
   %val = call float @llvm.amdgcn.sqrt.f32(float -0.0) nounwind readnone
@@ -151,8 +167,9 @@
 }
 
 define double @test_constant_fold_sqrt_f64_neg0() nounwind {
-; CHECK-LABEL: @test_constant_fold_sqrt_f64_neg0(
-; CHECK-NEXT:    [[VAL:%.*]] = call double @llvm.amdgcn.sqrt.f64(double -0.000000e+00) #[[ATTR15]]
+; CHECK-LABEL: define double @test_constant_fold_sqrt_f64_neg0
+; CHECK-SAME: () #[[ATTR1]] {
+; CHECK-NEXT:    [[VAL:%.*]] = call double @llvm.amdgcn.sqrt.f64(double -0.000000e+00) #[[ATTR14]]
 ; CHECK-NEXT:    ret double [[VAL]]
 ;
   %val = call double @llvm.amdgcn.sqrt.f64(double -0.0) nounwind readnone
@@ -160,7 +177,8 @@
 }
 
 define double @test_constant_fold_sqrt_snan_f64() nounwind {
-; CHECK-LABEL: @test_constant_fold_sqrt_snan_f64(
+; CHECK-LABEL: define double @test_constant_fold_sqrt_snan_f64
+; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:    [[VAL:%.*]] = call double @llvm.amdgcn.sqrt.f64(double 0x7FF0000000000001)
 ; CHECK-NEXT:    ret double [[VAL]]
 ;
@@ -169,7 +187,8 @@
 }
 
 define double @test_constant_fold_sqrt_qnan_f64() nounwind {
-; CHECK-LABEL: @test_constant_fold_sqrt_qnan_f64(
+; CHECK-LABEL: define double @test_constant_fold_sqrt_qnan_f64
+; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:    [[VAL:%.*]] = call double @llvm.amdgcn.sqrt.f64(double 0x7FF8000000000000)
 ; CHECK-NEXT:    ret double [[VAL]]
 ;
@@ -178,7 +197,8 @@
 }
 
 define double @test_constant_fold_sqrt_neg1() nounwind {
-; CHECK-LABEL: @test_constant_fold_sqrt_neg1(
+; CHECK-LABEL: define double @test_constant_fold_sqrt_neg1
+; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:    [[VAL:%.*]] = call double @llvm.amdgcn.sqrt.f64(double -1.000000e+00)
 ; CHECK-NEXT:    ret double [[VAL]]
 ;
@@ -193,7 +213,8 @@
 declare float @llvm.amdgcn.rsq.f32(float) nounwind readnone
 
 define float @test_constant_fold_rsq_f32_undef() nounwind {
-; CHECK-LABEL: @test_constant_fold_rsq_f32_undef(
+; CHECK-LABEL: define float @test_constant_fold_rsq_f32_undef
+; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:    ret float 0x7FF8000000000000
 ;
   %val = call float @llvm.amdgcn.rsq.f32(float undef) nounwind readnone
@@ -209,7 +230,8 @@
 
 
 define float @test_constant_fold_frexp_mant_f32_undef() nounwind {
-; CHECK-LABEL: @test_constant_fold_frexp_mant_f32_undef(
+; CHECK-LABEL: define float @test_constant_fold_frexp_mant_f32_undef
+; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:    ret float undef
 ;
   %val = call float @llvm.amdgcn.frexp.mant.f32(float undef)
@@ -217,7 +239,8 @@
 }
 
 define double @test_constant_fold_frexp_mant_f64_undef() nounwind {
-; CHECK-LABEL: @test_constant_fold_frexp_mant_f64_undef(
+; CHECK-LABEL: define double @test_constant_fold_frexp_mant_f64_undef
+; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:    ret double undef
 ;
   %val = call double @llvm.amdgcn.frexp.mant.f64(double undef)
@@ -225,7 +248,8 @@
 }
 
 define float @test_constant_fold_frexp_mant_f32_0() nounwind {
-; CHECK-LABEL: @test_constant_fold_frexp_mant_f32_0(
+; CHECK-LABEL: define float @test_constant_fold_frexp_mant_f32_0
+; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:    ret float 0.000000e+00
 ;
   %val = call float @llvm.amdgcn.frexp.mant.f32(float 0.0)
@@ -233,7 +257,8 @@
 }
 
 define double @test_constant_fold_frexp_mant_f64_0() nounwind {
-; CHECK-LABEL: @test_constant_fold_frexp_mant_f64_0(
+; CHECK-LABEL: define double @test_constant_fold_frexp_mant_f64_0
+; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:    ret double 0.000000e+00
 ;
   %val = call double @llvm.amdgcn.frexp.mant.f64(double 0.0)
@@ -241,7 +266,8 @@
 }
 
 define float @test_constant_fold_frexp_mant_f32_n0() nounwind {
-; CHECK-LABEL: @test_constant_fold_frexp_mant_f32_n0(
+; CHECK-LABEL: define float @test_constant_fold_frexp_mant_f32_n0
+; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:    ret float -0.000000e+00
 ;
   %val = call float @llvm.amdgcn.frexp.mant.f32(float -0.0)
@@ -249,7 +275,8 @@
 }
 
 define double @test_constant_fold_frexp_mant_f64_n0() nounwind {
-; CHECK-LABEL: @test_constant_fold_frexp_mant_f64_n0(
+; CHECK-LABEL: define double @test_constant_fold_frexp_mant_f64_n0
+; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:    ret double -0.000000e+00
 ;
   %val = call double @llvm.amdgcn.frexp.mant.f64(double -0.0)
@@ -257,7 +284,8 @@
 }
 
 define float @test_constant_fold_frexp_mant_f32_1() nounwind {
-; CHECK-LABEL: @test_constant_fold_frexp_mant_f32_1(
+; CHECK-LABEL: define float @test_constant_fold_frexp_mant_f32_1
+; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:    ret float 5.000000e-01
 ;
   %val = call float @llvm.amdgcn.frexp.mant.f32(float 1.0)
@@ -265,7 +293,8 @@
 }
 
 define double @test_constant_fold_frexp_mant_f64_1() nounwind {
-; CHECK-LABEL: @test_constant_fold_frexp_mant_f64_1(
+; CHECK-LABEL: define double @test_constant_fold_frexp_mant_f64_1
+; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:    ret double 5.000000e-01
 ;
   %val = call double @llvm.amdgcn.frexp.mant.f64(double 1.0)
@@ -273,7 +302,8 @@
 }
 
 define float @test_constant_fold_frexp_mant_f32_n1() nounwind {
-; CHECK-LABEL: @test_constant_fold_frexp_mant_f32_n1(
+; CHECK-LABEL: define float @test_constant_fold_frexp_mant_f32_n1
+; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:    ret float -5.000000e-01
 ;
   %val = call float @llvm.amdgcn.frexp.mant.f32(float -1.0)
@@ -281,7 +311,8 @@
 }
 
 define double @test_constant_fold_frexp_mant_f64_n1() nounwind {
-; CHECK-LABEL: @test_constant_fold_frexp_mant_f64_n1(
+; CHECK-LABEL: define double @test_constant_fold_frexp_mant_f64_n1
+; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:    ret double -5.000000e-01
 ;
   %val = call double @llvm.amdgcn.frexp.mant.f64(double -1.0)
@@ -289,7 +320,8 @@
 }
 
 define float @test_constant_fold_frexp_mant_f32_nan() nounwind {
-; CHECK-LABEL: @test_constant_fold_frexp_mant_f32_nan(
+; CHECK-LABEL: define float @test_constant_fold_frexp_mant_f32_nan
+; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:    ret float 0x7FF8000000000000
 ;
   %val = call float @llvm.amdgcn.frexp.mant.f32(float 0x7FF8000000000000)
@@ -297,7 +329,8 @@
 }
 
 define double @test_constant_fold_frexp_mant_f64_nan() nounwind {
-; CHECK-LABEL: @test_constant_fold_frexp_mant_f64_nan(
+; CHECK-LABEL: define double @test_constant_fold_frexp_mant_f64_nan
+; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:    ret double 0x7FF8000000000000
 ;
   %val = call double @llvm.amdgcn.frexp.mant.f64(double 0x7FF8000000000000)
@@ -305,7 +338,8 @@
 }
 
 define float @test_constant_fold_frexp_mant_f32_inf() nounwind {
-; CHECK-LABEL: @test_constant_fold_frexp_mant_f32_inf(
+; CHECK-LABEL: define float @test_constant_fold_frexp_mant_f32_inf
+; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:    ret float 0x7FF0000000000000
 ;
   %val = call float @llvm.amdgcn.frexp.mant.f32(float 0x7FF0000000000000)
@@ -313,7 +347,8 @@
 }
 
 define double @test_constant_fold_frexp_mant_f64_inf() nounwind {
-; CHECK-LABEL: @test_constant_fold_frexp_mant_f64_inf(
+; CHECK-LABEL: define double @test_constant_fold_frexp_mant_f64_inf
+; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:    ret double 0x7FF0000000000000
 ;
   %val = call double @llvm.amdgcn.frexp.mant.f64(double 0x7FF0000000000000)
@@ -321,7 +356,8 @@
 }
 
 define float @test_constant_fold_frexp_mant_f32_ninf() nounwind {
-; CHECK-LABEL: @test_constant_fold_frexp_mant_f32_ninf(
+; CHECK-LABEL: define float @test_constant_fold_frexp_mant_f32_ninf
+; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:    ret float 0xFFF0000000000000
 ;
   %val = call float @llvm.amdgcn.frexp.mant.f32(float 0xFFF0000000000000)
@@ -329,7 +365,8 @@
 }
 
 define double @test_constant_fold_frexp_mant_f64_ninf() nounwind {
-; CHECK-LABEL: @test_constant_fold_frexp_mant_f64_ninf(
+; CHECK-LABEL: define double @test_constant_fold_frexp_mant_f64_ninf
+; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:    ret double 0xFFF0000000000000
 ;
   %val = call double @llvm.amdgcn.frexp.mant.f64(double 0xFFF0000000000000)
@@ -337,7 +374,8 @@
 }
 
 define float @test_constant_fold_frexp_mant_f32_max_num() nounwind {
-; CHECK-LABEL: @test_constant_fold_frexp_mant_f32_max_num(
+; CHECK-LABEL: define float @test_constant_fold_frexp_mant_f32_max_num
+; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:    ret float 0x3FEFFFFFE0000000
 ;
   %val = call float @llvm.amdgcn.frexp.mant.f32(float 0x47EFFFFFE0000000)
@@ -345,7 +383,8 @@
 }
 
 define double @test_constant_fold_frexp_mant_f64_max_num() nounwind {
-; CHECK-LABEL: @test_constant_fold_frexp_mant_f64_max_num(
+; CHECK-LABEL: define double @test_constant_fold_frexp_mant_f64_max_num
+; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:    ret double 0x3FEFFFFFFFFFFFFF
 ;
   %val = call double @llvm.amdgcn.frexp.mant.f64(double 0x7FEFFFFFFFFFFFFF)
@@ -353,7 +392,8 @@
 }
 
 define float @test_constant_fold_frexp_mant_f32_min_num() nounwind {
-; CHECK-LABEL: @test_constant_fold_frexp_mant_f32_min_num(
+; CHECK-LABEL: define float @test_constant_fold_frexp_mant_f32_min_num
+; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:    ret float 5.000000e-01
 ;
   %val = call float @llvm.amdgcn.frexp.mant.f32(float 0x36A0000000000000)
@@ -361,7 +401,8 @@
 }
 
 define double @test_constant_fold_frexp_mant_f64_min_num() nounwind {
-; CHECK-LABEL: @test_constant_fold_frexp_mant_f64_min_num(
+; CHECK-LABEL: define double @test_constant_fold_frexp_mant_f64_min_num
+; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:    ret double 5.000000e-01
 ;
   %val = call double @llvm.amdgcn.frexp.mant.f64(double 4.940656e-324)
@@ -377,7 +418,8 @@
 declare i32 @llvm.amdgcn.frexp.exp.f64(double) nounwind readnone
 
 define i32 @test_constant_fold_frexp_exp_f32_undef() nounwind {
-; CHECK-LABEL: @test_constant_fold_frexp_exp_f32_undef(
+; CHECK-LABEL: define i32 @test_constant_fold_frexp_exp_f32_undef
+; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:    ret i32 undef
 ;
   %val = call i32 @llvm.amdgcn.frexp.exp.f32(float undef)
@@ -385,7 +427,8 @@
 }
 
 define i32 @test_constant_fold_frexp_exp_f64_undef() nounwind {
-; CHECK-LABEL: @test_constant_fold_frexp_exp_f64_undef(
+; CHECK-LABEL: define i32 @test_constant_fold_frexp_exp_f64_undef
+; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:    ret i32 undef
 ;
   %val = call i32 @llvm.amdgcn.frexp.exp.f64(double undef)
@@ -393,7 +436,8 @@
 }
 
 define i32 @test_constant_fold_frexp_exp_f32_0() nounwind {
-; CHECK-LABEL: @test_constant_fold_frexp_exp_f32_0(
+; CHECK-LABEL: define i32 @test_constant_fold_frexp_exp_f32_0
+; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:    ret i32 0
 ;
   %val = call i32 @llvm.amdgcn.frexp.exp.f32(float 0.0)
@@ -401,7 +445,8 @@
 }
 
 define i32 @test_constant_fold_frexp_exp_f64_0() nounwind {
-; CHECK-LABEL: @test_constant_fold_frexp_exp_f64_0(
+; CHECK-LABEL: define i32 @test_constant_fold_frexp_exp_f64_0
+; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:    ret i32 0
 ;
   %val = call i32 @llvm.amdgcn.frexp.exp.f64(double 0.0)
@@ -409,7 +454,8 @@
 }
 
 define i32 @test_constant_fold_frexp_exp_f32_n0() nounwind {
-; CHECK-LABEL: @test_constant_fold_frexp_exp_f32_n0(
+; CHECK-LABEL: define i32 @test_constant_fold_frexp_exp_f32_n0
+; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:    ret i32 0
 ;
   %val = call i32 @llvm.amdgcn.frexp.exp.f32(float -0.0)
@@ -417,7 +463,8 @@
 }
 
 define i32 @test_constant_fold_frexp_exp_f64_n0() nounwind {
-; CHECK-LABEL: @test_constant_fold_frexp_exp_f64_n0(
+; CHECK-LABEL: define i32 @test_constant_fold_frexp_exp_f64_n0
+; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:    ret i32 0
 ;
   %val = call i32 @llvm.amdgcn.frexp.exp.f64(double -0.0)
@@ -425,7 +472,8 @@
 }
 
 define i32 @test_constant_fold_frexp_exp_f32_1024() nounwind {
-; CHECK-LABEL: @test_constant_fold_frexp_exp_f32_1024(
+; CHECK-LABEL: define i32 @test_constant_fold_frexp_exp_f32_1024
+; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:    ret i32 11
 ;
   %val = call i32 @llvm.amdgcn.frexp.exp.f32(float 1024.0)
@@ -433,7 +481,8 @@
 }
 
 define i32 @test_constant_fold_frexp_exp_f64_1024() nounwind {
-; CHECK-LABEL: @test_constant_fold_frexp_exp_f64_1024(
+; CHECK-LABEL: define i32 @test_constant_fold_frexp_exp_f64_1024
+; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:    ret i32 11
 ;
   %val = call i32 @llvm.amdgcn.frexp.exp.f64(double 1024.0)
@@ -441,7 +490,8 @@
 }
 
 define i32 @test_constant_fold_frexp_exp_f32_n1024() nounwind {
-; CHECK-LABEL: @test_constant_fold_frexp_exp_f32_n1024(
+; CHECK-LABEL: define i32 @test_constant_fold_frexp_exp_f32_n1024
+; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:    ret i32 11
 ;
   %val = call i32 @llvm.amdgcn.frexp.exp.f32(float -1024.0)
@@ -449,7 +499,8 @@
 }
 
 define i32 @test_constant_fold_frexp_exp_f64_n1024() nounwind {
-; CHECK-LABEL: @test_constant_fold_frexp_exp_f64_n1024(
+; CHECK-LABEL: define i32 @test_constant_fold_frexp_exp_f64_n1024
+; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:    ret i32 11
 ;
   %val = call i32 @llvm.amdgcn.frexp.exp.f64(double -1024.0)
@@ -457,7 +508,8 @@
 }
 
 define i32 @test_constant_fold_frexp_exp_f32_1_1024() nounwind {
-; CHECK-LABEL: @test_constant_fold_frexp_exp_f32_1_1024(
+; CHECK-LABEL: define i32 @test_constant_fold_frexp_exp_f32_1_1024
+; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:    ret i32 -9
 ;
   %val = call i32 @llvm.amdgcn.frexp.exp.f32(float 0.0009765625)
@@ -465,7 +517,8 @@
 }
 
 define i32 @test_constant_fold_frexp_exp_f64_1_1024() nounwind {
-; CHECK-LABEL: @test_constant_fold_frexp_exp_f64_1_1024(
+; CHECK-LABEL: define i32 @test_constant_fold_frexp_exp_f64_1_1024
+; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:    ret i32 -9
 ;
   %val = call i32 @llvm.amdgcn.frexp.exp.f64(double 0.0009765625)
@@ -473,7 +526,8 @@
 }
 
 define i32 @test_constant_fold_frexp_exp_f32_nan() nounwind {
-; CHECK-LABEL: @test_constant_fold_frexp_exp_f32_nan(
+; CHECK-LABEL: define i32 @test_constant_fold_frexp_exp_f32_nan
+; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:    ret i32 0
 ;
   %val = call i32 @llvm.amdgcn.frexp.exp.f32(float 0x7FF8000000000000)
@@ -481,7 +535,8 @@
 }
 
 define i32 @test_constant_fold_frexp_exp_f64_nan() nounwind {
-; CHECK-LABEL: @test_constant_fold_frexp_exp_f64_nan(
+; CHECK-LABEL: define i32 @test_constant_fold_frexp_exp_f64_nan
+; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:    ret i32 0
 ;
   %val = call i32 @llvm.amdgcn.frexp.exp.f64(double 0x7FF8000000000000)
@@ -489,7 +544,8 @@
 }
 
 define i32 @test_constant_fold_frexp_exp_f32_inf() nounwind {
-; CHECK-LABEL: @test_constant_fold_frexp_exp_f32_inf(
+; CHECK-LABEL: define i32 @test_constant_fold_frexp_exp_f32_inf
+; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:    ret i32 0
 ;
   %val = call i32 @llvm.amdgcn.frexp.exp.f32(float 0x7FF0000000000000)
@@ -497,7 +553,8 @@
 }
 
 define i32 @test_constant_fold_frexp_exp_f64_inf() nounwind {
-; CHECK-LABEL: @test_constant_fold_frexp_exp_f64_inf(
+; CHECK-LABEL: define i32 @test_constant_fold_frexp_exp_f64_inf
+; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:    ret i32 0
 ;
   %val = call i32 @llvm.amdgcn.frexp.exp.f64(double 0x7FF0000000000000)
@@ -505,7 +562,8 @@
 }
 
 define i32 @test_constant_fold_frexp_exp_f32_ninf() nounwind {
-; CHECK-LABEL: @test_constant_fold_frexp_exp_f32_ninf(
+; CHECK-LABEL: define i32 @test_constant_fold_frexp_exp_f32_ninf
+; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:    ret i32 0
 ;
   %val = call i32 @llvm.amdgcn.frexp.exp.f32(float 0xFFF0000000000000)
@@ -513,7 +571,8 @@
 }
 
 define i32 @test_constant_fold_frexp_exp_f64_ninf() nounwind {
-; CHECK-LABEL: @test_constant_fold_frexp_exp_f64_ninf(
+; CHECK-LABEL: define i32 @test_constant_fold_frexp_exp_f64_ninf
+; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:    ret i32 0
 ;
   %val = call i32 @llvm.amdgcn.frexp.exp.f64(double 0xFFF0000000000000)
@@ -521,7 +580,8 @@
 }
 
 define i32 @test_constant_fold_frexp_exp_f32_max_num() nounwind {
-; CHECK-LABEL: @test_constant_fold_frexp_exp_f32_max_num(
+; CHECK-LABEL: define i32 @test_constant_fold_frexp_exp_f32_max_num
+; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:    ret i32 128
 ;
   %val = call i32 @llvm.amdgcn.frexp.exp.f32(float 0x47EFFFFFE0000000)
@@ -529,7 +589,8 @@
 }
 
 define i32 @test_constant_fold_frexp_exp_f64_max_num() nounwind {
-; CHECK-LABEL: @test_constant_fold_frexp_exp_f64_max_num(
+; CHECK-LABEL: define i32 @test_constant_fold_frexp_exp_f64_max_num
+; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:    ret i32 1024
 ;
   %val = call i32 @llvm.amdgcn.frexp.exp.f64(double 0x7FEFFFFFFFFFFFFF)
@@ -537,7 +598,8 @@
 }
 
 define i32 @test_constant_fold_frexp_exp_f32_min_num() nounwind {
-; CHECK-LABEL: @test_constant_fold_frexp_exp_f32_min_num(
+; CHECK-LABEL: define i32 @test_constant_fold_frexp_exp_f32_min_num
+; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:    ret i32 -148
 ;
   %val = call i32 @llvm.amdgcn.frexp.exp.f32(float 0x36A0000000000000)
@@ -545,7 +607,8 @@
 }
 
 define i32 @test_constant_fold_frexp_exp_f64_min_num() nounwind {
-; CHECK-LABEL: @test_constant_fold_frexp_exp_f64_min_num(
+; CHECK-LABEL: define i32 @test_constant_fold_frexp_exp_f64_min_num
+; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:    ret i32 -1073
 ;
   %val = call i32 @llvm.amdgcn.frexp.exp.f64(double 4.940656e-324)
@@ -560,7 +623,8 @@
 declare i1 @llvm.amdgcn.class.f64(double, i32) nounwind readnone
 
 define i1 @test_class_undef_mask_f32(float %x) nounwind {
-; CHECK-LABEL: @test_class_undef_mask_f32(
+; CHECK-LABEL: define i1 @test_class_undef_mask_f32
+; CHECK-SAME: (float [[X:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    ret i1 false
 ;
   %val = call i1 @llvm.amdgcn.class.f32(float %x, i32 undef)
@@ -568,8 +632,9 @@
 }
 
 define i1 @test_class_over_max_mask_f32(float %x) nounwind {
-; CHECK-LABEL: @test_class_over_max_mask_f32(
-; CHECK-NEXT:    [[VAL:%.*]] = call i1 @llvm.amdgcn.class.f32(float [[X:%.*]], i32 1)
+; CHECK-LABEL: define i1 @test_class_over_max_mask_f32
+; CHECK-SAME: (float [[X:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[VAL:%.*]] = call i1 @llvm.amdgcn.class.f32(float [[X]], i32 1)
 ; CHECK-NEXT:    ret i1 [[VAL]]
 ;
   %val = call i1 @llvm.amdgcn.class.f32(float %x, i32 1025)
@@ -577,7 +642,8 @@
 }
 
 define i1 @test_class_no_mask_f32(float %x) nounwind {
-; CHECK-LABEL: @test_class_no_mask_f32(
+; CHECK-LABEL: define i1 @test_class_no_mask_f32
+; CHECK-SAME: (float [[X:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    ret i1 false
 ;
   %val = call i1 @llvm.amdgcn.class.f32(float %x, i32 0)
@@ -585,7 +651,8 @@
 }
 
 define i1 @test_class_full_mask_f32(float %x) nounwind {
-; CHECK-LABEL: @test_class_full_mask_f32(
+; CHECK-LABEL: define i1 @test_class_full_mask_f32
+; CHECK-SAME: (float [[X:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    ret i1 true
 ;
   %val = call i1 @llvm.amdgcn.class.f32(float %x, i32 1023)
@@ -593,7 +660,8 @@
 }
 
 define i1 @test_class_undef_no_mask_f32() nounwind {
-; CHECK-LABEL: @test_class_undef_no_mask_f32(
+; CHECK-LABEL: define i1 @test_class_undef_no_mask_f32
+; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:    ret i1 false
 ;
   %val = call i1 @llvm.amdgcn.class.f32(float undef, i32 0)
@@ -601,7 +669,8 @@
 }
 
 define i1 @test_class_undef_full_mask_f32() nounwind {
-; CHECK-LABEL: @test_class_undef_full_mask_f32(
+; CHECK-LABEL: define i1 @test_class_undef_full_mask_f32
+; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:    ret i1 true
 ;
   %val = call i1 @llvm.amdgcn.class.f32(float undef, i32 1023)
@@ -609,7 +678,8 @@
 }
 
 define i1 @test_class_undef_val_f32() nounwind {
-; CHECK-LABEL: @test_class_undef_val_f32(
+; CHECK-LABEL: define i1 @test_class_undef_val_f32
+; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:    ret i1 undef
 ;
   %val = call i1 @llvm.amdgcn.class.f32(float undef, i32 4)
@@ -617,7 +687,8 @@
 }
 
 define i1 @test_class_undef_undef_f32() nounwind {
-; CHECK-LABEL: @test_class_undef_undef_f32(
+; CHECK-LABEL: define i1 @test_class_undef_undef_f32
+; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:    ret i1 undef
 ;
   %val = call i1 @llvm.amdgcn.class.f32(float undef, i32 undef)
@@ -625,8 +696,9 @@
 }
 
 define i1 @test_class_var_mask_f32(float %x, i32 %mask) nounwind {
-; CHECK-LABEL: @test_class_var_mask_f32(
-; CHECK-NEXT:    [[VAL:%.*]] = call i1 @llvm.amdgcn.class.f32(float [[X:%.*]], i32 [[MASK:%.*]])
+; CHECK-LABEL: define i1 @test_class_var_mask_f32
+; CHECK-SAME: (float [[X:%.*]], i32 [[MASK:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[VAL:%.*]] = call i1 @llvm.amdgcn.class.f32(float [[X]], i32 [[MASK]])
 ; CHECK-NEXT:    ret i1 [[VAL]]
 ;
   %val = call i1 @llvm.amdgcn.class.f32(float %x, i32 %mask)
@@ -634,8 +706,9 @@
 }
 
 define i1 @test_class_isnan_f32(float %x) nounwind {
-; CHECK-LABEL: @test_class_isnan_f32(
-; CHECK-NEXT:    [[VAL:%.*]] = fcmp uno float [[X:%.*]], 0.000000e+00
+; CHECK-LABEL: define i1 @test_class_isnan_f32
+; CHECK-SAME: (float [[X:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[VAL:%.*]] = fcmp uno float [[X]], 0.000000e+00
 ; CHECK-NEXT:    ret i1 [[VAL]]
 ;
   %val = call i1 @llvm.amdgcn.class.f32(float %x, i32 3)
@@ -643,8 +716,9 @@
 }
 
 define i1 @test_class_isnan_f32_strict(float %x) nounwind {
-; CHECK-LABEL: @test_class_isnan_f32_strict(
-; CHECK-NEXT:    [[VAL:%.*]] = call i1 @llvm.amdgcn.class.f32(float [[X:%.*]], i32 3) #[[ATTR16:[0-9]+]]
+; CHECK-LABEL: define i1 @test_class_isnan_f32_strict
+; CHECK-SAME: (float [[X:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[VAL:%.*]] = call i1 @llvm.amdgcn.class.f32(float [[X]], i32 3) #[[ATTR15:[0-9]+]]
 ; CHECK-NEXT:    ret i1 [[VAL]]
 ;
   %val = call i1 @llvm.amdgcn.class.f32(float %x, i32 3) strictfp
@@ -652,8 +726,9 @@
 }
 
 define i1 @test_class_is_p0_n0_f32(float %x) nounwind {
-; CHECK-LABEL: @test_class_is_p0_n0_f32(
-; CHECK-NEXT:    [[VAL:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00
+; CHECK-LABEL: define i1 @test_class_is_p0_n0_f32
+; CHECK-SAME: (float [[X:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[VAL:%.*]] = fcmp oeq float [[X]], 0.000000e+00
 ; CHECK-NEXT:    ret i1 [[VAL]]
 ;
   %val = call i1 @llvm.amdgcn.class.f32(float %x, i32 96)
@@ -661,8 +736,9 @@
 }
 
 define i1 @test_class_is_p0_n0_f32_strict(float %x) nounwind {
-; CHECK-LABEL: @test_class_is_p0_n0_f32_strict(
-; CHECK-NEXT:    [[VAL:%.*]] = call i1 @llvm.amdgcn.class.f32(float [[X:%.*]], i32 96) #[[ATTR16]]
+; CHECK-LABEL: define i1 @test_class_is_p0_n0_f32_strict
+; CHECK-SAME: (float [[X:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[VAL:%.*]] = call i1 @llvm.amdgcn.class.f32(float [[X]], i32 96) #[[ATTR15]]
 ; CHECK-NEXT:    ret i1 [[VAL]]
 ;
   %val = call i1 @llvm.amdgcn.class.f32(float %x, i32 96) strictfp
@@ -670,7 +746,8 @@
 }
 
 define i1 @test_constant_class_snan_test_snan_f64() nounwind {
-; CHECK-LABEL: @test_constant_class_snan_test_snan_f64(
+; CHECK-LABEL: define i1 @test_constant_class_snan_test_snan_f64
+; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:    ret i1 true
 ;
   %val = call i1 @llvm.amdgcn.class.f64(double 0x7FF0000000000001, i32 1)
@@ -678,7 +755,8 @@
 }
 
 define i1 @test_constant_class_qnan_test_qnan_f64() nounwind {
-; CHECK-LABEL: @test_constant_class_qnan_test_qnan_f64(
+; CHECK-LABEL: define i1 @test_constant_class_qnan_test_qnan_f64
+; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:    ret i1 true
 ;
   %val = call i1 @llvm.amdgcn.class.f64(double 0x7FF8000000000000, i32 2)
@@ -686,7 +764,8 @@
 }
 
 define i1 @test_constant_class_qnan_test_snan_f64() nounwind {
-; CHECK-LABEL: @test_constant_class_qnan_test_snan_f64(
+; CHECK-LABEL: define i1 @test_constant_class_qnan_test_snan_f64
+; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:    ret i1 false
 ;
   %val = call i1 @llvm.amdgcn.class.f64(double 0x7FF8000000000000, i32 1)
@@ -694,7 +773,8 @@
 }
 
 define i1 @test_constant_class_ninf_test_ninf_f64() nounwind {
-; CHECK-LABEL: @test_constant_class_ninf_test_ninf_f64(
+; CHECK-LABEL: define i1 @test_constant_class_ninf_test_ninf_f64
+; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:    ret i1 true
 ;
   %val = call i1 @llvm.amdgcn.class.f64(double 0xFFF0000000000000, i32 4)
@@ -702,7 +782,8 @@
 }
 
 define i1 @test_constant_class_pinf_test_ninf_f64() nounwind {
-; CHECK-LABEL: @test_constant_class_pinf_test_ninf_f64(
+; CHECK-LABEL: define i1 @test_constant_class_pinf_test_ninf_f64
+; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:    ret i1 false
 ;
   %val = call i1 @llvm.amdgcn.class.f64(double 0x7FF0000000000000, i32 4)
@@ -710,7 +791,8 @@
 }
 
 define i1 @test_constant_class_qnan_test_ninf_f64() nounwind {
-; CHECK-LABEL: @test_constant_class_qnan_test_ninf_f64(
+; CHECK-LABEL: define i1 @test_constant_class_qnan_test_ninf_f64
+; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:    ret i1 false
 ;
   %val = call i1 @llvm.amdgcn.class.f64(double 0x7FF8000000000000, i32 4)
@@ -718,7 +800,8 @@
 }
 
 define i1 @test_constant_class_snan_test_ninf_f64() nounwind {
-; CHECK-LABEL: @test_constant_class_snan_test_ninf_f64(
+; CHECK-LABEL: define i1 @test_constant_class_snan_test_ninf_f64
+; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:    ret i1 false
 ;
   %val = call i1 @llvm.amdgcn.class.f64(double 0x7FF0000000000001, i32 4)
@@ -726,7 +809,8 @@
 }
 
 define i1 @test_constant_class_nnormal_test_nnormal_f64() nounwind {
-; CHECK-LABEL: @test_constant_class_nnormal_test_nnormal_f64(
+; CHECK-LABEL: define i1 @test_constant_class_nnormal_test_nnormal_f64
+; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:    ret i1 true
 ;
   %val = call i1 @llvm.amdgcn.class.f64(double -1.0, i32 8)
@@ -734,7 +818,8 @@
 }
 
 define i1 @test_constant_class_pnormal_test_nnormal_f64() nounwind {
-; CHECK-LABEL: @test_constant_class_pnormal_test_nnormal_f64(
+; CHECK-LABEL: define i1 @test_constant_class_pnormal_test_nnormal_f64
+; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:    ret i1 false
 ;
   %val = call i1 @llvm.amdgcn.class.f64(double 1.0, i32 8)
@@ -742,7 +827,8 @@
 }
 
 define i1 @test_constant_class_nsubnormal_test_nsubnormal_f64() nounwind {
-; CHECK-LABEL: @test_constant_class_nsubnormal_test_nsubnormal_f64(
+; CHECK-LABEL: define i1 @test_constant_class_nsubnormal_test_nsubnormal_f64
+; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:    ret i1 true
 ;
   %val = call i1 @llvm.amdgcn.class.f64(double 0x800fffffffffffff, i32 16)
@@ -750,7 +836,8 @@
 }
 
 define i1 @test_constant_class_psubnormal_test_nsubnormal_f64() nounwind {
-; CHECK-LABEL: @test_constant_class_psubnormal_test_nsubnormal_f64(
+; CHECK-LABEL: define i1 @test_constant_class_psubnormal_test_nsubnormal_f64
+; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:    ret i1 false
 ;
   %val = call i1 @llvm.amdgcn.class.f64(double 0x000fffffffffffff, i32 16)
@@ -758,7 +845,8 @@
 }
 
 define i1 @test_constant_class_nzero_test_nzero_f64() nounwind {
-; CHECK-LABEL: @test_constant_class_nzero_test_nzero_f64(
+; CHECK-LABEL: define i1 @test_constant_class_nzero_test_nzero_f64
+; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:    ret i1 true
 ;
   %val = call i1 @llvm.amdgcn.class.f64(double -0.0, i32 32)
@@ -766,7 +854,8 @@
 }
 
 define i1 @test_constant_class_pzero_test_nzero_f64() nounwind {
-; CHECK-LABEL: @test_constant_class_pzero_test_nzero_f64(
+; CHECK-LABEL: define i1 @test_constant_class_pzero_test_nzero_f64
+; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:    ret i1 false
 ;
   %val = call i1 @llvm.amdgcn.class.f64(double 0.0, i32 32)
@@ -774,7 +863,8 @@
 }
 
 define i1 @test_constant_class_pzero_test_pzero_f64() nounwind {
-; CHECK-LABEL: @test_constant_class_pzero_test_pzero_f64(
+; CHECK-LABEL: define i1 @test_constant_class_pzero_test_pzero_f64
+; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:    ret i1 true
 ;
   %val = call i1 @llvm.amdgcn.class.f64(double 0.0, i32 64)
@@ -782,7 +872,8 @@
 }
 
 define i1 @test_constant_class_nzero_test_pzero_f64() nounwind {
-; CHECK-LABEL: @test_constant_class_nzero_test_pzero_f64(
+; CHECK-LABEL: define i1 @test_constant_class_nzero_test_pzero_f64
+; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:    ret i1 false
 ;
   %val = call i1 @llvm.amdgcn.class.f64(double -0.0, i32 64)
@@ -790,7 +881,8 @@
 }
 
 define i1 @test_constant_class_psubnormal_test_psubnormal_f64() nounwind {
-; CHECK-LABEL: @test_constant_class_psubnormal_test_psubnormal_f64(
+; CHECK-LABEL: define i1 @test_constant_class_psubnormal_test_psubnormal_f64
+; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:    ret i1 true
 ;
   %val = call i1 @llvm.amdgcn.class.f64(double 0x000fffffffffffff, i32 128)
@@ -798,7 +890,8 @@
 }
 
 define i1 @test_constant_class_nsubnormal_test_psubnormal_f64() nounwind {
-; CHECK-LABEL: @test_constant_class_nsubnormal_test_psubnormal_f64(
+; CHECK-LABEL: define i1 @test_constant_class_nsubnormal_test_psubnormal_f64
+; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:    ret i1 false
 ;
   %val = call i1 @llvm.amdgcn.class.f64(double 0x800fffffffffffff, i32 128)
@@ -806,7 +899,8 @@
 }
 
 define i1 @test_constant_class_pnormal_test_pnormal_f64() nounwind {
-; CHECK-LABEL: @test_constant_class_pnormal_test_pnormal_f64(
+; CHECK-LABEL: define i1 @test_constant_class_pnormal_test_pnormal_f64
+; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:    ret i1 true
 ;
   %val = call i1 @llvm.amdgcn.class.f64(double 1.0, i32 256)
@@ -814,7 +908,8 @@
 }
 
 define i1 @test_constant_class_nnormal_test_pnormal_f64() nounwind {
-; CHECK-LABEL: @test_constant_class_nnormal_test_pnormal_f64(
+; CHECK-LABEL: define i1 @test_constant_class_nnormal_test_pnormal_f64
+; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:    ret i1 false
 ;
   %val = call i1 @llvm.amdgcn.class.f64(double -1.0, i32 256)
@@ -822,7 +917,8 @@
 }
 
 define i1 @test_constant_class_pinf_test_pinf_f64() nounwind {
-; CHECK-LABEL: @test_constant_class_pinf_test_pinf_f64(
+; CHECK-LABEL: define i1 @test_constant_class_pinf_test_pinf_f64
+; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:    ret i1 true
 ;
   %val = call i1 @llvm.amdgcn.class.f64(double 0x7FF0000000000000, i32 512)
@@ -830,7 +926,8 @@
 }
 
 define i1 @test_constant_class_ninf_test_pinf_f64() nounwind {
-; CHECK-LABEL: @test_constant_class_ninf_test_pinf_f64(
+; CHECK-LABEL: define i1 @test_constant_class_ninf_test_pinf_f64
+; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:    ret i1 false
 ;
   %val = call i1 @llvm.amdgcn.class.f64(double 0xFFF0000000000000, i32 512)
@@ -838,7 +935,8 @@
 }
 
 define i1 @test_constant_class_qnan_test_pinf_f64() nounwind {
-; CHECK-LABEL: @test_constant_class_qnan_test_pinf_f64(
+; CHECK-LABEL: define i1 @test_constant_class_qnan_test_pinf_f64
+; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:    ret i1 false
 ;
   %val = call i1 @llvm.amdgcn.class.f64(double 0x7FF8000000000000, i32 512)
@@ -846,7 +944,8 @@
 }
 
 define i1 @test_constant_class_snan_test_pinf_f64() nounwind {
-; CHECK-LABEL: @test_constant_class_snan_test_pinf_f64(
+; CHECK-LABEL: define i1 @test_constant_class_snan_test_pinf_f64
+; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:    ret i1 false
 ;
   %val = call i1 @llvm.amdgcn.class.f64(double 0x7FF0000000000001, i32 512)
@@ -854,7 +953,8 @@
 }
 
 define i1 @test_class_is_snan_nnan_src(float %x) {
-; CHECK-LABEL: @test_class_is_snan_nnan_src(
+; CHECK-LABEL: define i1 @test_class_is_snan_nnan_src
+; CHECK-SAME: (float [[X:%.*]]) #[[ATTR3:[0-9]+]] {
 ; CHECK-NEXT:    ret i1 false
 ;
   %nnan = fadd nnan float %x, 1.0
@@ -863,7 +963,8 @@
 }
 
 define i1 @test_class_is_qnan_nnan_src(float %x) {
-; CHECK-LABEL: @test_class_is_qnan_nnan_src(
+; CHECK-LABEL: define i1 @test_class_is_qnan_nnan_src
+; CHECK-SAME: (float [[X:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    ret i1 false
 ;
   %nnan = fadd nnan float %x, 1.0
@@ -872,7 +973,8 @@
 }
 
 define i1 @test_class_is_nan_nnan_src(float %x) {
-; CHECK-LABEL: @test_class_is_nan_nnan_src(
+; CHECK-LABEL: define i1 @test_class_is_nan_nnan_src
+; CHECK-SAME: (float [[X:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    ret i1 false
 ;
   %nnan = fadd nnan float %x, 1.0
@@ -881,8 +983,9 @@
 }
 
 define i1 @test_class_is_nan_other_nnan_src(float %x) {
-; CHECK-LABEL: @test_class_is_nan_other_nnan_src(
-; CHECK-NEXT:    [[NNAN:%.*]] = fadd nnan float [[X:%.*]], 1.000000e+00
+; CHECK-LABEL: define i1 @test_class_is_nan_other_nnan_src
+; CHECK-SAME: (float [[X:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[NNAN:%.*]] = fadd nnan float [[X]], 1.000000e+00
 ; CHECK-NEXT:    [[CLASS:%.*]] = call i1 @llvm.amdgcn.class.f32(float [[NNAN]], i32 264)
 ; CHECK-NEXT:    ret i1 [[CLASS]]
 ;
@@ -898,8 +1001,9 @@
 declare float @llvm.fabs.f32(float) nounwind readnone
 
 define float @cos_fneg_f32(float %x) {
-; CHECK-LABEL: @cos_fneg_f32(
-; CHECK-NEXT:    [[COS:%.*]] = call float @llvm.amdgcn.cos.f32(float [[X:%.*]])
+; CHECK-LABEL: define float @cos_fneg_f32
+; CHECK-SAME: (float [[X:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[COS:%.*]] = call float @llvm.amdgcn.cos.f32(float [[X]])
 ; CHECK-NEXT:    ret float [[COS]]
 ;
   %x.fneg = fsub float -0.0, %x
@@ -908,8 +1012,9 @@
 }
 
 define float @cos_unary_fneg_f32(float %x) {
-; CHECK-LABEL: @cos_unary_fneg_f32(
-; CHECK-NEXT:    [[COS:%.*]] = call float @llvm.amdgcn.cos.f32(float [[X:%.*]])
+; CHECK-LABEL: define float @cos_unary_fneg_f32
+; CHECK-SAME: (float [[X:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[COS:%.*]] = call float @llvm.amdgcn.cos.f32(float [[X]])
 ; CHECK-NEXT:    ret float [[COS]]
 ;
   %x.fneg = fneg float %x
@@ -918,8 +1023,9 @@
 }
 
 define float @cos_fabs_f32(float %x) {
-; CHECK-LABEL: @cos_fabs_f32(
-; CHECK-NEXT:    [[COS:%.*]] = call float @llvm.amdgcn.cos.f32(float [[X:%.*]])
+; CHECK-LABEL: define float @cos_fabs_f32
+; CHECK-SAME: (float [[X:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[COS:%.*]] = call float @llvm.amdgcn.cos.f32(float [[X]])
 ; CHECK-NEXT:    ret float [[COS]]
 ;
   %x.fabs = call float @llvm.fabs.f32(float %x)
@@ -928,8 +1034,9 @@
 }
 
 define float @cos_fabs_fneg_f32(float %x) {
-; CHECK-LABEL: @cos_fabs_fneg_f32(
-; CHECK-NEXT:    [[COS:%.*]] = call float @llvm.amdgcn.cos.f32(float [[X:%.*]])
+; CHECK-LABEL: define float @cos_fabs_fneg_f32
+; CHECK-SAME: (float [[X:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[COS:%.*]] = call float @llvm.amdgcn.cos.f32(float [[X]])
 ; CHECK-NEXT:    ret float [[COS]]
 ;
   %x.fabs = call float @llvm.fabs.f32(float %x)
@@ -939,8 +1046,9 @@
 }
 
 define float @cos_fabs_unary_fneg_f32(float %x) {
-; CHECK-LABEL: @cos_fabs_unary_fneg_f32(
-; CHECK-NEXT:    [[COS:%.*]] = call float @llvm.amdgcn.cos.f32(float [[X:%.*]])
+; CHECK-LABEL: define float @cos_fabs_unary_fneg_f32
+; CHECK-SAME: (float [[X:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[COS:%.*]] = call float @llvm.amdgcn.cos.f32(float [[X]])
 ; CHECK-NEXT:    ret float [[COS]]
 ;
   %x.fabs = call float @llvm.fabs.f32(float %x)
@@ -956,8 +1064,9 @@
 declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) nounwind readnone
 
 define <2 x half> @vars_lhs_cvt_pkrtz(float %x, float %y) {
-; CHECK-LABEL: @vars_lhs_cvt_pkrtz(
-; CHECK-NEXT:    [[CVT:%.*]] = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float [[X:%.*]], float [[Y:%.*]])
+; CHECK-LABEL: define <2 x half> @vars_lhs_cvt_pkrtz
+; CHECK-SAME: (float [[X:%.*]], float [[Y:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[CVT:%.*]] = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float [[X]], float [[Y]])
 ; CHECK-NEXT:    ret <2 x half> [[CVT]]
 ;
   %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %y)
@@ -965,8 +1074,9 @@
 }
 
 define <2 x half> @constant_lhs_cvt_pkrtz(float %y) {
-; CHECK-LABEL: @constant_lhs_cvt_pkrtz(
-; CHECK-NEXT:    [[CVT:%.*]] = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 0.000000e+00, float [[Y:%.*]])
+; CHECK-LABEL: define <2 x half> @constant_lhs_cvt_pkrtz
+; CHECK-SAME: (float [[Y:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[CVT:%.*]] = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 0.000000e+00, float [[Y]])
 ; CHECK-NEXT:    ret <2 x half> [[CVT]]
 ;
   %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 0.0, float %y)
@@ -974,8 +1084,9 @@
 }
 
 define <2 x half> @constant_rhs_cvt_pkrtz(float %x) {
-; CHECK-LABEL: @constant_rhs_cvt_pkrtz(
-; CHECK-NEXT:    [[CVT:%.*]] = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float [[X:%.*]], float 0.000000e+00)
+; CHECK-LABEL: define <2 x half> @constant_rhs_cvt_pkrtz
+; CHECK-SAME: (float [[X:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[CVT:%.*]] = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float [[X]], float 0.000000e+00)
 ; CHECK-NEXT:    ret <2 x half> [[CVT]]
 ;
   %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float 0.0)
@@ -983,8 +1094,9 @@
 }
 
 define <2 x half> @undef_lhs_cvt_pkrtz(float %y) {
-; CHECK-LABEL: @undef_lhs_cvt_pkrtz(
-; CHECK-NEXT:    [[CVT:%.*]] = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float undef, float [[Y:%.*]])
+; CHECK-LABEL: define <2 x half> @undef_lhs_cvt_pkrtz
+; CHECK-SAME: (float [[Y:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[CVT:%.*]] = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float undef, float [[Y]])
 ; CHECK-NEXT:    ret <2 x half> [[CVT]]
 ;
   %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float undef, float %y)
@@ -992,8 +1104,9 @@
 }
 
 define <2 x half> @undef_rhs_cvt_pkrtz(float %x) {
-; CHECK-LABEL: @undef_rhs_cvt_pkrtz(
-; CHECK-NEXT:    [[CVT:%.*]] = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float [[X:%.*]], float undef)
+; CHECK-LABEL: define <2 x half> @undef_rhs_cvt_pkrtz
+; CHECK-SAME: (float [[X:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[CVT:%.*]] = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float [[X]], float undef)
 ; CHECK-NEXT:    ret <2 x half> [[CVT]]
 ;
   %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float undef)
@@ -1001,7 +1114,8 @@
 }
 
 define <2 x half> @undef_cvt_pkrtz() {
-; CHECK-LABEL: @undef_cvt_pkrtz(
+; CHECK-LABEL: define <2 x half> @undef_cvt_pkrtz
+; CHECK-SAME: () #[[ATTR3]] {
 ; CHECK-NEXT:    ret <2 x half> undef
 ;
   %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float undef, float undef)
@@ -1009,7 +1123,8 @@
 }
 
 define <2 x half> @constant_splat0_cvt_pkrtz() {
-; CHECK-LABEL: @constant_splat0_cvt_pkrtz(
+; CHECK-LABEL: define <2 x half> @constant_splat0_cvt_pkrtz
+; CHECK-SAME: () #[[ATTR3]] {
 ; CHECK-NEXT:    ret <2 x half> zeroinitializer
 ;
   %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 0.0, float 0.0)
@@ -1017,7 +1132,8 @@
 }
 
 define <2 x half> @constant_cvt_pkrtz() {
-; CHECK-LABEL: @constant_cvt_pkrtz(
+; CHECK-LABEL: define <2 x half> @constant_cvt_pkrtz
+; CHECK-SAME: () #[[ATTR3]] {
 ; CHECK-NEXT:    ret <2 x half> <half 0xH4000, half 0xH4400>
 ;
   %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 2.0, float 4.0)
@@ -1026,7 +1142,8 @@
 
 ; Test constant values where rtz changes result
 define <2 x half> @constant_rtz_pkrtz() {
-; CHECK-LABEL: @constant_rtz_pkrtz(
+; CHECK-LABEL: define <2 x half> @constant_rtz_pkrtz
+; CHECK-SAME: () #[[ATTR3]] {
 ; CHECK-NEXT:    ret <2 x half> <half 0xH7BFF, half 0xH7BFF>
 ;
   %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 65535.0, float 65535.0)
@@ -1040,8 +1157,9 @@
 declare <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float, float) nounwind readnone
 
 define <2 x i16> @undef_lhs_cvt_pknorm_i16(float %y) {
-; CHECK-LABEL: @undef_lhs_cvt_pknorm_i16(
-; CHECK-NEXT:    [[CVT:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float undef, float [[Y:%.*]])
+; CHECK-LABEL: define <2 x i16> @undef_lhs_cvt_pknorm_i16
+; CHECK-SAME: (float [[Y:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[CVT:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float undef, float [[Y]])
 ; CHECK-NEXT:    ret <2 x i16> [[CVT]]
 ;
   %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float undef, float %y)
@@ -1049,8 +1167,9 @@
 }
 
 define <2 x i16> @undef_rhs_cvt_pknorm_i16(float %x) {
-; CHECK-LABEL: @undef_rhs_cvt_pknorm_i16(
-; CHECK-NEXT:    [[CVT:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float [[X:%.*]], float undef)
+; CHECK-LABEL: define <2 x i16> @undef_rhs_cvt_pknorm_i16
+; CHECK-SAME: (float [[X:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[CVT:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float [[X]], float undef)
 ; CHECK-NEXT:    ret <2 x i16> [[CVT]]
 ;
   %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %x, float undef)
@@ -1058,7 +1177,8 @@
 }
 
 define <2 x i16> @undef_cvt_pknorm_i16() {
-; CHECK-LABEL: @undef_cvt_pknorm_i16(
+; CHECK-LABEL: define <2 x i16> @undef_cvt_pknorm_i16
+; CHECK-SAME: () #[[ATTR3]] {
 ; CHECK-NEXT:    ret <2 x i16> undef
 ;
   %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float undef, float undef)
@@ -1072,8 +1192,9 @@
 declare <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float, float) nounwind readnone
 
 define <2 x i16> @undef_lhs_cvt_pknorm_u16(float %y) {
-; CHECK-LABEL: @undef_lhs_cvt_pknorm_u16(
-; CHECK-NEXT:    [[CVT:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float undef, float [[Y:%.*]])
+; CHECK-LABEL: define <2 x i16> @undef_lhs_cvt_pknorm_u16
+; CHECK-SAME: (float [[Y:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[CVT:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float undef, float [[Y]])
 ; CHECK-NEXT:    ret <2 x i16> [[CVT]]
 ;
   %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float undef, float %y)
@@ -1081,8 +1202,9 @@
 }
 
 define <2 x i16> @undef_rhs_cvt_pknorm_u16(float %x) {
-; CHECK-LABEL: @undef_rhs_cvt_pknorm_u16(
-; CHECK-NEXT:    [[CVT:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float [[X:%.*]], float undef)
+; CHECK-LABEL: define <2 x i16> @undef_rhs_cvt_pknorm_u16
+; CHECK-SAME: (float [[X:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[CVT:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float [[X]], float undef)
 ; CHECK-NEXT:    ret <2 x i16> [[CVT]]
 ;
   %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %x, float undef)
@@ -1090,7 +1212,8 @@
 }
 
 define <2 x i16> @undef_cvt_pknorm_u16() {
-; CHECK-LABEL: @undef_cvt_pknorm_u16(
+; CHECK-LABEL: define <2 x i16> @undef_cvt_pknorm_u16
+; CHECK-SAME: () #[[ATTR3]] {
 ; CHECK-NEXT:    ret <2 x i16> undef
 ;
   %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float undef, float undef)
@@ -1104,8 +1227,9 @@
 declare <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32, i32) nounwind readnone
 
 define <2 x i16> @undef_lhs_cvt_pk_i16(i32 %y) {
-; CHECK-LABEL: @undef_lhs_cvt_pk_i16(
-; CHECK-NEXT:    [[CVT:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 undef, i32 [[Y:%.*]])
+; CHECK-LABEL: define <2 x i16> @undef_lhs_cvt_pk_i16
+; CHECK-SAME: (i32 [[Y:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[CVT:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 undef, i32 [[Y]])
 ; CHECK-NEXT:    ret <2 x i16> [[CVT]]
 ;
   %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 undef, i32 %y)
@@ -1113,8 +1237,9 @@
 }
 
 define <2 x i16> @undef_rhs_cvt_pk_i16(i32 %x) {
-; CHECK-LABEL: @undef_rhs_cvt_pk_i16(
-; CHECK-NEXT:    [[CVT:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 [[X:%.*]], i32 undef)
+; CHECK-LABEL: define <2 x i16> @undef_rhs_cvt_pk_i16
+; CHECK-SAME: (i32 [[X:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[CVT:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 [[X]], i32 undef)
 ; CHECK-NEXT:    ret <2 x i16> [[CVT]]
 ;
   %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 %x, i32 undef)
@@ -1122,7 +1247,8 @@
 }
 
 define <2 x i16> @undef_cvt_pk_i16() {
-; CHECK-LABEL: @undef_cvt_pk_i16(
+; CHECK-LABEL: define <2 x i16> @undef_cvt_pk_i16
+; CHECK-SAME: () #[[ATTR3]] {
 ; CHECK-NEXT:    ret <2 x i16> undef
 ;
   %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 undef, i32 undef)
@@ -1136,8 +1262,9 @@
 declare <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32, i32) nounwind readnone
 
 define <2 x i16> @undef_lhs_cvt_pk_u16(i32 %y) {
-; CHECK-LABEL: @undef_lhs_cvt_pk_u16(
-; CHECK-NEXT:    [[CVT:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 undef, i32 [[Y:%.*]])
+; CHECK-LABEL: define <2 x i16> @undef_lhs_cvt_pk_u16
+; CHECK-SAME: (i32 [[Y:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[CVT:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 undef, i32 [[Y]])
 ; CHECK-NEXT:    ret <2 x i16> [[CVT]]
 ;
   %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 undef, i32 %y)
@@ -1145,8 +1272,9 @@
 }
 
 define <2 x i16> @undef_rhs_cvt_pk_u16(i32 %x) {
-; CHECK-LABEL: @undef_rhs_cvt_pk_u16(
-; CHECK-NEXT:    [[CVT:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 [[X:%.*]], i32 undef)
+; CHECK-LABEL: define <2 x i16> @undef_rhs_cvt_pk_u16
+; CHECK-SAME: (i32 [[X:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[CVT:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 [[X]], i32 undef)
 ; CHECK-NEXT:    ret <2 x i16> [[CVT]]
 ;
   %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 %x, i32 undef)
@@ -1154,7 +1282,8 @@
 }
 
 define <2 x i16> @undef_cvt_pk_u16() {
-; CHECK-LABEL: @undef_cvt_pk_u16(
+; CHECK-LABEL: define <2 x i16> @undef_cvt_pk_u16
+; CHECK-SAME: () #[[ATTR3]] {
 ; CHECK-NEXT:    ret <2 x i16> undef
 ;
   %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 undef, i32 undef)
@@ -1169,8 +1298,9 @@
 declare i64 @llvm.amdgcn.ubfe.i64(i64, i32, i32) nounwind readnone
 
 define i32 @ubfe_var_i32(i32 %src, i32 %offset, i32 %width) {
-; CHECK-LABEL: @ubfe_var_i32(
-; CHECK-NEXT:    [[BFE:%.*]] = call i32 @llvm.amdgcn.ubfe.i32(i32 [[SRC:%.*]], i32 [[OFFSET:%.*]], i32 [[WIDTH:%.*]])
+; CHECK-LABEL: define i32 @ubfe_var_i32
+; CHECK-SAME: (i32 [[SRC:%.*]], i32 [[OFFSET:%.*]], i32 [[WIDTH:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[BFE:%.*]] = call i32 @llvm.amdgcn.ubfe.i32(i32 [[SRC]], i32 [[OFFSET]], i32 [[WIDTH]])
 ; CHECK-NEXT:    ret i32 [[BFE]]
 ;
   %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 %offset, i32 %width)
@@ -1178,8 +1308,9 @@
 }
 
 define i32 @ubfe_clear_high_bits_constant_offset_i32(i32 %src, i32 %width) {
-; CHECK-LABEL: @ubfe_clear_high_bits_constant_offset_i32(
-; CHECK-NEXT:    [[BFE:%.*]] = call i32 @llvm.amdgcn.ubfe.i32(i32 [[SRC:%.*]], i32 5, i32 [[WIDTH:%.*]])
+; CHECK-LABEL: define i32 @ubfe_clear_high_bits_constant_offset_i32
+; CHECK-SAME: (i32 [[SRC:%.*]], i32 [[WIDTH:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[BFE:%.*]] = call i32 @llvm.amdgcn.ubfe.i32(i32 [[SRC]], i32 5, i32 [[WIDTH]])
 ; CHECK-NEXT:    ret i32 [[BFE]]
 ;
   %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 133, i32 %width)
@@ -1187,8 +1318,9 @@
 }
 
 define i32 @ubfe_clear_high_bits_constant_width_i32(i32 %src, i32 %offset) {
-; CHECK-LABEL: @ubfe_clear_high_bits_constant_width_i32(
-; CHECK-NEXT:    [[BFE:%.*]] = call i32 @llvm.amdgcn.ubfe.i32(i32 [[SRC:%.*]], i32 [[OFFSET:%.*]], i32 5)
+; CHECK-LABEL: define i32 @ubfe_clear_high_bits_constant_width_i32
+; CHECK-SAME: (i32 [[SRC:%.*]], i32 [[OFFSET:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[BFE:%.*]] = call i32 @llvm.amdgcn.ubfe.i32(i32 [[SRC]], i32 [[OFFSET]], i32 5)
 ; CHECK-NEXT:    ret i32 [[BFE]]
 ;
   %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 %offset, i32 133)
@@ -1196,7 +1328,8 @@
 }
 
 define i32 @ubfe_width_0(i32 %src, i32 %offset) {
-; CHECK-LABEL: @ubfe_width_0(
+; CHECK-LABEL: define i32 @ubfe_width_0
+; CHECK-SAME: (i32 [[SRC:%.*]], i32 [[OFFSET:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    ret i32 0
 ;
   %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 %offset, i32 0)
@@ -1204,8 +1337,9 @@
 }
 
 define i32 @ubfe_width_31(i32 %src, i32 %offset) {
-; CHECK-LABEL: @ubfe_width_31(
-; CHECK-NEXT:    [[BFE:%.*]] = call i32 @llvm.amdgcn.ubfe.i32(i32 [[SRC:%.*]], i32 [[OFFSET:%.*]], i32 31)
+; CHECK-LABEL: define i32 @ubfe_width_31
+; CHECK-SAME: (i32 [[SRC:%.*]], i32 [[OFFSET:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[BFE:%.*]] = call i32 @llvm.amdgcn.ubfe.i32(i32 [[SRC]], i32 [[OFFSET]], i32 31)
 ; CHECK-NEXT:    ret i32 [[BFE]]
 ;
   %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 %offset, i32 31)
@@ -1213,7 +1347,8 @@
 }
 
 define i32 @ubfe_width_32(i32 %src, i32 %offset) {
-; CHECK-LABEL: @ubfe_width_32(
+; CHECK-LABEL: define i32 @ubfe_width_32
+; CHECK-SAME: (i32 [[SRC:%.*]], i32 [[OFFSET:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    ret i32 0
 ;
   %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 %offset, i32 32)
@@ -1221,8 +1356,9 @@
 }
 
 define i32 @ubfe_width_33(i32 %src, i32 %offset) {
-; CHECK-LABEL: @ubfe_width_33(
-; CHECK-NEXT:    [[BFE:%.*]] = call i32 @llvm.amdgcn.ubfe.i32(i32 [[SRC:%.*]], i32 [[OFFSET:%.*]], i32 1)
+; CHECK-LABEL: define i32 @ubfe_width_33
+; CHECK-SAME: (i32 [[SRC:%.*]], i32 [[OFFSET:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[BFE:%.*]] = call i32 @llvm.amdgcn.ubfe.i32(i32 [[SRC]], i32 [[OFFSET]], i32 1)
 ; CHECK-NEXT:    ret i32 [[BFE]]
 ;
   %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 %offset, i32 33)
@@ -1230,8 +1366,9 @@
 }
 
 define i32 @ubfe_offset_33(i32 %src, i32 %width) {
-; CHECK-LABEL: @ubfe_offset_33(
-; CHECK-NEXT:    [[BFE:%.*]] = call i32 @llvm.amdgcn.ubfe.i32(i32 [[SRC:%.*]], i32 1, i32 [[WIDTH:%.*]])
+; CHECK-LABEL: define i32 @ubfe_offset_33
+; CHECK-SAME: (i32 [[SRC:%.*]], i32 [[WIDTH:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[BFE:%.*]] = call i32 @llvm.amdgcn.ubfe.i32(i32 [[SRC]], i32 1, i32 [[WIDTH]])
 ; CHECK-NEXT:    ret i32 [[BFE]]
 ;
   %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 33, i32 %width)
@@ -1239,8 +1376,9 @@
 }
 
 define i32 @ubfe_offset_0(i32 %src, i32 %width) {
-; CHECK-LABEL: @ubfe_offset_0(
-; CHECK-NEXT:    [[BFE:%.*]] = call i32 @llvm.amdgcn.ubfe.i32(i32 [[SRC:%.*]], i32 0, i32 [[WIDTH:%.*]])
+; CHECK-LABEL: define i32 @ubfe_offset_0
+; CHECK-SAME: (i32 [[SRC:%.*]], i32 [[WIDTH:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[BFE:%.*]] = call i32 @llvm.amdgcn.ubfe.i32(i32 [[SRC]], i32 0, i32 [[WIDTH]])
 ; CHECK-NEXT:    ret i32 [[BFE]]
 ;
   %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 0, i32 %width)
@@ -1248,8 +1386,9 @@
 }
 
 define i32 @ubfe_offset_32(i32 %src, i32 %width) {
-; CHECK-LABEL: @ubfe_offset_32(
-; CHECK-NEXT:    [[BFE:%.*]] = call i32 @llvm.amdgcn.ubfe.i32(i32 [[SRC:%.*]], i32 0, i32 [[WIDTH:%.*]])
+; CHECK-LABEL: define i32 @ubfe_offset_32
+; CHECK-SAME: (i32 [[SRC:%.*]], i32 [[WIDTH:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[BFE:%.*]] = call i32 @llvm.amdgcn.ubfe.i32(i32 [[SRC]], i32 0, i32 [[WIDTH]])
 ; CHECK-NEXT:    ret i32 [[BFE]]
 ;
   %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 32, i32 %width)
@@ -1257,8 +1396,9 @@
 }
 
 define i32 @ubfe_offset_31(i32 %src, i32 %width) {
-; CHECK-LABEL: @ubfe_offset_31(
-; CHECK-NEXT:    [[BFE:%.*]] = call i32 @llvm.amdgcn.ubfe.i32(i32 [[SRC:%.*]], i32 31, i32 [[WIDTH:%.*]])
+; CHECK-LABEL: define i32 @ubfe_offset_31
+; CHECK-SAME: (i32 [[SRC:%.*]], i32 [[WIDTH:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[BFE:%.*]] = call i32 @llvm.amdgcn.ubfe.i32(i32 [[SRC]], i32 31, i32 [[WIDTH]])
 ; CHECK-NEXT:    ret i32 [[BFE]]
 ;
   %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 31, i32 %width)
@@ -1266,7 +1406,8 @@
 }
 
 define i32 @ubfe_offset_0_width_0(i32 %src) {
-; CHECK-LABEL: @ubfe_offset_0_width_0(
+; CHECK-LABEL: define i32 @ubfe_offset_0_width_0
+; CHECK-SAME: (i32 [[SRC:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    ret i32 0
 ;
   %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 0, i32 0)
@@ -1274,17 +1415,19 @@
 }
 
 define i32 @ubfe_offset_0_width_3(i32 %src) {
-; CHECK-LABEL: @ubfe_offset_0_width_3(
-; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[SRC:%.*]], 7
-; CHECK-NEXT:    ret i32 [[TMP1]]
+; CHECK-LABEL: define i32 @ubfe_offset_0_width_3
+; CHECK-SAME: (i32 [[SRC:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[BFE:%.*]] = and i32 [[SRC]], 7
+; CHECK-NEXT:    ret i32 [[BFE]]
 ;
   %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 0, i32 3)
   ret i32 %bfe
 }
 
 define i32 @ubfe_offset_3_width_1(i32 %src) {
-; CHECK-LABEL: @ubfe_offset_3_width_1(
-; CHECK-NEXT:    [[TMP1:%.*]] = lshr i32 [[SRC:%.*]], 3
+; CHECK-LABEL: define i32 @ubfe_offset_3_width_1
+; CHECK-SAME: (i32 [[SRC:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i32 [[SRC]], 3
 ; CHECK-NEXT:    [[BFE:%.*]] = and i32 [[TMP1]], 1
 ; CHECK-NEXT:    ret i32 [[BFE]]
 ;
@@ -1293,8 +1436,9 @@
 }
 
 define i32 @ubfe_offset_3_width_4(i32 %src) {
-; CHECK-LABEL: @ubfe_offset_3_width_4(
-; CHECK-NEXT:    [[TMP1:%.*]] = lshr i32 [[SRC:%.*]], 3
+; CHECK-LABEL: define i32 @ubfe_offset_3_width_4
+; CHECK-SAME: (i32 [[SRC:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i32 [[SRC]], 3
 ; CHECK-NEXT:    [[BFE:%.*]] = and i32 [[TMP1]], 15
 ; CHECK-NEXT:    ret i32 [[BFE]]
 ;
@@ -1303,7 +1447,8 @@
 }
 
 define i32 @ubfe_0_0_0() {
-; CHECK-LABEL: @ubfe_0_0_0(
+; CHECK-LABEL: define i32 @ubfe_0_0_0
+; CHECK-SAME: () #[[ATTR3]] {
 ; CHECK-NEXT:    ret i32 0
 ;
   %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 0, i32 0, i32 0)
@@ -1311,7 +1456,8 @@
 }
 
 define i32 @ubfe_neg1_5_7() {
-; CHECK-LABEL: @ubfe_neg1_5_7(
+; CHECK-LABEL: define i32 @ubfe_neg1_5_7
+; CHECK-SAME: () #[[ATTR3]] {
 ; CHECK-NEXT:    ret i32 127
 ;
   %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 -1, i32 5, i32 7)
@@ -1319,7 +1465,8 @@
 }
 
 define i32 @ubfe_undef_src_i32(i32 %offset, i32 %width) {
-; CHECK-LABEL: @ubfe_undef_src_i32(
+; CHECK-LABEL: define i32 @ubfe_undef_src_i32
+; CHECK-SAME: (i32 [[OFFSET:%.*]], i32 [[WIDTH:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    ret i32 undef
 ;
   %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 undef, i32 %offset, i32 %width)
@@ -1327,8 +1474,9 @@
 }
 
 define i32 @ubfe_undef_offset_i32(i32 %src, i32 %width) {
-; CHECK-LABEL: @ubfe_undef_offset_i32(
-; CHECK-NEXT:    [[BFE:%.*]] = call i32 @llvm.amdgcn.ubfe.i32(i32 [[SRC:%.*]], i32 undef, i32 [[WIDTH:%.*]])
+; CHECK-LABEL: define i32 @ubfe_undef_offset_i32
+; CHECK-SAME: (i32 [[SRC:%.*]], i32 [[WIDTH:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[BFE:%.*]] = call i32 @llvm.amdgcn.ubfe.i32(i32 [[SRC]], i32 undef, i32 [[WIDTH]])
 ; CHECK-NEXT:    ret i32 [[BFE]]
 ;
   %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 undef, i32 %width)
@@ -1336,8 +1484,9 @@
 }
 
 define i32 @ubfe_undef_width_i32(i32 %src, i32 %offset) {
-; CHECK-LABEL: @ubfe_undef_width_i32(
-; CHECK-NEXT:    [[BFE:%.*]] = call i32 @llvm.amdgcn.ubfe.i32(i32 [[SRC:%.*]], i32 [[OFFSET:%.*]], i32 undef)
+; CHECK-LABEL: define i32 @ubfe_undef_width_i32
+; CHECK-SAME: (i32 [[SRC:%.*]], i32 [[OFFSET:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[BFE:%.*]] = call i32 @llvm.amdgcn.ubfe.i32(i32 [[SRC]], i32 [[OFFSET]], i32 undef)
 ; CHECK-NEXT:    ret i32 [[BFE]]
 ;
   %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 %offset, i32 undef)
@@ -1345,8 +1494,9 @@
 }
 
 define i64 @ubfe_offset_33_width_4_i64(i64 %src) {
-; CHECK-LABEL: @ubfe_offset_33_width_4_i64(
-; CHECK-NEXT:    [[TMP1:%.*]] = lshr i64 [[SRC:%.*]], 33
+; CHECK-LABEL: define i64 @ubfe_offset_33_width_4_i64
+; CHECK-SAME: (i64 [[SRC:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i64 [[SRC]], 33
 ; CHECK-NEXT:    [[BFE:%.*]] = and i64 [[TMP1]], 15
 ; CHECK-NEXT:    ret i64 [[BFE]]
 ;
@@ -1355,8 +1505,9 @@
 }
 
 define i64 @ubfe_offset_0_i64(i64 %src, i32 %width) {
-; CHECK-LABEL: @ubfe_offset_0_i64(
-; CHECK-NEXT:    [[BFE:%.*]] = call i64 @llvm.amdgcn.ubfe.i64(i64 [[SRC:%.*]], i32 0, i32 [[WIDTH:%.*]])
+; CHECK-LABEL: define i64 @ubfe_offset_0_i64
+; CHECK-SAME: (i64 [[SRC:%.*]], i32 [[WIDTH:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[BFE:%.*]] = call i64 @llvm.amdgcn.ubfe.i64(i64 [[SRC]], i32 0, i32 [[WIDTH]])
 ; CHECK-NEXT:    ret i64 [[BFE]]
 ;
   %bfe = call i64 @llvm.amdgcn.ubfe.i64(i64 %src, i32 0, i32 %width)
@@ -1364,8 +1515,9 @@
 }
 
 define i64 @ubfe_offset_32_width_32_i64(i64 %src) {
-; CHECK-LABEL: @ubfe_offset_32_width_32_i64(
-; CHECK-NEXT:    [[BFE:%.*]] = lshr i64 [[SRC:%.*]], 32
+; CHECK-LABEL: define i64 @ubfe_offset_32_width_32_i64
+; CHECK-SAME: (i64 [[SRC:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[BFE:%.*]] = lshr i64 [[SRC]], 32
 ; CHECK-NEXT:    ret i64 [[BFE]]
 ;
   %bfe = call i64 @llvm.amdgcn.ubfe.i64(i64 %src, i32 32, i32 32)
@@ -1380,8 +1532,9 @@
 declare i64 @llvm.amdgcn.sbfe.i64(i64, i32, i32) nounwind readnone
 
 define i32 @sbfe_offset_31(i32 %src, i32 %width) {
-; CHECK-LABEL: @sbfe_offset_31(
-; CHECK-NEXT:    [[BFE:%.*]] = call i32 @llvm.amdgcn.sbfe.i32(i32 [[SRC:%.*]], i32 31, i32 [[WIDTH:%.*]])
+; CHECK-LABEL: define i32 @sbfe_offset_31
+; CHECK-SAME: (i32 [[SRC:%.*]], i32 [[WIDTH:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[BFE:%.*]] = call i32 @llvm.amdgcn.sbfe.i32(i32 [[SRC]], i32 31, i32 [[WIDTH]])
 ; CHECK-NEXT:    ret i32 [[BFE]]
 ;
   %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %src, i32 31, i32 %width)
@@ -1389,7 +1542,8 @@
 }
 
 define i32 @sbfe_neg1_5_7() {
-; CHECK-LABEL: @sbfe_neg1_5_7(
+; CHECK-LABEL: define i32 @sbfe_neg1_5_7
+; CHECK-SAME: () #[[ATTR3]] {
 ; CHECK-NEXT:    ret i32 -1
 ;
   %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 -1, i32 5, i32 7)
@@ -1397,8 +1551,9 @@
 }
 
 define i64 @sbfe_offset_32_width_32_i64(i64 %src) {
-; CHECK-LABEL: @sbfe_offset_32_width_32_i64(
-; CHECK-NEXT:    [[BFE:%.*]] = ashr i64 [[SRC:%.*]], 32
+; CHECK-LABEL: define i64 @sbfe_offset_32_width_32_i64
+; CHECK-SAME: (i64 [[SRC:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[BFE:%.*]] = ashr i64 [[SRC]], 32
 ; CHECK-NEXT:    ret i64 [[BFE]]
 ;
   %bfe = call i64 @llvm.amdgcn.sbfe.i64(i64 %src, i32 32, i32 32)
@@ -1416,15 +1571,16 @@
 
 define void @exp_disabled_inputs_to_undef(float %x, float %y, float %z, float %w) {
   ; enable src0..src3 constants
-; CHECK-LABEL: @exp_disabled_inputs_to_undef(
+; CHECK-LABEL: define void @exp_disabled_inputs_to_undef
+; CHECK-SAME: (float [[X:%.*]], float [[Y:%.*]], float [[Z:%.*]], float [[W:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    call void @llvm.amdgcn.exp.f32(i32 0, i32 1, float 1.000000e+00, float undef, float undef, float undef, i1 true, i1 false)
 ; CHECK-NEXT:    call void @llvm.amdgcn.exp.f32(i32 0, i32 2, float undef, float 2.000000e+00, float undef, float undef, i1 true, i1 false)
 ; CHECK-NEXT:    call void @llvm.amdgcn.exp.f32(i32 0, i32 4, float undef, float undef, float 5.000000e-01, float undef, i1 true, i1 false)
 ; CHECK-NEXT:    call void @llvm.amdgcn.exp.f32(i32 0, i32 8, float undef, float undef, float undef, float 4.000000e+00, i1 true, i1 false)
-; CHECK-NEXT:    call void @llvm.amdgcn.exp.f32(i32 0, i32 1, float [[X:%.*]], float undef, float undef, float undef, i1 true, i1 false)
-; CHECK-NEXT:    call void @llvm.amdgcn.exp.f32(i32 0, i32 2, float undef, float [[Y:%.*]], float undef, float undef, i1 true, i1 false)
-; CHECK-NEXT:    call void @llvm.amdgcn.exp.f32(i32 0, i32 4, float undef, float undef, float [[Z:%.*]], float undef, i1 true, i1 false)
-; CHECK-NEXT:    call void @llvm.amdgcn.exp.f32(i32 0, i32 8, float undef, float undef, float undef, float [[W:%.*]], i1 true, i1 false)
+; CHECK-NEXT:    call void @llvm.amdgcn.exp.f32(i32 0, i32 1, float [[X]], float undef, float undef, float undef, i1 true, i1 false)
+; CHECK-NEXT:    call void @llvm.amdgcn.exp.f32(i32 0, i32 2, float undef, float [[Y]], float undef, float undef, i1 true, i1 false)
+; CHECK-NEXT:    call void @llvm.amdgcn.exp.f32(i32 0, i32 4, float undef, float undef, float [[Z]], float undef, i1 true, i1 false)
+; CHECK-NEXT:    call void @llvm.amdgcn.exp.f32(i32 0, i32 8, float undef, float undef, float undef, float [[W]], i1 true, i1 false)
 ; CHECK-NEXT:    call void @llvm.amdgcn.exp.f32(i32 0, i32 0, float undef, float undef, float undef, float undef, i1 true, i1 false)
 ; CHECK-NEXT:    call void @llvm.amdgcn.exp.f32(i32 0, i32 3, float 1.000000e+00, float 2.000000e+00, float undef, float undef, i1 true, i1 false)
 ; CHECK-NEXT:    call void @llvm.amdgcn.exp.f32(i32 0, i32 5, float 1.000000e+00, float undef, float 5.000000e-01, float undef, i1 true, i1 false)
@@ -1464,16 +1620,17 @@
 
 
 define void @exp_compr_disabled_inputs_to_undef(<2 x half> %xy, <2 x half> %zw) {
-; CHECK-LABEL: @exp_compr_disabled_inputs_to_undef(
+; CHECK-LABEL: define void @exp_compr_disabled_inputs_to_undef
+; CHECK-SAME: (<2 x half> [[XY:%.*]], <2 x half> [[ZW:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 0, <2 x half> undef, <2 x half> undef, i1 true, i1 false)
 ; CHECK-NEXT:    call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 1, <2 x half> <half 0xH3C00, half 0xH4000>, <2 x half> undef, i1 true, i1 false)
 ; CHECK-NEXT:    call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 2, <2 x half> <half 0xH3C00, half 0xH4000>, <2 x half> undef, i1 true, i1 false)
 ; CHECK-NEXT:    call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 3, <2 x half> <half 0xH3C00, half 0xH4000>, <2 x half> undef, i1 true, i1 false)
 ; CHECK-NEXT:    call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 0, <2 x half> undef, <2 x half> undef, i1 true, i1 false)
-; CHECK-NEXT:    call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 1, <2 x half> [[XY:%.*]], <2 x half> undef, i1 true, i1 false)
+; CHECK-NEXT:    call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 1, <2 x half> [[XY]], <2 x half> undef, i1 true, i1 false)
 ; CHECK-NEXT:    call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 2, <2 x half> [[XY]], <2 x half> undef, i1 true, i1 false)
 ; CHECK-NEXT:    call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 3, <2 x half> [[XY]], <2 x half> undef, i1 true, i1 false)
-; CHECK-NEXT:    call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 12, <2 x half> undef, <2 x half> [[ZW:%.*]], i1 true, i1 false)
+; CHECK-NEXT:    call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 12, <2 x half> undef, <2 x half> [[ZW]], i1 true, i1 false)
 ; CHECK-NEXT:    call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> [[XY]], <2 x half> [[ZW]], i1 true, i1 false)
 ; CHECK-NEXT:    ret void
 ;
@@ -1499,8 +1656,9 @@
 declare float @llvm.amdgcn.fmed3.f32(float, float, float) nounwind readnone
 
 define float @fmed3_f32(float %x, float %y, float %z) {
-; CHECK-LABEL: @fmed3_f32(
-; CHECK-NEXT:    [[MED3:%.*]] = call float @llvm.amdgcn.fmed3.f32(float [[X:%.*]], float [[Y:%.*]], float [[Z:%.*]])
+; CHECK-LABEL: define float @fmed3_f32
+; CHECK-SAME: (float [[X:%.*]], float [[Y:%.*]], float [[Z:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[MED3:%.*]] = call float @llvm.amdgcn.fmed3.f32(float [[X]], float [[Y]], float [[Z]])
 ; CHECK-NEXT:    ret float [[MED3]]
 ;
   %med3 = call float @llvm.amdgcn.fmed3.f32(float %x, float %y, float %z)
@@ -1508,8 +1666,9 @@
 }
 
 define float @fmed3_canonicalize_x_c0_c1_f32(float %x) {
-; CHECK-LABEL: @fmed3_canonicalize_x_c0_c1_f32(
-; CHECK-NEXT:    [[MED3:%.*]] = call float @llvm.amdgcn.fmed3.f32(float [[X:%.*]], float 0.000000e+00, float 1.000000e+00)
+; CHECK-LABEL: define float @fmed3_canonicalize_x_c0_c1_f32
+; CHECK-SAME: (float [[X:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[MED3:%.*]] = call float @llvm.amdgcn.fmed3.f32(float [[X]], float 0.000000e+00, float 1.000000e+00)
 ; CHECK-NEXT:    ret float [[MED3]]
 ;
   %med3 = call float @llvm.amdgcn.fmed3.f32(float %x, float 0.0, float 1.0)
@@ -1517,8 +1676,9 @@
 }
 
 define float @fmed3_canonicalize_c0_x_c1_f32(float %x) {
-; CHECK-LABEL: @fmed3_canonicalize_c0_x_c1_f32(
-; CHECK-NEXT:    [[MED3:%.*]] = call float @llvm.amdgcn.fmed3.f32(float [[X:%.*]], float 0.000000e+00, float 1.000000e+00)
+; CHECK-LABEL: define float @fmed3_canonicalize_c0_x_c1_f32
+; CHECK-SAME: (float [[X:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[MED3:%.*]] = call float @llvm.amdgcn.fmed3.f32(float [[X]], float 0.000000e+00, float 1.000000e+00)
 ; CHECK-NEXT:    ret float [[MED3]]
 ;
   %med3 = call float @llvm.amdgcn.fmed3.f32(float 0.0, float %x, float 1.0)
@@ -1526,8 +1686,9 @@
 }
 
 define float @fmed3_canonicalize_c0_c1_x_f32(float %x) {
-; CHECK-LABEL: @fmed3_canonicalize_c0_c1_x_f32(
-; CHECK-NEXT:    [[MED3:%.*]] = call float @llvm.amdgcn.fmed3.f32(float [[X:%.*]], float 0.000000e+00, float 1.000000e+00)
+; CHECK-LABEL: define float @fmed3_canonicalize_c0_c1_x_f32
+; CHECK-SAME: (float [[X:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[MED3:%.*]] = call float @llvm.amdgcn.fmed3.f32(float [[X]], float 0.000000e+00, float 1.000000e+00)
 ; CHECK-NEXT:    ret float [[MED3]]
 ;
   %med3 = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float %x)
@@ -1535,8 +1696,9 @@
 }
 
 define float @fmed3_canonicalize_x_y_c_f32(float %x, float %y) {
-; CHECK-LABEL: @fmed3_canonicalize_x_y_c_f32(
-; CHECK-NEXT:    [[MED3:%.*]] = call float @llvm.amdgcn.fmed3.f32(float [[X:%.*]], float [[Y:%.*]], float 1.000000e+00)
+; CHECK-LABEL: define float @fmed3_canonicalize_x_y_c_f32
+; CHECK-SAME: (float [[X:%.*]], float [[Y:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[MED3:%.*]] = call float @llvm.amdgcn.fmed3.f32(float [[X]], float [[Y]], float 1.000000e+00)
 ; CHECK-NEXT:    ret float [[MED3]]
 ;
   %med3 = call float @llvm.amdgcn.fmed3.f32(float %x, float %y, float 1.0)
@@ -1544,8 +1706,9 @@
 }
 
 define float @fmed3_canonicalize_x_c_y_f32(float %x, float %y) {
-; CHECK-LABEL: @fmed3_canonicalize_x_c_y_f32(
-; CHECK-NEXT:    [[MED3:%.*]] = call float @llvm.amdgcn.fmed3.f32(float [[X:%.*]], float [[Y:%.*]], float 1.000000e+00)
+; CHECK-LABEL: define float @fmed3_canonicalize_x_c_y_f32
+; CHECK-SAME: (float [[X:%.*]], float [[Y:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[MED3:%.*]] = call float @llvm.amdgcn.fmed3.f32(float [[X]], float [[Y]], float 1.000000e+00)
 ; CHECK-NEXT:    ret float [[MED3]]
 ;
   %med3 = call float @llvm.amdgcn.fmed3.f32(float %x, float 1.0, float %y)
@@ -1553,8 +1716,9 @@
 }
 
 define float @fmed3_canonicalize_c_x_y_f32(float %x, float %y) {
-; CHECK-LABEL: @fmed3_canonicalize_c_x_y_f32(
-; CHECK-NEXT:    [[MED3:%.*]] = call float @llvm.amdgcn.fmed3.f32(float [[X:%.*]], float [[Y:%.*]], float 1.000000e+00)
+; CHECK-LABEL: define float @fmed3_canonicalize_c_x_y_f32
+; CHECK-SAME: (float [[X:%.*]], float [[Y:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[MED3:%.*]] = call float @llvm.amdgcn.fmed3.f32(float [[X]], float [[Y]], float 1.000000e+00)
 ; CHECK-NEXT:    ret float [[MED3]]
 ;
   %med3 = call float @llvm.amdgcn.fmed3.f32(float 1.0, float %x, float %y)
@@ -1562,8 +1726,9 @@
 }
 
 define float @fmed3_undef_x_y_f32(float %x, float %y) {
-; CHECK-LABEL: @fmed3_undef_x_y_f32(
-; CHECK-NEXT:    [[MED3:%.*]] = call float @llvm.minnum.f32(float [[X:%.*]], float [[Y:%.*]])
+; CHECK-LABEL: define float @fmed3_undef_x_y_f32
+; CHECK-SAME: (float [[X:%.*]], float [[Y:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[MED3:%.*]] = call float @llvm.minnum.f32(float [[X]], float [[Y]])
 ; CHECK-NEXT:    ret float [[MED3]]
 ;
   %med3 = call float @llvm.amdgcn.fmed3.f32(float undef, float %x, float %y)
@@ -1571,8 +1736,9 @@
 }
 
 define float @fmed3_fmf_undef_x_y_f32(float %x, float %y) {
-; CHECK-LABEL: @fmed3_fmf_undef_x_y_f32(
-; CHECK-NEXT:    [[MED3:%.*]] = call nnan float @llvm.minnum.f32(float [[X:%.*]], float [[Y:%.*]])
+; CHECK-LABEL: define float @fmed3_fmf_undef_x_y_f32
+; CHECK-SAME: (float [[X:%.*]], float [[Y:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[MED3:%.*]] = call nnan float @llvm.minnum.f32(float [[X]], float [[Y]])
 ; CHECK-NEXT:    ret float [[MED3]]
 ;
   %med3 = call nnan float @llvm.amdgcn.fmed3.f32(float undef, float %x, float %y)
@@ -1580,8 +1746,9 @@
 }
 
 define float @fmed3_x_undef_y_f32(float %x, float %y) {
-; CHECK-LABEL: @fmed3_x_undef_y_f32(
-; CHECK-NEXT:    [[MED3:%.*]] = call float @llvm.minnum.f32(float [[X:%.*]], float [[Y:%.*]])
+; CHECK-LABEL: define float @fmed3_x_undef_y_f32
+; CHECK-SAME: (float [[X:%.*]], float [[Y:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[MED3:%.*]] = call float @llvm.minnum.f32(float [[X]], float [[Y]])
 ; CHECK-NEXT:    ret float [[MED3]]
 ;
   %med3 = call float @llvm.amdgcn.fmed3.f32(float %x, float undef, float %y)
@@ -1589,8 +1756,9 @@
 }
 
 define float @fmed3_x_y_undef_f32(float %x, float %y) {
-; CHECK-LABEL: @fmed3_x_y_undef_f32(
-; CHECK-NEXT:    [[MED3:%.*]] = call float @llvm.maxnum.f32(float [[X:%.*]], float [[Y:%.*]])
+; CHECK-LABEL: define float @fmed3_x_y_undef_f32
+; CHECK-SAME: (float [[X:%.*]], float [[Y:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[MED3:%.*]] = call float @llvm.maxnum.f32(float [[X]], float [[Y]])
 ; CHECK-NEXT:    ret float [[MED3]]
 ;
   %med3 = call float @llvm.amdgcn.fmed3.f32(float %x, float %y, float undef)
@@ -1598,8 +1766,9 @@
 }
 
 define float @fmed3_qnan0_x_y_f32(float %x, float %y) {
-; CHECK-LABEL: @fmed3_qnan0_x_y_f32(
-; CHECK-NEXT:    [[MED3:%.*]] = call float @llvm.minnum.f32(float [[X:%.*]], float [[Y:%.*]])
+; CHECK-LABEL: define float @fmed3_qnan0_x_y_f32
+; CHECK-SAME: (float [[X:%.*]], float [[Y:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[MED3:%.*]] = call float @llvm.minnum.f32(float [[X]], float [[Y]])
 ; CHECK-NEXT:    ret float [[MED3]]
 ;
   %med3 = call float @llvm.amdgcn.fmed3.f32(float 0x7FF8000000000000, float %x, float %y)
@@ -1607,8 +1776,9 @@
 }
 
 define float @fmed3_x_qnan0_y_f32(float %x, float %y) {
-; CHECK-LABEL: @fmed3_x_qnan0_y_f32(
-; CHECK-NEXT:    [[MED3:%.*]] = call float @llvm.minnum.f32(float [[X:%.*]], float [[Y:%.*]])
+; CHECK-LABEL: define float @fmed3_x_qnan0_y_f32
+; CHECK-SAME: (float [[X:%.*]], float [[Y:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[MED3:%.*]] = call float @llvm.minnum.f32(float [[X]], float [[Y]])
 ; CHECK-NEXT:    ret float [[MED3]]
 ;
   %med3 = call float @llvm.amdgcn.fmed3.f32(float %x, float 0x7FF8000000000000, float %y)
@@ -1616,8 +1786,9 @@
 }
 
 define float @fmed3_x_y_qnan0_f32(float %x, float %y) {
-; CHECK-LABEL: @fmed3_x_y_qnan0_f32(
-; CHECK-NEXT:    [[MED3:%.*]] = call float @llvm.maxnum.f32(float [[X:%.*]], float [[Y:%.*]])
+; CHECK-LABEL: define float @fmed3_x_y_qnan0_f32
+; CHECK-SAME: (float [[X:%.*]], float [[Y:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[MED3:%.*]] = call float @llvm.maxnum.f32(float [[X]], float [[Y]])
 ; CHECK-NEXT:    ret float [[MED3]]
 ;
   %med3 = call float @llvm.amdgcn.fmed3.f32(float %x, float %y, float 0x7FF8000000000000)
@@ -1625,8 +1796,9 @@
 }
 
 define float @fmed3_qnan1_x_y_f32(float %x, float %y) {
-; CHECK-LABEL: @fmed3_qnan1_x_y_f32(
-; CHECK-NEXT:    [[MED3:%.*]] = call float @llvm.minnum.f32(float [[X:%.*]], float [[Y:%.*]])
+; CHECK-LABEL: define float @fmed3_qnan1_x_y_f32
+; CHECK-SAME: (float [[X:%.*]], float [[Y:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[MED3:%.*]] = call float @llvm.minnum.f32(float [[X]], float [[Y]])
 ; CHECK-NEXT:    ret float [[MED3]]
 ;
   %med3 = call float @llvm.amdgcn.fmed3.f32(float 0x7FF8000100000000, float %x, float %y)
@@ -1635,7 +1807,8 @@
 
 ; This can return any of the qnans.
 define float @fmed3_qnan0_qnan1_qnan2_f32(float %x, float %y) {
-; CHECK-LABEL: @fmed3_qnan0_qnan1_qnan2_f32(
+; CHECK-LABEL: define float @fmed3_qnan0_qnan1_qnan2_f32
+; CHECK-SAME: (float [[X:%.*]], float [[Y:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    ret float 0x7FF8030000000000
 ;
   %med3 = call float @llvm.amdgcn.fmed3.f32(float 0x7FF8000100000000, float 0x7FF8002000000000, float 0x7FF8030000000000)
@@ -1643,7 +1816,8 @@
 }
 
 define float @fmed3_constant_src0_0_f32(float %x, float %y) {
-; CHECK-LABEL: @fmed3_constant_src0_0_f32(
+; CHECK-LABEL: define float @fmed3_constant_src0_0_f32
+; CHECK-SAME: (float [[X:%.*]], float [[Y:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    ret float 5.000000e-01
 ;
   %med3 = call float @llvm.amdgcn.fmed3.f32(float 0.5, float -1.0, float 4.0)
@@ -1651,7 +1825,8 @@
 }
 
 define float @fmed3_constant_src0_1_f32(float %x, float %y) {
-; CHECK-LABEL: @fmed3_constant_src0_1_f32(
+; CHECK-LABEL: define float @fmed3_constant_src0_1_f32
+; CHECK-SAME: (float [[X:%.*]], float [[Y:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    ret float 5.000000e-01
 ;
   %med3 = call float @llvm.amdgcn.fmed3.f32(float 0.5, float 4.0, float -1.0)
@@ -1659,7 +1834,8 @@
 }
 
 define float @fmed3_constant_src1_0_f32(float %x, float %y) {
-; CHECK-LABEL: @fmed3_constant_src1_0_f32(
+; CHECK-LABEL: define float @fmed3_constant_src1_0_f32
+; CHECK-SAME: (float [[X:%.*]], float [[Y:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    ret float 5.000000e-01
 ;
   %med3 = call float @llvm.amdgcn.fmed3.f32(float -1.0, float 0.5, float 4.0)
@@ -1667,7 +1843,8 @@
 }
 
 define float @fmed3_constant_src1_1_f32(float %x, float %y) {
-; CHECK-LABEL: @fmed3_constant_src1_1_f32(
+; CHECK-LABEL: define float @fmed3_constant_src1_1_f32
+; CHECK-SAME: (float [[X:%.*]], float [[Y:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    ret float 5.000000e-01
 ;
   %med3 = call float @llvm.amdgcn.fmed3.f32(float 4.0, float 0.5, float -1.0)
@@ -1675,7 +1852,8 @@
 }
 
 define float @fmed3_constant_src2_0_f32(float %x, float %y) {
-; CHECK-LABEL: @fmed3_constant_src2_0_f32(
+; CHECK-LABEL: define float @fmed3_constant_src2_0_f32
+; CHECK-SAME: (float [[X:%.*]], float [[Y:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    ret float 5.000000e-01
 ;
   %med3 = call float @llvm.amdgcn.fmed3.f32(float -1.0, float 4.0, float 0.5)
@@ -1683,7 +1861,8 @@
 }
 
 define float @fmed3_constant_src2_1_f32(float %x, float %y) {
-; CHECK-LABEL: @fmed3_constant_src2_1_f32(
+; CHECK-LABEL: define float @fmed3_constant_src2_1_f32
+; CHECK-SAME: (float [[X:%.*]], float [[Y:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    ret float 5.000000e-01
 ;
   %med3 = call float @llvm.amdgcn.fmed3.f32(float 4.0, float -1.0, float 0.5)
@@ -1691,31 +1870,35 @@
 }
 
 define float @fmed3_x_qnan0_qnan1_f32(float %x) {
-; CHECK-LABEL: @fmed3_x_qnan0_qnan1_f32(
-; CHECK-NEXT:    ret float [[X:%.*]]
+; CHECK-LABEL: define float @fmed3_x_qnan0_qnan1_f32
+; CHECK-SAME: (float [[X:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    ret float [[X]]
 ;
   %med3 = call float @llvm.amdgcn.fmed3.f32(float %x, float 0x7FF8001000000000, float 0x7FF8002000000000)
   ret float %med3
 }
 
 define float @fmed3_qnan0_x_qnan1_f32(float %x) {
-; CHECK-LABEL: @fmed3_qnan0_x_qnan1_f32(
-; CHECK-NEXT:    ret float [[X:%.*]]
+; CHECK-LABEL: define float @fmed3_qnan0_x_qnan1_f32
+; CHECK-SAME: (float [[X:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    ret float [[X]]
 ;
   %med3 = call float @llvm.amdgcn.fmed3.f32(float 0x7FF8001000000000, float %x, float 0x7FF8002000000000)
   ret float %med3
 }
 
 define float @fmed3_qnan0_qnan1_x_f32(float %x) {
-; CHECK-LABEL: @fmed3_qnan0_qnan1_x_f32(
-; CHECK-NEXT:    ret float [[X:%.*]]
+; CHECK-LABEL: define float @fmed3_qnan0_qnan1_x_f32
+; CHECK-SAME: (float [[X:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    ret float [[X]]
 ;
   %med3 = call float @llvm.amdgcn.fmed3.f32(float 0x7FF8001000000000, float 0x7FF8002000000000, float %x)
   ret float %med3
 }
 
 define float @fmed3_nan_0_1_f32() {
-; CHECK-LABEL: @fmed3_nan_0_1_f32(
+; CHECK-LABEL: define float @fmed3_nan_0_1_f32
+; CHECK-SAME: () #[[ATTR3]] {
 ; CHECK-NEXT:    ret float 0.000000e+00
 ;
   %med3 = call float @llvm.amdgcn.fmed3.f32(float 0x7FF8001000000000, float 0.0, float 1.0)
@@ -1723,7 +1906,8 @@
 }
 
 define float @fmed3_0_nan_1_f32() {
-; CHECK-LABEL: @fmed3_0_nan_1_f32(
+; CHECK-LABEL: define float @fmed3_0_nan_1_f32
+; CHECK-SAME: () #[[ATTR3]] {
 ; CHECK-NEXT:    ret float 0.000000e+00
 ;
   %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 0x7FF8001000000000, float 1.0)
@@ -1731,7 +1915,8 @@
 }
 
 define float @fmed3_0_1_nan_f32() {
-; CHECK-LABEL: @fmed3_0_1_nan_f32(
+; CHECK-LABEL: define float @fmed3_0_1_nan_f32
+; CHECK-SAME: () #[[ATTR3]] {
 ; CHECK-NEXT:    ret float 1.000000e+00
 ;
   %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float 0x7FF8001000000000)
@@ -1739,7 +1924,8 @@
 }
 
 define float @fmed3_undef_0_1_f32() {
-; CHECK-LABEL: @fmed3_undef_0_1_f32(
+; CHECK-LABEL: define float @fmed3_undef_0_1_f32
+; CHECK-SAME: () #[[ATTR3]] {
 ; CHECK-NEXT:    ret float 0.000000e+00
 ;
   %med3 = call float @llvm.amdgcn.fmed3.f32(float undef, float 0.0, float 1.0)
@@ -1747,7 +1933,8 @@
 }
 
 define float @fmed3_0_undef_1_f32() {
-; CHECK-LABEL: @fmed3_0_undef_1_f32(
+; CHECK-LABEL: define float @fmed3_0_undef_1_f32
+; CHECK-SAME: () #[[ATTR3]] {
 ; CHECK-NEXT:    ret float 0.000000e+00
 ;
   %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float undef, float 1.0)
@@ -1755,7 +1942,8 @@
 }
 
 define float @fmed3_0_1_undef_f32() {
-; CHECK-LABEL: @fmed3_0_1_undef_f32(
+; CHECK-LABEL: define float @fmed3_0_1_undef_f32
+; CHECK-SAME: () #[[ATTR3]] {
 ; CHECK-NEXT:    ret float 1.000000e+00
 ;
   %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float undef)
@@ -1771,8 +1959,9 @@
 declare i64 @llvm.amdgcn.icmp.i64.i1(i1, i1, i32 immarg) nounwind readnone convergent
 
 define i64 @invalid_icmp_code(i32 %a, i32 %b) {
-; CHECK-LABEL: @invalid_icmp_code(
-; CHECK-NEXT:    [[UNDER:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i32(i32 [[A:%.*]], i32 [[B:%.*]], i32 31)
+; CHECK-LABEL: define i64 @invalid_icmp_code
+; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[UNDER:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i32(i32 [[A]], i32 [[B]], i32 31)
 ; CHECK-NEXT:    [[OVER:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i32(i32 [[A]], i32 [[B]], i32 42)
 ; CHECK-NEXT:    [[OR:%.*]] = or i64 [[UNDER]], [[OVER]]
 ; CHECK-NEXT:    ret i64 [[OR]]
@@ -1784,7 +1973,8 @@
 }
 
 define i64 @icmp_constant_inputs_false() {
-; CHECK-LABEL: @icmp_constant_inputs_false(
+; CHECK-LABEL: define i64 @icmp_constant_inputs_false
+; CHECK-SAME: () #[[ATTR3]] {
 ; CHECK-NEXT:    ret i64 0
 ;
   %result = call i64 @llvm.amdgcn.icmp.i64.i32(i32 9, i32 8, i32 32)
@@ -1792,8 +1982,9 @@
 }
 
 define i64 @icmp_constant_inputs_true() {
-; CHECK-LABEL: @icmp_constant_inputs_true(
-; CHECK-NEXT:    [[RESULT:%.*]] = call i64 @llvm.read_register.i64(metadata [[META0:![0-9]+]]) #[[ATTR17:[0-9]+]]
+; CHECK-LABEL: define i64 @icmp_constant_inputs_true
+; CHECK-SAME: () #[[ATTR3]] {
+; CHECK-NEXT:    [[RESULT:%.*]] = call i64 @llvm.read_register.i64(metadata [[META0:![0-9]+]]) #[[ATTR16:[0-9]+]]
 ; CHECK-NEXT:    ret i64 [[RESULT]]
 ;
   %result = call i64 @llvm.amdgcn.icmp.i64.i32(i32 9, i32 8, i32 34)
@@ -1801,8 +1992,9 @@
 }
 
 define i64 @icmp_constant_to_rhs_slt(i32 %x) {
-; CHECK-LABEL: @icmp_constant_to_rhs_slt(
-; CHECK-NEXT:    [[RESULT:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i32(i32 [[X:%.*]], i32 9, i32 38)
+; CHECK-LABEL: define i64 @icmp_constant_to_rhs_slt
+; CHECK-SAME: (i32 [[X:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[RESULT:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i32(i32 [[X]], i32 9, i32 38)
 ; CHECK-NEXT:    ret i64 [[RESULT]]
 ;
   %result = call i64 @llvm.amdgcn.icmp.i64.i32(i32 9, i32 %x, i32 40)
@@ -1810,8 +2002,9 @@
 }
 
 define i64 @fold_icmp_ne_0_zext_icmp_eq_i32(i32 %a, i32 %b) {
-; CHECK-LABEL: @fold_icmp_ne_0_zext_icmp_eq_i32(
-; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i32(i32 [[A:%.*]], i32 [[B:%.*]], i32 32)
+; CHECK-LABEL: define i64 @fold_icmp_ne_0_zext_icmp_eq_i32
+; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i32(i32 [[A]], i32 [[B]], i32 32)
 ; CHECK-NEXT:    ret i64 [[MASK]]
 ;
   %cmp = icmp eq i32 %a, %b
@@ -1821,8 +2014,9 @@
 }
 
 define i64 @fold_icmp_ne_0_zext_icmp_ne_i32(i32 %a, i32 %b) {
-; CHECK-LABEL: @fold_icmp_ne_0_zext_icmp_ne_i32(
-; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i32(i32 [[A:%.*]], i32 [[B:%.*]], i32 33)
+; CHECK-LABEL: define i64 @fold_icmp_ne_0_zext_icmp_ne_i32
+; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i32(i32 [[A]], i32 [[B]], i32 33)
 ; CHECK-NEXT:    ret i64 [[MASK]]
 ;
   %cmp = icmp ne i32 %a, %b
@@ -1832,8 +2026,9 @@
 }
 
 define i64 @fold_icmp_ne_0_zext_icmp_sle_i32(i32 %a, i32 %b) {
-; CHECK-LABEL: @fold_icmp_ne_0_zext_icmp_sle_i32(
-; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i32(i32 [[A:%.*]], i32 [[B:%.*]], i32 41)
+; CHECK-LABEL: define i64 @fold_icmp_ne_0_zext_icmp_sle_i32
+; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i32(i32 [[A]], i32 [[B]], i32 41)
 ; CHECK-NEXT:    ret i64 [[MASK]]
 ;
   %cmp = icmp sle i32 %a, %b
@@ -1843,8 +2038,9 @@
 }
 
 define i64 @fold_icmp_ne_0_zext_icmp_ugt_i64(i64 %a, i64 %b) {
-; CHECK-LABEL: @fold_icmp_ne_0_zext_icmp_ugt_i64(
-; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i64(i64 [[A:%.*]], i64 [[B:%.*]], i32 34)
+; CHECK-LABEL: define i64 @fold_icmp_ne_0_zext_icmp_ugt_i64
+; CHECK-SAME: (i64 [[A:%.*]], i64 [[B:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i64(i64 [[A]], i64 [[B]], i32 34)
 ; CHECK-NEXT:    ret i64 [[MASK]]
 ;
   %cmp = icmp ugt i64 %a, %b
@@ -1854,8 +2050,9 @@
 }
 
 define i64 @fold_icmp_ne_0_zext_icmp_ult_swap_i64(i64 %a, i64 %b) {
-; CHECK-LABEL: @fold_icmp_ne_0_zext_icmp_ult_swap_i64(
-; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i64(i64 [[A:%.*]], i64 [[B:%.*]], i32 34)
+; CHECK-LABEL: define i64 @fold_icmp_ne_0_zext_icmp_ult_swap_i64
+; CHECK-SAME: (i64 [[A:%.*]], i64 [[B:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i64(i64 [[A]], i64 [[B]], i32 34)
 ; CHECK-NEXT:    ret i64 [[MASK]]
 ;
   %cmp = icmp ugt i64 %a, %b
@@ -1865,8 +2062,9 @@
 }
 
 define i64 @fold_icmp_ne_0_zext_fcmp_oeq_f32(float %a, float %b) {
-; CHECK-LABEL: @fold_icmp_ne_0_zext_fcmp_oeq_f32(
-; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.fcmp.i64.f32(float [[A:%.*]], float [[B:%.*]], i32 1)
+; CHECK-LABEL: define i64 @fold_icmp_ne_0_zext_fcmp_oeq_f32
+; CHECK-SAME: (float [[A:%.*]], float [[B:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.fcmp.i64.f32(float [[A]], float [[B]], i32 1)
 ; CHECK-NEXT:    ret i64 [[MASK]]
 ;
   %cmp = fcmp oeq float %a, %b
@@ -1876,8 +2074,9 @@
 }
 
 define i64 @fold_icmp_ne_0_zext_fcmp_une_f32(float %a, float %b) {
-; CHECK-LABEL: @fold_icmp_ne_0_zext_fcmp_une_f32(
-; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.fcmp.i64.f32(float [[A:%.*]], float [[B:%.*]], i32 14)
+; CHECK-LABEL: define i64 @fold_icmp_ne_0_zext_fcmp_une_f32
+; CHECK-SAME: (float [[A:%.*]], float [[B:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.fcmp.i64.f32(float [[A]], float [[B]], i32 14)
 ; CHECK-NEXT:    ret i64 [[MASK]]
 ;
   %cmp = fcmp une float %a, %b
@@ -1887,8 +2086,9 @@
 }
 
 define i64 @fold_icmp_ne_0_zext_fcmp_olt_f64(double %a, double %b) {
-; CHECK-LABEL: @fold_icmp_ne_0_zext_fcmp_olt_f64(
-; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.fcmp.i64.f64(double [[A:%.*]], double [[B:%.*]], i32 4)
+; CHECK-LABEL: define i64 @fold_icmp_ne_0_zext_fcmp_olt_f64
+; CHECK-SAME: (double [[A:%.*]], double [[B:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.fcmp.i64.f64(double [[A]], double [[B]], i32 4)
 ; CHECK-NEXT:    ret i64 [[MASK]]
 ;
   %cmp = fcmp olt double %a, %b
@@ -1898,8 +2098,9 @@
 }
 
 define i64 @fold_icmp_sext_icmp_ne_0_i32(i32 %a, i32 %b) {
-; CHECK-LABEL: @fold_icmp_sext_icmp_ne_0_i32(
-; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i32(i32 [[A:%.*]], i32 [[B:%.*]], i32 32)
+; CHECK-LABEL: define i64 @fold_icmp_sext_icmp_ne_0_i32
+; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i32(i32 [[A]], i32 [[B]], i32 32)
 ; CHECK-NEXT:    ret i64 [[MASK]]
 ;
   %cmp = icmp eq i32 %a, %b
@@ -1909,8 +2110,9 @@
 }
 
 define i64 @fold_icmp_eq_0_zext_icmp_eq_i32(i32 %a, i32 %b) {
-; CHECK-LABEL: @fold_icmp_eq_0_zext_icmp_eq_i32(
-; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i32(i32 [[A:%.*]], i32 [[B:%.*]], i32 33)
+; CHECK-LABEL: define i64 @fold_icmp_eq_0_zext_icmp_eq_i32
+; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i32(i32 [[A]], i32 [[B]], i32 33)
 ; CHECK-NEXT:    ret i64 [[MASK]]
 ;
   %cmp = icmp eq i32 %a, %b
@@ -1920,8 +2122,9 @@
 }
 
 define i64 @fold_icmp_eq_0_zext_icmp_slt_i32(i32 %a, i32 %b) {
-; CHECK-LABEL: @fold_icmp_eq_0_zext_icmp_slt_i32(
-; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i32(i32 [[A:%.*]], i32 [[B:%.*]], i32 39)
+; CHECK-LABEL: define i64 @fold_icmp_eq_0_zext_icmp_slt_i32
+; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i32(i32 [[A]], i32 [[B]], i32 39)
 ; CHECK-NEXT:    ret i64 [[MASK]]
 ;
   %cmp = icmp slt i32 %a, %b
@@ -1931,8 +2134,9 @@
 }
 
 define i64 @fold_icmp_eq_0_zext_fcmp_oeq_f32(float %a, float %b) {
-; CHECK-LABEL: @fold_icmp_eq_0_zext_fcmp_oeq_f32(
-; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.fcmp.i64.f32(float [[A:%.*]], float [[B:%.*]], i32 14)
+; CHECK-LABEL: define i64 @fold_icmp_eq_0_zext_fcmp_oeq_f32
+; CHECK-SAME: (float [[A:%.*]], float [[B:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.fcmp.i64.f32(float [[A]], float [[B]], i32 14)
 ; CHECK-NEXT:    ret i64 [[MASK]]
 ;
   %cmp = fcmp oeq float %a, %b
@@ -1942,8 +2146,9 @@
 }
 
 define i64 @fold_icmp_eq_0_zext_fcmp_ule_f32(float %a, float %b) {
-; CHECK-LABEL: @fold_icmp_eq_0_zext_fcmp_ule_f32(
-; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.fcmp.i64.f32(float [[A:%.*]], float [[B:%.*]], i32 2)
+; CHECK-LABEL: define i64 @fold_icmp_eq_0_zext_fcmp_ule_f32
+; CHECK-SAME: (float [[A:%.*]], float [[B:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.fcmp.i64.f32(float [[A]], float [[B]], i32 2)
 ; CHECK-NEXT:    ret i64 [[MASK]]
 ;
   %cmp = fcmp ule float %a, %b
@@ -1953,8 +2158,9 @@
 }
 
 define i64 @fold_icmp_eq_0_zext_fcmp_ogt_f32(float %a, float %b) {
-; CHECK-LABEL: @fold_icmp_eq_0_zext_fcmp_ogt_f32(
-; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.fcmp.i64.f32(float [[A:%.*]], float [[B:%.*]], i32 13)
+; CHECK-LABEL: define i64 @fold_icmp_eq_0_zext_fcmp_ogt_f32
+; CHECK-SAME: (float [[A:%.*]], float [[B:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.fcmp.i64.f32(float [[A]], float [[B]], i32 13)
 ; CHECK-NEXT:    ret i64 [[MASK]]
 ;
   %cmp = fcmp ogt float %a, %b
@@ -1964,8 +2170,9 @@
 }
 
 define i64 @fold_icmp_zext_icmp_eq_1_i32(i32 %a, i32 %b) {
-; CHECK-LABEL: @fold_icmp_zext_icmp_eq_1_i32(
-; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i32(i32 [[A:%.*]], i32 [[B:%.*]], i32 32)
+; CHECK-LABEL: define i64 @fold_icmp_zext_icmp_eq_1_i32
+; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i32(i32 [[A]], i32 [[B]], i32 32)
 ; CHECK-NEXT:    ret i64 [[MASK]]
 ;
   %cmp = icmp eq i32 %a, %b
@@ -1975,8 +2182,9 @@
 }
 
 define i64 @fold_icmp_zext_argi1_eq_1_i32(i1 %cond) {
-; CHECK-LABEL: @fold_icmp_zext_argi1_eq_1_i32(
-; CHECK-NEXT:    [[ZEXT_COND:%.*]] = zext i1 [[COND:%.*]] to i32
+; CHECK-LABEL: define i64 @fold_icmp_zext_argi1_eq_1_i32
+; CHECK-SAME: (i1 [[COND:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[ZEXT_COND:%.*]] = zext i1 [[COND]] to i32
 ; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i32(i32 [[ZEXT_COND]], i32 0, i32 33)
 ; CHECK-NEXT:    ret i64 [[MASK]]
 ;
@@ -1986,8 +2194,9 @@
 }
 
 define i64 @fold_icmp_zext_argi1_eq_neg1_i32(i1 %cond) {
-; CHECK-LABEL: @fold_icmp_zext_argi1_eq_neg1_i32(
-; CHECK-NEXT:    [[ZEXT_COND:%.*]] = zext i1 [[COND:%.*]] to i32
+; CHECK-LABEL: define i64 @fold_icmp_zext_argi1_eq_neg1_i32
+; CHECK-SAME: (i1 [[COND:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[ZEXT_COND:%.*]] = zext i1 [[COND]] to i32
 ; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i32(i32 [[ZEXT_COND]], i32 -1, i32 32)
 ; CHECK-NEXT:    ret i64 [[MASK]]
 ;
@@ -1997,8 +2206,9 @@
 }
 
 define i64 @fold_icmp_sext_argi1_eq_1_i32(i1 %cond) {
-; CHECK-LABEL: @fold_icmp_sext_argi1_eq_1_i32(
-; CHECK-NEXT:    [[SEXT_COND:%.*]] = sext i1 [[COND:%.*]] to i32
+; CHECK-LABEL: define i64 @fold_icmp_sext_argi1_eq_1_i32
+; CHECK-SAME: (i1 [[COND:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[SEXT_COND:%.*]] = sext i1 [[COND]] to i32
 ; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i32(i32 [[SEXT_COND]], i32 1, i32 32)
 ; CHECK-NEXT:    ret i64 [[MASK]]
 ;
@@ -2008,8 +2218,9 @@
 }
 
 define i64 @fold_icmp_sext_argi1_eq_neg1_i32(i1 %cond) {
-; CHECK-LABEL: @fold_icmp_sext_argi1_eq_neg1_i32(
-; CHECK-NEXT:    [[SEXT_COND:%.*]] = sext i1 [[COND:%.*]] to i32
+; CHECK-LABEL: define i64 @fold_icmp_sext_argi1_eq_neg1_i32
+; CHECK-SAME: (i1 [[COND:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[SEXT_COND:%.*]] = sext i1 [[COND]] to i32
 ; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i32(i32 [[SEXT_COND]], i32 0, i32 33)
 ; CHECK-NEXT:    ret i64 [[MASK]]
 ;
@@ -2019,8 +2230,9 @@
 }
 
 define i64 @fold_icmp_sext_argi1_eq_neg1_i64(i1 %cond) {
-; CHECK-LABEL: @fold_icmp_sext_argi1_eq_neg1_i64(
-; CHECK-NEXT:    [[SEXT_COND:%.*]] = sext i1 [[COND:%.*]] to i64
+; CHECK-LABEL: define i64 @fold_icmp_sext_argi1_eq_neg1_i64
+; CHECK-SAME: (i1 [[COND:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[SEXT_COND:%.*]] = sext i1 [[COND]] to i64
 ; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i64(i64 [[SEXT_COND]], i64 0, i32 33)
 ; CHECK-NEXT:    ret i64 [[MASK]]
 ;
@@ -2031,8 +2243,9 @@
 
 ; TODO: Should be able to fold to false
 define i64 @fold_icmp_sext_icmp_eq_1_i32(i32 %a, i32 %b) {
-; CHECK-LABEL: @fold_icmp_sext_icmp_eq_1_i32(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[A:%.*]], [[B:%.*]]
+; CHECK-LABEL: define i64 @fold_icmp_sext_icmp_eq_1_i32
+; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[A]], [[B]]
 ; CHECK-NEXT:    [[SEXT_CMP:%.*]] = sext i1 [[CMP]] to i32
 ; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i32(i32 [[SEXT_CMP]], i32 1, i32 32)
 ; CHECK-NEXT:    ret i64 [[MASK]]
@@ -2044,8 +2257,9 @@
 }
 
 define i64 @fold_icmp_sext_icmp_eq_neg1_i32(i32 %a, i32 %b) {
-; CHECK-LABEL: @fold_icmp_sext_icmp_eq_neg1_i32(
-; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i32(i32 [[A:%.*]], i32 [[B:%.*]], i32 32)
+; CHECK-LABEL: define i64 @fold_icmp_sext_icmp_eq_neg1_i32
+; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i32(i32 [[A]], i32 [[B]], i32 32)
 ; CHECK-NEXT:    ret i64 [[MASK]]
 ;
   %cmp = icmp eq i32 %a, %b
@@ -2055,8 +2269,9 @@
 }
 
 define i64 @fold_icmp_sext_icmp_sge_neg1_i32(i32 %a, i32 %b) {
-; CHECK-LABEL: @fold_icmp_sext_icmp_sge_neg1_i32(
-; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i32(i32 [[A:%.*]], i32 [[B:%.*]], i32 39)
+; CHECK-LABEL: define i64 @fold_icmp_sext_icmp_sge_neg1_i32
+; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i32(i32 [[A]], i32 [[B]], i32 39)
 ; CHECK-NEXT:    ret i64 [[MASK]]
 ;
   %cmp = icmp sge i32 %a, %b
@@ -2066,8 +2281,9 @@
 }
 
 define i64 @fold_not_icmp_ne_0_zext_icmp_sle_i32(i32 %a, i32 %b) {
-; CHECK-LABEL: @fold_not_icmp_ne_0_zext_icmp_sle_i32(
-; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i32(i32 [[A:%.*]], i32 [[B:%.*]], i32 38)
+; CHECK-LABEL: define i64 @fold_not_icmp_ne_0_zext_icmp_sle_i32
+; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i32(i32 [[A]], i32 [[B]], i32 38)
 ; CHECK-NEXT:    ret i64 [[MASK]]
 ;
   %cmp = icmp sle i32 %a, %b
@@ -2078,9 +2294,10 @@
 }
 
 define i64 @fold_icmp_ne_0_zext_icmp_eq_i4(i4 %a, i4 %b) {
-; CHECK-LABEL: @fold_icmp_ne_0_zext_icmp_eq_i4(
-; CHECK-NEXT:    [[TMP1:%.*]] = zext i4 [[A:%.*]] to i16
-; CHECK-NEXT:    [[TMP2:%.*]] = zext i4 [[B:%.*]] to i16
+; CHECK-LABEL: define i64 @fold_icmp_ne_0_zext_icmp_eq_i4
+; CHECK-SAME: (i4 [[A:%.*]], i4 [[B:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i4 [[A]] to i16
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i4 [[B]] to i16
 ; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i16(i16 [[TMP1]], i16 [[TMP2]], i32 32)
 ; CHECK-NEXT:    ret i64 [[MASK]]
 ;
@@ -2091,9 +2308,10 @@
 }
 
 define i64 @fold_icmp_ne_0_zext_icmp_eq_i8(i8 %a, i8 %b) {
-; CHECK-LABEL: @fold_icmp_ne_0_zext_icmp_eq_i8(
-; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i16
-; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[B:%.*]] to i16
+; CHECK-LABEL: define i64 @fold_icmp_ne_0_zext_icmp_eq_i8
+; CHECK-SAME: (i8 [[A:%.*]], i8 [[B:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[A]] to i16
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[B]] to i16
 ; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i16(i16 [[TMP1]], i16 [[TMP2]], i32 32)
 ; CHECK-NEXT:    ret i64 [[MASK]]
 ;
@@ -2104,8 +2322,9 @@
 }
 
 define i64 @fold_icmp_ne_0_zext_icmp_eq_i16(i16 %a, i16 %b) {
-; CHECK-LABEL: @fold_icmp_ne_0_zext_icmp_eq_i16(
-; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i16(i16 [[A:%.*]], i16 [[B:%.*]], i32 32)
+; CHECK-LABEL: define i64 @fold_icmp_ne_0_zext_icmp_eq_i16
+; CHECK-SAME: (i16 [[A:%.*]], i16 [[B:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i16(i16 [[A]], i16 [[B]], i32 32)
 ; CHECK-NEXT:    ret i64 [[MASK]]
 ;
   %cmp = icmp eq i16 %a, %b
@@ -2115,9 +2334,10 @@
 }
 
 define i64 @fold_icmp_ne_0_zext_icmp_eq_i36(i36 %a, i36 %b) {
-; CHECK-LABEL: @fold_icmp_ne_0_zext_icmp_eq_i36(
-; CHECK-NEXT:    [[TMP1:%.*]] = zext i36 [[A:%.*]] to i64
-; CHECK-NEXT:    [[TMP2:%.*]] = zext i36 [[B:%.*]] to i64
+; CHECK-LABEL: define i64 @fold_icmp_ne_0_zext_icmp_eq_i36
+; CHECK-SAME: (i36 [[A:%.*]], i36 [[B:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i36 [[A]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i36 [[B]] to i64
 ; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i64(i64 [[TMP1]], i64 [[TMP2]], i32 32)
 ; CHECK-NEXT:    ret i64 [[MASK]]
 ;
@@ -2128,8 +2348,9 @@
 }
 
 define i64 @fold_icmp_ne_0_zext_icmp_eq_i128(i128 %a, i128 %b) {
-; CHECK-LABEL: @fold_icmp_ne_0_zext_icmp_eq_i128(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i128 [[A:%.*]], [[B:%.*]]
+; CHECK-LABEL: define i64 @fold_icmp_ne_0_zext_icmp_eq_i128
+; CHECK-SAME: (i128 [[A:%.*]], i128 [[B:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i128 [[A]], [[B]]
 ; CHECK-NEXT:    [[ZEXT_CMP:%.*]] = zext i1 [[CMP]] to i32
 ; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i32(i32 [[ZEXT_CMP]], i32 0, i32 33)
 ; CHECK-NEXT:    ret i64 [[MASK]]
@@ -2141,8 +2362,9 @@
 }
 
 define i64 @fold_icmp_ne_0_zext_fcmp_oeq_f16(half %a, half %b) {
-; CHECK-LABEL: @fold_icmp_ne_0_zext_fcmp_oeq_f16(
-; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.fcmp.i64.f16(half [[A:%.*]], half [[B:%.*]], i32 1)
+; CHECK-LABEL: define i64 @fold_icmp_ne_0_zext_fcmp_oeq_f16
+; CHECK-SAME: (half [[A:%.*]], half [[B:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.fcmp.i64.f16(half [[A]], half [[B]], i32 1)
 ; CHECK-NEXT:    ret i64 [[MASK]]
 ;
   %cmp = fcmp oeq half %a, %b
@@ -2152,8 +2374,9 @@
 }
 
 define i64 @fold_icmp_ne_0_zext_fcmp_oeq_f128(fp128 %a, fp128 %b) {
-; CHECK-LABEL: @fold_icmp_ne_0_zext_fcmp_oeq_f128(
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq fp128 [[A:%.*]], [[B:%.*]]
+; CHECK-LABEL: define i64 @fold_icmp_ne_0_zext_fcmp_oeq_f128
+; CHECK-SAME: (fp128 [[A:%.*]], fp128 [[B:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq fp128 [[A]], [[B]]
 ; CHECK-NEXT:    [[ZEXT_CMP:%.*]] = zext i1 [[CMP]] to i32
 ; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i32(i32 [[ZEXT_CMP]], i32 0, i32 33)
 ; CHECK-NEXT:    ret i64 [[MASK]]
@@ -2165,9 +2388,10 @@
 }
 
 define i64 @fold_icmp_ne_0_zext_icmp_slt_i4(i4 %a, i4 %b) {
-; CHECK-LABEL: @fold_icmp_ne_0_zext_icmp_slt_i4(
-; CHECK-NEXT:    [[TMP1:%.*]] = sext i4 [[A:%.*]] to i16
-; CHECK-NEXT:    [[TMP2:%.*]] = sext i4 [[B:%.*]] to i16
+; CHECK-LABEL: define i64 @fold_icmp_ne_0_zext_icmp_slt_i4
+; CHECK-SAME: (i4 [[A:%.*]], i4 [[B:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = sext i4 [[A]] to i16
+; CHECK-NEXT:    [[TMP2:%.*]] = sext i4 [[B]] to i16
 ; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i16(i16 [[TMP1]], i16 [[TMP2]], i32 40)
 ; CHECK-NEXT:    ret i64 [[MASK]]
 ;
@@ -2178,9 +2402,10 @@
 }
 
 define i64 @fold_icmp_ne_0_zext_icmp_slt_i8(i8 %a, i8 %b) {
-; CHECK-LABEL: @fold_icmp_ne_0_zext_icmp_slt_i8(
-; CHECK-NEXT:    [[TMP1:%.*]] = sext i8 [[A:%.*]] to i16
-; CHECK-NEXT:    [[TMP2:%.*]] = sext i8 [[B:%.*]] to i16
+; CHECK-LABEL: define i64 @fold_icmp_ne_0_zext_icmp_slt_i8
+; CHECK-SAME: (i8 [[A:%.*]], i8 [[B:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = sext i8 [[A]] to i16
+; CHECK-NEXT:    [[TMP2:%.*]] = sext i8 [[B]] to i16
 ; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i16(i16 [[TMP1]], i16 [[TMP2]], i32 40)
 ; CHECK-NEXT:    ret i64 [[MASK]]
 ;
@@ -2191,8 +2416,9 @@
 }
 
 define i64 @fold_icmp_ne_0_zext_icmp_slt_i16(i16 %a, i16 %b) {
-; CHECK-LABEL: @fold_icmp_ne_0_zext_icmp_slt_i16(
-; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i16(i16 [[A:%.*]], i16 [[B:%.*]], i32 40)
+; CHECK-LABEL: define i64 @fold_icmp_ne_0_zext_icmp_slt_i16
+; CHECK-SAME: (i16 [[A:%.*]], i16 [[B:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i16(i16 [[A]], i16 [[B]], i32 40)
 ; CHECK-NEXT:    ret i64 [[MASK]]
 ;
   %cmp = icmp slt i16 %a, %b
@@ -2202,9 +2428,10 @@
 }
 
 define i64 @fold_icmp_ne_0_zext_icmp_ult_i4(i4 %a, i4 %b) {
-; CHECK-LABEL: @fold_icmp_ne_0_zext_icmp_ult_i4(
-; CHECK-NEXT:    [[TMP1:%.*]] = zext i4 [[A:%.*]] to i16
-; CHECK-NEXT:    [[TMP2:%.*]] = zext i4 [[B:%.*]] to i16
+; CHECK-LABEL: define i64 @fold_icmp_ne_0_zext_icmp_ult_i4
+; CHECK-SAME: (i4 [[A:%.*]], i4 [[B:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i4 [[A]] to i16
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i4 [[B]] to i16
 ; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i16(i16 [[TMP1]], i16 [[TMP2]], i32 36)
 ; CHECK-NEXT:    ret i64 [[MASK]]
 ;
@@ -2215,9 +2442,10 @@
 }
 
 define i64 @fold_icmp_ne_0_zext_icmp_ult_i8(i8 %a, i8 %b) {
-; CHECK-LABEL: @fold_icmp_ne_0_zext_icmp_ult_i8(
-; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i16
-; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[B:%.*]] to i16
+; CHECK-LABEL: define i64 @fold_icmp_ne_0_zext_icmp_ult_i8
+; CHECK-SAME: (i8 [[A:%.*]], i8 [[B:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[A]] to i16
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[B]] to i16
 ; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i16(i16 [[TMP1]], i16 [[TMP2]], i32 36)
 ; CHECK-NEXT:    ret i64 [[MASK]]
 ;
@@ -2228,8 +2456,9 @@
 }
 
 define i64 @fold_icmp_ne_0_zext_icmp_ult_i16(i16 %a, i16 %b) {
-; CHECK-LABEL: @fold_icmp_ne_0_zext_icmp_ult_i16(
-; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i16(i16 [[A:%.*]], i16 [[B:%.*]], i32 36)
+; CHECK-LABEL: define i64 @fold_icmp_ne_0_zext_icmp_ult_i16
+; CHECK-SAME: (i16 [[A:%.*]], i16 [[B:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i16(i16 [[A]], i16 [[B]], i32 36)
 ; CHECK-NEXT:    ret i64 [[MASK]]
 ;
   %cmp = icmp ult i16 %a, %b
@@ -2241,8 +2470,9 @@
 ; 1-bit NE comparisons
 
 define i64 @fold_icmp_i1_ne_0_icmp_eq_i1(i32 %a, i32 %b) {
-; CHECK-LABEL: @fold_icmp_i1_ne_0_icmp_eq_i1(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[A:%.*]], [[B:%.*]]
+; CHECK-LABEL: define i64 @fold_icmp_i1_ne_0_icmp_eq_i1
+; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[A]], [[B]]
 ; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i1(i1 [[CMP]], i1 false, i32 33)
 ; CHECK-NEXT:    ret i64 [[MASK]]
 ;
@@ -2252,8 +2482,9 @@
 }
 
 define i64 @fold_icmp_i1_ne_0_icmp_ne_i1(i32 %a, i32 %b) {
-; CHECK-LABEL: @fold_icmp_i1_ne_0_icmp_ne_i1(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[A:%.*]], [[B:%.*]]
+; CHECK-LABEL: define i64 @fold_icmp_i1_ne_0_icmp_ne_i1
+; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[A]], [[B]]
 ; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i1(i1 [[CMP]], i1 false, i32 33)
 ; CHECK-NEXT:    ret i64 [[MASK]]
 ;
@@ -2263,8 +2494,9 @@
 }
 
 define i64 @fold_icmp_i1_ne_0_icmp_sle_i1(i32 %a, i32 %b) {
-; CHECK-LABEL: @fold_icmp_i1_ne_0_icmp_sle_i1(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sle i32 [[A:%.*]], [[B:%.*]]
+; CHECK-LABEL: define i64 @fold_icmp_i1_ne_0_icmp_sle_i1
+; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sle i32 [[A]], [[B]]
 ; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i1(i1 [[CMP]], i1 false, i32 33)
 ; CHECK-NEXT:    ret i64 [[MASK]]
 ;
@@ -2274,8 +2506,9 @@
 }
 
 define i64 @fold_icmp_i1_ne_0_icmp_ugt_i64(i64 %a, i64 %b) {
-; CHECK-LABEL: @fold_icmp_i1_ne_0_icmp_ugt_i64(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[A:%.*]], [[B:%.*]]
+; CHECK-LABEL: define i64 @fold_icmp_i1_ne_0_icmp_ugt_i64
+; CHECK-SAME: (i64 [[A:%.*]], i64 [[B:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[A]], [[B]]
 ; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i1(i1 [[CMP]], i1 false, i32 33)
 ; CHECK-NEXT:    ret i64 [[MASK]]
 ;
@@ -2285,8 +2518,9 @@
 }
 
 define i64 @fold_icmp_i1_ne_0_icmp_ult_swap_i64(i64 %a, i64 %b) {
-; CHECK-LABEL: @fold_icmp_i1_ne_0_icmp_ult_swap_i64(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[A:%.*]], [[B:%.*]]
+; CHECK-LABEL: define i64 @fold_icmp_i1_ne_0_icmp_ult_swap_i64
+; CHECK-SAME: (i64 [[A:%.*]], i64 [[B:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[A]], [[B]]
 ; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i1(i1 [[CMP]], i1 false, i32 33)
 ; CHECK-NEXT:    ret i64 [[MASK]]
 ;
@@ -2296,8 +2530,9 @@
 }
 
 define i64 @fold_icmp_i1_ne_0_fcmp_oeq_f32(float %a, float %b) {
-; CHECK-LABEL: @fold_icmp_i1_ne_0_fcmp_oeq_f32(
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq float [[A:%.*]], [[B:%.*]]
+; CHECK-LABEL: define i64 @fold_icmp_i1_ne_0_fcmp_oeq_f32
+; CHECK-SAME: (float [[A:%.*]], float [[B:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq float [[A]], [[B]]
 ; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i1(i1 [[CMP]], i1 false, i32 33)
 ; CHECK-NEXT:    ret i64 [[MASK]]
 ;
@@ -2307,8 +2542,9 @@
 }
 
 define i64 @fold_icmp_i1_ne_0_fcmp_une_f32(float %a, float %b) {
-; CHECK-LABEL: @fold_icmp_i1_ne_0_fcmp_une_f32(
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp une float [[A:%.*]], [[B:%.*]]
+; CHECK-LABEL: define i64 @fold_icmp_i1_ne_0_fcmp_une_f32
+; CHECK-SAME: (float [[A:%.*]], float [[B:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp une float [[A]], [[B]]
 ; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i1(i1 [[CMP]], i1 false, i32 33)
 ; CHECK-NEXT:    ret i64 [[MASK]]
 ;
@@ -2318,8 +2554,9 @@
 }
 
 define i64 @fold_icmp_i1_ne_0_fcmp_olt_f64(double %a, double %b) {
-; CHECK-LABEL: @fold_icmp_i1_ne_0_fcmp_olt_f64(
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp olt double [[A:%.*]], [[B:%.*]]
+; CHECK-LABEL: define i64 @fold_icmp_i1_ne_0_fcmp_olt_f64
+; CHECK-SAME: (double [[A:%.*]], double [[B:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp olt double [[A]], [[B]]
 ; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i1(i1 [[CMP]], i1 false, i32 33)
 ; CHECK-NEXT:    ret i64 [[MASK]]
 ;
@@ -2329,8 +2566,9 @@
 }
 
 define i64 @fold_icmp_i1_ne_0_icmp_eq_i4(i4 %a, i4 %b) {
-; CHECK-LABEL: @fold_icmp_i1_ne_0_icmp_eq_i4(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i4 [[A:%.*]], [[B:%.*]]
+; CHECK-LABEL: define i64 @fold_icmp_i1_ne_0_icmp_eq_i4
+; CHECK-SAME: (i4 [[A:%.*]], i4 [[B:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i4 [[A]], [[B]]
 ; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i1(i1 [[CMP]], i1 false, i32 33)
 ; CHECK-NEXT:    ret i64 [[MASK]]
 ;
@@ -2340,8 +2578,9 @@
 }
 
 define i64 @fold_icmp_i1_ne_0_icmp_eq_i8(i8 %a, i8 %b) {
-; CHECK-LABEL: @fold_icmp_i1_ne_0_icmp_eq_i8(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[A:%.*]], [[B:%.*]]
+; CHECK-LABEL: define i64 @fold_icmp_i1_ne_0_icmp_eq_i8
+; CHECK-SAME: (i8 [[A:%.*]], i8 [[B:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[A]], [[B]]
 ; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i1(i1 [[CMP]], i1 false, i32 33)
 ; CHECK-NEXT:    ret i64 [[MASK]]
 ;
@@ -2351,8 +2590,9 @@
 }
 
 define i64 @fold_icmp_i1_ne_0_icmp_eq_i16(i16 %a, i16 %b) {
-; CHECK-LABEL: @fold_icmp_i1_ne_0_icmp_eq_i16(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i16 [[A:%.*]], [[B:%.*]]
+; CHECK-LABEL: define i64 @fold_icmp_i1_ne_0_icmp_eq_i16
+; CHECK-SAME: (i16 [[A:%.*]], i16 [[B:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i16 [[A]], [[B]]
 ; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i1(i1 [[CMP]], i1 false, i32 33)
 ; CHECK-NEXT:    ret i64 [[MASK]]
 ;
@@ -2362,8 +2602,9 @@
 }
 
 define i64 @fold_icmp_i1_ne_0_icmp_eq_i36(i36 %a, i36 %b) {
-; CHECK-LABEL: @fold_icmp_i1_ne_0_icmp_eq_i36(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i36 [[A:%.*]], [[B:%.*]]
+; CHECK-LABEL: define i64 @fold_icmp_i1_ne_0_icmp_eq_i36
+; CHECK-SAME: (i36 [[A:%.*]], i36 [[B:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i36 [[A]], [[B]]
 ; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i1(i1 [[CMP]], i1 false, i32 33)
 ; CHECK-NEXT:    ret i64 [[MASK]]
 ;
@@ -2373,8 +2614,9 @@
 }
 
 define i64 @fold_icmp_i1_ne_0_icmp_eq_i128(i128 %a, i128 %b) {
-; CHECK-LABEL: @fold_icmp_i1_ne_0_icmp_eq_i128(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i128 [[A:%.*]], [[B:%.*]]
+; CHECK-LABEL: define i64 @fold_icmp_i1_ne_0_icmp_eq_i128
+; CHECK-SAME: (i128 [[A:%.*]], i128 [[B:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i128 [[A]], [[B]]
 ; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i1(i1 [[CMP]], i1 false, i32 33)
 ; CHECK-NEXT:    ret i64 [[MASK]]
 ;
@@ -2384,8 +2626,9 @@
 }
 
 define i64 @fold_icmp_i1_ne_0_fcmp_oeq_f16(half %a, half %b) {
-; CHECK-LABEL: @fold_icmp_i1_ne_0_fcmp_oeq_f16(
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq half [[A:%.*]], [[B:%.*]]
+; CHECK-LABEL: define i64 @fold_icmp_i1_ne_0_fcmp_oeq_f16
+; CHECK-SAME: (half [[A:%.*]], half [[B:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq half [[A]], [[B]]
 ; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i1(i1 [[CMP]], i1 false, i32 33)
 ; CHECK-NEXT:    ret i64 [[MASK]]
 ;
@@ -2395,8 +2638,9 @@
 }
 
 define i64 @fold_icmp_i1_ne_0_fcmp_oeq_f128(fp128 %a, fp128 %b) {
-; CHECK-LABEL: @fold_icmp_i1_ne_0_fcmp_oeq_f128(
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq fp128 [[A:%.*]], [[B:%.*]]
+; CHECK-LABEL: define i64 @fold_icmp_i1_ne_0_fcmp_oeq_f128
+; CHECK-SAME: (fp128 [[A:%.*]], fp128 [[B:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq fp128 [[A]], [[B]]
 ; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i1(i1 [[CMP]], i1 false, i32 33)
 ; CHECK-NEXT:    ret i64 [[MASK]]
 ;
@@ -2406,8 +2650,9 @@
 }
 
 define i64 @fold_icmp_i1_ne_0_icmp_slt_i4(i4 %a, i4 %b) {
-; CHECK-LABEL: @fold_icmp_i1_ne_0_icmp_slt_i4(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i4 [[A:%.*]], [[B:%.*]]
+; CHECK-LABEL: define i64 @fold_icmp_i1_ne_0_icmp_slt_i4
+; CHECK-SAME: (i4 [[A:%.*]], i4 [[B:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i4 [[A]], [[B]]
 ; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i1(i1 [[CMP]], i1 false, i32 33)
 ; CHECK-NEXT:    ret i64 [[MASK]]
 ;
@@ -2417,8 +2662,9 @@
 }
 
 define i64 @fold_icmp_i1_ne_0_icmp_slt_i8(i8 %a, i8 %b) {
-; CHECK-LABEL: @fold_icmp_i1_ne_0_icmp_slt_i8(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[A:%.*]], [[B:%.*]]
+; CHECK-LABEL: define i64 @fold_icmp_i1_ne_0_icmp_slt_i8
+; CHECK-SAME: (i8 [[A:%.*]], i8 [[B:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[A]], [[B]]
 ; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i1(i1 [[CMP]], i1 false, i32 33)
 ; CHECK-NEXT:    ret i64 [[MASK]]
 ;
@@ -2428,8 +2674,9 @@
 }
 
 define i64 @fold_icmp_i1_ne_0_icmp_slt_i16(i16 %a, i16 %b) {
-; CHECK-LABEL: @fold_icmp_i1_ne_0_icmp_slt_i16(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i16 [[A:%.*]], [[B:%.*]]
+; CHECK-LABEL: define i64 @fold_icmp_i1_ne_0_icmp_slt_i16
+; CHECK-SAME: (i16 [[A:%.*]], i16 [[B:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i16 [[A]], [[B]]
 ; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i1(i1 [[CMP]], i1 false, i32 33)
 ; CHECK-NEXT:    ret i64 [[MASK]]
 ;
@@ -2439,8 +2686,9 @@
 }
 
 define i64 @fold_icmp_i1_ne_0_icmp_ult_i4(i4 %a, i4 %b) {
-; CHECK-LABEL: @fold_icmp_i1_ne_0_icmp_ult_i4(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i4 [[A:%.*]], [[B:%.*]]
+; CHECK-LABEL: define i64 @fold_icmp_i1_ne_0_icmp_ult_i4
+; CHECK-SAME: (i4 [[A:%.*]], i4 [[B:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i4 [[A]], [[B]]
 ; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i1(i1 [[CMP]], i1 false, i32 33)
 ; CHECK-NEXT:    ret i64 [[MASK]]
 ;
@@ -2450,8 +2698,9 @@
 }
 
 define i64 @fold_icmp_i1_ne_0_icmp_ult_i8(i8 %a, i8 %b) {
-; CHECK-LABEL: @fold_icmp_i1_ne_0_icmp_ult_i8(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 [[A:%.*]], [[B:%.*]]
+; CHECK-LABEL: define i64 @fold_icmp_i1_ne_0_icmp_ult_i8
+; CHECK-SAME: (i8 [[A:%.*]], i8 [[B:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 [[A]], [[B]]
 ; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i1(i1 [[CMP]], i1 false, i32 33)
 ; CHECK-NEXT:    ret i64 [[MASK]]
 ;
@@ -2461,8 +2710,9 @@
 }
 
 define i64 @fold_icmp_i1_ne_0_icmp_ult_i16(i16 %a, i16 %b) {
-; CHECK-LABEL: @fold_icmp_i1_ne_0_icmp_ult_i16(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i16 [[A:%.*]], [[B:%.*]]
+; CHECK-LABEL: define i64 @fold_icmp_i1_ne_0_icmp_ult_i16
+; CHECK-SAME: (i16 [[A:%.*]], i16 [[B:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i16 [[A]], [[B]]
 ; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i1(i1 [[CMP]], i1 false, i32 33)
 ; CHECK-NEXT:    ret i64 [[MASK]]
 ;
@@ -2478,8 +2728,9 @@
 declare i64 @llvm.amdgcn.fcmp.i64.f32(float, float, i32 immarg) nounwind readnone convergent
 
 define i64 @invalid_fcmp_code(float %a, float %b) {
-; CHECK-LABEL: @invalid_fcmp_code(
-; CHECK-NEXT:    [[UNDER:%.*]] = call i64 @llvm.amdgcn.fcmp.i64.f32(float [[A:%.*]], float [[B:%.*]], i32 -1)
+; CHECK-LABEL: define i64 @invalid_fcmp_code
+; CHECK-SAME: (float [[A:%.*]], float [[B:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[UNDER:%.*]] = call i64 @llvm.amdgcn.fcmp.i64.f32(float [[A]], float [[B]], i32 -1)
 ; CHECK-NEXT:    [[OVER:%.*]] = call i64 @llvm.amdgcn.fcmp.i64.f32(float [[A]], float [[B]], i32 16)
 ; CHECK-NEXT:    [[OR:%.*]] = or i64 [[UNDER]], [[OVER]]
 ; CHECK-NEXT:    ret i64 [[OR]]
@@ -2491,7 +2742,8 @@
 }
 
 define i64 @fcmp_constant_inputs_false() {
-; CHECK-LABEL: @fcmp_constant_inputs_false(
+; CHECK-LABEL: define i64 @fcmp_constant_inputs_false
+; CHECK-SAME: () #[[ATTR3]] {
 ; CHECK-NEXT:    ret i64 0
 ;
   %result = call i64 @llvm.amdgcn.fcmp.i64.f32(float 2.0, float 4.0, i32 1)
@@ -2499,8 +2751,9 @@
 }
 
 define i64 @fcmp_constant_inputs_true() {
-; CHECK-LABEL: @fcmp_constant_inputs_true(
-; CHECK-NEXT:    [[RESULT:%.*]] = call i64 @llvm.read_register.i64(metadata [[META0]]) #[[ATTR17]]
+; CHECK-LABEL: define i64 @fcmp_constant_inputs_true
+; CHECK-SAME: () #[[ATTR3]] {
+; CHECK-NEXT:    [[RESULT:%.*]] = call i64 @llvm.read_register.i64(metadata [[META0]]) #[[ATTR16]]
 ; CHECK-NEXT:    ret i64 [[RESULT]]
 ;
   %result = call i64 @llvm.amdgcn.fcmp.i64.f32(float 2.0, float 4.0, i32 4)
@@ -2508,8 +2761,9 @@
 }
 
 define i64 @fcmp_constant_to_rhs_olt(float %x) {
-; CHECK-LABEL: @fcmp_constant_to_rhs_olt(
-; CHECK-NEXT:    [[RESULT:%.*]] = call i64 @llvm.amdgcn.fcmp.i64.f32(float [[X:%.*]], float 4.000000e+00, i32 2)
+; CHECK-LABEL: define i64 @fcmp_constant_to_rhs_olt
+; CHECK-SAME: (float [[X:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[RESULT:%.*]] = call i64 @llvm.amdgcn.fcmp.i64.f32(float [[X]], float 4.000000e+00, i32 2)
 ; CHECK-NEXT:    ret i64 [[RESULT]]
 ;
   %result = call i64 @llvm.amdgcn.fcmp.i64.f32(float 4.0, float %x, i32 4)
@@ -2524,8 +2778,9 @@
 declare i32 @llvm.amdgcn.ballot.i32(i1) nounwind readnone convergent
 
 define i64 @ballot_nocombine_64(i1 %i) {
-; CHECK-LABEL: @ballot_nocombine_64(
-; CHECK-NEXT:    [[B:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[I:%.*]])
+; CHECK-LABEL: define i64 @ballot_nocombine_64
+; CHECK-SAME: (i1 [[I:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[B:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[I]])
 ; CHECK-NEXT:    ret i64 [[B]]
 ;
   %b = call i64 @llvm.amdgcn.ballot.i64(i1 %i)
@@ -2533,7 +2788,8 @@
 }
 
 define i64 @ballot_zero_64() {
-; CHECK-LABEL: @ballot_zero_64(
+; CHECK-LABEL: define i64 @ballot_zero_64
+; CHECK-SAME: () #[[ATTR3]] {
 ; CHECK-NEXT:    ret i64 0
 ;
   %b = call i64 @llvm.amdgcn.ballot.i64(i1 0)
@@ -2541,8 +2797,9 @@
 }
 
 define i64 @ballot_one_64() {
-; CHECK-LABEL: @ballot_one_64(
-; CHECK-NEXT:    [[B:%.*]] = call i64 @llvm.read_register.i64(metadata [[META0]]) #[[ATTR17]]
+; CHECK-LABEL: define i64 @ballot_one_64
+; CHECK-SAME: () #[[ATTR3]] {
+; CHECK-NEXT:    [[B:%.*]] = call i64 @llvm.read_register.i64(metadata [[META0]]) #[[ATTR16]]
 ; CHECK-NEXT:    ret i64 [[B]]
 ;
   %b = call i64 @llvm.amdgcn.ballot.i64(i1 1)
@@ -2550,8 +2807,9 @@
 }
 
 define i32 @ballot_nocombine_32(i1 %i) {
-; CHECK-LABEL: @ballot_nocombine_32(
-; CHECK-NEXT:    [[B:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[I:%.*]])
+; CHECK-LABEL: define i32 @ballot_nocombine_32
+; CHECK-SAME: (i1 [[I:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[B:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[I]])
 ; CHECK-NEXT:    ret i32 [[B]]
 ;
   %b = call i32 @llvm.amdgcn.ballot.i32(i1 %i)
@@ -2559,7 +2817,8 @@
 }
 
 define i32 @ballot_zero_32() {
-; CHECK-LABEL: @ballot_zero_32(
+; CHECK-LABEL: define i32 @ballot_zero_32
+; CHECK-SAME: () #[[ATTR3]] {
 ; CHECK-NEXT:    ret i32 0
 ;
   %b = call i32 @llvm.amdgcn.ballot.i32(i1 0)
@@ -2567,8 +2826,9 @@
 }
 
 define i32 @ballot_one_32() {
-; CHECK-LABEL: @ballot_one_32(
-; CHECK-NEXT:    [[B:%.*]] = call i32 @llvm.read_register.i32(metadata [[META1:![0-9]+]]) #[[ATTR17]]
+; CHECK-LABEL: define i32 @ballot_one_32
+; CHECK-SAME: () #[[ATTR3]] {
+; CHECK-NEXT:    [[B:%.*]] = call i32 @llvm.read_register.i32(metadata [[META1:![0-9]+]]) #[[ATTR16]]
 ; CHECK-NEXT:    ret i32 [[B]]
 ;
   %b = call i32 @llvm.amdgcn.ballot.i32(i1 1)
@@ -2582,7 +2842,8 @@
 declare i1 @llvm.amdgcn.wqm.vote(i1)
 
 define float @wqm_vote_true() {
-; CHECK-LABEL: @wqm_vote_true(
+; CHECK-LABEL: define float @wqm_vote_true
+; CHECK-SAME: () #[[ATTR3]] {
 ; CHECK-NEXT:  main_body:
 ; CHECK-NEXT:    ret float 1.000000e+00
 ;
@@ -2593,7 +2854,8 @@
 }
 
 define float @wqm_vote_false() {
-; CHECK-LABEL: @wqm_vote_false(
+; CHECK-LABEL: define float @wqm_vote_false
+; CHECK-SAME: () #[[ATTR3]] {
 ; CHECK-NEXT:  main_body:
 ; CHECK-NEXT:    ret float 0.000000e+00
 ;
@@ -2604,7 +2866,8 @@
 }
 
 define float @wqm_vote_undef() {
-; CHECK-LABEL: @wqm_vote_undef(
+; CHECK-LABEL: define float @wqm_vote_undef
+; CHECK-SAME: () #[[ATTR3]] {
 ; CHECK-NEXT:  main_body:
 ; CHECK-NEXT:    ret float 0.000000e+00
 ;
@@ -2621,7 +2884,8 @@
 declare void @llvm.amdgcn.kill(i1)
 
 define void @kill_true() {
-; CHECK-LABEL: @kill_true(
+; CHECK-LABEL: define void @kill_true
+; CHECK-SAME: () #[[ATTR3]] {
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.amdgcn.kill(i1 true)
@@ -2637,8 +2901,9 @@
 @gv = constant i32 0
 
 define amdgpu_kernel void @readfirstlane_constant(i32 %arg) {
-; CHECK-LABEL: @readfirstlane_constant(
-; CHECK-NEXT:    [[VAR:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[ARG:%.*]])
+; CHECK-LABEL: define amdgpu_kernel void @readfirstlane_constant
+; CHECK-SAME: (i32 [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[VAR:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG]])
 ; CHECK-NEXT:    store volatile i32 [[VAR]], ptr undef, align 4
 ; CHECK-NEXT:    store volatile i32 0, ptr undef, align 4
 ; CHECK-NEXT:    store volatile i32 123, ptr undef, align 4
@@ -2660,8 +2925,9 @@
 }
 
 define i32 @readfirstlane_idempotent(i32 %arg) {
-; CHECK-LABEL: @readfirstlane_idempotent(
-; CHECK-NEXT:    [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[ARG:%.*]])
+; CHECK-LABEL: define i32 @readfirstlane_idempotent
+; CHECK-SAME: (i32 [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG]])
 ; CHECK-NEXT:    ret i32 [[READ0]]
 ;
   %read0 = call i32 @llvm.amdgcn.readfirstlane(i32 %arg)
@@ -2671,8 +2937,9 @@
 }
 
 define i32 @readfirstlane_readlane(i32 %arg) {
-; CHECK-LABEL: @readfirstlane_readlane(
-; CHECK-NEXT:    [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[ARG:%.*]])
+; CHECK-LABEL: define i32 @readfirstlane_readlane
+; CHECK-SAME: (i32 [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG]])
 ; CHECK-NEXT:    ret i32 [[READ0]]
 ;
   %read0 = call i32 @llvm.amdgcn.readfirstlane(i32 %arg)
@@ -2681,12 +2948,13 @@
 }
 
 define i32 @readfirstlane_readfirstlane_different_block(i32 %arg) {
-; CHECK-LABEL: @readfirstlane_readfirstlane_different_block(
+; CHECK-LABEL: define i32 @readfirstlane_readfirstlane_different_block
+; CHECK-SAME: (i32 [[ARG:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  bb0:
-; CHECK-NEXT:    [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[ARG:%.*]])
+; CHECK-NEXT:    [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG]])
 ; CHECK-NEXT:    br label [[BB1:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[READ1:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[READ0]])
+; CHECK-NEXT:    [[READ1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[READ0]])
 ; CHECK-NEXT:    ret i32 [[READ1]]
 ;
 bb0:
@@ -2699,12 +2967,13 @@
 }
 
 define i32 @readfirstlane_readlane_different_block(i32 %arg) {
-; CHECK-LABEL: @readfirstlane_readlane_different_block(
+; CHECK-LABEL: define i32 @readfirstlane_readlane_different_block
+; CHECK-SAME: (i32 [[ARG:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  bb0:
-; CHECK-NEXT:    [[READ0:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[ARG:%.*]], i32 0)
+; CHECK-NEXT:    [[READ0:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 0)
 ; CHECK-NEXT:    br label [[BB1:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[READ1:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[READ0]])
+; CHECK-NEXT:    [[READ1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[READ0]])
 ; CHECK-NEXT:    ret i32 [[READ1]]
 ;
 bb0:
@@ -2716,6 +2985,44 @@
   ret i32 %read1
 }
 
+define i32 @readfirstlane_bitcast(float %arg) {
+; CHECK-LABEL: define i32 @readfirstlane_bitcast
+; CHECK-SAME: (float [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[ARG]])
+; CHECK-NEXT:    [[READ:%.*]] = bitcast float [[TMP1]] to i32
+; CHECK-NEXT:    ret i32 [[READ]]
+;
+  %bitcast.arg = bitcast float %arg to i32
+  %read = call i32 @llvm.amdgcn.readfirstlane(i32 %bitcast.arg)
+  ret i32 %read
+}
+
+define float @bitcast_readfirstlane_bitcast(float %arg) {
+; CHECK-LABEL: define float @bitcast_readfirstlane_bitcast
+; CHECK-SAME: (float [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[ARG]])
+; CHECK-NEXT:    ret float [[TMP1]]
+;
+  %bitcast.arg = bitcast float %arg to i32
+  %read = call i32 @llvm.amdgcn.readfirstlane(i32 %bitcast.arg)
+  %cast.read = bitcast i32 %read to float
+  ret float %cast.read
+}
+
+define i32 @readfirstlane_bitcast_multi_use(float %arg) {
+; CHECK-LABEL: define i32 @readfirstlane_bitcast_multi_use
+; CHECK-SAME: (float [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    store float [[ARG]], ptr undef, align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[ARG]])
+; CHECK-NEXT:    [[READ:%.*]] = bitcast float [[TMP1]] to i32
+; CHECK-NEXT:    ret i32 [[READ]]
+;
+  %bitcast.arg = bitcast float %arg to i32
+  store i32 %bitcast.arg, i32* undef
+  %read = call i32 @llvm.amdgcn.readfirstlane(i32 %bitcast.arg)
+  ret i32 %read
+}
+
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.readlane
 ; --------------------------------------------------------------------
@@ -2723,8 +3030,9 @@
 declare i32 @llvm.amdgcn.readlane(i32, i32)
 
 define amdgpu_kernel void @readlane_constant(i32 %arg, i32 %lane) {
-; CHECK-LABEL: @readlane_constant(
-; CHECK-NEXT:    [[VAR:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[ARG:%.*]], i32 7)
+; CHECK-LABEL: define amdgpu_kernel void @readlane_constant
+; CHECK-SAME: (i32 [[ARG:%.*]], i32 [[LANE:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[VAR:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 7)
 ; CHECK-NEXT:    store volatile i32 [[VAR]], ptr undef, align 4
 ; CHECK-NEXT:    store volatile i32 0, ptr undef, align 4
 ; CHECK-NEXT:    store volatile i32 123, ptr undef, align 4
@@ -2746,8 +3054,9 @@
 }
 
 define i32 @readlane_idempotent(i32 %arg, i32 %lane) {
-; CHECK-LABEL: @readlane_idempotent(
-; CHECK-NEXT:    [[READ0:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[ARG:%.*]], i32 [[LANE:%.*]])
+; CHECK-LABEL: define i32 @readlane_idempotent
+; CHECK-SAME: (i32 [[ARG:%.*]], i32 [[LANE:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[READ0:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 [[LANE]])
 ; CHECK-NEXT:    ret i32 [[READ0]]
 ;
   %read0 = call i32 @llvm.amdgcn.readlane(i32 %arg, i32 %lane)
@@ -2756,9 +3065,10 @@
 }
 
 define i32 @readlane_idempotent_different_lanes(i32 %arg, i32 %lane0, i32 %lane1) {
-; CHECK-LABEL: @readlane_idempotent_different_lanes(
-; CHECK-NEXT:    [[READ0:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[ARG:%.*]], i32 [[LANE0:%.*]])
-; CHECK-NEXT:    [[READ1:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[READ0]], i32 [[LANE1:%.*]])
+; CHECK-LABEL: define i32 @readlane_idempotent_different_lanes
+; CHECK-SAME: (i32 [[ARG:%.*]], i32 [[LANE0:%.*]], i32 [[LANE1:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[READ0:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 [[LANE0]])
+; CHECK-NEXT:    [[READ1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[READ0]], i32 [[LANE1]])
 ; CHECK-NEXT:    ret i32 [[READ1]]
 ;
   %read0 = call i32 @llvm.amdgcn.readlane(i32 %arg, i32 %lane0)
@@ -2767,8 +3077,9 @@
 }
 
 define i32 @readlane_readfirstlane(i32 %arg) {
-; CHECK-LABEL: @readlane_readfirstlane(
-; CHECK-NEXT:    [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[ARG:%.*]])
+; CHECK-LABEL: define i32 @readlane_readfirstlane
+; CHECK-SAME: (i32 [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG]])
 ; CHECK-NEXT:    ret i32 [[READ0]]
 ;
   %read0 = call i32 @llvm.amdgcn.readfirstlane(i32 %arg)
@@ -2777,12 +3088,13 @@
 }
 
 define i32 @readlane_idempotent_different_block(i32 %arg, i32 %lane) {
-; CHECK-LABEL: @readlane_idempotent_different_block(
+; CHECK-LABEL: define i32 @readlane_idempotent_different_block
+; CHECK-SAME: (i32 [[ARG:%.*]], i32 [[LANE:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  bb0:
-; CHECK-NEXT:    [[READ0:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[ARG:%.*]], i32 [[LANE:%.*]])
+; CHECK-NEXT:    [[READ0:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 [[LANE]])
 ; CHECK-NEXT:    br label [[BB1:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[READ1:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[READ0]], i32 [[LANE]])
+; CHECK-NEXT:    [[READ1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[READ0]], i32 [[LANE]])
 ; CHECK-NEXT:    ret i32 [[READ1]]
 ;
 bb0:
@@ -2796,12 +3108,13 @@
 
 
 define i32 @readlane_readfirstlane_different_block(i32 %arg) {
-; CHECK-LABEL: @readlane_readfirstlane_different_block(
+; CHECK-LABEL: define i32 @readlane_readfirstlane_different_block
+; CHECK-SAME: (i32 [[ARG:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  bb0:
-; CHECK-NEXT:    [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[ARG:%.*]])
+; CHECK-NEXT:    [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG]])
 ; CHECK-NEXT:    br label [[BB1:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[READ1:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[READ0]], i32 0)
+; CHECK-NEXT:    [[READ1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[READ0]], i32 0)
 ; CHECK-NEXT:    ret i32 [[READ1]]
 ;
 bb0:
@@ -2820,35 +3133,38 @@
 declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32, i32, i32, i1)
 
 define amdgpu_kernel void @update_dpp_no_combine(ptr addrspace(1) %out, i32 %in1, i32 %in2) {
-; CHECK-LABEL: @update_dpp_no_combine(
-; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 [[IN1:%.*]], i32 [[IN2:%.*]], i32 1, i32 1, i32 1, i1 false)
-; CHECK-NEXT:    store i32 [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-LABEL: define amdgpu_kernel void @update_dpp_no_combine
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 [[IN1:%.*]], i32 [[IN2:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[VAL0:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 [[IN1]], i32 [[IN2]], i32 1, i32 1, i32 1, i1 false)
+; CHECK-NEXT:    store i32 [[VAL0]], ptr addrspace(1) [[OUT]], align 4
 ; CHECK-NEXT:    ret void
 ;
-  %tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %in1, i32 %in2, i32 1, i32 1, i32 1, i1 0)
-  store i32 %tmp0, ptr addrspace(1) %out
+  %val0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %in1, i32 %in2, i32 1, i32 1, i32 1, i1 0)
+  store i32 %val0, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @update_dpp_drop_old(ptr addrspace(1) %out, i32 %in1, i32 %in2) {
-; CHECK-LABEL: @update_dpp_drop_old(
-; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 undef, i32 [[IN2:%.*]], i32 3, i32 15, i32 15, i1 true)
-; CHECK-NEXT:    store i32 [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-LABEL: define amdgpu_kernel void @update_dpp_drop_old
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 [[IN1:%.*]], i32 [[IN2:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[VAL0:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 undef, i32 [[IN2]], i32 3, i32 15, i32 15, i1 true)
+; CHECK-NEXT:    store i32 [[VAL0]], ptr addrspace(1) [[OUT]], align 4
 ; CHECK-NEXT:    ret void
 ;
-  %tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %in1, i32 %in2, i32 3, i32 15, i32 15, i1 1)
-  store i32 %tmp0, ptr addrspace(1) %out
+  %val0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %in1, i32 %in2, i32 3, i32 15, i32 15, i1 1)
+  store i32 %val0, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @update_dpp_undef_old(ptr addrspace(1) %out, i32 %in1) {
-; CHECK-LABEL: @update_dpp_undef_old(
-; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 undef, i32 [[IN1:%.*]], i32 4, i32 15, i32 15, i1 true)
-; CHECK-NEXT:    store i32 [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-LABEL: define amdgpu_kernel void @update_dpp_undef_old
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 [[IN1:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[VAL0:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 undef, i32 [[IN1]], i32 4, i32 15, i32 15, i1 true)
+; CHECK-NEXT:    store i32 [[VAL0]], ptr addrspace(1) [[OUT]], align 4
 ; CHECK-NEXT:    ret void
 ;
-  %tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 undef, i32 %in1, i32 4, i32 15, i32 15, i1 1)
-  store i32 %tmp0, ptr addrspace(1) %out
+  %val0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 undef, i32 %in1, i32 4, i32 15, i32 15, i1 1)
+  store i32 %val0, ptr addrspace(1) %out
   ret void
 }
 
@@ -2860,9 +3176,10 @@
 declare i32 @llvm.amdgcn.permlane16(i32, i32, i32, i32, i1 immarg, i1 immarg)
 
 define amdgpu_kernel void @permlane16(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
-; CHECK-LABEL: @permlane16(
-; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.amdgcn.permlane16(i32 12345, i32 [[SRC0:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]], i1 false, i1 false)
-; CHECK-NEXT:    store i32 [[RES]], ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-LABEL: define amdgpu_kernel void @permlane16
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.amdgcn.permlane16.i32(i32 12345, i32 [[SRC0]], i32 [[SRC1]], i32 [[SRC2]], i1 false, i1 false)
+; CHECK-NEXT:    store i32 [[RES]], ptr addrspace(1) [[OUT]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %res = call i32 @llvm.amdgcn.permlane16(i32 12345, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
@@ -2871,9 +3188,10 @@
 }
 
 define amdgpu_kernel void @permlane16_bound_ctrl(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
-; CHECK-LABEL: @permlane16_bound_ctrl(
-; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.amdgcn.permlane16(i32 undef, i32 [[SRC0:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]], i1 false, i1 true)
-; CHECK-NEXT:    store i32 [[RES]], ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-LABEL: define amdgpu_kernel void @permlane16_bound_ctrl
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.amdgcn.permlane16.i32(i32 undef, i32 [[SRC0]], i32 [[SRC1]], i32 [[SRC2]], i1 false, i1 true)
+; CHECK-NEXT:    store i32 [[RES]], ptr addrspace(1) [[OUT]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %res = call i32 @llvm.amdgcn.permlane16(i32 12345, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 true)
@@ -2882,9 +3200,10 @@
 }
 
 define amdgpu_kernel void @permlane16_fetch_invalid_bound_ctrl(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
-; CHECK-LABEL: @permlane16_fetch_invalid_bound_ctrl(
-; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.amdgcn.permlane16(i32 undef, i32 [[SRC0:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]], i1 true, i1 true)
-; CHECK-NEXT:    store i32 [[RES]], ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-LABEL: define amdgpu_kernel void @permlane16_fetch_invalid_bound_ctrl
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.amdgcn.permlane16.i32(i32 undef, i32 [[SRC0]], i32 [[SRC1]], i32 [[SRC2]], i1 true, i1 true)
+; CHECK-NEXT:    store i32 [[RES]], ptr addrspace(1) [[OUT]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %res = call i32 @llvm.amdgcn.permlane16(i32 12345, i32 %src0, i32 %src1, i32 %src2, i1 true, i1 true)
@@ -2899,9 +3218,10 @@
 declare i32 @llvm.amdgcn.permlanex16(i32, i32, i32, i32, i1 immarg, i1 immarg)
 
 define amdgpu_kernel void @permlanex16(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
-; CHECK-LABEL: @permlanex16(
-; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.amdgcn.permlanex16(i32 12345, i32 [[SRC0:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]], i1 false, i1 false)
-; CHECK-NEXT:    store i32 [[RES]], ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-LABEL: define amdgpu_kernel void @permlanex16
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.amdgcn.permlanex16.i32(i32 12345, i32 [[SRC0]], i32 [[SRC1]], i32 [[SRC2]], i1 false, i1 false)
+; CHECK-NEXT:    store i32 [[RES]], ptr addrspace(1) [[OUT]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %res = call i32 @llvm.amdgcn.permlanex16(i32 12345, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
@@ -2910,9 +3230,10 @@
 }
 
 define amdgpu_kernel void @permlanex16_bound_ctrl(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
-; CHECK-LABEL: @permlanex16_bound_ctrl(
-; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.amdgcn.permlanex16(i32 undef, i32 [[SRC0:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]], i1 false, i1 true)
-; CHECK-NEXT:    store i32 [[RES]], ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-LABEL: define amdgpu_kernel void @permlanex16_bound_ctrl
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.amdgcn.permlanex16.i32(i32 undef, i32 [[SRC0]], i32 [[SRC1]], i32 [[SRC2]], i1 false, i1 true)
+; CHECK-NEXT:    store i32 [[RES]], ptr addrspace(1) [[OUT]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %res = call i32 @llvm.amdgcn.permlanex16(i32 12345, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 true)
@@ -2921,9 +3242,10 @@
 }
 
 define amdgpu_kernel void @permlanex16_fetch_invalid_bound_ctrl(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
-; CHECK-LABEL: @permlanex16_fetch_invalid_bound_ctrl(
-; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.amdgcn.permlanex16(i32 undef, i32 [[SRC0:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]], i1 true, i1 true)
-; CHECK-NEXT:    store i32 [[RES]], ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-LABEL: define amdgpu_kernel void @permlanex16_fetch_invalid_bound_ctrl
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.amdgcn.permlanex16.i32(i32 undef, i32 [[SRC0]], i32 [[SRC1]], i32 [[SRC2]], i1 true, i1 true)
+; CHECK-NEXT:    store i32 [[RES]], ptr addrspace(1) [[OUT]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %res = call i32 @llvm.amdgcn.permlanex16(i32 12345, i32 %src0, i32 %src1, i32 %src2, i1 true, i1 true)
@@ -2991,9 +3313,10 @@
 declare <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f32.f32(i32, i32, float, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 define amdgpu_kernel void @image_sample_a16_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s) {
-; CHECK-LABEL: @image_sample_a16_1d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f16(i32 15, half [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_1d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[S:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f16(i32 15, half [[S]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %s32 = fpext half %s to float
@@ -3003,9 +3326,10 @@
 }
 
 define amdgpu_kernel void @image_sample_a16_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t) {
-; CHECK-LABEL: @image_sample_a16_2d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f16(i32 15, half [[S:%.*]], half [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_2d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[S:%.*]], half [[T:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f16(i32 15, half [[S]], half [[T]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %s32 = fpext half %s to float
@@ -3016,9 +3340,10 @@
 }
 
 define amdgpu_kernel void @image_sample_a16_3d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %r) {
-; CHECK-LABEL: @image_sample_a16_3d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f16(i32 15, half [[S:%.*]], half [[T:%.*]], half [[R:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_3d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[S:%.*]], half [[T:%.*]], half [[R:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f16(i32 15, half [[S]], half [[T]], half [[R]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %s32 = fpext half %s to float
@@ -3031,9 +3356,10 @@
 
 define amdgpu_kernel void @image_sample_a16_cube(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %face) {
 ;
-; CHECK-LABEL: @image_sample_a16_cube(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cube.v4f32.f16(i32 15, half [[S:%.*]], half [[T:%.*]], half [[FACE:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_cube
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[S:%.*]], half [[T:%.*]], half [[FACE:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cube.v4f32.f16(i32 15, half [[S]], half [[T]], half [[FACE]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %s32 = fpext half %s to float
@@ -3045,9 +3371,10 @@
 }
 
 define amdgpu_kernel void @image_sample_a16_1darray(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %slice) {
-; CHECK-LABEL: @image_sample_a16_1darray(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f16(i32 15, half [[S:%.*]], half [[SLICE:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_1darray
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[S:%.*]], half [[SLICE:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f16(i32 15, half [[S]], half [[SLICE]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %s32 = fpext half %s to float
@@ -3058,9 +3385,10 @@
 }
 
 define amdgpu_kernel void @image_sample_a16_2darray(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %slice) {
-; CHECK-LABEL: @image_sample_a16_2darray(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.2darray.v4f32.f16(i32 15, half [[S:%.*]], half [[T:%.*]], half [[SLICE:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_2darray
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[S:%.*]], half [[T:%.*]], half [[SLICE:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.2darray.v4f32.f16(i32 15, half [[S]], half [[T]], half [[SLICE]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %s32 = fpext half %s to float
@@ -3072,9 +3400,10 @@
 }
 
 define amdgpu_kernel void @image_sample_a16_c_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s) {
-; CHECK-LABEL: @image_sample_a16_c_1d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.1d.v4f32.f16(i32 15, float [[ZCOMPARE:%.*]], half [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_c_1d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], half [[S:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.1d.v4f32.f16(i32 15, float [[ZCOMPARE]], half [[S]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %s32 = fpext half %s to float
@@ -3084,9 +3413,10 @@
 }
 
 define amdgpu_kernel void @image_sample_a16_c_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %t) {
-; CHECK-LABEL: @image_sample_a16_c_2d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.2d.v4f32.f16(i32 15, float [[ZCOMPARE:%.*]], half [[S:%.*]], half [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_c_2d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], half [[S:%.*]], half [[T:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.2d.v4f32.f16(i32 15, float [[ZCOMPARE]], half [[S]], half [[T]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %s32 = fpext half %s to float
@@ -3097,9 +3427,10 @@
 }
 
 define amdgpu_kernel void @image_sample_a16_cl_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %clamp) {
-; CHECK-LABEL: @image_sample_a16_cl_1d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cl.1d.v4f32.f16(i32 15, half [[S:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_cl_1d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[S:%.*]], half [[CLAMP:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cl.1d.v4f32.f16(i32 15, half [[S]], half [[CLAMP]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %s32 = fpext half %s to float
@@ -3110,9 +3441,10 @@
 }
 
 define amdgpu_kernel void @image_sample_a16_cl_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %clamp) {
-; CHECK-LABEL: @image_sample_a16_cl_2d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cl.2d.v4f32.f16(i32 15, half [[S:%.*]], half [[T:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_cl_2d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[S:%.*]], half [[T:%.*]], half [[CLAMP:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cl.2d.v4f32.f16(i32 15, half [[S]], half [[T]], half [[CLAMP]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %s32 = fpext half %s to float
@@ -3124,9 +3456,10 @@
 }
 
 define amdgpu_kernel void @image_sample_a16_c_cl_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %clamp) {
-; CHECK-LABEL: @image_sample_a16_c_cl_1d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cl.1d.v4f32.f16(i32 15, float [[ZCOMPARE:%.*]], half [[S:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_c_cl_1d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], half [[S:%.*]], half [[CLAMP:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cl.1d.v4f32.f16(i32 15, float [[ZCOMPARE]], half [[S]], half [[CLAMP]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %s32 = fpext half %s to float
@@ -3137,9 +3470,10 @@
 }
 
 define amdgpu_kernel void @image_sample_a16_c_cl_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %t, half %clamp) {
-; CHECK-LABEL: @image_sample_a16_c_cl_2d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cl.2d.v4f32.f16(i32 15, float [[ZCOMPARE:%.*]], half [[S:%.*]], half [[T:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_c_cl_2d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], half [[S:%.*]], half [[T:%.*]], half [[CLAMP:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cl.2d.v4f32.f16(i32 15, float [[ZCOMPARE]], half [[S]], half [[T]], half [[CLAMP]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %s32 = fpext half %s to float
@@ -3151,9 +3485,10 @@
 }
 
 define amdgpu_kernel void @image_sample_a16_b16_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %bias, half %s) {
-; CHECK-LABEL: @image_sample_a16_b16_1d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f16.f16(i32 15, half [[BIAS:%.*]], half [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_b16_1d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[BIAS:%.*]], half [[S:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f16.f16(i32 15, half [[BIAS]], half [[S]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %bias32 = fpext half %bias to float
@@ -3164,10 +3499,11 @@
 }
 
 define amdgpu_kernel void @image_sample_a16_b32_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, half %s) {
-; CHECK-LABEL: @image_sample_a16_b32_1d(
-; CHECK-NEXT:    [[S32:%.*]] = fpext half [[S:%.*]] to float
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32.f32(i32 15, float [[BIAS:%.*]], float [[S32]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_b32_1d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[BIAS:%.*]], half [[S:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[S32:%.*]] = fpext half [[S]] to float
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32.f32(i32 15, float [[BIAS]], float [[S32]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %s32 = fpext half %s to float
@@ -3177,9 +3513,10 @@
 }
 
 define amdgpu_kernel void @image_sample_a16_b16_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %bias, half %s, half %t) {
-; CHECK-LABEL: @image_sample_a16_b16_2d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.2d.v4f32.f16.f16(i32 15, half [[BIAS:%.*]], half [[S:%.*]], half [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_b16_2d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[BIAS:%.*]], half [[S:%.*]], half [[T:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.2d.v4f32.f16.f16(i32 15, half [[BIAS]], half [[S]], half [[T]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %bias32 = fpext half %bias to float
@@ -3191,11 +3528,12 @@
 }
 
 define amdgpu_kernel void @image_sample_a16_b32_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, half %s, half %t) {
-; CHECK-LABEL: @image_sample_a16_b32_2d(
-; CHECK-NEXT:    [[S32:%.*]] = fpext half [[S:%.*]] to float
-; CHECK-NEXT:    [[T32:%.*]] = fpext half [[T:%.*]] to float
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.2d.v4f32.f32.f32(i32 15, float [[BIAS:%.*]], float [[S32]], float [[T32]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_b32_2d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[BIAS:%.*]], half [[S:%.*]], half [[T:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[S32:%.*]] = fpext half [[S]] to float
+; CHECK-NEXT:    [[T32:%.*]] = fpext half [[T]] to float
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.2d.v4f32.f32.f32(i32 15, float [[BIAS]], float [[S32]], float [[T32]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %s32 = fpext half %s to float
@@ -3206,9 +3544,10 @@
 }
 
 define amdgpu_kernel void @image_sample_a16_c_b16_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %bias, float %zcompare, half %s) {
-; CHECK-LABEL: @image_sample_a16_c_b16_1d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.1d.v4f32.f16.f16(i32 15, half [[BIAS:%.*]], float [[ZCOMPARE:%.*]], half [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_c_b16_1d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[BIAS:%.*]], float [[ZCOMPARE:%.*]], half [[S:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.1d.v4f32.f16.f16(i32 15, half [[BIAS]], float [[ZCOMPARE]], half [[S]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %bias32 = fpext half %bias to float
@@ -3219,10 +3558,11 @@
 }
 
 define amdgpu_kernel void @image_sample_a16_c_b32_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, half %s) {
-; CHECK-LABEL: @image_sample_a16_c_b32_1d(
-; CHECK-NEXT:    [[S32:%.*]] = fpext half [[S:%.*]] to float
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.1d.v4f32.f32.f32(i32 15, float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S32]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_c_b32_1d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], half [[S:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[S32:%.*]] = fpext half [[S]] to float
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.1d.v4f32.f32.f32(i32 15, float [[BIAS]], float [[ZCOMPARE]], float [[S32]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %s32 = fpext half %s to float
@@ -3232,9 +3572,10 @@
 }
 
 define amdgpu_kernel void @image_sample_a16_c_b16_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %bias, float %zcompare, half %s, half %t) {
-; CHECK-LABEL: @image_sample_a16_c_b16_2d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.2d.v4f32.f16.f16(i32 15, half [[BIAS:%.*]], float [[ZCOMPARE:%.*]], half [[S:%.*]], half [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_c_b16_2d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[BIAS:%.*]], float [[ZCOMPARE:%.*]], half [[S:%.*]], half [[T:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.2d.v4f32.f16.f16(i32 15, half [[BIAS]], float [[ZCOMPARE]], half [[S]], half [[T]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %bias32 = fpext half %bias to float
@@ -3246,11 +3587,12 @@
 }
 
 define amdgpu_kernel void @image_sample_a16_c_b32_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, half %s, half %t) {
-; CHECK-LABEL: @image_sample_a16_c_b32_2d(
-; CHECK-NEXT:    [[S32:%.*]] = fpext half [[S:%.*]] to float
-; CHECK-NEXT:    [[T32:%.*]] = fpext half [[T:%.*]] to float
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.2d.v4f32.f32.f32(i32 15, float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S32]], float [[T32]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_c_b32_2d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], half [[S:%.*]], half [[T:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[S32:%.*]] = fpext half [[S]] to float
+; CHECK-NEXT:    [[T32:%.*]] = fpext half [[T]] to float
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.2d.v4f32.f32.f32(i32 15, float [[BIAS]], float [[ZCOMPARE]], float [[S32]], float [[T32]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %s32 = fpext half %s to float
@@ -3261,9 +3603,10 @@
 }
 
 define amdgpu_kernel void @image_sample_a16_b16_cl_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %bias, half %s, half %clamp) {
-; CHECK-LABEL: @image_sample_a16_b16_cl_1d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f16.f16(i32 15, half [[BIAS:%.*]], half [[S:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_b16_cl_1d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[BIAS:%.*]], half [[S:%.*]], half [[CLAMP:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f16.f16(i32 15, half [[BIAS]], half [[S]], half [[CLAMP]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %bias32 = fpext half %bias to float
@@ -3275,11 +3618,12 @@
 }
 
 define amdgpu_kernel void @image_sample_a16_b32_cl_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, half %s, half %clamp) {
-; CHECK-LABEL: @image_sample_a16_b32_cl_1d(
-; CHECK-NEXT:    [[S32:%.*]] = fpext half [[S:%.*]] to float
-; CHECK-NEXT:    [[CLAMP32:%.*]] = fpext half [[CLAMP:%.*]] to float
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f32.f32(i32 15, float [[BIAS:%.*]], float [[S32]], float [[CLAMP32]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_b32_cl_1d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[BIAS:%.*]], half [[S:%.*]], half [[CLAMP:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[S32:%.*]] = fpext half [[S]] to float
+; CHECK-NEXT:    [[CLAMP32:%.*]] = fpext half [[CLAMP]] to float
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f32.f32(i32 15, float [[BIAS]], float [[S32]], float [[CLAMP32]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %s32 = fpext half %s to float
@@ -3290,9 +3634,10 @@
 }
 
 define amdgpu_kernel void @image_sample_a16_b16_cl_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %bias, half %s, half %t, half %clamp) {
-; CHECK-LABEL: @image_sample_a16_b16_cl_2d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.cl.2d.v4f32.f16.f16(i32 15, half [[BIAS:%.*]], half [[S:%.*]], half [[T:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_b16_cl_2d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[BIAS:%.*]], half [[S:%.*]], half [[T:%.*]], half [[CLAMP:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.cl.2d.v4f32.f16.f16(i32 15, half [[BIAS]], half [[S]], half [[T]], half [[CLAMP]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %bias32 = fpext half %bias to float
@@ -3305,12 +3650,13 @@
 }
 
 define amdgpu_kernel void @image_sample_a16_b32_cl_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, half %s, half %t, half %clamp) {
-; CHECK-LABEL: @image_sample_a16_b32_cl_2d(
-; CHECK-NEXT:    [[S32:%.*]] = fpext half [[S:%.*]] to float
-; CHECK-NEXT:    [[T32:%.*]] = fpext half [[T:%.*]] to float
-; CHECK-NEXT:    [[CLAMP32:%.*]] = fpext half [[CLAMP:%.*]] to float
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.cl.2d.v4f32.f32.f32(i32 15, float [[BIAS:%.*]], float [[S32]], float [[T32]], float [[CLAMP32]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_b32_cl_2d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[BIAS:%.*]], half [[S:%.*]], half [[T:%.*]], half [[CLAMP:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[S32:%.*]] = fpext half [[S]] to float
+; CHECK-NEXT:    [[T32:%.*]] = fpext half [[T]] to float
+; CHECK-NEXT:    [[CLAMP32:%.*]] = fpext half [[CLAMP]] to float
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.cl.2d.v4f32.f32.f32(i32 15, float [[BIAS]], float [[S32]], float [[T32]], float [[CLAMP32]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %s32 = fpext half %s to float
@@ -3322,9 +3668,10 @@
 }
 
 define amdgpu_kernel void @image_sample_a16_c_b16_cl_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %bias, float %zcompare, half %s, half %clamp) {
-; CHECK-LABEL: @image_sample_a16_c_b16_cl_1d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.1d.v4f32.f16.f16(i32 15, half [[BIAS:%.*]], float [[ZCOMPARE:%.*]], half [[S:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_c_b16_cl_1d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[BIAS:%.*]], float [[ZCOMPARE:%.*]], half [[S:%.*]], half [[CLAMP:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.1d.v4f32.f16.f16(i32 15, half [[BIAS]], float [[ZCOMPARE]], half [[S]], half [[CLAMP]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %bias32 = fpext half %bias to float
@@ -3336,11 +3683,12 @@
 }
 
 define amdgpu_kernel void @image_sample_a16_c_b32_cl_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, half %s, half %clamp) {
-; CHECK-LABEL: @image_sample_a16_c_b32_cl_1d(
-; CHECK-NEXT:    [[S32:%.*]] = fpext half [[S:%.*]] to float
-; CHECK-NEXT:    [[CLAMP32:%.*]] = fpext half [[CLAMP:%.*]] to float
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.1d.v4f32.f32.f32(i32 15, float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S32]], float [[CLAMP32]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_c_b32_cl_1d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], half [[S:%.*]], half [[CLAMP:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[S32:%.*]] = fpext half [[S]] to float
+; CHECK-NEXT:    [[CLAMP32:%.*]] = fpext half [[CLAMP]] to float
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.1d.v4f32.f32.f32(i32 15, float [[BIAS]], float [[ZCOMPARE]], float [[S32]], float [[CLAMP32]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %s32 = fpext half %s to float
@@ -3351,9 +3699,10 @@
 }
 
 define amdgpu_kernel void @image_sample_a16_c_b16_cl_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %bias, float %zcompare, half %s, half %t, half %clamp) {
-; CHECK-LABEL: @image_sample_a16_c_b16_cl_2d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.2d.v4f32.f16.f16(i32 15, half [[BIAS:%.*]], float [[ZCOMPARE:%.*]], half [[S:%.*]], half [[T:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_c_b16_cl_2d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[BIAS:%.*]], float [[ZCOMPARE:%.*]], half [[S:%.*]], half [[T:%.*]], half [[CLAMP:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.2d.v4f32.f16.f16(i32 15, half [[BIAS]], float [[ZCOMPARE]], half [[S]], half [[T]], half [[CLAMP]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %bias32 = fpext half %bias to float
@@ -3366,12 +3715,13 @@
 }
 
 define amdgpu_kernel void @image_sample_a16_c_b32_cl_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, half %s, half %t, half %clamp) {
-; CHECK-LABEL: @image_sample_a16_c_b32_cl_2d(
-; CHECK-NEXT:    [[S32:%.*]] = fpext half [[S:%.*]] to float
-; CHECK-NEXT:    [[T32:%.*]] = fpext half [[T:%.*]] to float
-; CHECK-NEXT:    [[CLAMP32:%.*]] = fpext half [[CLAMP:%.*]] to float
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.2d.v4f32.f32.f32(i32 15, float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S32]], float [[T32]], float [[CLAMP32]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_c_b32_cl_2d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], half [[S:%.*]], half [[T:%.*]], half [[CLAMP:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[S32:%.*]] = fpext half [[S]] to float
+; CHECK-NEXT:    [[T32:%.*]] = fpext half [[T]] to float
+; CHECK-NEXT:    [[CLAMP32:%.*]] = fpext half [[CLAMP]] to float
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.2d.v4f32.f32.f32(i32 15, float [[BIAS]], float [[ZCOMPARE]], float [[S32]], float [[T32]], float [[CLAMP32]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %s32 = fpext half %s to float
@@ -3383,9 +3733,10 @@
 }
 
 define amdgpu_kernel void @image_sample_a16_d_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, half %s) {
-; CHECK-LABEL: @image_sample_a16_d_1d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f16.f16(i32 15, half [[DSDH:%.*]], half [[DSDV:%.*]], half [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_d_1d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[DSDH:%.*]], half [[DSDV:%.*]], half [[S:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f16.f16(i32 15, half [[DSDH]], half [[DSDV]], half [[S]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %dsdh32 = fpext half %dsdh to float
@@ -3397,9 +3748,10 @@
 }
 
 define amdgpu_kernel void @image_sample_a16_d_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t) {
-; CHECK-LABEL: @image_sample_a16_d_2d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f16.f16(i32 15, half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[S:%.*]], half [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_d_2d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[S:%.*]], half [[T:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f16.f16(i32 15, half [[DSDH]], half [[DTDH]], half [[DSDV]], half [[DTDV]], half [[S]], half [[T]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %dsdh32 = fpext half %dsdh to float
@@ -3414,9 +3766,10 @@
 }
 
 define amdgpu_kernel void @image_sample_a16_d_3d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %drdh, half %dsdv, half %dtdv, half %drdv, half %s, half %t, half %r) {
-; CHECK-LABEL: @image_sample_a16_d_3d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f16.f16(i32 15, half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DRDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[DRDV:%.*]], half [[S:%.*]], half [[T:%.*]], half [[R:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_d_3d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DRDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[DRDV:%.*]], half [[S:%.*]], half [[T:%.*]], half [[R:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f16.f16(i32 15, half [[DSDH]], half [[DTDH]], half [[DRDH]], half [[DSDV]], half [[DTDV]], half [[DRDV]], half [[S]], half [[T]], half [[R]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %dsdh32 = fpext half %dsdh to float
@@ -3434,9 +3787,10 @@
 }
 
 define amdgpu_kernel void @image_sample_a16_c_d_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, half %s) {
-; CHECK-LABEL: @image_sample_a16_c_d_1d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f16.f16(i32 15, float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DSDV:%.*]], half [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_c_d_1d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DSDV:%.*]], half [[S:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f16.f16(i32 15, float [[ZCOMPARE]], half [[DSDH]], half [[DSDV]], half [[S]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %dsdh32 = fpext half %dsdh to float
@@ -3448,9 +3802,10 @@
 }
 
 define amdgpu_kernel void @image_sample_a16_c_d_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t) {
-; CHECK-LABEL: @image_sample_a16_c_d_2d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f16.f16(i32 15, float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[S:%.*]], half [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_c_d_2d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[S:%.*]], half [[T:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f16.f16(i32 15, float [[ZCOMPARE]], half [[DSDH]], half [[DTDH]], half [[DSDV]], half [[DTDV]], half [[S]], half [[T]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %dsdh32 = fpext half %dsdh to float
@@ -3465,9 +3820,10 @@
 }
 
 define amdgpu_kernel void @image_sample_a16_d_cl_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, half %s, half %clamp) {
-; CHECK-LABEL: @image_sample_a16_d_cl_1d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f16.f16(i32 15, half [[DSDH:%.*]], half [[DSDV:%.*]], half [[S:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_d_cl_1d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[DSDH:%.*]], half [[DSDV:%.*]], half [[S:%.*]], half [[CLAMP:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f16.f16(i32 15, half [[DSDH]], half [[DSDV]], half [[S]], half [[CLAMP]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %dsdh32 = fpext half %dsdh to float
@@ -3480,9 +3836,10 @@
 }
 
 define amdgpu_kernel void @image_sample_a16_d_cl_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp) {
-; CHECK-LABEL: @image_sample_a16_d_cl_2d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f16.f16(i32 15, half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[S:%.*]], half [[T:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_d_cl_2d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[S:%.*]], half [[T:%.*]], half [[CLAMP:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f16.f16(i32 15, half [[DSDH]], half [[DTDH]], half [[DSDV]], half [[DTDV]], half [[S]], half [[T]], half [[CLAMP]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %dsdh32 = fpext half %dsdh to float
@@ -3498,9 +3855,10 @@
 }
 
 define amdgpu_kernel void @image_sample_a16_c_d_cl_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, half %s, half %clamp) {
-; CHECK-LABEL: @image_sample_a16_c_d_cl_1d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f16.f16(i32 15, float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DSDV:%.*]], half [[S:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_c_d_cl_1d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DSDV:%.*]], half [[S:%.*]], half [[CLAMP:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f16.f16(i32 15, float [[ZCOMPARE]], half [[DSDH]], half [[DSDV]], half [[S]], half [[CLAMP]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %dsdh32 = fpext half %dsdh to float
@@ -3513,9 +3871,10 @@
 }
 
 define amdgpu_kernel void @image_sample_a16_c_d_cl_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp) {
-; CHECK-LABEL: @image_sample_a16_c_d_cl_2d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f16.f16(i32 15, float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[S:%.*]], half [[T:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_c_d_cl_2d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[S:%.*]], half [[T:%.*]], half [[CLAMP:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f16.f16(i32 15, float [[ZCOMPARE]], half [[DSDH]], half [[DTDH]], half [[DSDV]], half [[DTDV]], half [[S]], half [[T]], half [[CLAMP]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %dsdh32 = fpext half %dsdh to float
@@ -3531,9 +3890,10 @@
 }
 
 define amdgpu_kernel void @image_sample_a16_cd_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, half %s) {
-; CHECK-LABEL: @image_sample_a16_cd_1d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f16.f16(i32 15, half [[DSDH:%.*]], half [[DSDV:%.*]], half [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_cd_1d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[DSDH:%.*]], half [[DSDV:%.*]], half [[S:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f16.f16(i32 15, half [[DSDH]], half [[DSDV]], half [[S]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %dsdh32 = fpext half %dsdh to float
@@ -3545,9 +3905,10 @@
 }
 
 define amdgpu_kernel void @image_sample_a16_cd_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t) {
-; CHECK-LABEL: @image_sample_a16_cd_2d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f16.f16(i32 15, half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[S:%.*]], half [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_cd_2d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[S:%.*]], half [[T:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f16.f16(i32 15, half [[DSDH]], half [[DTDH]], half [[DSDV]], half [[DTDV]], half [[S]], half [[T]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %dsdh32 = fpext half %dsdh to float
@@ -3562,9 +3923,10 @@
 }
 
 define amdgpu_kernel void @image_sample_a16_c_cd_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, half %s) {
-; CHECK-LABEL: @image_sample_a16_c_cd_1d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f16.f16(i32 15, float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DSDV:%.*]], half [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_c_cd_1d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DSDV:%.*]], half [[S:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f16.f16(i32 15, float [[ZCOMPARE]], half [[DSDH]], half [[DSDV]], half [[S]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %dsdh32 = fpext half %dsdh to float
@@ -3576,9 +3938,10 @@
 }
 
 define amdgpu_kernel void @image_sample_a16_c_cd_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t) {
-; CHECK-LABEL: @image_sample_a16_c_cd_2d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f16.f16(i32 15, float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[S:%.*]], half [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_c_cd_2d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[S:%.*]], half [[T:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f16.f16(i32 15, float [[ZCOMPARE]], half [[DSDH]], half [[DTDH]], half [[DSDV]], half [[DTDV]], half [[S]], half [[T]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %dsdh32 = fpext half %dsdh to float
@@ -3593,9 +3956,10 @@
 }
 
 define amdgpu_kernel void @image_sample_a16_cd_cl_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, half %s, half %clamp) {
-; CHECK-LABEL: @image_sample_a16_cd_cl_1d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f16.f16(i32 15, half [[DSDH:%.*]], half [[DSDV:%.*]], half [[S:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_cd_cl_1d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[DSDH:%.*]], half [[DSDV:%.*]], half [[S:%.*]], half [[CLAMP:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f16.f16(i32 15, half [[DSDH]], half [[DSDV]], half [[S]], half [[CLAMP]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %dsdh32 = fpext half %dsdh to float
@@ -3608,9 +3972,10 @@
 }
 
 define amdgpu_kernel void @image_sample_a16_cd_cl_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp) {
-; CHECK-LABEL: @image_sample_a16_cd_cl_2d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f16.f16(i32 15, half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[S:%.*]], half [[T:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_cd_cl_2d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[S:%.*]], half [[T:%.*]], half [[CLAMP:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f16.f16(i32 15, half [[DSDH]], half [[DTDH]], half [[DSDV]], half [[DTDV]], half [[S]], half [[T]], half [[CLAMP]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %dsdh32 = fpext half %dsdh to float
@@ -3626,9 +3991,10 @@
 }
 
 define amdgpu_kernel void @image_sample_a16_c_cd_cl_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, half %s, half %clamp) {
-; CHECK-LABEL: @image_sample_a16_c_cd_cl_1d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f16.f16(i32 15, float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DSDV:%.*]], half [[S:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_c_cd_cl_1d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DSDV:%.*]], half [[S:%.*]], half [[CLAMP:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f16.f16(i32 15, float [[ZCOMPARE]], half [[DSDH]], half [[DSDV]], half [[S]], half [[CLAMP]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %dsdh32 = fpext half %dsdh to float
@@ -3641,9 +4007,10 @@
 }
 
 define amdgpu_kernel void @image_sample_a16_c_cd_cl_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp) {
-; CHECK-LABEL: @image_sample_a16_c_cd_cl_2d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f16.f16(i32 15, float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[S:%.*]], half [[T:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_c_cd_cl_2d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[S:%.*]], half [[T:%.*]], half [[CLAMP:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f16.f16(i32 15, float [[ZCOMPARE]], half [[DSDH]], half [[DTDH]], half [[DSDV]], half [[DTDV]], half [[S]], half [[T]], half [[CLAMP]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %dsdh32 = fpext half %dsdh to float
@@ -3659,9 +4026,10 @@
 }
 
 define amdgpu_kernel void @image_sample_a16_l_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %lod) {
-; CHECK-LABEL: @image_sample_a16_l_1d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.l.1d.v4f32.f16(i32 15, half [[S:%.*]], half [[LOD:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_l_1d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[S:%.*]], half [[LOD:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.l.1d.v4f32.f16(i32 15, half [[S]], half [[LOD]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %s32 = fpext half %s to float
@@ -3672,9 +4040,10 @@
 }
 
 define amdgpu_kernel void @image_sample_a16_l_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %lod) {
-; CHECK-LABEL: @image_sample_a16_l_2d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.l.2d.v4f32.f16(i32 15, half [[S:%.*]], half [[T:%.*]], half [[LOD:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_l_2d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[S:%.*]], half [[T:%.*]], half [[LOD:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.l.2d.v4f32.f16(i32 15, half [[S]], half [[T]], half [[LOD]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %s32 = fpext half %s to float
@@ -3686,9 +4055,10 @@
 }
 
 define amdgpu_kernel void @image_sample_a16_c_l_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %lod) {
-; CHECK-LABEL: @image_sample_a16_c_l_1d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.l.1d.v4f32.f16(i32 15, float [[ZCOMPARE:%.*]], half [[S:%.*]], half [[LOD:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_c_l_1d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], half [[S:%.*]], half [[LOD:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.l.1d.v4f32.f16(i32 15, float [[ZCOMPARE]], half [[S]], half [[LOD]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %s32 = fpext half %s to float
@@ -3699,9 +4069,10 @@
 }
 
 define amdgpu_kernel void @image_sample_a16_c_l_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %t, half %lod) {
-; CHECK-LABEL: @image_sample_a16_c_l_2d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.l.2d.v4f32.f16(i32 15, float [[ZCOMPARE:%.*]], half [[S:%.*]], half [[T:%.*]], half [[LOD:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_c_l_2d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], half [[S:%.*]], half [[T:%.*]], half [[LOD:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.l.2d.v4f32.f16(i32 15, float [[ZCOMPARE]], half [[S]], half [[T]], half [[LOD]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %s32 = fpext half %s to float
@@ -3713,9 +4084,10 @@
 }
 
 define amdgpu_kernel void @image_sample_a16_lz_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s) {
-; CHECK-LABEL: @image_sample_a16_lz_1d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.lz.1d.v4f32.f16(i32 15, half [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_lz_1d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[S:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.lz.1d.v4f32.f16(i32 15, half [[S]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %s32 = fpext half %s to float
@@ -3725,9 +4097,10 @@
 }
 
 define amdgpu_kernel void @image_sample_a16_lz_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t) {
-; CHECK-LABEL: @image_sample_a16_lz_2d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f16(i32 15, half [[S:%.*]], half [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_lz_2d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[S:%.*]], half [[T:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f16(i32 15, half [[S]], half [[T]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %s32 = fpext half %s to float
@@ -3738,9 +4111,10 @@
 }
 
 define amdgpu_kernel void @image_sample_a16_c_lz_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s) {
-; CHECK-LABEL: @image_sample_a16_c_lz_1d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.lz.1d.v4f32.f16(i32 15, float [[ZCOMPARE:%.*]], half [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_c_lz_1d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], half [[S:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.lz.1d.v4f32.f16(i32 15, float [[ZCOMPARE]], half [[S]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %s32 = fpext half %s to float
@@ -3750,9 +4124,10 @@
 }
 
 define amdgpu_kernel void @image_sample_a16_c_lz_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %t) {
-; CHECK-LABEL: @image_sample_a16_c_lz_2d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.lz.2d.v4f32.f16(i32 15, float [[ZCOMPARE:%.*]], half [[S:%.*]], half [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_c_lz_2d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], half [[S:%.*]], half [[T:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.lz.2d.v4f32.f16(i32 15, float [[ZCOMPARE]], half [[S]], half [[T]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %s32 = fpext half %s to float
@@ -3763,9 +4138,10 @@
 }
 
 define amdgpu_kernel void @image_sample_a16_c_d_o_2darray_V1(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %slice) {
-; CHECK-LABEL: @image_sample_a16_c_d_o_2darray_V1(
-; CHECK-NEXT:    [[RES:%.*]] = call float @llvm.amdgcn.image.sample.c.d.o.2darray.f32.f16.f16(i32 4, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[S:%.*]], half [[T:%.*]], half [[SLICE:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store float [[RES]], ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_c_d_o_2darray_V1
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[S:%.*]], half [[T:%.*]], half [[SLICE:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[RES:%.*]] = call float @llvm.amdgcn.image.sample.c.d.o.2darray.f32.f16.f16(i32 4, i32 [[OFFSET]], float [[ZCOMPARE]], half [[DSDH]], half [[DTDH]], half [[DSDV]], half [[DTDV]], half [[S]], half [[T]], half [[SLICE]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store float [[RES]], ptr addrspace(1) [[OUT]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %dsdh32 = fpext half %dsdh to float
@@ -3781,9 +4157,10 @@
 }
 
 define amdgpu_kernel void @image_sample_a16_c_d_o_2darray_V2(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %slice) {
-; CHECK-LABEL: @image_sample_a16_c_d_o_2darray_V2(
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f16.f16(i32 6, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[S:%.*]], half [[T:%.*]], half [[SLICE:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <2 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 8
+; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_c_d_o_2darray_V2
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[S:%.*]], half [[T:%.*]], half [[SLICE:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f16.f16(i32 6, i32 [[OFFSET]], float [[ZCOMPARE]], half [[DSDH]], half [[DTDH]], half [[DSDV]], half [[DTDV]], half [[S]], half [[T]], half [[SLICE]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <2 x float> [[RES]], ptr addrspace(1) [[OUT]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %dsdh32 = fpext half %dsdh to float
@@ -3799,9 +4176,10 @@
 }
 
 define amdgpu_kernel void @image_sample_a16_c_d_o_2darray_const(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %slice) {
-; CHECK-LABEL: @image_sample_a16_c_d_o_2darray_const(
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f16.f16(i32 6, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[S:%.*]], half 0xH3400, half [[SLICE:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <2 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 8
+; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_c_d_o_2darray_const
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[S:%.*]], half [[SLICE:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f16.f16(i32 6, i32 [[OFFSET]], float [[ZCOMPARE]], half [[DSDH]], half [[DTDH]], half [[DSDV]], half [[DTDV]], half [[S]], half 0xH3400, half [[SLICE]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <2 x float> [[RES]], ptr addrspace(1) [[OUT]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %dsdh32 = fpext half %dsdh to float
@@ -3816,11 +4194,12 @@
 }
 
 define amdgpu_kernel void @image_sample_a16_c_d_o_2darray_const_noopt(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %slice) {
-; CHECK-LABEL: @image_sample_a16_c_d_o_2darray_const_noopt(
-; CHECK-NEXT:    [[S32:%.*]] = fpext half [[S:%.*]] to float
-; CHECK-NEXT:    [[SLICE32:%.*]] = fpext half [[SLICE:%.*]] to float
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f16.f32(i32 6, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], float [[S32]], float 1.000000e+10, float [[SLICE32]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <2 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 8
+; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_c_d_o_2darray_const_noopt
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[S:%.*]], half [[SLICE:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[S32:%.*]] = fpext half [[S]] to float
+; CHECK-NEXT:    [[SLICE32:%.*]] = fpext half [[SLICE]] to float
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f16.f32(i32 6, i32 [[OFFSET]], float [[ZCOMPARE]], half [[DSDH]], half [[DTDH]], half [[DSDV]], half [[DTDV]], float [[S32]], float 1.000000e+10, float [[SLICE32]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <2 x float> [[RES]], ptr addrspace(1) [[OUT]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %dsdh32 = fpext half %dsdh to float
@@ -3835,9 +4214,10 @@
 }
 
 define amdgpu_kernel void @image_load_a16_mip_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, i16 %s) {
-; CHECK-LABEL: @image_load_a16_mip_1d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i16(i32 15, i16 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-LABEL: define amdgpu_kernel void @image_load_a16_mip_1d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], i16 [[S:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i16(i32 15, i16 [[S]], <8 x i32> [[RSRC]], i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %s32 = zext i16 %s to i32
@@ -3847,10 +4227,11 @@
 }
 
 define amdgpu_kernel void @image_load_a16_mip_1d_noopt(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, i16 %s) {
-; CHECK-LABEL: @image_load_a16_mip_1d_noopt(
-; CHECK-NEXT:    [[S32:%.*]] = sext i16 [[S:%.*]] to i32
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 [[S32]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-LABEL: define amdgpu_kernel void @image_load_a16_mip_1d_noopt
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], i16 [[S:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[S32:%.*]] = sext i16 [[S]] to i32
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 [[S32]], <8 x i32> [[RSRC]], i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %s32 = sext i16 %s to i32
@@ -3860,9 +4241,10 @@
 }
 
 define amdgpu_kernel void @image_load_a16_mip_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, i16 %s, i16 %t) {
-; CHECK-LABEL: @image_load_a16_mip_2d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i16(i32 15, i16 [[S:%.*]], i16 [[T:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-LABEL: define amdgpu_kernel void @image_load_a16_mip_2d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], i16 [[S:%.*]], i16 [[T:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i16(i32 15, i16 [[S]], i16 [[T]], <8 x i32> [[RSRC]], i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %s32 = zext i16 %s to i32
@@ -3873,9 +4255,10 @@
 }
 
 define amdgpu_kernel void @image_load_a16_mip_2d_const(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, i16 %s) {
-; CHECK-LABEL: @image_load_a16_mip_2d_const(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i16(i32 15, i16 [[S:%.*]], i16 -1, <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-LABEL: define amdgpu_kernel void @image_load_a16_mip_2d_const
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], i16 [[S:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i16(i32 15, i16 [[S]], i16 -1, <8 x i32> [[RSRC]], i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %s32 = zext i16 %s to i32
@@ -3885,10 +4268,11 @@
 }
 
 define amdgpu_kernel void @image_load_a16_mip_2d_const_noopt(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, i16 %s) {
-; CHECK-LABEL: @image_load_a16_mip_2d_const_noopt(
-; CHECK-NEXT:    [[S32:%.*]] = zext i16 [[S:%.*]] to i32
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32 15, i32 [[S32]], i32 65536, <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-LABEL: define amdgpu_kernel void @image_load_a16_mip_2d_const_noopt
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], i16 [[S:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[S32:%.*]] = zext i16 [[S]] to i32
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32 15, i32 [[S32]], i32 65536, <8 x i32> [[RSRC]], i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %s32 = zext i16 %s to i32
@@ -3902,9 +4286,10 @@
 ; --------------------------------------------------------------------
 
 define amdgpu_kernel void @image_sample_g16_d_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) {
-; CHECK-LABEL: @image_sample_g16_d_1d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f16.f32(i32 15, half [[DSDH:%.*]], half [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-LABEL: define amdgpu_kernel void @image_sample_g16_d_1d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[DSDH:%.*]], half [[DSDV:%.*]], float [[S:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f16.f32(i32 15, half [[DSDH]], half [[DSDV]], float [[S]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %dsdh32 = fpext half %dsdh to float
@@ -3915,9 +4300,10 @@
 }
 
 define amdgpu_kernel void @image_sample_g16_d_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) {
-; CHECK-LABEL: @image_sample_g16_d_2d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f16.f32(i32 15, half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-LABEL: define amdgpu_kernel void @image_sample_g16_d_2d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f16.f32(i32 15, half [[DSDH]], half [[DTDH]], half [[DSDV]], half [[DTDV]], float [[S]], float [[T]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %dsdh32 = fpext half %dsdh to float
@@ -3930,9 +4316,10 @@
 }
 
 define amdgpu_kernel void @image_sample_g16_d_3d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %drdh, half %dsdv, half %dtdv, half %drdv, float %s, float %t, float %r) {
-; CHECK-LABEL: @image_sample_g16_d_3d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f16.f32(i32 15, half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DRDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[DRDV:%.*]], float [[S:%.*]], float [[T:%.*]], float [[R:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-LABEL: define amdgpu_kernel void @image_sample_g16_d_3d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DRDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[DRDV:%.*]], float [[S:%.*]], float [[T:%.*]], float [[R:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f16.f32(i32 15, half [[DSDH]], half [[DTDH]], half [[DRDH]], half [[DSDV]], half [[DTDV]], half [[DRDV]], float [[S]], float [[T]], float [[R]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %dsdh32 = fpext half %dsdh to float
@@ -3947,9 +4334,10 @@
 }
 
 define amdgpu_kernel void @image_sample_g16_c_d_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s) {
-; CHECK-LABEL: @image_sample_g16_c_d_1d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f16.f32(i32 15, float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-LABEL: define amdgpu_kernel void @image_sample_g16_c_d_1d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DSDV:%.*]], float [[S:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f16.f32(i32 15, float [[ZCOMPARE]], half [[DSDH]], half [[DSDV]], float [[S]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %dsdh32 = fpext half %dsdh to float
@@ -3960,9 +4348,10 @@
 }
 
 define amdgpu_kernel void @image_sample_g16_c_d_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) {
-; CHECK-LABEL: @image_sample_g16_c_d_2d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f16.f32(i32 15, float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-LABEL: define amdgpu_kernel void @image_sample_g16_c_d_2d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f16.f32(i32 15, float [[ZCOMPARE]], half [[DSDH]], half [[DTDH]], half [[DSDV]], half [[DTDV]], float [[S]], float [[T]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %dsdh32 = fpext half %dsdh to float
@@ -3975,9 +4364,10 @@
 }
 
 define amdgpu_kernel void @image_sample_g16_d_cl_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s, float %clamp) {
-; CHECK-LABEL: @image_sample_g16_d_cl_1d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f16.f32(i32 15, half [[DSDH:%.*]], half [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-LABEL: define amdgpu_kernel void @image_sample_g16_d_cl_1d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[DSDH:%.*]], half [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f16.f32(i32 15, half [[DSDH]], half [[DSDV]], float [[S]], float [[CLAMP]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %dsdh32 = fpext half %dsdh to float
@@ -3988,9 +4378,10 @@
 }
 
 define amdgpu_kernel void @image_sample_g16_d_cl_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) {
-; CHECK-LABEL: @image_sample_g16_d_cl_2d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f16.f32(i32 15, half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-LABEL: define amdgpu_kernel void @image_sample_g16_d_cl_2d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f16.f32(i32 15, half [[DSDH]], half [[DTDH]], half [[DSDV]], half [[DTDV]], float [[S]], float [[T]], float [[CLAMP]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %dsdh32 = fpext half %dsdh to float
@@ -4003,9 +4394,10 @@
 }
 
 define amdgpu_kernel void @image_sample_g16_c_d_cl_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp) {
-; CHECK-LABEL: @image_sample_g16_c_d_cl_1d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f16.f32(i32 15, float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-LABEL: define amdgpu_kernel void @image_sample_g16_c_d_cl_1d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f16.f32(i32 15, float [[ZCOMPARE]], half [[DSDH]], half [[DSDV]], float [[S]], float [[CLAMP]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %dsdh32 = fpext half %dsdh to float
@@ -4016,9 +4408,10 @@
 }
 
 define amdgpu_kernel void @image_sample_g16_c_d_cl_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) {
-; CHECK-LABEL: @image_sample_g16_c_d_cl_2d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f16.f32(i32 15, float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-LABEL: define amdgpu_kernel void @image_sample_g16_c_d_cl_2d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f16.f32(i32 15, float [[ZCOMPARE]], half [[DSDH]], half [[DTDH]], half [[DSDV]], half [[DTDV]], float [[S]], float [[T]], float [[CLAMP]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %dsdh32 = fpext half %dsdh to float
@@ -4031,9 +4424,10 @@
 }
 
 define amdgpu_kernel void @image_sample_g16_cd_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) {
-; CHECK-LABEL: @image_sample_g16_cd_1d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f16.f32(i32 15, half [[DSDH:%.*]], half [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-LABEL: define amdgpu_kernel void @image_sample_g16_cd_1d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[DSDH:%.*]], half [[DSDV:%.*]], float [[S:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f16.f32(i32 15, half [[DSDH]], half [[DSDV]], float [[S]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %dsdh32 = fpext half %dsdh to float
@@ -4044,9 +4438,10 @@
 }
 
 define amdgpu_kernel void @image_sample_g16_cd_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) {
-; CHECK-LABEL: @image_sample_g16_cd_2d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f16.f32(i32 15, half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-LABEL: define amdgpu_kernel void @image_sample_g16_cd_2d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f16.f32(i32 15, half [[DSDH]], half [[DTDH]], half [[DSDV]], half [[DTDV]], float [[S]], float [[T]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %dsdh32 = fpext half %dsdh to float
@@ -4059,9 +4454,10 @@
 }
 
 define amdgpu_kernel void @image_sample_g16_c_cd_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s) {
-; CHECK-LABEL: @image_sample_g16_c_cd_1d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f16.f32(i32 15, float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-LABEL: define amdgpu_kernel void @image_sample_g16_c_cd_1d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DSDV:%.*]], float [[S:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f16.f32(i32 15, float [[ZCOMPARE]], half [[DSDH]], half [[DSDV]], float [[S]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %dsdh32 = fpext half %dsdh to float
@@ -4072,9 +4468,10 @@
 }
 
 define amdgpu_kernel void @image_sample_g16_c_cd_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) {
-; CHECK-LABEL: @image_sample_g16_c_cd_2d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f16.f32(i32 15, float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-LABEL: define amdgpu_kernel void @image_sample_g16_c_cd_2d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f16.f32(i32 15, float [[ZCOMPARE]], half [[DSDH]], half [[DTDH]], half [[DSDV]], half [[DTDV]], float [[S]], float [[T]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %dsdh32 = fpext half %dsdh to float
@@ -4087,9 +4484,10 @@
 }
 
 define amdgpu_kernel void @image_sample_g16_cd_cl_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s, float %clamp) {
-; CHECK-LABEL: @image_sample_g16_cd_cl_1d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f16.f32(i32 15, half [[DSDH:%.*]], half [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-LABEL: define amdgpu_kernel void @image_sample_g16_cd_cl_1d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[DSDH:%.*]], half [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f16.f32(i32 15, half [[DSDH]], half [[DSDV]], float [[S]], float [[CLAMP]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %dsdh32 = fpext half %dsdh to float
@@ -4100,9 +4498,10 @@
 }
 
 define amdgpu_kernel void @image_sample_g16_cd_cl_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) {
-; CHECK-LABEL: @image_sample_g16_cd_cl_2d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f16.f32(i32 15, half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-LABEL: define amdgpu_kernel void @image_sample_g16_cd_cl_2d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f16.f32(i32 15, half [[DSDH]], half [[DTDH]], half [[DSDV]], half [[DTDV]], float [[S]], float [[T]], float [[CLAMP]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %dsdh32 = fpext half %dsdh to float
@@ -4115,9 +4514,10 @@
 }
 
 define amdgpu_kernel void @image_sample_g16_c_cd_cl_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp) {
-; CHECK-LABEL: @image_sample_g16_c_cd_cl_1d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f16.f32(i32 15, float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-LABEL: define amdgpu_kernel void @image_sample_g16_c_cd_cl_1d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f16.f32(i32 15, float [[ZCOMPARE]], half [[DSDH]], half [[DSDV]], float [[S]], float [[CLAMP]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %dsdh32 = fpext half %dsdh to float
@@ -4128,9 +4528,10 @@
 }
 
 define amdgpu_kernel void @image_sample_g16_c_cd_cl_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) {
-; CHECK-LABEL: @image_sample_g16_c_cd_cl_2d(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f16.f32(i32 15, float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-LABEL: define amdgpu_kernel void @image_sample_g16_c_cd_cl_2d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f16.f32(i32 15, float [[ZCOMPARE]], half [[DSDH]], half [[DTDH]], half [[DSDV]], half [[DTDV]], float [[S]], float [[T]], float [[CLAMP]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %dsdh32 = fpext half %dsdh to float
@@ -4143,9 +4544,10 @@
 }
 
 define amdgpu_kernel void @image_sample_g16_c_d_o_2darray_V1(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice) {
-; CHECK-LABEL: @image_sample_g16_c_d_o_2darray_V1(
-; CHECK-NEXT:    [[RES:%.*]] = call float @llvm.amdgcn.image.sample.c.d.o.2darray.f32.f16.f32(i32 4, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], float [[SLICE:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store float [[RES]], ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-LABEL: define amdgpu_kernel void @image_sample_g16_c_d_o_2darray_V1
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], float [[SLICE:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[RES:%.*]] = call float @llvm.amdgcn.image.sample.c.d.o.2darray.f32.f16.f32(i32 4, i32 [[OFFSET]], float [[ZCOMPARE]], half [[DSDH]], half [[DTDH]], half [[DSDV]], half [[DTDV]], float [[S]], float [[T]], float [[SLICE]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store float [[RES]], ptr addrspace(1) [[OUT]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %dsdh32 = fpext half %dsdh to float
@@ -4158,9 +4560,10 @@
 }
 
 define amdgpu_kernel void @image_sample_g16_c_d_o_2darray_V2(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice) {
-; CHECK-LABEL: @image_sample_g16_c_d_o_2darray_V2(
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f16.f32(i32 6, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], float [[SLICE:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <2 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 8
+; CHECK-LABEL: define amdgpu_kernel void @image_sample_g16_c_d_o_2darray_V2
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], float [[SLICE:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f16.f32(i32 6, i32 [[OFFSET]], float [[ZCOMPARE]], half [[DSDH]], half [[DTDH]], half [[DSDV]], half [[DTDV]], float [[S]], float [[T]], float [[SLICE]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <2 x float> [[RES]], ptr addrspace(1) [[OUT]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %dsdh32 = fpext half %dsdh to float
@@ -4177,9 +4580,10 @@
 ; --------------------------------------------------------------------
 
 define amdgpu_kernel void @image_sample_a16_1d_nnan(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s) {
-; CHECK-LABEL: @image_sample_a16_1d_nnan(
-; CHECK-NEXT:    [[RES:%.*]] = call nnan <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f16(i32 15, half [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_1d_nnan
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[S:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[RES:%.*]] = call nnan <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f16(i32 15, half [[S]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %s32 = fpext half %s to float
@@ -4189,9 +4593,10 @@
 }
 
 define amdgpu_kernel void @image_sample_a16_1d_nnan_ninf_nsz(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s) {
-; CHECK-LABEL: @image_sample_a16_1d_nnan_ninf_nsz(
-; CHECK-NEXT:    [[RES:%.*]] = call nnan ninf nsz <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f16(i32 15, half [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_1d_nnan_ninf_nsz
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[S:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[RES:%.*]] = call nnan ninf nsz <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f16(i32 15, half [[S]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %s32 = fpext half %s to float
@@ -4201,9 +4606,10 @@
 }
 
 define amdgpu_kernel void @image_sample_a16_1d_fast(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s) {
-; CHECK-LABEL: @image_sample_a16_1d_fast(
-; CHECK-NEXT:    [[RES:%.*]] = call fast <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f16(i32 15, half [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_1d_fast
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[S:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[RES:%.*]] = call fast <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f16(i32 15, half [[S]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %s32 = fpext half %s to float
@@ -4213,9 +4619,10 @@
 }
 
 define amdgpu_kernel void @image_sample_a16_2d_nnan(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t) {
-; CHECK-LABEL: @image_sample_a16_2d_nnan(
-; CHECK-NEXT:    [[RES:%.*]] = call nnan <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f16(i32 15, half [[S:%.*]], half [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_2d_nnan
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[S:%.*]], half [[T:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[RES:%.*]] = call nnan <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f16(i32 15, half [[S]], half [[T]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %s32 = fpext half %s to float
@@ -4226,9 +4633,10 @@
 }
 
 define amdgpu_kernel void @image_sample_a16_3d_nnan(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %r) {
-; CHECK-LABEL: @image_sample_a16_3d_nnan(
-; CHECK-NEXT:    [[RES:%.*]] = call nnan <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f16(i32 15, half [[S:%.*]], half [[T:%.*]], half [[R:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_3d_nnan
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[S:%.*]], half [[T:%.*]], half [[R:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[RES:%.*]] = call nnan <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f16(i32 15, half [[S]], half [[T]], half [[R]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %s32 = fpext half %s to float
@@ -4241,9 +4649,10 @@
 
 define amdgpu_kernel void @image_sample_a16_cube_nnan(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %face) {
 ;
-; CHECK-LABEL: @image_sample_a16_cube_nnan(
-; CHECK-NEXT:    [[RES:%.*]] = call nnan <4 x float> @llvm.amdgcn.image.sample.cube.v4f32.f16(i32 15, half [[S:%.*]], half [[T:%.*]], half [[FACE:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_cube_nnan
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[S:%.*]], half [[T:%.*]], half [[FACE:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[RES:%.*]] = call nnan <4 x float> @llvm.amdgcn.image.sample.cube.v4f32.f16(i32 15, half [[S]], half [[T]], half [[FACE]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %s32 = fpext half %s to float
@@ -4255,9 +4664,10 @@
 }
 
 define amdgpu_kernel void @image_sample_a16_1darray_nnan(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %slice) {
-; CHECK-LABEL: @image_sample_a16_1darray_nnan(
-; CHECK-NEXT:    [[RES:%.*]] = call nnan <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f16(i32 15, half [[S:%.*]], half [[SLICE:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_1darray_nnan
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[S:%.*]], half [[SLICE:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[RES:%.*]] = call nnan <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f16(i32 15, half [[S]], half [[SLICE]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %s32 = fpext half %s to float
@@ -4268,9 +4678,10 @@
 }
 
 define amdgpu_kernel void @image_sample_a16_2darray_nnan(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %slice) {
-; CHECK-LABEL: @image_sample_a16_2darray_nnan(
-; CHECK-NEXT:    [[RES:%.*]] = call nnan <4 x float> @llvm.amdgcn.image.sample.2darray.v4f32.f16(i32 15, half [[S:%.*]], half [[T:%.*]], half [[SLICE:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_2darray_nnan
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[S:%.*]], half [[T:%.*]], half [[SLICE:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[RES:%.*]] = call nnan <4 x float> @llvm.amdgcn.image.sample.2darray.v4f32.f16(i32 15, half [[S]], half [[T]], half [[SLICE]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %s32 = fpext half %s to float
@@ -4297,10 +4708,11 @@
 declare <4 x float> @llvm.amdgcn.image.gather4.c.l.o.2darray.v4f32.f32(i32, i32, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 define amdgpu_kernel void @sample_l_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %lod) {
-; CHECK-LABEL: @sample_l_1d(
+; CHECK-LABEL: define amdgpu_kernel void @sample_l_1d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[S:%.*]], float [[LOD:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.lz.1d.v4f32.f32(i32 15, float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.lz.1d.v4f32.f32(i32 15, float [[S]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
@@ -4310,10 +4722,11 @@
 }
 
 define amdgpu_kernel void @sample_l_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, float %lod) {
-; CHECK-LABEL: @sample_l_2d(
+; CHECK-LABEL: define amdgpu_kernel void @sample_l_2d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[S:%.*]], float [[T:%.*]], float [[LOD:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32(i32 15, float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32(i32 15, float [[S]], float [[T]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
@@ -4323,10 +4736,11 @@
 }
 
 define amdgpu_kernel void @sample_c_l_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %lod) {
-; CHECK-LABEL: @sample_c_l_1d(
+; CHECK-LABEL: define amdgpu_kernel void @sample_c_l_1d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[LOD:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.lz.1d.v4f32.f32(i32 15, float [[ZCOMPARE:%.*]], float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.lz.1d.v4f32.f32(i32 15, float [[ZCOMPARE]], float [[S]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
@@ -4336,10 +4750,11 @@
 }
 
 define amdgpu_kernel void @sample_c_l_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %t, float %lod) {
-; CHECK-LABEL: @sample_c_l_2d(
+; CHECK-LABEL: define amdgpu_kernel void @sample_c_l_2d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], float [[LOD:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.lz.2d.v4f32.f32(i32 15, float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.lz.2d.v4f32.f32(i32 15, float [[ZCOMPARE]], float [[S]], float [[T]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
@@ -4349,10 +4764,11 @@
 }
 
 define amdgpu_kernel void @sample_l_o_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %s, float %lod) {
-; CHECK-LABEL: @sample_l_o_1d(
+; CHECK-LABEL: define amdgpu_kernel void @sample_l_o_1d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], i32 [[OFFSET:%.*]], float [[S:%.*]], float [[LOD:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.lz.o.1d.v4f32.f32(i32 15, i32 [[OFFSET:%.*]], float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.lz.o.1d.v4f32.f32(i32 15, i32 [[OFFSET]], float [[S]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
@@ -4362,10 +4778,11 @@
 }
 
 define amdgpu_kernel void @sample_l_o_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %s, float %t, float %lod) {
-; CHECK-LABEL: @sample_l_o_2d(
+; CHECK-LABEL: define amdgpu_kernel void @sample_l_o_2d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], i32 [[OFFSET:%.*]], float [[S:%.*]], float [[T:%.*]], float [[LOD:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.lz.o.2d.v4f32.f32(i32 15, i32 [[OFFSET:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.lz.o.2d.v4f32.f32(i32 15, i32 [[OFFSET]], float [[S]], float [[T]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
@@ -4375,10 +4792,11 @@
 }
 
 define amdgpu_kernel void @sample_c_l_o_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %s, float %lod) {
-; CHECK-LABEL: @sample_c_l_o_1d(
+; CHECK-LABEL: define amdgpu_kernel void @sample_c_l_o_1d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[LOD:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.lz.o.1d.v4f32.f32(i32 15, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.lz.o.1d.v4f32.f32(i32 15, i32 [[OFFSET]], float [[ZCOMPARE]], float [[S]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
@@ -4388,10 +4806,11 @@
 }
 
 define amdgpu_kernel void @sample_c_l_o_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %s, float %t, float %lod) {
-; CHECK-LABEL: @sample_c_l_o_2d(
+; CHECK-LABEL: define amdgpu_kernel void @sample_c_l_o_2d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], float [[LOD:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.lz.o.2d.v4f32.f32(i32 15, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.lz.o.2d.v4f32.f32(i32 15, i32 [[OFFSET]], float [[ZCOMPARE]], float [[S]], float [[T]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
@@ -4401,10 +4820,11 @@
 }
 
 define amdgpu_kernel void @gather4_l_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, float %lod) {
-; CHECK-LABEL: @gather4_l_2d(
+; CHECK-LABEL: define amdgpu_kernel void @gather4_l_2d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[S:%.*]], float [[T:%.*]], float [[LOD:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32(i32 15, float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32(i32 15, float [[S]], float [[T]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
@@ -4414,10 +4834,11 @@
 }
 
 define amdgpu_kernel void @gather4_c_l_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %t, float %lod) {
-; CHECK-LABEL: @gather4_c_l_2d(
+; CHECK-LABEL: define amdgpu_kernel void @gather4_c_l_2d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], float [[LOD:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.c.lz.2d.v4f32.f32(i32 15, float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.c.lz.2d.v4f32.f32(i32 15, float [[ZCOMPARE]], float [[S]], float [[T]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
@@ -4427,10 +4848,11 @@
 }
 
 define amdgpu_kernel void @gather4_l_o_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %s, float %t, float %lod) {
-; CHECK-LABEL: @gather4_l_o_2d(
+; CHECK-LABEL: define amdgpu_kernel void @gather4_l_o_2d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], i32 [[OFFSET:%.*]], float [[S:%.*]], float [[T:%.*]], float [[LOD:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f32(i32 15, i32 [[OFFSET:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f32(i32 15, i32 [[OFFSET]], float [[S]], float [[T]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
@@ -4440,10 +4862,11 @@
 }
 
 define amdgpu_kernel void @gather4_c_l_o_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %s, float %t, float %lod) {
-; CHECK-LABEL: @gather4_c_l_o_2d(
+; CHECK-LABEL: define amdgpu_kernel void @gather4_c_l_o_2d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], float [[LOD:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2d.v4f32.f32(i32 15, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2d.v4f32.f32(i32 15, i32 [[OFFSET]], float [[ZCOMPARE]], float [[S]], float [[T]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
@@ -4453,10 +4876,11 @@
 }
 
 define amdgpu_kernel void @gather4_c_l_o_2darray(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %s, float %t, float %slice, float %lod) {
-; CHECK-LABEL: @gather4_c_l_o_2darray(
+; CHECK-LABEL: define amdgpu_kernel void @gather4_c_l_o_2darray
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], float [[SLICE:%.*]], float [[LOD:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2darray.v4f32.f32(i32 15, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], float [[SLICE:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2darray.v4f32.f32(i32 15, i32 [[OFFSET]], float [[ZCOMPARE]], float [[S]], float [[T]], float [[SLICE]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
@@ -4470,10 +4894,11 @@
 ; --------------------------------------------------------------------
 
 define amdgpu_kernel void @load_mip_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, i32 %s) {
-; CHECK-LABEL: @load_mip_1d(
+; CHECK-LABEL: define amdgpu_kernel void @load_mip_1d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 [[S]], <8 x i32> [[RSRC]], i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
@@ -4483,10 +4908,11 @@
 }
 
 define amdgpu_kernel void @load_mip_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, i32 %s, i32 %t) {
-; CHECK-LABEL: @load_mip_2d(
+; CHECK-LABEL: define amdgpu_kernel void @load_mip_2d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32 15, i32 [[S:%.*]], i32 [[T:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32 15, i32 [[S]], i32 [[T]], <8 x i32> [[RSRC]], i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
@@ -4496,10 +4922,11 @@
 }
 
 define amdgpu_kernel void @load_mip_3d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %u) {
-; CHECK-LABEL: @load_mip_3d(
+; CHECK-LABEL: define amdgpu_kernel void @load_mip_3d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]], i32 [[U:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.load.3d.v4f32.i32(i32 15, i32 [[S:%.*]], i32 [[T:%.*]], i32 [[U:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.load.3d.v4f32.i32(i32 15, i32 [[S]], i32 [[T]], i32 [[U]], <8 x i32> [[RSRC]], i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
@@ -4509,10 +4936,11 @@
 }
 
 define amdgpu_kernel void @load_mip_1darray(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, i32 %s, i32 %t) {
-; CHECK-LABEL: @load_mip_1darray(
+; CHECK-LABEL: define amdgpu_kernel void @load_mip_1darray
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.load.1darray.v4f32.i32(i32 15, i32 [[S:%.*]], i32 [[T:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.load.1darray.v4f32.i32(i32 15, i32 [[S]], i32 [[T]], <8 x i32> [[RSRC]], i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
@@ -4522,10 +4950,11 @@
 }
 
 define amdgpu_kernel void @load_mip_2darray(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %u) {
-; CHECK-LABEL: @load_mip_2darray(
+; CHECK-LABEL: define amdgpu_kernel void @load_mip_2darray
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]], i32 [[U:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.load.2darray.v4f32.i32(i32 15, i32 [[S:%.*]], i32 [[T:%.*]], i32 [[U:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.load.2darray.v4f32.i32(i32 15, i32 [[S]], i32 [[T]], i32 [[U]], <8 x i32> [[RSRC]], i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
@@ -4535,10 +4964,11 @@
 }
 
 define amdgpu_kernel void @load_mip_cube(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %u) {
-; CHECK-LABEL: @load_mip_cube(
+; CHECK-LABEL: define amdgpu_kernel void @load_mip_cube
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]], i32 [[U:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.load.cube.v4f32.i32(i32 15, i32 [[S:%.*]], i32 [[T:%.*]], i32 [[U:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.load.cube.v4f32.i32(i32 15, i32 [[S]], i32 [[T]], i32 [[U]], <8 x i32> [[RSRC]], i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
@@ -4549,9 +4979,10 @@
 
 
 define amdgpu_kernel void @store_mip_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s) {
-; CHECK-LABEL: @store_mip_1d(
+; CHECK-LABEL: define amdgpu_kernel void @store_mip_1d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x float> [[VDATA:%.*]], i32 [[S:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> [[VDATA:%.*]], i32 15, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
+; CHECK-NEXT:    call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> [[VDATA]], i32 15, i32 [[S]], <8 x i32> [[RSRC]], i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
 main_body:
@@ -4560,9 +4991,10 @@
 }
 
 define amdgpu_kernel void @store_mip_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t) {
-; CHECK-LABEL: @store_mip_2d(
+; CHECK-LABEL: define amdgpu_kernel void @store_mip_2d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x float> [[VDATA:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    call void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float> [[VDATA:%.*]], i32 15, i32 [[S:%.*]], i32 [[T:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
+; CHECK-NEXT:    call void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float> [[VDATA]], i32 15, i32 [[S]], i32 [[T]], <8 x i32> [[RSRC]], i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
 main_body:
@@ -4571,9 +5003,10 @@
 }
 
 define amdgpu_kernel void @store_mip_3d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t, i32 %u) {
-; CHECK-LABEL: @store_mip_3d(
+; CHECK-LABEL: define amdgpu_kernel void @store_mip_3d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x float> [[VDATA:%.*]], i32 [[S:%.*]], i32 [[T:%.*]], i32 [[U:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    call void @llvm.amdgcn.image.store.3d.v4f32.i32(<4 x float> [[VDATA:%.*]], i32 15, i32 [[S:%.*]], i32 [[T:%.*]], i32 [[U:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
+; CHECK-NEXT:    call void @llvm.amdgcn.image.store.3d.v4f32.i32(<4 x float> [[VDATA]], i32 15, i32 [[S]], i32 [[T]], i32 [[U]], <8 x i32> [[RSRC]], i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
 main_body:
@@ -4582,9 +5015,10 @@
 }
 
 define amdgpu_kernel void @store_mip_1darray(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t) {
-; CHECK-LABEL: @store_mip_1darray(
+; CHECK-LABEL: define amdgpu_kernel void @store_mip_1darray
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x float> [[VDATA:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    call void @llvm.amdgcn.image.store.1darray.v4f32.i32(<4 x float> [[VDATA:%.*]], i32 15, i32 [[S:%.*]], i32 [[T:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
+; CHECK-NEXT:    call void @llvm.amdgcn.image.store.1darray.v4f32.i32(<4 x float> [[VDATA]], i32 15, i32 [[S]], i32 [[T]], <8 x i32> [[RSRC]], i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
 main_body:
@@ -4593,9 +5027,10 @@
 }
 
 define amdgpu_kernel void @store_mip_2darray(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t, i32 %u) {
-; CHECK-LABEL: @store_mip_2darray(
+; CHECK-LABEL: define amdgpu_kernel void @store_mip_2darray
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x float> [[VDATA:%.*]], i32 [[S:%.*]], i32 [[T:%.*]], i32 [[U:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    call void @llvm.amdgcn.image.store.2darray.v4f32.i32(<4 x float> [[VDATA:%.*]], i32 15, i32 [[S:%.*]], i32 [[T:%.*]], i32 [[U:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
+; CHECK-NEXT:    call void @llvm.amdgcn.image.store.2darray.v4f32.i32(<4 x float> [[VDATA]], i32 15, i32 [[S]], i32 [[T]], i32 [[U]], <8 x i32> [[RSRC]], i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
 main_body:
@@ -4604,9 +5039,10 @@
 }
 
 define amdgpu_kernel void @store_mip_cube(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t, i32 %u) {
-; CHECK-LABEL: @store_mip_cube(
+; CHECK-LABEL: define amdgpu_kernel void @store_mip_cube
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x float> [[VDATA:%.*]], i32 [[S:%.*]], i32 [[T:%.*]], i32 [[U:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    call void @llvm.amdgcn.image.store.cube.v4f32.i32(<4 x float> [[VDATA:%.*]], i32 15, i32 [[S:%.*]], i32 [[T:%.*]], i32 [[U:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
+; CHECK-NEXT:    call void @llvm.amdgcn.image.store.cube.v4f32.i32(<4 x float> [[VDATA]], i32 15, i32 [[S]], i32 [[T]], i32 [[U]], <8 x i32> [[RSRC]], i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
 main_body:
@@ -4649,10 +5085,11 @@
 declare <4 x float> @llvm.amdgcn.image.gather4.c.b.o.2d.v4f32.f32(i32, i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 define amdgpu_kernel void @sample_b_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
-; CHECK-LABEL: @sample_b_1d(
+; CHECK-LABEL: define amdgpu_kernel void @sample_b_1d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[S:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float [[S]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
@@ -4662,10 +5099,11 @@
 }
 
 define amdgpu_kernel void @sample_b_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
-; CHECK-LABEL: @sample_b_2d(
+; CHECK-LABEL: define amdgpu_kernel void @sample_b_2d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[S:%.*]], float [[T:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float [[S]], float [[T]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
@@ -4675,10 +5113,11 @@
 }
 
 define amdgpu_kernel void @sample_c_b_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s) {
-; CHECK-LABEL: @sample_c_b_1d(
+; CHECK-LABEL: define amdgpu_kernel void @sample_c_b_1d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.1d.v4f32.f32(i32 15, float [[ZCOMPARE:%.*]], float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.1d.v4f32.f32(i32 15, float [[ZCOMPARE]], float [[S]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
@@ -4688,10 +5127,11 @@
 }
 
 define amdgpu_kernel void @sample_c_b_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %t) {
-; CHECK-LABEL: @sample_c_b_2d(
+; CHECK-LABEL: define amdgpu_kernel void @sample_c_b_2d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.2d.v4f32.f32(i32 15, float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.2d.v4f32.f32(i32 15, float [[ZCOMPARE]], float [[S]], float [[T]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
@@ -4701,10 +5141,11 @@
 }
 
 define amdgpu_kernel void @sample_b_o_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %s) {
-; CHECK-LABEL: @sample_b_o_1d(
+; CHECK-LABEL: define amdgpu_kernel void @sample_b_o_1d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], i32 [[OFFSET:%.*]], float [[S:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.o.1d.v4f32.f32(i32 15, i32 [[OFFSET:%.*]], float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.o.1d.v4f32.f32(i32 15, i32 [[OFFSET]], float [[S]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
@@ -4714,10 +5155,11 @@
 }
 
 define amdgpu_kernel void @sample_b_o_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %s, float %t) {
-; CHECK-LABEL: @sample_b_o_2d(
+; CHECK-LABEL: define amdgpu_kernel void @sample_b_o_2d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], i32 [[OFFSET:%.*]], float [[S:%.*]], float [[T:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.o.2d.v4f32.f32(i32 15, i32 [[OFFSET:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.o.2d.v4f32.f32(i32 15, i32 [[OFFSET]], float [[S]], float [[T]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
@@ -4727,10 +5169,11 @@
 }
 
 define amdgpu_kernel void @sample_c_b_o_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %s) {
-; CHECK-LABEL: @sample_c_b_o_1d(
+; CHECK-LABEL: define amdgpu_kernel void @sample_c_b_o_1d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.o.1d.v4f32.f32(i32 15, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.o.1d.v4f32.f32(i32 15, i32 [[OFFSET]], float [[ZCOMPARE]], float [[S]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
@@ -4740,10 +5183,11 @@
 }
 
 define amdgpu_kernel void @sample_c_b_o_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %s, float %t) {
-; CHECK-LABEL: @sample_c_b_o_2d(
+; CHECK-LABEL: define amdgpu_kernel void @sample_c_b_o_2d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.o.2d.v4f32.f32(i32 15, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.o.2d.v4f32.f32(i32 15, i32 [[OFFSET]], float [[ZCOMPARE]], float [[S]], float [[T]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
@@ -4753,10 +5197,11 @@
 }
 
 define amdgpu_kernel void @gather4_b_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
-; CHECK-LABEL: @gather4_b_2d(
+; CHECK-LABEL: define amdgpu_kernel void @gather4_b_2d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[S:%.*]], float [[T:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32 15, float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32 15, float [[S]], float [[T]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
@@ -4766,10 +5211,11 @@
 }
 
 define amdgpu_kernel void @gather4_c_b_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %t) {
-; CHECK-LABEL: @gather4_c_b_2d(
+; CHECK-LABEL: define amdgpu_kernel void @gather4_c_b_2d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.c.2d.v4f32.f32(i32 15, float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.c.2d.v4f32.f32(i32 15, float [[ZCOMPARE]], float [[S]], float [[T]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
@@ -4779,10 +5225,11 @@
 }
 
 define amdgpu_kernel void @gather4_b_o_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %s, float %t) {
-; CHECK-LABEL: @gather4_b_o_2d(
+; CHECK-LABEL: define amdgpu_kernel void @gather4_b_o_2d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], i32 [[OFFSET:%.*]], float [[S:%.*]], float [[T:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.o.2d.v4f32.f32(i32 15, i32 [[OFFSET:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.o.2d.v4f32.f32(i32 15, i32 [[OFFSET]], float [[S]], float [[T]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
@@ -4792,10 +5239,11 @@
 }
 
 define amdgpu_kernel void @gather4_c_b_o_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %s, float %t) {
-; CHECK-LABEL: @gather4_c_b_o_2d(
+; CHECK-LABEL: define amdgpu_kernel void @gather4_c_b_o_2d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.c.o.2d.v4f32.f32(i32 15, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.c.o.2d.v4f32.f32(i32 15, i32 [[OFFSET]], float [[ZCOMPARE]], float [[S]], float [[T]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
@@ -4805,10 +5253,11 @@
 }
 
 define amdgpu_kernel void @sample_c_b_o_a16_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %s, half %t) {
-; CHECK-LABEL: @sample_c_b_o_a16_2d(
+; CHECK-LABEL: define amdgpu_kernel void @sample_c_b_o_a16_2d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], half [[S:%.*]], half [[T:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.o.2d.v4f32.f16(i32 15, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], half [[S:%.*]], half [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.o.2d.v4f32.f16(i32 15, i32 [[OFFSET]], float [[ZCOMPARE]], half [[S]], half [[T]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
@@ -4819,10 +5268,11 @@
 
 ; Check that bias is not optimized away if > 0
 define amdgpu_kernel void @sample_b_1d_pos(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
-; CHECK-LABEL: @sample_b_1d_pos(
+; CHECK-LABEL: define amdgpu_kernel void @sample_b_1d_pos
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[S:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32.f32(i32 15, float 1.000000e+00, float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32.f32(i32 15, float 1.000000e+00, float [[S]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
@@ -4833,10 +5283,11 @@
 
 ; Check that bias is not optimized away if < 0
 define amdgpu_kernel void @sample_b_1d_neg(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
-; CHECK-LABEL: @sample_b_1d_neg(
+; CHECK-LABEL: define amdgpu_kernel void @sample_b_1d_neg
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[S:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32.f32(i32 15, float -1.000000e+00, float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32.f32(i32 15, float -1.000000e+00, float [[S]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
@@ -4847,10 +5298,11 @@
 
 ; Zero bias + A16
 define amdgpu_kernel void @sample_b_1d_a16(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s) {
-; CHECK-LABEL: @sample_b_1d_a16(
+; CHECK-LABEL: define amdgpu_kernel void @sample_b_1d_a16
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[S:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f16(i32 15, half [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f16(i32 15, half [[S]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
@@ -4865,10 +5317,11 @@
 ; --------------------------------------------------------------------
 
 define amdgpu_kernel void @offset_sample_o_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
-; CHECK-LABEL: @offset_sample_o_1d(
+; CHECK-LABEL: define amdgpu_kernel void @offset_sample_o_1d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[S:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float [[S]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
@@ -4878,10 +5331,11 @@
 }
 
 define amdgpu_kernel void @offset_sample_o_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
-; CHECK-LABEL: @offset_sample_o_2d(
+; CHECK-LABEL: define amdgpu_kernel void @offset_sample_o_2d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[S:%.*]], float [[T:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float [[S]], float [[T]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
@@ -4891,10 +5345,11 @@
 }
 
 define amdgpu_kernel void @offset_sample_c_o_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s) {
-; CHECK-LABEL: @offset_sample_c_o_1d(
+; CHECK-LABEL: define amdgpu_kernel void @offset_sample_c_o_1d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.1d.v4f32.f32(i32 15, float [[ZCOMPARE:%.*]], float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.1d.v4f32.f32(i32 15, float [[ZCOMPARE]], float [[S]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
@@ -4904,10 +5359,11 @@
 }
 
 define amdgpu_kernel void @offset_sample_c_o_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %t) {
-; CHECK-LABEL: @offset_sample_c_o_2d(
+; CHECK-LABEL: define amdgpu_kernel void @offset_sample_c_o_2d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.2d.v4f32.f32(i32 15, float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.2d.v4f32.f32(i32 15, float [[ZCOMPARE]], float [[S]], float [[T]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
@@ -4917,10 +5373,11 @@
 }
 
 define amdgpu_kernel void @offset_sample_cl_o_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %clamp) {
-; CHECK-LABEL: @offset_sample_cl_o_1d(
+; CHECK-LABEL: define amdgpu_kernel void @offset_sample_cl_o_1d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[S:%.*]], float [[CLAMP:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cl.1d.v4f32.f32(i32 15, float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cl.1d.v4f32.f32(i32 15, float [[S]], float [[CLAMP]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
@@ -4930,10 +5387,11 @@
 }
 
 define amdgpu_kernel void @offset_sample_cl_o_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, float %clamp) {
-; CHECK-LABEL: @offset_sample_cl_o_2d(
+; CHECK-LABEL: define amdgpu_kernel void @offset_sample_cl_o_2d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cl.2d.v4f32.f32(i32 15, float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cl.2d.v4f32.f32(i32 15, float [[S]], float [[T]], float [[CLAMP]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
@@ -4943,10 +5401,11 @@
 }
 
 define amdgpu_kernel void @offset_sample_c_cl_o_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %clamp) {
-; CHECK-LABEL: @offset_sample_c_cl_o_1d(
+; CHECK-LABEL: define amdgpu_kernel void @offset_sample_c_cl_o_1d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[CLAMP:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cl.1d.v4f32.f32(i32 15, float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cl.1d.v4f32.f32(i32 15, float [[ZCOMPARE]], float [[S]], float [[CLAMP]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
@@ -4956,10 +5415,11 @@
 }
 
 define amdgpu_kernel void @offset_sample_c_cl_o_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %t, float %clamp) {
-; CHECK-LABEL: @offset_sample_c_cl_o_2d(
+; CHECK-LABEL: define amdgpu_kernel void @offset_sample_c_cl_o_2d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cl.2d.v4f32.f32(i32 15, float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cl.2d.v4f32.f32(i32 15, float [[ZCOMPARE]], float [[S]], float [[T]], float [[CLAMP]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
@@ -4969,10 +5429,11 @@
 }
 
 define amdgpu_kernel void @offset_sample_b_o_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %s) {
-; CHECK-LABEL: @offset_sample_b_o_1d(
+; CHECK-LABEL: define amdgpu_kernel void @offset_sample_b_o_1d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[BIAS:%.*]], float [[S:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32.f32(i32 15, float [[BIAS:%.*]], float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32.f32(i32 15, float [[BIAS]], float [[S]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
@@ -4982,10 +5443,11 @@
 }
 
 define amdgpu_kernel void @offset_sample_b_o_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %s, float %t) {
-; CHECK-LABEL: @offset_sample_b_o_2d(
+; CHECK-LABEL: define amdgpu_kernel void @offset_sample_b_o_2d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[BIAS:%.*]], float [[S:%.*]], float [[T:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.2d.v4f32.f32.f32(i32 15, float [[BIAS:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.2d.v4f32.f32.f32(i32 15, float [[BIAS]], float [[S]], float [[T]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
@@ -4995,10 +5457,11 @@
 }
 
 define amdgpu_kernel void @offset_sample_c_b_o_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, float %s) {
-; CHECK-LABEL: @offset_sample_c_b_o_1d(
+; CHECK-LABEL: define amdgpu_kernel void @offset_sample_c_b_o_1d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.1d.v4f32.f32.f32(i32 15, float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.1d.v4f32.f32.f32(i32 15, float [[BIAS]], float [[ZCOMPARE]], float [[S]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
@@ -5008,10 +5471,11 @@
 }
 
 define amdgpu_kernel void @offset_sample_c_b_o_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, float %s, float %t) {
-; CHECK-LABEL: @offset_sample_c_b_o_2d(
+; CHECK-LABEL: define amdgpu_kernel void @offset_sample_c_b_o_2d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.2d.v4f32.f32.f32(i32 15, float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.2d.v4f32.f32.f32(i32 15, float [[BIAS]], float [[ZCOMPARE]], float [[S]], float [[T]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
@@ -5021,10 +5485,11 @@
 }
 
 define amdgpu_kernel void @offset_sample_b_cl_o_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %s, float %clamp) {
-; CHECK-LABEL: @offset_sample_b_cl_o_1d(
+; CHECK-LABEL: define amdgpu_kernel void @offset_sample_b_cl_o_1d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[BIAS:%.*]], float [[S:%.*]], float [[CLAMP:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f32.f32(i32 15, float [[BIAS:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f32.f32(i32 15, float [[BIAS]], float [[S]], float [[CLAMP]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
@@ -5034,10 +5499,11 @@
 }
 
 define amdgpu_kernel void @offset_sample_b_cl_o_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %s, float %t, float %clamp) {
-; CHECK-LABEL: @offset_sample_b_cl_o_2d(
+; CHECK-LABEL: define amdgpu_kernel void @offset_sample_b_cl_o_2d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[BIAS:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.cl.2d.v4f32.f32.f32(i32 15, float [[BIAS:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.cl.2d.v4f32.f32.f32(i32 15, float [[BIAS]], float [[S]], float [[T]], float [[CLAMP]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
@@ -5047,10 +5513,11 @@
 }
 
 define amdgpu_kernel void @offset_sample_c_b_cl_o_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, float %s, float %clamp) {
-; CHECK-LABEL: @offset_sample_c_b_cl_o_1d(
+; CHECK-LABEL: define amdgpu_kernel void @offset_sample_c_b_cl_o_1d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[CLAMP:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.1d.v4f32.f32.f32(i32 15, float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.1d.v4f32.f32.f32(i32 15, float [[BIAS]], float [[ZCOMPARE]], float [[S]], float [[CLAMP]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
@@ -5060,10 +5527,11 @@
 }
 
 define amdgpu_kernel void @offset_sample_c_b_cl_o_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, float %s, float %t, float %clamp) {
-; CHECK-LABEL: @offset_sample_c_b_cl_o_2d(
+; CHECK-LABEL: define amdgpu_kernel void @offset_sample_c_b_cl_o_2d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.2d.v4f32.f32.f32(i32 15, float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.2d.v4f32.f32.f32(i32 15, float [[BIAS]], float [[ZCOMPARE]], float [[S]], float [[T]], float [[CLAMP]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
@@ -5073,10 +5541,11 @@
 }
 
 define amdgpu_kernel void @offset_sample_d_o_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dsdv, float %s) {
-; CHECK-LABEL: @offset_sample_d_o_1d(
+; CHECK-LABEL: define amdgpu_kernel void @offset_sample_d_o_1d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f32.f32(i32 15, float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f32.f32(i32 15, float [[DSDH]], float [[DSDV]], float [[S]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
@@ -5086,10 +5555,11 @@
 }
 
 define amdgpu_kernel void @offset_sample_d_o_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t) {
-; CHECK-LABEL: @offset_sample_d_o_2d(
+; CHECK-LABEL: define amdgpu_kernel void @offset_sample_d_o_2d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[DSDH:%.*]], float [[DTDH:%.*]], float [[DSDV:%.*]], float [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f32.f32(i32 15, float [[DSDH:%.*]], float [[DTDH:%.*]], float [[DSDV:%.*]], float [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f32.f32(i32 15, float [[DSDH]], float [[DTDH]], float [[DSDV]], float [[DTDV]], float [[S]], float [[T]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
@@ -5099,10 +5569,11 @@
 }
 
 define amdgpu_kernel void @offset_sample_c_d_o_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dsdv, float %s) {
-; CHECK-LABEL: @offset_sample_c_d_o_1d(
+; CHECK-LABEL: define amdgpu_kernel void @offset_sample_c_d_o_1d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f32.f32(i32 15, float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f32.f32(i32 15, float [[ZCOMPARE]], float [[DSDH]], float [[DSDV]], float [[S]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
@@ -5112,10 +5583,11 @@
 }
 
 define amdgpu_kernel void @offset_sample_c_d_o_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t) {
-; CHECK-LABEL: @offset_sample_c_d_o_2d(
+; CHECK-LABEL: define amdgpu_kernel void @offset_sample_c_d_o_2d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DTDH:%.*]], float [[DSDV:%.*]], float [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f32.f32(i32 15, float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DTDH:%.*]], float [[DSDV:%.*]], float [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f32.f32(i32 15, float [[ZCOMPARE]], float [[DSDH]], float [[DTDH]], float [[DSDV]], float [[DTDV]], float [[S]], float [[T]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
@@ -5125,10 +5597,11 @@
 }
 
 define amdgpu_kernel void @offset_sample_d_cl_o_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dsdv, float %s, float %clamp) {
-; CHECK-LABEL: @offset_sample_d_cl_o_1d(
+; CHECK-LABEL: define amdgpu_kernel void @offset_sample_d_cl_o_1d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f32.f32(i32 15, float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f32.f32(i32 15, float [[DSDH]], float [[DSDV]], float [[S]], float [[CLAMP]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
@@ -5138,10 +5611,11 @@
 }
 
 define amdgpu_kernel void @offset_sample_d_cl_o_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %clamp) {
-; CHECK-LABEL: @offset_sample_d_cl_o_2d(
+; CHECK-LABEL: define amdgpu_kernel void @offset_sample_d_cl_o_2d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[DSDH:%.*]], float [[DTDH:%.*]], float [[DSDV:%.*]], float [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f32.f32(i32 15, float [[DSDH:%.*]], float [[DTDH:%.*]], float [[DSDV:%.*]], float [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f32.f32(i32 15, float [[DSDH]], float [[DTDH]], float [[DSDV]], float [[DTDV]], float [[S]], float [[T]], float [[CLAMP]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
@@ -5151,10 +5625,11 @@
 }
 
 define amdgpu_kernel void @offset_sample_c_d_cl_o_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp) {
-; CHECK-LABEL: @offset_sample_c_d_cl_o_1d(
+; CHECK-LABEL: define amdgpu_kernel void @offset_sample_c_d_cl_o_1d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f32.f32(i32 15, float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f32.f32(i32 15, float [[ZCOMPARE]], float [[DSDH]], float [[DSDV]], float [[S]], float [[CLAMP]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
@@ -5164,10 +5639,11 @@
 }
 
 define amdgpu_kernel void @offset_sample_c_d_cl_o_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %clamp) {
-; CHECK-LABEL: @offset_sample_c_d_cl_o_2d(
+; CHECK-LABEL: define amdgpu_kernel void @offset_sample_c_d_cl_o_2d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DTDH:%.*]], float [[DSDV:%.*]], float [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f32.f32(i32 15, float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DTDH:%.*]], float [[DSDV:%.*]], float [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f32.f32(i32 15, float [[ZCOMPARE]], float [[DSDH]], float [[DTDH]], float [[DSDV]], float [[DTDV]], float [[S]], float [[T]], float [[CLAMP]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
@@ -5177,10 +5653,11 @@
 }
 
 define amdgpu_kernel void @offset_sample_cd_o_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dsdv, float %s) {
-; CHECK-LABEL: @offset_sample_cd_o_1d(
+; CHECK-LABEL: define amdgpu_kernel void @offset_sample_cd_o_1d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f32.f32(i32 15, float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f32.f32(i32 15, float [[DSDH]], float [[DSDV]], float [[S]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
@@ -5190,10 +5667,11 @@
 }
 
 define amdgpu_kernel void @offset_sample_cd_o_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t) {
-; CHECK-LABEL: @offset_sample_cd_o_2d(
+; CHECK-LABEL: define amdgpu_kernel void @offset_sample_cd_o_2d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[DSDH:%.*]], float [[DTDH:%.*]], float [[DSDV:%.*]], float [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f32.f32(i32 15, float [[DSDH:%.*]], float [[DTDH:%.*]], float [[DSDV:%.*]], float [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f32.f32(i32 15, float [[DSDH]], float [[DTDH]], float [[DSDV]], float [[DTDV]], float [[S]], float [[T]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
@@ -5203,10 +5681,11 @@
 }
 
 define amdgpu_kernel void @offset_sample_c_cd_o_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dsdv, float %s) {
-; CHECK-LABEL: @offset_sample_c_cd_o_1d(
+; CHECK-LABEL: define amdgpu_kernel void @offset_sample_c_cd_o_1d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f32.f32(i32 15, float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f32.f32(i32 15, float [[ZCOMPARE]], float [[DSDH]], float [[DSDV]], float [[S]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
@@ -5216,10 +5695,11 @@
 }
 
 define amdgpu_kernel void @offset_sample_c_cd_o_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t) {
-; CHECK-LABEL: @offset_sample_c_cd_o_2d(
+; CHECK-LABEL: define amdgpu_kernel void @offset_sample_c_cd_o_2d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DTDH:%.*]], float [[DSDV:%.*]], float [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f32.f32(i32 15, float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DTDH:%.*]], float [[DSDV:%.*]], float [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f32.f32(i32 15, float [[ZCOMPARE]], float [[DSDH]], float [[DTDH]], float [[DSDV]], float [[DTDV]], float [[S]], float [[T]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
@@ -5229,10 +5709,11 @@
 }
 
 define amdgpu_kernel void @offset_sample_cd_cl_o_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dsdv, float %s, float %clamp) {
-; CHECK-LABEL: @offset_sample_cd_cl_o_1d(
+; CHECK-LABEL: define amdgpu_kernel void @offset_sample_cd_cl_o_1d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f32.f32(i32 15, float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f32.f32(i32 15, float [[DSDH]], float [[DSDV]], float [[S]], float [[CLAMP]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
@@ -5242,10 +5723,11 @@
 }
 
 define amdgpu_kernel void @offset_sample_cd_cl_o_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %clamp) {
-; CHECK-LABEL: @offset_sample_cd_cl_o_2d(
+; CHECK-LABEL: define amdgpu_kernel void @offset_sample_cd_cl_o_2d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[DSDH:%.*]], float [[DTDH:%.*]], float [[DSDV:%.*]], float [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f32.f32(i32 15, float [[DSDH:%.*]], float [[DTDH:%.*]], float [[DSDV:%.*]], float [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f32.f32(i32 15, float [[DSDH]], float [[DTDH]], float [[DSDV]], float [[DTDV]], float [[S]], float [[T]], float [[CLAMP]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
@@ -5255,10 +5737,11 @@
 }
 
 define amdgpu_kernel void @offset_sample_c_cd_cl_o_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp) {
-; CHECK-LABEL: @offset_sample_c_cd_cl_o_1d(
+; CHECK-LABEL: define amdgpu_kernel void @offset_sample_c_cd_cl_o_1d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f32.f32(i32 15, float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f32.f32(i32 15, float [[ZCOMPARE]], float [[DSDH]], float [[DSDV]], float [[S]], float [[CLAMP]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
@@ -5268,10 +5751,11 @@
 }
 
 define amdgpu_kernel void @offset_sample_c_cd_cl_o_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %clamp) {
-; CHECK-LABEL: @offset_sample_c_cd_cl_o_2d(
+; CHECK-LABEL: define amdgpu_kernel void @offset_sample_c_cd_cl_o_2d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DTDH:%.*]], float [[DSDV:%.*]], float [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f32.f32(i32 15, float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DTDH:%.*]], float [[DSDV:%.*]], float [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f32.f32(i32 15, float [[ZCOMPARE]], float [[DSDH]], float [[DTDH]], float [[DSDV]], float [[DTDV]], float [[S]], float [[T]], float [[CLAMP]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
@@ -5281,10 +5765,11 @@
 }
 
 define amdgpu_kernel void @offset_sample_l_o_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %lod) {
-; CHECK-LABEL: @offset_sample_l_o_1d(
+; CHECK-LABEL: define amdgpu_kernel void @offset_sample_l_o_1d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[S:%.*]], float [[LOD:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.l.1d.v4f32.f32(i32 15, float [[S:%.*]], float [[LOD:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.l.1d.v4f32.f32(i32 15, float [[S]], float [[LOD]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
@@ -5294,10 +5779,11 @@
 }
 
 define amdgpu_kernel void @offset_sample_l_o_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, float %lod) {
-; CHECK-LABEL: @offset_sample_l_o_2d(
+; CHECK-LABEL: define amdgpu_kernel void @offset_sample_l_o_2d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[S:%.*]], float [[T:%.*]], float [[LOD:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.l.2d.v4f32.f32(i32 15, float [[S:%.*]], float [[T:%.*]], float [[LOD:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.l.2d.v4f32.f32(i32 15, float [[S]], float [[T]], float [[LOD]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
@@ -5307,10 +5793,11 @@
 }
 
 define amdgpu_kernel void @offset_sample_c_l_o_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %lod) {
-; CHECK-LABEL: @offset_sample_c_l_o_1d(
+; CHECK-LABEL: define amdgpu_kernel void @offset_sample_c_l_o_1d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[LOD:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.l.1d.v4f32.f32(i32 15, float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[LOD:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.l.1d.v4f32.f32(i32 15, float [[ZCOMPARE]], float [[S]], float [[LOD]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
@@ -5320,10 +5807,11 @@
 }
 
 define amdgpu_kernel void @offset_sample_c_l_o_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %t, float %lod) {
-; CHECK-LABEL: @offset_sample_c_l_o_2d(
+; CHECK-LABEL: define amdgpu_kernel void @offset_sample_c_l_o_2d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], float [[LOD:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.l.2d.v4f32.f32(i32 15, float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], float [[LOD:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.l.2d.v4f32.f32(i32 15, float [[ZCOMPARE]], float [[S]], float [[T]], float [[LOD]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
@@ -5333,10 +5821,11 @@
 }
 
 define amdgpu_kernel void @offset_sample_lz_o_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
-; CHECK-LABEL: @offset_sample_lz_o_1d(
+; CHECK-LABEL: define amdgpu_kernel void @offset_sample_lz_o_1d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[S:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.lz.1d.v4f32.f32(i32 15, float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.lz.1d.v4f32.f32(i32 15, float [[S]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
@@ -5346,10 +5835,11 @@
 }
 
 define amdgpu_kernel void @offset_sample_lz_o_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
-; CHECK-LABEL: @offset_sample_lz_o_2d(
+; CHECK-LABEL: define amdgpu_kernel void @offset_sample_lz_o_2d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[S:%.*]], float [[T:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32(i32 15, float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32(i32 15, float [[S]], float [[T]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
@@ -5359,10 +5849,11 @@
 }
 
 define amdgpu_kernel void @offset_sample_c_lz_o_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s) {
-; CHECK-LABEL: @offset_sample_c_lz_o_1d(
+; CHECK-LABEL: define amdgpu_kernel void @offset_sample_c_lz_o_1d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.lz.1d.v4f32.f32(i32 15, float [[ZCOMPARE:%.*]], float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.lz.1d.v4f32.f32(i32 15, float [[ZCOMPARE]], float [[S]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
@@ -5372,10 +5863,11 @@
 }
 
 define amdgpu_kernel void @offset_sample_c_lz_o_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %t) {
-; CHECK-LABEL: @offset_sample_c_lz_o_2d(
+; CHECK-LABEL: define amdgpu_kernel void @offset_sample_c_lz_o_2d
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.lz.2d.v4f32.f32(i32 15, float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
-; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.lz.2d.v4f32.f32(i32 15, float [[ZCOMPARE]], float [[S]], float [[T]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
 main_body:
@@ -5432,7 +5924,8 @@
 declare i1 @llvm.amdgcn.is.shared(ptr) nounwind readnone
 
 define i1 @test_is_shared_null() nounwind {
-; CHECK-LABEL: @test_is_shared_null(
+; CHECK-LABEL: define i1 @test_is_shared_null
+; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:    ret i1 false
 ;
   %val = call i1 @llvm.amdgcn.is.shared(ptr null)
@@ -5440,7 +5933,8 @@
 }
 
 define i1 @test_is_shared_undef() nounwind {
-; CHECK-LABEL: @test_is_shared_undef(
+; CHECK-LABEL: define i1 @test_is_shared_undef
+; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:    ret i1 undef
 ;
   %val = call i1 @llvm.amdgcn.is.shared(ptr undef)
@@ -5454,7 +5948,8 @@
 declare i1 @llvm.amdgcn.is.private(ptr) nounwind readnone
 
 define i1 @test_is_private_null() nounwind {
-; CHECK-LABEL: @test_is_private_null(
+; CHECK-LABEL: define i1 @test_is_private_null
+; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:    ret i1 false
 ;
   %val = call i1 @llvm.amdgcn.is.private(ptr null)
@@ -5462,7 +5957,8 @@
 }
 
 define i1 @test_is_private_undef() nounwind {
-; CHECK-LABEL: @test_is_private_undef(
+; CHECK-LABEL: define i1 @test_is_private_undef
+; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:    ret i1 undef
 ;
   %val = call i1 @llvm.amdgcn.is.private(ptr undef)
@@ -5477,8 +5973,9 @@
 declare float @llvm.amdgcn.trig.preop.f32(float, i32)
 
 define double @trig_preop_constfold_variable_undef_arg(i32 %arg) {
-; CHECK-LABEL: @trig_preop_constfold_variable_undef_arg(
-; CHECK-NEXT:    [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double undef, i32 [[ARG:%.*]])
+; CHECK-LABEL: define double @trig_preop_constfold_variable_undef_arg
+; CHECK-SAME: (i32 [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double undef, i32 [[ARG]])
 ; CHECK-NEXT:    ret double [[VAL]]
 ;
   %val = call double @llvm.amdgcn.trig.preop.f64(double undef, i32 %arg)
@@ -5486,8 +5983,9 @@
 }
 
 define double @trig_preop_constfold_variable_poison_arg(i32 %arg) {
-; CHECK-LABEL: @trig_preop_constfold_variable_poison_arg(
-; CHECK-NEXT:    [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double poison, i32 [[ARG:%.*]])
+; CHECK-LABEL: define double @trig_preop_constfold_variable_poison_arg
+; CHECK-SAME: (i32 [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double poison, i32 [[ARG]])
 ; CHECK-NEXT:    ret double [[VAL]]
 ;
   %val = call double @llvm.amdgcn.trig.preop.f64(double poison, i32 %arg)
@@ -5495,8 +5993,9 @@
 }
 
 define double @trig_preop_constfold_variable_arg_undef(double %arg) {
-; CHECK-LABEL: @trig_preop_constfold_variable_arg_undef(
-; CHECK-NEXT:    [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double [[ARG:%.*]], i32 undef)
+; CHECK-LABEL: define double @trig_preop_constfold_variable_arg_undef
+; CHECK-SAME: (double [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double [[ARG]], i32 undef)
 ; CHECK-NEXT:    ret double [[VAL]]
 ;
   %val = call double @llvm.amdgcn.trig.preop.f64(double %arg, i32 undef)
@@ -5504,8 +6003,9 @@
 }
 
 define double @trig_preop_constfold_variable_arg_poison(double %arg) {
-; CHECK-LABEL: @trig_preop_constfold_variable_arg_poison(
-; CHECK-NEXT:    [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double [[ARG:%.*]], i32 poison)
+; CHECK-LABEL: define double @trig_preop_constfold_variable_arg_poison
+; CHECK-SAME: (double [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double [[ARG]], i32 poison)
 ; CHECK-NEXT:    ret double [[VAL]]
 ;
   %val = call double @llvm.amdgcn.trig.preop.f64(double %arg, i32 poison)
@@ -5513,8 +6013,9 @@
 }
 
 define double @trig_preop_constfold_variable_int(i32 %arg) {
-; CHECK-LABEL: @trig_preop_constfold_variable_int(
-; CHECK-NEXT:    [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double 3.454350e+02, i32 [[ARG:%.*]])
+; CHECK-LABEL: define double @trig_preop_constfold_variable_int
+; CHECK-SAME: (i32 [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double 3.454350e+02, i32 [[ARG]])
 ; CHECK-NEXT:    ret double [[VAL]]
 ;
   %val = call double @llvm.amdgcn.trig.preop.f64(double 3.454350e+02, i32 %arg)
@@ -5522,8 +6023,9 @@
 }
 
 define double @trig_preop_qnan(i32 %arg) {
-; CHECK-LABEL: @trig_preop_qnan(
-; CHECK-NEXT:    [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double 0x7FF8000000000000, i32 [[ARG:%.*]])
+; CHECK-LABEL: define double @trig_preop_qnan
+; CHECK-SAME: (i32 [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double 0x7FF8000000000000, i32 [[ARG]])
 ; CHECK-NEXT:    ret double [[VAL]]
 ;
   %val = call double @llvm.amdgcn.trig.preop.f64(double 0x7FF8000000000000, i32 %arg)
@@ -5531,8 +6033,9 @@
 }
 
 define double @trig_preop_snan(i32 %arg) {
-; CHECK-LABEL: @trig_preop_snan(
-; CHECK-NEXT:    [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double 0x7FF0000000000001, i32 [[ARG:%.*]])
+; CHECK-LABEL: define double @trig_preop_snan
+; CHECK-SAME: (i32 [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double 0x7FF0000000000001, i32 [[ARG]])
 ; CHECK-NEXT:    ret double [[VAL]]
 ;
   %val = call double @llvm.amdgcn.trig.preop.f64(double 0x7FF0000000000001, i32 %arg)
@@ -5540,7 +6043,8 @@
 }
 
 define double @trig_preop_inf_0() {
-; CHECK-LABEL: @trig_preop_inf_0(
+; CHECK-LABEL: define double @trig_preop_inf_0
+; CHECK-SAME: () #[[ATTR3]] {
 ; CHECK-NEXT:    [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double 0x7FF0000000000000, i32 0)
 ; CHECK-NEXT:    ret double [[VAL]]
 ;
@@ -5549,7 +6053,8 @@
 }
 
 define double @trig_preop_ninf_0() {
-; CHECK-LABEL: @trig_preop_ninf_0(
+; CHECK-LABEL: define double @trig_preop_ninf_0
+; CHECK-SAME: () #[[ATTR3]] {
 ; CHECK-NEXT:    [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double 0xFFF0000000000000, i32 0)
 ; CHECK-NEXT:    ret double [[VAL]]
 ;
@@ -5558,8 +6063,9 @@
 }
 
 define double @trig_preop_variable_fp(double %arg) {
-; CHECK-LABEL: @trig_preop_variable_fp(
-; CHECK-NEXT:    [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double [[ARG:%.*]], i32 5)
+; CHECK-LABEL: define double @trig_preop_variable_fp
+; CHECK-SAME: (double [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double [[ARG]], i32 5)
 ; CHECK-NEXT:    ret double [[VAL]]
 ;
   %val = call double @llvm.amdgcn.trig.preop.f64(double %arg, i32 5)
@@ -5567,8 +6073,9 @@
 }
 
 define double @trig_preop_variable_args(double %arg0, i32 %arg1) {
-; CHECK-LABEL: @trig_preop_variable_args(
-; CHECK-NEXT:    [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double [[ARG0:%.*]], i32 [[ARG1:%.*]])
+; CHECK-LABEL: define double @trig_preop_variable_args
+; CHECK-SAME: (double [[ARG0:%.*]], i32 [[ARG1:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double [[ARG0]], i32 [[ARG1]])
 ; CHECK-NEXT:    ret double [[VAL]]
 ;
   %val = call double @llvm.amdgcn.trig.preop.f64(double %arg0, i32 %arg1)
@@ -5576,7 +6083,8 @@
 }
 
 define double @trig_preop_constfold() {
-; CHECK-LABEL: @trig_preop_constfold(
+; CHECK-LABEL: define double @trig_preop_constfold
+; CHECK-SAME: () #[[ATTR3]] {
 ; CHECK-NEXT:    [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double 3.454350e+02, i32 5)
 ; CHECK-NEXT:    ret double [[VAL]]
 ;
@@ -5585,8 +6093,9 @@
 }
 
 define double @trig_preop_constfold_strictfp() {
-; CHECK-LABEL: @trig_preop_constfold_strictfp(
-; CHECK-NEXT:    [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double 3.454350e+02, i32 5) #[[ATTR16]]
+; CHECK-LABEL: define double @trig_preop_constfold_strictfp
+; CHECK-SAME: () #[[ATTR3]] {
+; CHECK-NEXT:    [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double 3.454350e+02, i32 5) #[[ATTR15]]
 ; CHECK-NEXT:    ret double [[VAL]]
 ;
   %val = call double @llvm.amdgcn.trig.preop.f64(double 3.454350e+02, i32 5) strictfp
@@ -5594,7 +6103,8 @@
 }
 
 define double @trig_preop_constfold_0.0__0() {
-; CHECK-LABEL: @trig_preop_constfold_0.0__0(
+; CHECK-LABEL: define double @trig_preop_constfold_0.0__0
+; CHECK-SAME: () #[[ATTR3]] {
 ; CHECK-NEXT:    [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double 0.000000e+00, i32 0)
 ; CHECK-NEXT:    ret double [[VAL]]
 ;
@@ -5603,7 +6113,8 @@
 }
 
 define double @trig_preop_constfold_0.0__1() {
-; CHECK-LABEL: @trig_preop_constfold_0.0__1(
+; CHECK-LABEL: define double @trig_preop_constfold_0.0__1
+; CHECK-SAME: () #[[ATTR3]] {
 ; CHECK-NEXT:    [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double 0.000000e+00, i32 1)
 ; CHECK-NEXT:    ret double [[VAL]]
 ;
@@ -5612,7 +6123,8 @@
 }
 
 define double @trig_preop_constfold_0.0__neg1() {
-; CHECK-LABEL: @trig_preop_constfold_0.0__neg1(
+; CHECK-LABEL: define double @trig_preop_constfold_0.0__neg1
+; CHECK-SAME: () #[[ATTR3]] {
 ; CHECK-NEXT:    [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double 0.000000e+00, i32 -1)
 ; CHECK-NEXT:    ret double [[VAL]]
 ;
@@ -5621,7 +6133,8 @@
 }
 
 define double @trig_preop_constfold_0.0__9999999() {
-; CHECK-LABEL: @trig_preop_constfold_0.0__9999999(
+; CHECK-LABEL: define double @trig_preop_constfold_0.0__9999999
+; CHECK-SAME: () #[[ATTR3]] {
 ; CHECK-NEXT:    [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double 0.000000e+00, i32 9999999)
 ; CHECK-NEXT:    ret double [[VAL]]
 ;
@@ -5630,7 +6143,8 @@
 }
 
 define double @trig_preop_constfold_0.0__neg999999() {
-; CHECK-LABEL: @trig_preop_constfold_0.0__neg999999(
+; CHECK-LABEL: define double @trig_preop_constfold_0.0__neg999999
+; CHECK-SAME: () #[[ATTR3]] {
 ; CHECK-NEXT:    [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double 0.000000e+00, i32 -999999)
 ; CHECK-NEXT:    ret double [[VAL]]
 ;
@@ -5639,7 +6153,8 @@
 }
 
 define double @trig_preop_constfold_0x0020000000000000_0() {
-; CHECK-LABEL: @trig_preop_constfold_0x0020000000000000_0(
+; CHECK-LABEL: define double @trig_preop_constfold_0x0020000000000000_0
+; CHECK-SAME: () #[[ATTR3]] {
 ; CHECK-NEXT:    [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double 0x10000000000000, i32 0)
 ; CHECK-NEXT:    ret double [[VAL]]
 ;
@@ -5648,7 +6163,8 @@
 }
 
 define double @trig_preop_constfold_0x001fffffffffffff_0() {
-; CHECK-LABEL: @trig_preop_constfold_0x001fffffffffffff_0(
+; CHECK-LABEL: define double @trig_preop_constfold_0x001fffffffffffff_0
+; CHECK-SAME: () #[[ATTR3]] {
 ; CHECK-NEXT:    [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double 0xFFFFFFFFFFFFF, i32 0)
 ; CHECK-NEXT:    ret double [[VAL]]
 ;
@@ -5657,7 +6173,8 @@
 }
 
 define double @trig_preop_constfold_0x8020000000000000_0() {
-; CHECK-LABEL: @trig_preop_constfold_0x8020000000000000_0(
+; CHECK-LABEL: define double @trig_preop_constfold_0x8020000000000000_0
+; CHECK-SAME: () #[[ATTR3]] {
 ; CHECK-NEXT:    [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double 0x8020000000000000, i32 0)
 ; CHECK-NEXT:    ret double [[VAL]]
 ;
@@ -5666,7 +6183,8 @@
 }
 
 define double @trig_preop_constfold_0x801fffffffffffff_0() {
-; CHECK-LABEL: @trig_preop_constfold_0x801fffffffffffff_0(
+; CHECK-LABEL: define double @trig_preop_constfold_0x801fffffffffffff_0
+; CHECK-SAME: () #[[ATTR3]] {
 ; CHECK-NEXT:    [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double 0x801FFFFFFFFFFFFF, i32 0)
 ; CHECK-NEXT:    ret double [[VAL]]
 ;
diff --git a/llvm/test/Verifier/AMDGPU/intrinsic-immarg.ll b/llvm/test/Verifier/AMDGPU/intrinsic-immarg.ll
--- a/llvm/test/Verifier/AMDGPU/intrinsic-immarg.ll
+++ b/llvm/test/Verifier/AMDGPU/intrinsic-immarg.ll
@@ -555,12 +555,12 @@
 define i32 @test_permlane16(ptr addrspace(1) %out, i32 %arg0, i32 %arg1, i32 %arg2, i1 %arg3, i1 %arg4) {
   ; CHECK: immarg operand has non-immediate parameter
   ; CHECK-NEXT: i1 %arg3
-  ; CHECK-NEXT: %v1 = call i32 @llvm.amdgcn.permlane16(i32 %arg0, i32 %arg0, i32 %arg1, i32 %arg2, i1 %arg3, i1 false)
+  ; CHECK-NEXT: %v1 = call i32 @llvm.amdgcn.permlane16.i32(i32 %arg0, i32 %arg0, i32 %arg1, i32 %arg2, i1 %arg3, i1 false)
   %v1 = call i32 @llvm.amdgcn.permlane16(i32 %arg0, i32 %arg0, i32 %arg1, i32 %arg2, i1 %arg3, i1 false)
 
   ; CHECK: immarg operand has non-immediate parameter
   ; CHECK-NEXT: i1 %arg4
-  ; CHECK-NEXT: call i32 @llvm.amdgcn.permlane16(i32 %v2, i32 %arg0, i32 %arg1, i32 %arg2, i1 false, i1 %arg4)
+  ; CHECK-NEXT: call i32 @llvm.amdgcn.permlane16.i32(i32 %v2, i32 %arg0, i32 %arg1, i32 %arg2, i1 false, i1 %arg4)
   %v2 = call i32 @llvm.amdgcn.permlane16(i32 %v2, i32 %arg0, i32 %arg1, i32 %arg2, i1 false, i1 %arg4)
   ret i32 %v2
 }
@@ -569,12 +569,12 @@
 define i32 @test_permlanex16(ptr addrspace(1) %out, i32 %arg0, i32 %arg1, i32 %arg2, i1 %arg3, i1 %arg4) {
   ; CHECK: immarg operand has non-immediate parameter
   ; CHECK-NEXT: i1 %arg3
-  ; CHECK-NEXT: %v1 = call i32 @llvm.amdgcn.permlanex16(i32 %arg0, i32 %arg0, i32 %arg1, i32 %arg2, i1 %arg3, i1 false)
+  ; CHECK-NEXT: %v1 = call i32 @llvm.amdgcn.permlanex16.i32(i32 %arg0, i32 %arg0, i32 %arg1, i32 %arg2, i1 %arg3, i1 false)
   %v1 = call i32 @llvm.amdgcn.permlanex16(i32 %arg0, i32 %arg0, i32 %arg1, i32 %arg2, i1 %arg3, i1 false)
 
   ; CHECK: immarg operand has non-immediate parameter
   ; CHECK-NEXT: i1 %arg4
-  ; CHECK-NEXT: call i32 @llvm.amdgcn.permlanex16(i32 %v2, i32 %arg0, i32 %arg1, i32 %arg2, i1 false, i1 %arg4)
+  ; CHECK-NEXT: call i32 @llvm.amdgcn.permlanex16.i32(i32 %v2, i32 %arg0, i32 %arg1, i32 %arg2, i1 false, i1 %arg4)
   %v2 = call i32 @llvm.amdgcn.permlanex16(i32 %v2, i32 %arg0, i32 %arg1, i32 %arg2, i1 false, i1 %arg4)
   ret i32 %v2
 }
@@ -600,7 +600,6 @@
   ; CHECK: immarg operand has non-immediate parameter
   ; CHECK-NEXT: i32 %arg2
   ; CHECK-NEXT: %val0 = call float @llvm.amdgcn.interp.p2(float %arg0, float %arg1, i32 %arg2, i32 0, i32 0)
-
   %val0 = call float @llvm.amdgcn.interp.p2(float %arg0, float %arg1, i32 %arg2, i32 0, i32 0)
   store volatile float %val0, ptr addrspace(1) undef