diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -255,6 +255,8 @@ //===----------------------------------------------------------------------===// TARGET_BUILTIN(__builtin_amdgcn_permlane16, "UiUiUiUiUiIbIb", "nc", "gfx10-insts") TARGET_BUILTIN(__builtin_amdgcn_permlanex16, "UiUiUiUiUiIbIb", "nc", "gfx10-insts") +TARGET_BUILTIN(__builtin_amdgcn_permlane16_f32, "fffUiUiIbIb", "nc", "gfx10-insts") +TARGET_BUILTIN(__builtin_amdgcn_permlanex16_f32, "fffUiUiIbIb", "nc", "gfx10-insts") TARGET_BUILTIN(__builtin_amdgcn_mov_dpp8, "UiUiIUi", "nc", "gfx10-insts") //===----------------------------------------------------------------------===// diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -17354,6 +17354,35 @@ return Builder.CreateCall(F, Args); } + case AMDGPU::BI__builtin_amdgcn_permlane16: + case AMDGPU::BI__builtin_amdgcn_permlanex16: + case AMDGPU::BI__builtin_amdgcn_permlane16_f32: + case AMDGPU::BI__builtin_amdgcn_permlanex16_f32: { + Intrinsic::ID Intrin; + switch (BuiltinID) { + case AMDGPU::BI__builtin_amdgcn_permlane16: + case AMDGPU::BI__builtin_amdgcn_permlane16_f32: + Intrin = Intrinsic::amdgcn_permlane16; + break; + case AMDGPU::BI__builtin_amdgcn_permlanex16: + case AMDGPU::BI__builtin_amdgcn_permlanex16_f32: + Intrin = Intrinsic::amdgcn_permlanex16; + break; + } + llvm::Value *Src0 = EmitScalarExpr(E->getArg(0)); + llvm::Value *Src1 = EmitScalarExpr(E->getArg(1)); + llvm::Value *Src2 = EmitScalarExpr(E->getArg(2)); + llvm::Value *Src3 = EmitScalarExpr(E->getArg(3)); + llvm::Value *Src4 = EmitScalarExpr(E->getArg(4)); + llvm::Value *Src5 = EmitScalarExpr(E->getArg(5)); + + llvm::Function *F = CGM.getIntrinsic(Intrin, Src1->getType()); + return Builder.CreateCall(F, {Src0, Src1, Src2, Src3, Src4, Src5}); + } + case AMDGPU::BI__builtin_amdgcn_readfirstlane: + return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_readfirstlane); + case AMDGPU::BI__builtin_amdgcn_readlane: + return emitBinaryBuiltin(*this, E, Intrinsic::amdgcn_readlane); // amdgcn workitem case AMDGPU::BI__builtin_amdgcn_workitem_id_x: return emitRangedBuiltin(*this, Intrinsic::amdgcn_workitem_id_x, 0, 1024); diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl @@ -7,17 +7,30 @@ typedef unsigned long ulong; // CHECK-LABEL: @test_permlane16( -// CHECK: call i32 @llvm.amdgcn.permlane16(i32 %a, i32 %b, i32 %c, i32 %d, i1 false, i1 false) +// CHECK: call i32 @llvm.amdgcn.permlane16.i32(i32 %a, i32 %b, i32 %c, i32 %d, i1 false, i1 false) void test_permlane16(global uint* out, uint a, uint b, uint c, uint d) { *out = __builtin_amdgcn_permlane16(a, b, c, d, 0, 0); } // CHECK-LABEL: @test_permlanex16( -// CHECK: call i32 @llvm.amdgcn.permlanex16(i32 %a, i32 %b, i32 %c, i32 %d, i1 false, i1 false) +// CHECK: call i32 @llvm.amdgcn.permlanex16.i32(i32 %a, i32 %b, i32 %c, i32 %d, i1 false, i1 false) void test_permlanex16(global uint* out, uint a, uint b, uint c, uint d) { *out = __builtin_amdgcn_permlanex16(a, b, c, d, 0, 0); } +// CHECK-LABEL: @test_permlane16_f32( +// CHECK: call float @llvm.amdgcn.permlane16.f32(float %a, float %b, i32 %c, i32 %d, i1 false, i1 false) +void test_permlane16_f32(global float* out, float a, float b, uint c, uint d) { + *out = __builtin_amdgcn_permlane16_f32(a, b, c, d, 0, 0); +} + +// CHECK-LABEL: @test_permlanex16_f32( +// CHECK: call float @llvm.amdgcn.permlanex16.f32(float %a, float %b, i32 %c, i32 %d, i1 false, i1 false) +void test_permlanex16_f32(global float* out, float a, float b, uint c, uint d) { + *out = __builtin_amdgcn_permlanex16_f32(a, b, c, d, 0, 0); +} + + // CHECK-LABEL: @test_mov_dpp8( // CHECK: call i32 @llvm.amdgcn.mov.dpp8.i32(i32 %a, i32 1) void test_mov_dpp8(global uint* out, uint a) { diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl --- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl @@ -292,14 +292,14 @@ } // CHECK-LABEL: @test_readfirstlane -// CHECK: call i32 @llvm.amdgcn.readfirstlane(i32 %a) +// CHECK: call i32 @llvm.amdgcn.readfirstlane.i32(i32 %a) void test_readfirstlane(global int* out, int a) { *out = __builtin_amdgcn_readfirstlane(a); } // CHECK-LABEL: @test_readlane -// CHECK: call i32 @llvm.amdgcn.readlane(i32 %a, i32 %b) +// CHECK: call i32 @llvm.amdgcn.readlane.i32(i32 %a, i32 %b) void test_readlane(global int* out, int a, int b) { *out = __builtin_amdgcn_readlane(a, b); diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx10-param.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx10-param.cl --- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx10-param.cl +++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx10-param.cl @@ -13,6 +13,16 @@ *out = __builtin_amdgcn_permlanex16(a, b, c, d, 1, e); // expected-error{{argument to '__builtin_amdgcn_permlanex16' must be a constant integer}} } +void test_permlane16_f32(global float* out, float a, float b, uint c, uint d, uint e) { + *out = __builtin_amdgcn_permlane16_f32(a, b, c, d, e, 1); // expected-error{{argument to '__builtin_amdgcn_permlane16_f32' must be a constant integer}} + *out = __builtin_amdgcn_permlane16_f32(a, b, c, d, 1, e); // expected-error{{argument to '__builtin_amdgcn_permlane16_f32' must be a constant integer}} +} + +void test_permlanex16_f32(global float* out, float a, float b, uint c, uint d, uint e) { + *out = __builtin_amdgcn_permlanex16_f32(a, b, c, d, e, 1); // expected-error{{argument to '__builtin_amdgcn_permlanex16_f32' must be a constant integer}} + *out = __builtin_amdgcn_permlanex16_f32(a, b, c, d, 1, e); // expected-error{{argument to '__builtin_amdgcn_permlanex16_f32' must be a constant integer}} +} + void test_mov_dpp8(global uint* out, uint a, uint b) { *out = __builtin_amdgcn_mov_dpp8(a, b); // expected-error{{argument to '__builtin_amdgcn_mov_dpp8' must be a constant integer}} } diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -1667,15 +1667,13 @@ [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; def int_amdgcn_readfirstlane : - ClangBuiltin<"__builtin_amdgcn_readfirstlane">, - Intrinsic<[llvm_i32_ty], [llvm_i32_ty], + Intrinsic<[llvm_any_ty], [LLVMMatchType<0>], [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; // The lane argument must be uniform across the currently active threads of the // current wave. Otherwise, the result is undefined. def int_amdgcn_readlane : - ClangBuiltin<"__builtin_amdgcn_readlane">, - Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], + Intrinsic<[llvm_any_ty], [LLVMMatchType<0>, llvm_i32_ty], [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; // The value to write and lane select arguments must be uniform across the @@ -1683,10 +1681,10 @@ // undefined. def int_amdgcn_writelane : ClangBuiltin<"__builtin_amdgcn_writelane">, - Intrinsic<[llvm_i32_ty], [ - llvm_i32_ty, // uniform value to write: returned by the selected lane + Intrinsic<[llvm_any_ty], [ + LLVMMatchType<0>, // uniform value to write: returned by the selected lane llvm_i32_ty, // uniform lane select - llvm_i32_ty // returned by all lanes other than the selected one + LLVMMatchType<0> // returned by all lanes other than the selected one ], [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree] >; @@ -1941,16 +1939,16 @@ //===----------------------------------------------------------------------===// // llvm.amdgcn.permlane16 -def int_amdgcn_permlane16 : ClangBuiltin<"__builtin_amdgcn_permlane16">, - Intrinsic<[llvm_i32_ty], - [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty], +def int_amdgcn_permlane16 : + Intrinsic<[llvm_any_ty], + [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty], [IntrNoMem, IntrConvergent, IntrWillReturn, ImmArg>, ImmArg>, IntrNoCallback, IntrNoFree]>; // llvm.amdgcn.permlanex16 -def int_amdgcn_permlanex16 : ClangBuiltin<"__builtin_amdgcn_permlanex16">, - Intrinsic<[llvm_i32_ty], - [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty], +def int_amdgcn_permlanex16 : + Intrinsic<[llvm_any_ty], + [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty], [IntrNoMem, IntrConvergent, IntrWillReturn, ImmArg>, ImmArg>, IntrNoCallback, IntrNoFree]>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp @@ -344,7 +344,7 @@ V = buildNonAtomicBinOp( B, Op, V, B.CreateIntrinsic( - Intrinsic::amdgcn_permlanex16, {}, + Intrinsic::amdgcn_permlanex16, {B.getInt32Ty()}, {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()})); if (ST->isWave32()) @@ -359,7 +359,7 @@ // Pick an arbitrary lane from 0..31 and an arbitrary lane from 32..63 and // combine them with a scalar operation. Function *ReadLane = - Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {}); + Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {Ty}); Value *const Lane0 = B.CreateCall(ReadLane, {V, B.getInt32(0)}); Value *const Lane32 = B.CreateCall(ReadLane, {V, B.getInt32(32)}); return buildNonAtomicBinOp(B, Op, Lane0, Lane32); @@ -402,7 +402,7 @@ // 48..63). assert(ST->hasPermLaneX16()); Value *const PermX = B.CreateIntrinsic( - Intrinsic::amdgcn_permlanex16, {}, + Intrinsic::amdgcn_permlanex16, {B.getInt32Ty()}, {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()}); V = buildNonAtomicBinOp( B, Op, V, @@ -411,8 +411,8 @@ B.getInt32(0xa), B.getInt32(0xf), B.getFalse()})); if (!ST->isWave32()) { // Combine lane 31 into lanes 32..63. - Value *const Lane31 = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {}, - {V, B.getInt32(31)}); + Value *const Lane31 = B.CreateIntrinsic( + Intrinsic::amdgcn_readlane, {V->getType()}, {V, B.getInt32(31)}); V = buildNonAtomicBinOp( B, Op, V, B.CreateCall(UpdateDPP, @@ -439,9 +439,9 @@ B.getInt32(0xf), B.getFalse()}); } else { Function *ReadLane = - Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {}); + Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {Ty}); Function *WriteLane = - Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane, {}); + Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane, {Ty}); // On GFX10 all DPP operations are confined to a single row. To get cross- // row operations we have to use permlane or readlane. @@ -592,7 +592,7 @@ // will provide to the atomic operation. Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1); assert(TyBitWidth == 32); - NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {}, + NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {Ty}, {NewV, LastLaneIdx}); } @@ -672,27 +672,8 @@ // We need to broadcast the value who was the lowest active lane (the first // lane) to all other lanes in the wavefront. We use an intrinsic for this, // but have to handle 64-bit broadcasts with two calls to this intrinsic. - Value *BroadcastI = nullptr; - - if (TyBitWidth == 64) { - Value *const ExtractLo = B.CreateTrunc(PHI, B.getInt32Ty()); - Value *const ExtractHi = - B.CreateTrunc(B.CreateLShr(PHI, 32), B.getInt32Ty()); - CallInst *const ReadFirstLaneLo = - B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractLo); - CallInst *const ReadFirstLaneHi = - B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractHi); - Value *const PartialInsert = B.CreateInsertElement( - PoisonValue::get(VecTy), ReadFirstLaneLo, B.getInt32(0)); - Value *const Insert = - B.CreateInsertElement(PartialInsert, ReadFirstLaneHi, B.getInt32(1)); - BroadcastI = B.CreateBitCast(Insert, Ty); - } else if (TyBitWidth == 32) { - - BroadcastI = B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, PHI); - } else { - llvm_unreachable("Unhandled atomic bit width"); - } + Value *BroadcastI = + B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {Ty}, {PHI}); // Now that we have the result of our single atomic operation, we need to // get our individual lane's slice into the result. We use the lane offset diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -233,6 +233,10 @@ bool visitIntrinsicInst(IntrinsicInst &I); bool visitBitreverseIntrinsicInst(IntrinsicInst &I); + bool visitLaneIntrinsicInst(IntrinsicInst &I); + Value *buildLegalLaneIntrinsic(IRBuilder<> &B, Intrinsic::ID IID, + Value *Data0, Value *Lane = nullptr, + Value *Data1 = nullptr); bool doInitialization(Module &M) override; bool runOnFunction(Function &F) override; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -943,6 +943,17 @@ if (match(Src, PatternMatch::m_Intrinsic())) { return IC.replaceInstUsesWith(II, Src); } + + // readfirstlane (bitcast x) -> bitcast (readfirstlane x) + Value *BitcastInput = nullptr; + if (match(Src, + PatternMatch::m_BitCast(PatternMatch::m_Value(BitcastInput)))) { + CallInst *NewCall = + IC.Builder.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, + {BitcastInput->getType()}, BitcastInput); + Value *NewCast = IC.Builder.CreateBitCast(NewCall, II.getType()); + return IC.replaceInstUsesWith(II, NewCast); + } } else { // readlane (readlane x, y), y -> readlane x, y if (match(Src, PatternMatch::m_Intrinsic( diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp @@ -18,6 +18,7 @@ #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstVisitor.h" +#include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/InitializePasses.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/KnownBits.h" @@ -76,6 +77,12 @@ bool canWidenScalarExtLoad(LoadInst &LI) const; bool visitLoadInst(LoadInst &LI); + bool visitIntrinsicInst(IntrinsicInst &I); + bool visitLaneIntrinsicInst(IntrinsicInst &I); + + Value *buildLegalLaneIntrinsic(IRBuilder<> &B, Intrinsic::ID IID, + Value *Data0, Value *Data1, Value *Lane0, + Value *Lane1, Value *Mod0, Value *Mod1); }; } // end anonymous namespace @@ -177,6 +184,172 @@ return true; } +Value *AMDGPULateCodeGenPrepare::buildLegalLaneIntrinsic( + IRBuilder<> &B, Intrinsic::ID IID, Value *Data0, Value *Data1, Value *Lane0, + Value *Lane1, Value *Mod0, Value *Mod1) { + Type *Ty = Data0->getType(); + bool IsPermLane = (IID == Intrinsic::amdgcn_permlane16 || + IID == Intrinsic::amdgcn_permlanex16); + + if (Ty == B.getInt32Ty()) { + if (IsPermLane) { + Value *Args[6] = {Data0, Data1, Lane0, Lane1, Mod0, Mod1}; + return B.CreateIntrinsic(IID, {Ty}, {Args}); + } + + // {write, read, readfirst}lane + Value *Args[3] = {Data0, Lane0, Data1}; + unsigned NumArgs = Data1 != nullptr ? 3 : Lane0 != nullptr ? 2 : 1; + return B.CreateIntrinsic(IID, {B.getInt32Ty()}, {Args, NumArgs}); + } + + if (auto *VecTy = dyn_cast(Ty)) { + Type *EltType = VecTy->getElementType(); + bool is16Bit = + (EltType->isIntegerTy() && EltType->getIntegerBitWidth() == 16) || + (EltType->isHalfTy()); + int EC = VecTy->getElementCount().getKnownMinValue(); + + Value *Result = UndefValue::get(Ty); + for (int i = 0; i < EC; i += 1 + is16Bit) { + Value *EltData0; + Value *EltData1 = nullptr; + + if (is16Bit) { + int Idxs[2] = {i, i + 1}; + EltData0 = B.CreateShuffleVector(Data0, UndefValue::get(Ty), Idxs); + EltData0 = B.CreateBitCast(EltData0, B.getInt32Ty()); + } else { + EltData0 = B.CreateExtractElement(Data0, i); + } + + if (Data1) { + if (is16Bit) { + int Idxs[2] = {i, i + 1}; + EltData1 = B.CreateShuffleVector(Data1, UndefValue::get(Ty), Idxs); + EltData1 = B.CreateBitCast(EltData1, B.getInt32Ty()); + } else { + EltData1 = B.CreateExtractElement(Data1, i); + } + } + + Value *EltResult = buildLegalLaneIntrinsic(B, IID, EltData0, EltData1, + Lane0, Lane1, Mod0, Mod1); + + if (is16Bit) { + EltResult = + B.CreateBitCast(EltResult, FixedVectorType::get(EltType, 2)); + for (int j = 0; j < 2; ++j) { + if (i + j >= EC) + break; + Result = B.CreateInsertElement( + Result, B.CreateExtractElement(EltResult, j), i + j); + } + } else { + Result = B.CreateInsertElement(Result, EltResult, i); + } + } + + return Result; + } + + unsigned BitWidth = DL->getTypeSizeInBits(Ty); + Type *IntTy = Ty; + + if (!Ty->isIntegerTy()) { + IntTy = IntegerType::get(Mod->getContext(), BitWidth); + if (!Ty->isPointerTy()) { + Data0 = B.CreateBitCast(Data0, IntTy); + if (Data1) + Data1 = B.CreateBitCast(Data1, IntTy); + } + } + + if ((BitWidth % 32) != 0) { + Type *ExtendedTy = + IntegerType::get(Mod->getContext(), (BitWidth + 31) & ~31); + Data0 = B.CreateZExt(Data0, ExtendedTy); + if (Data1) + Data1 = B.CreateZExt(Data1, ExtendedTy); + } + + if (BitWidth > 32) { + Type *VecTy = FixedVectorType::get(B.getInt32Ty(), (BitWidth + 31) / 32); + Data0 = B.CreateBitCast(Data0, VecTy); + if (Data1) + Data1 = B.CreateBitCast(Data1, VecTy); + } + + Value *Result = + buildLegalLaneIntrinsic(B, IID, Data0, Data1, Lane0, Lane1, Mod0, Mod1); + + if ((BitWidth % 32) != 0) { + if (BitWidth > 32) { + Result = B.CreateBitCast( + Result, IntegerType::get(Mod->getContext(), (BitWidth + 31) / 32)); + } + + Result = + B.CreateTrunc(Result, IntegerType::get(Mod->getContext(), BitWidth)); + } + + return Result->getType()->isPointerTy() ? Result + : B.CreateBitCast(Result, Ty); +} + +/// "Legalize" readfirstlane/readlane/writelane to single-dword intrinsics +/// on i32. +/// +/// Done during codegen prepare purely because this turned out to be simpler +/// than doing it in this generality in SelectionDAG. +bool AMDGPULateCodeGenPrepare::visitLaneIntrinsicInst(IntrinsicInst &I) { + Type *Ty = I.getType(); + if (Ty->isIntegerTy(32) && Ty->getIntegerBitWidth() == 32) + return false; // already legal + + Value *Data0 = I.getArgOperand(0); + Value *Data1 = nullptr; + Value *Lane0 = nullptr; + Value *Lane1 = nullptr; + Value *Mod0 = nullptr; + Value *Mod1 = nullptr; + + if (I.getIntrinsicID() == Intrinsic::amdgcn_readlane) { + Lane0 = I.getArgOperand(1); + } else if (I.getIntrinsicID() == Intrinsic::amdgcn_writelane) { + Lane0 = I.getArgOperand(1); + Data1 = I.getArgOperand(2); + } else if (I.getIntrinsicID() == Intrinsic::amdgcn_permlane16 || + I.getIntrinsicID() == Intrinsic::amdgcn_permlanex16) { + Data1 = I.getArgOperand(1); + Lane0 = I.getArgOperand(2); + Lane1 = I.getArgOperand(3); + Mod0 = I.getArgOperand(4); + Mod1 = I.getArgOperand(5); + } + + IRBuilder<> Builder(&I); + Value *Legalized = buildLegalLaneIntrinsic(Builder, I.getIntrinsicID(), Data0, + Data1, Lane0, Lane1, Mod0, Mod1); + + I.replaceAllUsesWith(Legalized); + I.eraseFromParent(); + return true; +} + +bool AMDGPULateCodeGenPrepare::visitIntrinsicInst(IntrinsicInst &I) { + switch (I.getIntrinsicID()) { + case Intrinsic::amdgcn_readfirstlane: + case Intrinsic::amdgcn_readlane: + case Intrinsic::amdgcn_writelane: + case Intrinsic::amdgcn_permlane16: + case Intrinsic::amdgcn_permlanex16: + return visitLaneIntrinsicInst(I); + default: + return false; + } +} + INITIALIZE_PASS_BEGIN(AMDGPULateCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR late optimizations", false, false) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -1117,13 +1117,13 @@ bool GCNPassConfig::addPreISel() { AMDGPUPassConfig::addPreISel(); - if (TM->getOptLevel() > CodeGenOpt::None) - addPass(createAMDGPULateCodeGenPreparePass()); - if (isPassEnabled(EnableAtomicOptimizations, CodeGenOpt::Less)) { addPass(createAMDGPUAtomicOptimizerPass()); } + if (TM->getOptLevel() > CodeGenOpt::None) + addPass(createAMDGPULateCodeGenPreparePass()); + if (TM->getOptLevel() > CodeGenOpt::None) addPass(createSinkingPass()); diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -3145,7 +3145,7 @@ // FIXME: Should also do this for readlane, but tablegen crashes on // the ignored src1. def : GCNPat< - (int_amdgcn_readfirstlane (i32 imm:$src)), + (i32 (int_amdgcn_readfirstlane (i32 imm:$src))), (S_MOV_B32 SReg_32:$src) >; diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -681,8 +681,8 @@ class PermlanePat : GCNPat< - (permlane i32:$vdst_in, i32:$src0, i32:$src1, i32:$src2, - timm:$fi, timm:$bc), + (i32 (permlane i32:$vdst_in, i32:$src0, i32:$src1, i32:$src2, + timm:$fi, timm:$bc)), (inst (opsel_i1timm $fi), VGPR_32:$src0, (opsel_i1timm $bc), SCSrc_b32:$src1, 0, SCSrc_b32:$src2, VGPR_32:$vdst_in) >; diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll --- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll +++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll @@ -7,14 +7,14 @@ ret void } -; CHECK: DIVERGENT: %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0 +; CHECK: DIVERGENT: %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0 define amdgpu_kernel void @v_permlane16_b32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #0 { %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0 store i32 %v, ptr addrspace(1) %out ret void } -; CHECK: DIVERGENT: %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0 +; CHECK: DIVERGENT: %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0 define amdgpu_kernel void @v_permlanex16_b32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #0 { %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0 store i32 %v, ptr addrspace(1) %out @@ -42,7 +42,7 @@ ret void } -; CHECK: DIVERGENT: %tmp0 = call i32 @llvm.amdgcn.writelane(i32 0, i32 1, i32 2) +; CHECK: DIVERGENT: %tmp0 = call i32 @llvm.amdgcn.writelane.i32(i32 0, i32 1, i32 2) define amdgpu_kernel void @writelane(ptr addrspace(1) %out) #0 { %tmp0 = call i32 @llvm.amdgcn.writelane(i32 0, i32 1, i32 2) store i32 %tmp0, ptr addrspace(1) %out diff --git a/llvm/test/Assembler/autoupgrade-amdgpu-intrinsics.ll b/llvm/test/Assembler/autoupgrade-amdgpu-intrinsics.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Assembler/autoupgrade-amdgpu-intrinsics.ll @@ -0,0 +1,68 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 +; RUN: opt -S < %s | FileCheck %s + +declare i32 @llvm.amdgcn.permlane16(i32, i32, i32, i32, i1, i1) +declare i32 @llvm.amdgcn.permlanex16(i32, i32, i32, i32, i1, i1) +declare i32 @llvm.amdgcn.readlane(i32, i32) +declare i32 @llvm.amdgcn.readfirstlane(i32) +declare i32 @llvm.amdgcn.writelane(i32, i32, i32) + +define void @test_permlanex6(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #1 { +; CHECK-LABEL: define void @test_permlanex6 +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]]) { +; CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.permlane16.i32(i32 [[SRC0]], i32 [[SRC0]], i32 [[SRC1]], i32 [[SRC2]], i1 true, i1 false) +; CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4 +; CHECK-NEXT: ret void +; + %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 true, i1 false) + store i32 %v, ptr addrspace(1) %out, align 4 + ret void +} + +define void @test_permlanex16(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #1 { +; CHECK-LABEL: define void @test_permlanex16 +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]]) { +; CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.permlanex16.i32(i32 [[SRC0]], i32 [[SRC0]], i32 [[SRC1]], i32 [[SRC2]], i1 true, i1 false) +; CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4 +; CHECK-NEXT: ret void +; + %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 true, i1 false) + store i32 %v, ptr addrspace(1) %out, align 4 + ret void +} + +define void @test_readlane(ptr addrspace(1) %out, i32 %src) { +; CHECK-LABEL: define void @test_readlane +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 [[SRC:%.*]]) { +; CHECK-NEXT: [[READLANE:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[SRC]], i32 15) +; CHECK-NEXT: store i32 [[READLANE]], ptr addrspace(1) [[OUT]], align 2 +; CHECK-NEXT: ret void +; + %readlane = call i32 @llvm.amdgcn.readlane(i32 %src, i32 15) + store i32 %readlane, ptr addrspace(1) %out, align 2 + ret void +} + +define void @test_readfirstlane(ptr addrspace(1) %out, i32 %src) { +; CHECK-LABEL: define void @test_readfirstlane +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 [[SRC:%.*]]) { +; CHECK-NEXT: [[READFIRSTLANE:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[SRC]]) +; CHECK-NEXT: store i32 [[READFIRSTLANE]], ptr addrspace(1) [[OUT]], align 2 +; CHECK-NEXT: ret void +; + %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %src) + store i32 %readfirstlane, ptr addrspace(1) %out, align 2 + ret void +} + +define void @test_writelane(ptr addrspace(1) %out, i32 %src) { +; CHECK-LABEL: define void @test_writelane +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 [[SRC:%.*]]) { +; CHECK-NEXT: [[WRITELANE:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 1234, i32 15, i32 [[SRC]]) +; CHECK-NEXT: store i32 [[WRITELANE]], ptr addrspace(1) [[OUT]], align 2 +; CHECK-NEXT: ret void +; + %writelane = call i32 @llvm.amdgcn.writelane(i32 1234, i32 15, i32 %src) + store i32 %writelane, ptr addrspace(1) %out, align 2 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll @@ -1,5 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 ; RUN: opt -S -mtriple=amdgcn-- -amdgpu-atomic-optimizer -verify-machineinstrs %s | FileCheck -check-prefix=IR %s ; RUN: llc -global-isel -mtriple=amdgcn-- -amdgpu-atomic-optimizations -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s @@ -9,7 +8,8 @@ declare void @llvm.amdgcn.struct.buffer.store.format.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32, i32 immarg) define amdgpu_cs void @atomic_add(<4 x i32> inreg %arg) { -; IR-LABEL: @atomic_add( +; IR-LABEL: define amdgpu_cs void @atomic_add +; IR-SAME: (<4 x i32> inreg [[ARG:%.*]]) { ; IR-NEXT: .entry: ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) ; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> @@ -22,7 +22,7 @@ ; IR-NEXT: [[TMP8:%.*]] = icmp eq i32 [[TMP5]], 0 ; IR-NEXT: br i1 [[TMP8]], label [[TMP9:%.*]], label [[TMP11:%.*]] ; IR: 9: -; IR-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 [[TMP7]], <4 x i32> [[ARG:%.*]], i32 0, i32 0, i32 0, i32 0) +; IR-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 [[TMP7]], <4 x i32> [[ARG]], i32 0, i32 0, i32 0, i32 0) ; IR-NEXT: br label [[TMP11]] ; IR: 11: ; IR-NEXT: ret void @@ -48,7 +48,8 @@ } define amdgpu_cs void @atomic_add_and_format(<4 x i32> inreg %arg) { -; IR-LABEL: @atomic_add_and_format( +; IR-LABEL: define amdgpu_cs void @atomic_add_and_format +; IR-SAME: (<4 x i32> inreg [[ARG:%.*]]) { ; IR-NEXT: .entry: ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) ; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> @@ -61,11 +62,11 @@ ; IR-NEXT: [[TMP8:%.*]] = icmp eq i32 [[TMP5]], 0 ; IR-NEXT: br i1 [[TMP8]], label [[TMP9:%.*]], label [[TMP11:%.*]] ; IR: 9: -; IR-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 [[TMP7]], <4 x i32> [[ARG:%.*]], i32 0, i32 0, i32 0, i32 0) +; IR-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 [[TMP7]], <4 x i32> [[ARG]], i32 0, i32 0, i32 0, i32 0) ; IR-NEXT: br label [[TMP11]] ; IR: 11: ; IR-NEXT: [[TMP12:%.*]] = phi i32 [ poison, [[DOTENTRY:%.*]] ], [ [[TMP10]], [[TMP9]] ] -; IR-NEXT: [[TMP13:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP12]]) +; IR-NEXT: [[TMP13:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP12]]) ; IR-NEXT: [[TMP14:%.*]] = add i32 [[TMP13]], [[TMP5]] ; IR-NEXT: call void @llvm.amdgcn.struct.buffer.store.format.v4i32(<4 x i32> [[ARG]], <4 x i32> [[ARG]], i32 [[TMP14]], i32 0, i32 0, i32 0) ; IR-NEXT: ret void @@ -103,7 +104,8 @@ } define amdgpu_cs void @atomic_sub(<4 x i32> inreg %arg) { -; IR-LABEL: @atomic_sub( +; IR-LABEL: define amdgpu_cs void @atomic_sub +; IR-SAME: (<4 x i32> inreg [[ARG:%.*]]) { ; IR-NEXT: .entry: ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) ; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> @@ -116,7 +118,7 @@ ; IR-NEXT: [[TMP8:%.*]] = icmp eq i32 [[TMP5]], 0 ; IR-NEXT: br i1 [[TMP8]], label [[TMP9:%.*]], label [[TMP11:%.*]] ; IR: 9: -; IR-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.struct.buffer.atomic.sub.i32(i32 [[TMP7]], <4 x i32> [[ARG:%.*]], i32 0, i32 0, i32 0, i32 0) +; IR-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.struct.buffer.atomic.sub.i32(i32 [[TMP7]], <4 x i32> [[ARG]], i32 0, i32 0, i32 0, i32 0) ; IR-NEXT: br label [[TMP11]] ; IR: 11: ; IR-NEXT: ret void @@ -142,7 +144,8 @@ } define amdgpu_cs void @atomic_sub_and_format(<4 x i32> inreg %arg) { -; IR-LABEL: @atomic_sub_and_format( +; IR-LABEL: define amdgpu_cs void @atomic_sub_and_format +; IR-SAME: (<4 x i32> inreg [[ARG:%.*]]) { ; IR-NEXT: .entry: ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) ; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> @@ -155,11 +158,11 @@ ; IR-NEXT: [[TMP8:%.*]] = icmp eq i32 [[TMP5]], 0 ; IR-NEXT: br i1 [[TMP8]], label [[TMP9:%.*]], label [[TMP11:%.*]] ; IR: 9: -; IR-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.struct.buffer.atomic.sub.i32(i32 [[TMP7]], <4 x i32> [[ARG:%.*]], i32 0, i32 0, i32 0, i32 0) +; IR-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.struct.buffer.atomic.sub.i32(i32 [[TMP7]], <4 x i32> [[ARG]], i32 0, i32 0, i32 0, i32 0) ; IR-NEXT: br label [[TMP11]] ; IR: 11: ; IR-NEXT: [[TMP12:%.*]] = phi i32 [ poison, [[DOTENTRY:%.*]] ], [ [[TMP10]], [[TMP9]] ] -; IR-NEXT: [[TMP13:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP12]]) +; IR-NEXT: [[TMP13:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP12]]) ; IR-NEXT: [[TMP14:%.*]] = sub i32 [[TMP13]], [[TMP5]] ; IR-NEXT: call void @llvm.amdgcn.struct.buffer.store.format.v4i32(<4 x i32> [[ARG]], <4 x i32> [[ARG]], i32 [[TMP14]], i32 0, i32 0, i32 0) ; IR-NEXT: ret void @@ -197,7 +200,8 @@ } define amdgpu_cs void @atomic_xor(<4 x i32> inreg %arg) { -; IR-LABEL: @atomic_xor( +; IR-LABEL: define amdgpu_cs void @atomic_xor +; IR-SAME: (<4 x i32> inreg [[ARG:%.*]]) { ; IR-NEXT: .entry: ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) ; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> @@ -211,7 +215,7 @@ ; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0 ; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] ; IR: 10: -; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.struct.buffer.atomic.xor.i32(i32 [[TMP8]], <4 x i32> [[ARG:%.*]], i32 0, i32 0, i32 0, i32 0) +; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.struct.buffer.atomic.xor.i32(i32 [[TMP8]], <4 x i32> [[ARG]], i32 0, i32 0, i32 0, i32 0) ; IR-NEXT: br label [[TMP12]] ; IR: 12: ; IR-NEXT: ret void @@ -238,7 +242,8 @@ } define amdgpu_cs void @atomic_xor_and_format(<4 x i32> inreg %arg) { -; IR-LABEL: @atomic_xor_and_format( +; IR-LABEL: define amdgpu_cs void @atomic_xor_and_format +; IR-SAME: (<4 x i32> inreg [[ARG:%.*]]) { ; IR-NEXT: .entry: ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) ; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> @@ -252,11 +257,11 @@ ; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0 ; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] ; IR: 10: -; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.struct.buffer.atomic.xor.i32(i32 [[TMP8]], <4 x i32> [[ARG:%.*]], i32 0, i32 0, i32 0, i32 0) +; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.struct.buffer.atomic.xor.i32(i32 [[TMP8]], <4 x i32> [[ARG]], i32 0, i32 0, i32 0, i32 0) ; IR-NEXT: br label [[TMP12]] ; IR: 12: ; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[DOTENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ] -; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]]) +; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]]) ; IR-NEXT: [[TMP15:%.*]] = and i32 [[TMP5]], 1 ; IR-NEXT: [[TMP16:%.*]] = xor i32 [[TMP14]], [[TMP15]] ; IR-NEXT: call void @llvm.amdgcn.struct.buffer.store.format.v4i32(<4 x i32> [[ARG]], <4 x i32> [[ARG]], i32 [[TMP16]], i32 0, i32 0, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --force-update +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS %s ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s diff --git a/llvm/test/CodeGen/AMDGPU/global-atomic-scan.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-scan.ll --- a/llvm/test/CodeGen/AMDGPU/global-atomic-scan.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomic-scan.ll @@ -1,10 +1,11 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 ; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-atomic-optimizer %s | FileCheck -check-prefix=IR %s define amdgpu_kernel void @atomic_add_i32_offset(ptr addrspace(1) %out, i32 %in) { -; IR-LABEL: @atomic_add_i32_offset( +; IR-LABEL: define amdgpu_kernel void @atomic_add_i32_offset +; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]]) { ; IR-NEXT: entry: -; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4 +; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 4 ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) ; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> ; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 @@ -13,7 +14,7 @@ ; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) ; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]]) ; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 -; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]] +; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN]], [[TMP7]] ; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0 ; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] ; IR: 10: @@ -29,9 +30,10 @@ } define amdgpu_kernel void @atomic_add_i32_max_neg_offset(ptr addrspace(1) %out, i32 %in) { -; IR-LABEL: @atomic_add_i32_max_neg_offset( +; IR-LABEL: define amdgpu_kernel void @atomic_add_i32_max_neg_offset +; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]]) { ; IR-NEXT: entry: -; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 -1024 +; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 -1024 ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) ; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> ; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 @@ -40,7 +42,7 @@ ; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) ; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]]) ; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 -; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]] +; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN]], [[TMP7]] ; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0 ; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] ; IR: 10: @@ -56,9 +58,10 @@ } define amdgpu_kernel void @atomic_add_i32_soffset(ptr addrspace(1) %out, i32 %in) { -; IR-LABEL: @atomic_add_i32_soffset( +; IR-LABEL: define amdgpu_kernel void @atomic_add_i32_soffset +; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]]) { ; IR-NEXT: entry: -; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 9000 +; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 9000 ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) ; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> ; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 @@ -67,7 +70,7 @@ ; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) ; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]]) ; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 -; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]] +; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN]], [[TMP7]] ; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0 ; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] ; IR: 10: @@ -83,9 +86,10 @@ } define amdgpu_kernel void @atomic_add_i32_huge_offset(ptr addrspace(1) %out, i32 %in) { -; IR-LABEL: @atomic_add_i32_huge_offset( +; IR-LABEL: define amdgpu_kernel void @atomic_add_i32_huge_offset +; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]]) { ; IR-NEXT: entry: -; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 47224239175595 +; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 47224239175595 ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) ; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> ; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 @@ -94,7 +98,7 @@ ; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) ; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]]) ; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 -; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]] +; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN]], [[TMP7]] ; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0 ; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] ; IR: 10: @@ -111,9 +115,10 @@ } define amdgpu_kernel void @atomic_add_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { -; IR-LABEL: @atomic_add_i32_ret_offset( +; IR-LABEL: define amdgpu_kernel void @atomic_add_i32_ret_offset +; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT2:%.*]], i32 [[IN:%.*]]) { ; IR-NEXT: entry: -; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4 +; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 4 ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) ; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> ; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 @@ -122,7 +127,7 @@ ; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) ; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]]) ; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 -; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]] +; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN]], [[TMP7]] ; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0 ; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] ; IR: 10: @@ -130,10 +135,10 @@ ; IR-NEXT: br label [[TMP12]] ; IR: 12: ; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ] -; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]]) +; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]]) ; IR-NEXT: [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]] ; IR-NEXT: [[TMP16:%.*]] = add i32 [[TMP14]], [[TMP15]] -; IR-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4 +; IR-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[OUT2]], align 4 ; IR-NEXT: ret void ; entry: @@ -144,9 +149,10 @@ } define amdgpu_kernel void @atomic_add_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { -; IR-LABEL: @atomic_add_i32_addr64_offset( +; IR-LABEL: define amdgpu_kernel void @atomic_add_i32_addr64_offset +; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]], i64 [[INDEX:%.*]]) { ; IR-NEXT: entry: -; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] +; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[INDEX]] ; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4 ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) ; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> @@ -156,7 +162,7 @@ ; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) ; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]]) ; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 -; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]] +; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN]], [[TMP7]] ; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0 ; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] ; IR: 10: @@ -173,9 +179,10 @@ } define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { -; IR-LABEL: @atomic_add_i32_ret_addr64_offset( +; IR-LABEL: define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset +; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT2:%.*]], i32 [[IN:%.*]], i64 [[INDEX:%.*]]) { ; IR-NEXT: entry: -; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] +; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[INDEX]] ; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4 ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) ; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> @@ -185,7 +192,7 @@ ; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) ; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]]) ; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 -; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]] +; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN]], [[TMP7]] ; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0 ; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] ; IR: 10: @@ -193,10 +200,10 @@ ; IR-NEXT: br label [[TMP12]] ; IR: 12: ; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ] -; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]]) +; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]]) ; IR-NEXT: [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]] ; IR-NEXT: [[TMP16:%.*]] = add i32 [[TMP14]], [[TMP15]] -; IR-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4 +; IR-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[OUT2]], align 4 ; IR-NEXT: ret void ; entry: @@ -208,7 +215,8 @@ } define amdgpu_kernel void @atomic_add_i32(ptr addrspace(1) %out, i32 %in) { -; IR-LABEL: @atomic_add_i32( +; IR-LABEL: define amdgpu_kernel void @atomic_add_i32 +; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]]) { ; IR-NEXT: entry: ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) ; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> @@ -218,11 +226,11 @@ ; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) ; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]]) ; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 -; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]] +; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN]], [[TMP7]] ; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0 ; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] ; IR: 10: -; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[OUT:%.*]], i32 [[TMP8]] seq_cst, align 4 +; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[OUT]], i32 [[TMP8]] seq_cst, align 4 ; IR-NEXT: br label [[TMP12]] ; IR: 12: ; IR-NEXT: ret void @@ -233,7 +241,8 @@ } define amdgpu_kernel void @atomic_add_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { -; IR-LABEL: @atomic_add_i32_ret( +; IR-LABEL: define amdgpu_kernel void @atomic_add_i32_ret +; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT2:%.*]], i32 [[IN:%.*]]) { ; IR-NEXT: entry: ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) ; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> @@ -243,18 +252,18 @@ ; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) ; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]]) ; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 -; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]] +; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN]], [[TMP7]] ; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0 ; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] ; IR: 10: -; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[OUT:%.*]], i32 [[TMP8]] seq_cst, align 4 +; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[OUT]], i32 [[TMP8]] seq_cst, align 4 ; IR-NEXT: br label [[TMP12]] ; IR: 12: ; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ] -; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]]) +; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]]) ; IR-NEXT: [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]] ; IR-NEXT: [[TMP16:%.*]] = add i32 [[TMP14]], [[TMP15]] -; IR-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4 +; IR-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[OUT2]], align 4 ; IR-NEXT: ret void ; entry: @@ -264,9 +273,10 @@ } define amdgpu_kernel void @atomic_add_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) { -; IR-LABEL: @atomic_add_i32_addr64( +; IR-LABEL: define amdgpu_kernel void @atomic_add_i32_addr64 +; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]], i64 [[INDEX:%.*]]) { ; IR-NEXT: entry: -; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] +; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[INDEX]] ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) ; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> ; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 @@ -275,7 +285,7 @@ ; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) ; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]]) ; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 -; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]] +; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN]], [[TMP7]] ; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0 ; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] ; IR: 10: @@ -291,9 +301,10 @@ } define amdgpu_kernel void @atomic_add_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { -; IR-LABEL: @atomic_add_i32_ret_addr64( +; IR-LABEL: define amdgpu_kernel void @atomic_add_i32_ret_addr64 +; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT2:%.*]], i32 [[IN:%.*]], i64 [[INDEX:%.*]]) { ; IR-NEXT: entry: -; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] +; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[INDEX]] ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) ; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> ; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 @@ -302,7 +313,7 @@ ; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) ; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]]) ; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 -; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]] +; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN]], [[TMP7]] ; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0 ; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] ; IR: 10: @@ -310,10 +321,10 @@ ; IR-NEXT: br label [[TMP12]] ; IR: 12: ; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ] -; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]]) +; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]]) ; IR-NEXT: [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]] ; IR-NEXT: [[TMP16:%.*]] = add i32 [[TMP14]], [[TMP15]] -; IR-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4 +; IR-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[OUT2]], align 4 ; IR-NEXT: ret void ; entry: @@ -324,9 +335,10 @@ } define amdgpu_kernel void @atomic_and_i32_offset(ptr addrspace(1) %out, i32 %in) { -; IR-LABEL: @atomic_and_i32_offset( +; IR-LABEL: define amdgpu_kernel void @atomic_and_i32_offset +; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]]) { ; IR-NEXT: entry: -; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4 +; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 4 ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) ; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> ; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 @@ -336,7 +348,7 @@ ; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 ; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] ; IR: 7: -; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] seq_cst, align 4 +; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[GEP]], i32 [[IN]] seq_cst, align 4 ; IR-NEXT: br label [[TMP9]] ; IR: 9: ; IR-NEXT: ret void @@ -348,9 +360,10 @@ } define amdgpu_kernel void @atomic_and_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { -; IR-LABEL: @atomic_and_i32_ret_offset( +; IR-LABEL: define amdgpu_kernel void @atomic_and_i32_ret_offset +; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT2:%.*]], i32 [[IN:%.*]]) { ; IR-NEXT: entry: -; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4 +; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 4 ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) ; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> ; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 @@ -360,14 +373,14 @@ ; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 ; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] ; IR: 7: -; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] seq_cst, align 4 +; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[GEP]], i32 [[IN]] seq_cst, align 4 ; IR-NEXT: br label [[TMP9]] ; IR: 9: ; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ] -; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]]) +; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]]) ; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 -1, i32 [[IN]] ; IR-NEXT: [[TMP13:%.*]] = and i32 [[TMP11]], [[TMP12]] -; IR-NEXT: store i32 [[TMP13]], ptr addrspace(1) [[OUT2:%.*]], align 4 +; IR-NEXT: store i32 [[TMP13]], ptr addrspace(1) [[OUT2]], align 4 ; IR-NEXT: ret void ; entry: @@ -378,9 +391,10 @@ } define amdgpu_kernel void @atomic_and_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { -; IR-LABEL: @atomic_and_i32_addr64_offset( +; IR-LABEL: define amdgpu_kernel void @atomic_and_i32_addr64_offset +; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]], i64 [[INDEX:%.*]]) { ; IR-NEXT: entry: -; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] +; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[INDEX]] ; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4 ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) ; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> @@ -391,7 +405,7 @@ ; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 ; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] ; IR: 7: -; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] seq_cst, align 4 +; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[GEP]], i32 [[IN]] seq_cst, align 4 ; IR-NEXT: br label [[TMP9]] ; IR: 9: ; IR-NEXT: ret void @@ -404,9 +418,10 @@ } define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { -; IR-LABEL: @atomic_and_i32_ret_addr64_offset( +; IR-LABEL: define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset +; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT2:%.*]], i32 [[IN:%.*]], i64 [[INDEX:%.*]]) { ; IR-NEXT: entry: -; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] +; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[INDEX]] ; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4 ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) ; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> @@ -417,14 +432,14 @@ ; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 ; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] ; IR: 7: -; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] seq_cst, align 4 +; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[GEP]], i32 [[IN]] seq_cst, align 4 ; IR-NEXT: br label [[TMP9]] ; IR: 9: ; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ] -; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]]) +; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]]) ; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 -1, i32 [[IN]] ; IR-NEXT: [[TMP13:%.*]] = and i32 [[TMP11]], [[TMP12]] -; IR-NEXT: store i32 [[TMP13]], ptr addrspace(1) [[OUT2:%.*]], align 4 +; IR-NEXT: store i32 [[TMP13]], ptr addrspace(1) [[OUT2]], align 4 ; IR-NEXT: ret void ; entry: @@ -436,7 +451,8 @@ } define amdgpu_kernel void @atomic_and_i32(ptr addrspace(1) %out, i32 %in) { -; IR-LABEL: @atomic_and_i32( +; IR-LABEL: define amdgpu_kernel void @atomic_and_i32 +; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]]) { ; IR-NEXT: entry: ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) ; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> @@ -447,7 +463,7 @@ ; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 ; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] ; IR: 7: -; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]] seq_cst, align 4 +; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[OUT]], i32 [[IN]] seq_cst, align 4 ; IR-NEXT: br label [[TMP9]] ; IR: 9: ; IR-NEXT: ret void @@ -458,7 +474,8 @@ } define amdgpu_kernel void @atomic_and_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { -; IR-LABEL: @atomic_and_i32_ret( +; IR-LABEL: define amdgpu_kernel void @atomic_and_i32_ret +; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT2:%.*]], i32 [[IN:%.*]]) { ; IR-NEXT: entry: ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) ; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> @@ -469,14 +486,14 @@ ; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 ; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] ; IR: 7: -; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]] seq_cst, align 4 +; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[OUT]], i32 [[IN]] seq_cst, align 4 ; IR-NEXT: br label [[TMP9]] ; IR: 9: ; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ] -; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]]) +; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]]) ; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 -1, i32 [[IN]] ; IR-NEXT: [[TMP13:%.*]] = and i32 [[TMP11]], [[TMP12]] -; IR-NEXT: store i32 [[TMP13]], ptr addrspace(1) [[OUT2:%.*]], align 4 +; IR-NEXT: store i32 [[TMP13]], ptr addrspace(1) [[OUT2]], align 4 ; IR-NEXT: ret void ; entry: @@ -486,9 +503,10 @@ } define amdgpu_kernel void @atomic_and_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) { -; IR-LABEL: @atomic_and_i32_addr64( +; IR-LABEL: define amdgpu_kernel void @atomic_and_i32_addr64 +; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]], i64 [[INDEX:%.*]]) { ; IR-NEXT: entry: -; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] +; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[INDEX]] ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) ; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> ; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 @@ -498,7 +516,7 @@ ; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 ; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] ; IR: 7: -; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[PTR]], i32 [[IN:%.*]] seq_cst, align 4 +; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[PTR]], i32 [[IN]] seq_cst, align 4 ; IR-NEXT: br label [[TMP9]] ; IR: 9: ; IR-NEXT: ret void @@ -510,9 +528,10 @@ } define amdgpu_kernel void @atomic_and_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { -; IR-LABEL: @atomic_and_i32_ret_addr64( +; IR-LABEL: define amdgpu_kernel void @atomic_and_i32_ret_addr64 +; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT2:%.*]], i32 [[IN:%.*]], i64 [[INDEX:%.*]]) { ; IR-NEXT: entry: -; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] +; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[INDEX]] ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) ; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> ; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 @@ -522,14 +541,14 @@ ; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 ; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] ; IR: 7: -; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[PTR]], i32 [[IN:%.*]] seq_cst, align 4 +; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[PTR]], i32 [[IN]] seq_cst, align 4 ; IR-NEXT: br label [[TMP9]] ; IR: 9: ; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ] -; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]]) +; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]]) ; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 -1, i32 [[IN]] ; IR-NEXT: [[TMP13:%.*]] = and i32 [[TMP11]], [[TMP12]] -; IR-NEXT: store i32 [[TMP13]], ptr addrspace(1) [[OUT2:%.*]], align 4 +; IR-NEXT: store i32 [[TMP13]], ptr addrspace(1) [[OUT2]], align 4 ; IR-NEXT: ret void ; entry: @@ -540,9 +559,10 @@ } define amdgpu_kernel void @atomic_sub_i32_offset(ptr addrspace(1) %out, i32 %in) { -; IR-LABEL: @atomic_sub_i32_offset( +; IR-LABEL: define amdgpu_kernel void @atomic_sub_i32_offset +; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]]) { ; IR-NEXT: entry: -; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4 +; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 4 ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) ; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> ; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 @@ -551,7 +571,7 @@ ; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) ; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]]) ; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 -; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]] +; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN]], [[TMP7]] ; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0 ; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] ; IR: 10: @@ -567,9 +587,10 @@ } define amdgpu_kernel void @atomic_sub_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { -; IR-LABEL: @atomic_sub_i32_ret_offset( +; IR-LABEL: define amdgpu_kernel void @atomic_sub_i32_ret_offset +; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT2:%.*]], i32 [[IN:%.*]]) { ; IR-NEXT: entry: -; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4 +; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 4 ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) ; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> ; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 @@ -578,7 +599,7 @@ ; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) ; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]]) ; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 -; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]] +; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN]], [[TMP7]] ; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0 ; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] ; IR: 10: @@ -586,10 +607,10 @@ ; IR-NEXT: br label [[TMP12]] ; IR: 12: ; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ] -; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]]) +; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]]) ; IR-NEXT: [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]] ; IR-NEXT: [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]] -; IR-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4 +; IR-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[OUT2]], align 4 ; IR-NEXT: ret void ; entry: @@ -600,9 +621,10 @@ } define amdgpu_kernel void @atomic_sub_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { -; IR-LABEL: @atomic_sub_i32_addr64_offset( +; IR-LABEL: define amdgpu_kernel void @atomic_sub_i32_addr64_offset +; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]], i64 [[INDEX:%.*]]) { ; IR-NEXT: entry: -; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] +; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[INDEX]] ; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4 ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) ; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> @@ -612,7 +634,7 @@ ; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) ; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]]) ; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 -; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]] +; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN]], [[TMP7]] ; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0 ; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] ; IR: 10: @@ -629,9 +651,10 @@ } define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { -; IR-LABEL: @atomic_sub_i32_ret_addr64_offset( +; IR-LABEL: define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset +; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT2:%.*]], i32 [[IN:%.*]], i64 [[INDEX:%.*]]) { ; IR-NEXT: entry: -; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] +; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[INDEX]] ; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4 ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) ; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> @@ -641,7 +664,7 @@ ; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) ; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]]) ; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 -; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]] +; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN]], [[TMP7]] ; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0 ; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] ; IR: 10: @@ -649,10 +672,10 @@ ; IR-NEXT: br label [[TMP12]] ; IR: 12: ; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ] -; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]]) +; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]]) ; IR-NEXT: [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]] ; IR-NEXT: [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]] -; IR-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4 +; IR-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[OUT2]], align 4 ; IR-NEXT: ret void ; entry: @@ -664,7 +687,8 @@ } define amdgpu_kernel void @atomic_sub_i32(ptr addrspace(1) %out, i32 %in) { -; IR-LABEL: @atomic_sub_i32( +; IR-LABEL: define amdgpu_kernel void @atomic_sub_i32 +; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]]) { ; IR-NEXT: entry: ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) ; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> @@ -674,11 +698,11 @@ ; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) ; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]]) ; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 -; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]] +; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN]], [[TMP7]] ; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0 ; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] ; IR: 10: -; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile sub ptr addrspace(1) [[OUT:%.*]], i32 [[TMP8]] seq_cst, align 4 +; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile sub ptr addrspace(1) [[OUT]], i32 [[TMP8]] seq_cst, align 4 ; IR-NEXT: br label [[TMP12]] ; IR: 12: ; IR-NEXT: ret void @@ -689,7 +713,8 @@ } define amdgpu_kernel void @atomic_sub_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { -; IR-LABEL: @atomic_sub_i32_ret( +; IR-LABEL: define amdgpu_kernel void @atomic_sub_i32_ret +; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT2:%.*]], i32 [[IN:%.*]]) { ; IR-NEXT: entry: ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) ; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> @@ -699,18 +724,18 @@ ; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) ; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]]) ; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 -; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]] +; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN]], [[TMP7]] ; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0 ; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] ; IR: 10: -; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile sub ptr addrspace(1) [[OUT:%.*]], i32 [[TMP8]] seq_cst, align 4 +; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile sub ptr addrspace(1) [[OUT]], i32 [[TMP8]] seq_cst, align 4 ; IR-NEXT: br label [[TMP12]] ; IR: 12: ; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ] -; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]]) +; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]]) ; IR-NEXT: [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]] ; IR-NEXT: [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]] -; IR-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4 +; IR-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[OUT2]], align 4 ; IR-NEXT: ret void ; entry: @@ -720,9 +745,10 @@ } define amdgpu_kernel void @atomic_sub_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) { -; IR-LABEL: @atomic_sub_i32_addr64( +; IR-LABEL: define amdgpu_kernel void @atomic_sub_i32_addr64 +; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]], i64 [[INDEX:%.*]]) { ; IR-NEXT: entry: -; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] +; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[INDEX]] ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) ; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> ; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 @@ -731,7 +757,7 @@ ; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) ; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]]) ; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 -; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]] +; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN]], [[TMP7]] ; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0 ; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] ; IR: 10: @@ -747,9 +773,10 @@ } define amdgpu_kernel void @atomic_sub_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { -; IR-LABEL: @atomic_sub_i32_ret_addr64( +; IR-LABEL: define amdgpu_kernel void @atomic_sub_i32_ret_addr64 +; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT2:%.*]], i32 [[IN:%.*]], i64 [[INDEX:%.*]]) { ; IR-NEXT: entry: -; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] +; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[INDEX]] ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) ; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> ; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 @@ -758,7 +785,7 @@ ; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) ; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]]) ; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 -; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]] +; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN]], [[TMP7]] ; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0 ; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] ; IR: 10: @@ -766,10 +793,10 @@ ; IR-NEXT: br label [[TMP12]] ; IR: 12: ; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ] -; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]]) +; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]]) ; IR-NEXT: [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]] ; IR-NEXT: [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]] -; IR-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4 +; IR-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[OUT2]], align 4 ; IR-NEXT: ret void ; entry: @@ -780,9 +807,10 @@ } define amdgpu_kernel void @atomic_max_i32_offset(ptr addrspace(1) %out, i32 %in) { -; IR-LABEL: @atomic_max_i32_offset( +; IR-LABEL: define amdgpu_kernel void @atomic_max_i32_offset +; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]]) { ; IR-NEXT: entry: -; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4 +; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 4 ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) ; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> ; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 @@ -792,7 +820,7 @@ ; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 ; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] ; IR: 7: -; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] seq_cst, align 4 +; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[GEP]], i32 [[IN]] seq_cst, align 4 ; IR-NEXT: br label [[TMP9]] ; IR: 9: ; IR-NEXT: ret void @@ -804,9 +832,10 @@ } define amdgpu_kernel void @atomic_max_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { -; IR-LABEL: @atomic_max_i32_ret_offset( +; IR-LABEL: define amdgpu_kernel void @atomic_max_i32_ret_offset +; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT2:%.*]], i32 [[IN:%.*]]) { ; IR-NEXT: entry: -; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4 +; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 4 ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) ; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> ; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 @@ -816,15 +845,15 @@ ; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 ; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] ; IR: 7: -; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 +; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[GEP]], i32 [[IN]] syncscope("workgroup") seq_cst, align 4 ; IR-NEXT: br label [[TMP9]] ; IR: 9: ; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ] -; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]]) +; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]]) ; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 -2147483648, i32 [[IN]] ; IR-NEXT: [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]] ; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]] -; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4 +; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2]], align 4 ; IR-NEXT: ret void ; entry: @@ -835,9 +864,10 @@ } define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { -; IR-LABEL: @atomic_max_i32_addr64_offset( +; IR-LABEL: define amdgpu_kernel void @atomic_max_i32_addr64_offset +; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]], i64 [[INDEX:%.*]]) { ; IR-NEXT: entry: -; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] +; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[INDEX]] ; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4 ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) ; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> @@ -848,7 +878,7 @@ ; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 ; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] ; IR: 7: -; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 +; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[GEP]], i32 [[IN]] syncscope("workgroup") seq_cst, align 4 ; IR-NEXT: br label [[TMP9]] ; IR: 9: ; IR-NEXT: ret void @@ -861,9 +891,10 @@ } define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { -; IR-LABEL: @atomic_max_i32_ret_addr64_offset( +; IR-LABEL: define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset +; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT2:%.*]], i32 [[IN:%.*]], i64 [[INDEX:%.*]]) { ; IR-NEXT: entry: -; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] +; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[INDEX]] ; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4 ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) ; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> @@ -874,15 +905,15 @@ ; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 ; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] ; IR: 7: -; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 +; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[GEP]], i32 [[IN]] syncscope("workgroup") seq_cst, align 4 ; IR-NEXT: br label [[TMP9]] ; IR: 9: ; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ] -; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]]) +; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]]) ; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 -2147483648, i32 [[IN]] ; IR-NEXT: [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]] ; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]] -; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4 +; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2]], align 4 ; IR-NEXT: ret void ; entry: @@ -894,7 +925,8 @@ } define amdgpu_kernel void @atomic_max_i32(ptr addrspace(1) %out, i32 %in) { -; IR-LABEL: @atomic_max_i32( +; IR-LABEL: define amdgpu_kernel void @atomic_max_i32 +; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]]) { ; IR-NEXT: entry: ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) ; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> @@ -905,7 +937,7 @@ ; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 ; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] ; IR: 7: -; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 +; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[OUT]], i32 [[IN]] syncscope("workgroup") seq_cst, align 4 ; IR-NEXT: br label [[TMP9]] ; IR: 9: ; IR-NEXT: ret void @@ -916,7 +948,8 @@ } define amdgpu_kernel void @atomic_max_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { -; IR-LABEL: @atomic_max_i32_ret( +; IR-LABEL: define amdgpu_kernel void @atomic_max_i32_ret +; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT2:%.*]], i32 [[IN:%.*]]) { ; IR-NEXT: entry: ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) ; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> @@ -927,15 +960,15 @@ ; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 ; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] ; IR: 7: -; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 +; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[OUT]], i32 [[IN]] syncscope("workgroup") seq_cst, align 4 ; IR-NEXT: br label [[TMP9]] ; IR: 9: ; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ] -; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]]) +; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]]) ; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 -2147483648, i32 [[IN]] ; IR-NEXT: [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]] ; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]] -; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4 +; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2]], align 4 ; IR-NEXT: ret void ; entry: @@ -945,9 +978,10 @@ } define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) { -; IR-LABEL: @atomic_max_i32_addr64( +; IR-LABEL: define amdgpu_kernel void @atomic_max_i32_addr64 +; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]], i64 [[INDEX:%.*]]) { ; IR-NEXT: entry: -; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] +; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[INDEX]] ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) ; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> ; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 @@ -957,7 +991,7 @@ ; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 ; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] ; IR: 7: -; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[PTR]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 +; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[PTR]], i32 [[IN]] syncscope("workgroup") seq_cst, align 4 ; IR-NEXT: br label [[TMP9]] ; IR: 9: ; IR-NEXT: ret void @@ -969,9 +1003,10 @@ } define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { -; IR-LABEL: @atomic_max_i32_ret_addr64( +; IR-LABEL: define amdgpu_kernel void @atomic_max_i32_ret_addr64 +; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT2:%.*]], i32 [[IN:%.*]], i64 [[INDEX:%.*]]) { ; IR-NEXT: entry: -; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] +; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[INDEX]] ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) ; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> ; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 @@ -981,15 +1016,15 @@ ; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 ; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] ; IR: 7: -; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[PTR]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 +; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[PTR]], i32 [[IN]] syncscope("workgroup") seq_cst, align 4 ; IR-NEXT: br label [[TMP9]] ; IR: 9: ; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ] -; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]]) +; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]]) ; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 -2147483648, i32 [[IN]] ; IR-NEXT: [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]] ; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]] -; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4 +; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2]], align 4 ; IR-NEXT: ret void ; entry: @@ -1000,9 +1035,10 @@ } define amdgpu_kernel void @atomic_umax_i32_offset(ptr addrspace(1) %out, i32 %in) { -; IR-LABEL: @atomic_umax_i32_offset( +; IR-LABEL: define amdgpu_kernel void @atomic_umax_i32_offset +; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]]) { ; IR-NEXT: entry: -; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4 +; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 4 ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) ; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> ; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 @@ -1012,7 +1048,7 @@ ; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 ; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] ; IR: 7: -; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 +; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[GEP]], i32 [[IN]] syncscope("workgroup") seq_cst, align 4 ; IR-NEXT: br label [[TMP9]] ; IR: 9: ; IR-NEXT: ret void @@ -1024,9 +1060,10 @@ } define amdgpu_kernel void @atomic_umax_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { -; IR-LABEL: @atomic_umax_i32_ret_offset( +; IR-LABEL: define amdgpu_kernel void @atomic_umax_i32_ret_offset +; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT2:%.*]], i32 [[IN:%.*]]) { ; IR-NEXT: entry: -; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4 +; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 4 ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) ; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> ; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 @@ -1036,15 +1073,15 @@ ; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 ; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] ; IR: 7: -; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 +; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[GEP]], i32 [[IN]] syncscope("workgroup") seq_cst, align 4 ; IR-NEXT: br label [[TMP9]] ; IR: 9: ; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ] -; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]]) +; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]]) ; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 0, i32 [[IN]] ; IR-NEXT: [[TMP13:%.*]] = icmp ugt i32 [[TMP11]], [[TMP12]] ; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]] -; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4 +; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2]], align 4 ; IR-NEXT: ret void ; entry: @@ -1055,9 +1092,10 @@ } define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { -; IR-LABEL: @atomic_umax_i32_addr64_offset( +; IR-LABEL: define amdgpu_kernel void @atomic_umax_i32_addr64_offset +; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]], i64 [[INDEX:%.*]]) { ; IR-NEXT: entry: -; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] +; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[INDEX]] ; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4 ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) ; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> @@ -1068,7 +1106,7 @@ ; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 ; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] ; IR: 7: -; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 +; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[GEP]], i32 [[IN]] syncscope("workgroup") seq_cst, align 4 ; IR-NEXT: br label [[TMP9]] ; IR: 9: ; IR-NEXT: ret void @@ -1081,9 +1119,10 @@ } define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { -; IR-LABEL: @atomic_umax_i32_ret_addr64_offset( +; IR-LABEL: define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset +; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT2:%.*]], i32 [[IN:%.*]], i64 [[INDEX:%.*]]) { ; IR-NEXT: entry: -; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] +; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[INDEX]] ; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4 ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) ; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> @@ -1094,15 +1133,15 @@ ; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 ; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] ; IR: 7: -; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 +; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[GEP]], i32 [[IN]] syncscope("workgroup") seq_cst, align 4 ; IR-NEXT: br label [[TMP9]] ; IR: 9: ; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ] -; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]]) +; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]]) ; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 0, i32 [[IN]] ; IR-NEXT: [[TMP13:%.*]] = icmp ugt i32 [[TMP11]], [[TMP12]] ; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]] -; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4 +; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2]], align 4 ; IR-NEXT: ret void ; entry: @@ -1114,7 +1153,8 @@ } define amdgpu_kernel void @atomic_umax_i32(ptr addrspace(1) %out, i32 %in) { -; IR-LABEL: @atomic_umax_i32( +; IR-LABEL: define amdgpu_kernel void @atomic_umax_i32 +; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]]) { ; IR-NEXT: entry: ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) ; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> @@ -1125,7 +1165,7 @@ ; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 ; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] ; IR: 7: -; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 +; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[OUT]], i32 [[IN]] syncscope("workgroup") seq_cst, align 4 ; IR-NEXT: br label [[TMP9]] ; IR: 9: ; IR-NEXT: ret void @@ -1136,7 +1176,8 @@ } define amdgpu_kernel void @atomic_umax_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { -; IR-LABEL: @atomic_umax_i32_ret( +; IR-LABEL: define amdgpu_kernel void @atomic_umax_i32_ret +; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT2:%.*]], i32 [[IN:%.*]]) { ; IR-NEXT: entry: ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) ; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> @@ -1147,15 +1188,15 @@ ; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 ; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] ; IR: 7: -; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 +; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[OUT]], i32 [[IN]] syncscope("workgroup") seq_cst, align 4 ; IR-NEXT: br label [[TMP9]] ; IR: 9: ; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ] -; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]]) +; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]]) ; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 0, i32 [[IN]] ; IR-NEXT: [[TMP13:%.*]] = icmp ugt i32 [[TMP11]], [[TMP12]] ; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]] -; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4 +; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2]], align 4 ; IR-NEXT: ret void ; entry: @@ -1165,9 +1206,10 @@ } define amdgpu_kernel void @atomic_umax_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) { -; IR-LABEL: @atomic_umax_i32_addr64( +; IR-LABEL: define amdgpu_kernel void @atomic_umax_i32_addr64 +; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]], i64 [[INDEX:%.*]]) { ; IR-NEXT: entry: -; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] +; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[INDEX]] ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) ; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> ; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 @@ -1177,7 +1219,7 @@ ; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 ; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] ; IR: 7: -; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[PTR]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 +; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[PTR]], i32 [[IN]] syncscope("workgroup") seq_cst, align 4 ; IR-NEXT: br label [[TMP9]] ; IR: 9: ; IR-NEXT: ret void @@ -1189,9 +1231,10 @@ } define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { -; IR-LABEL: @atomic_umax_i32_ret_addr64( +; IR-LABEL: define amdgpu_kernel void @atomic_umax_i32_ret_addr64 +; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT2:%.*]], i32 [[IN:%.*]], i64 [[INDEX:%.*]]) { ; IR-NEXT: entry: -; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] +; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[INDEX]] ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) ; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> ; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 @@ -1201,15 +1244,15 @@ ; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 ; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] ; IR: 7: -; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[PTR]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 +; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[PTR]], i32 [[IN]] syncscope("workgroup") seq_cst, align 4 ; IR-NEXT: br label [[TMP9]] ; IR: 9: ; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ] -; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]]) +; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]]) ; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 0, i32 [[IN]] ; IR-NEXT: [[TMP13:%.*]] = icmp ugt i32 [[TMP11]], [[TMP12]] ; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]] -; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4 +; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2]], align 4 ; IR-NEXT: ret void ; entry: @@ -1220,9 +1263,10 @@ } define amdgpu_kernel void @atomic_min_i32_offset(ptr addrspace(1) %out, i32 %in) { -; IR-LABEL: @atomic_min_i32_offset( +; IR-LABEL: define amdgpu_kernel void @atomic_min_i32_offset +; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]]) { ; IR-NEXT: entry: -; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4 +; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 4 ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) ; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> ; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 @@ -1232,7 +1276,7 @@ ; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 ; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] ; IR: 7: -; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 +; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[GEP]], i32 [[IN]] syncscope("workgroup") seq_cst, align 4 ; IR-NEXT: br label [[TMP9]] ; IR: 9: ; IR-NEXT: ret void @@ -1244,9 +1288,10 @@ } define amdgpu_kernel void @atomic_min_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { -; IR-LABEL: @atomic_min_i32_ret_offset( +; IR-LABEL: define amdgpu_kernel void @atomic_min_i32_ret_offset +; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT2:%.*]], i32 [[IN:%.*]]) { ; IR-NEXT: entry: -; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4 +; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 4 ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) ; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> ; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 @@ -1256,15 +1301,15 @@ ; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 ; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] ; IR: 7: -; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 +; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[GEP]], i32 [[IN]] syncscope("workgroup") seq_cst, align 4 ; IR-NEXT: br label [[TMP9]] ; IR: 9: ; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ] -; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]]) +; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]]) ; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 2147483647, i32 [[IN]] ; IR-NEXT: [[TMP13:%.*]] = icmp slt i32 [[TMP11]], [[TMP12]] ; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]] -; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4 +; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2]], align 4 ; IR-NEXT: ret void ; entry: @@ -1275,9 +1320,10 @@ } define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { -; IR-LABEL: @atomic_min_i32_addr64_offset( +; IR-LABEL: define amdgpu_kernel void @atomic_min_i32_addr64_offset +; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]], i64 [[INDEX:%.*]]) { ; IR-NEXT: entry: -; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] +; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[INDEX]] ; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4 ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) ; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> @@ -1288,7 +1334,7 @@ ; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 ; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] ; IR: 7: -; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 +; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[GEP]], i32 [[IN]] syncscope("workgroup") seq_cst, align 4 ; IR-NEXT: br label [[TMP9]] ; IR: 9: ; IR-NEXT: ret void @@ -1301,9 +1347,10 @@ } define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { -; IR-LABEL: @atomic_min_i32_ret_addr64_offset( +; IR-LABEL: define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset +; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT2:%.*]], i32 [[IN:%.*]], i64 [[INDEX:%.*]]) { ; IR-NEXT: entry: -; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] +; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[INDEX]] ; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4 ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) ; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> @@ -1314,15 +1361,15 @@ ; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 ; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] ; IR: 7: -; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 +; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[GEP]], i32 [[IN]] syncscope("workgroup") seq_cst, align 4 ; IR-NEXT: br label [[TMP9]] ; IR: 9: ; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ] -; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]]) +; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]]) ; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 2147483647, i32 [[IN]] ; IR-NEXT: [[TMP13:%.*]] = icmp slt i32 [[TMP11]], [[TMP12]] ; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]] -; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4 +; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2]], align 4 ; IR-NEXT: ret void ; entry: @@ -1334,7 +1381,8 @@ } define amdgpu_kernel void @atomic_min_i32(ptr addrspace(1) %out, i32 %in) { -; IR-LABEL: @atomic_min_i32( +; IR-LABEL: define amdgpu_kernel void @atomic_min_i32 +; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]]) { ; IR-NEXT: entry: ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) ; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> @@ -1345,7 +1393,7 @@ ; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 ; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] ; IR: 7: -; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 +; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[OUT]], i32 [[IN]] syncscope("workgroup") seq_cst, align 4 ; IR-NEXT: br label [[TMP9]] ; IR: 9: ; IR-NEXT: ret void @@ -1356,7 +1404,8 @@ } define amdgpu_kernel void @atomic_min_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { -; IR-LABEL: @atomic_min_i32_ret( +; IR-LABEL: define amdgpu_kernel void @atomic_min_i32_ret +; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT2:%.*]], i32 [[IN:%.*]]) { ; IR-NEXT: entry: ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) ; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> @@ -1367,15 +1416,15 @@ ; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 ; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] ; IR: 7: -; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 +; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[OUT]], i32 [[IN]] syncscope("workgroup") seq_cst, align 4 ; IR-NEXT: br label [[TMP9]] ; IR: 9: ; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ] -; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]]) +; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]]) ; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 2147483647, i32 [[IN]] ; IR-NEXT: [[TMP13:%.*]] = icmp slt i32 [[TMP11]], [[TMP12]] ; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]] -; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4 +; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2]], align 4 ; IR-NEXT: ret void ; entry: @@ -1385,9 +1434,10 @@ } define amdgpu_kernel void @atomic_min_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) { -; IR-LABEL: @atomic_min_i32_addr64( +; IR-LABEL: define amdgpu_kernel void @atomic_min_i32_addr64 +; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]], i64 [[INDEX:%.*]]) { ; IR-NEXT: entry: -; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] +; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[INDEX]] ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) ; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> ; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 @@ -1397,7 +1447,7 @@ ; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 ; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] ; IR: 7: -; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[PTR]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 +; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[PTR]], i32 [[IN]] syncscope("workgroup") seq_cst, align 4 ; IR-NEXT: br label [[TMP9]] ; IR: 9: ; IR-NEXT: ret void @@ -1409,9 +1459,10 @@ } define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { -; IR-LABEL: @atomic_min_i32_ret_addr64( +; IR-LABEL: define amdgpu_kernel void @atomic_min_i32_ret_addr64 +; IR-SAME: (ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT2:%.*]], i32 [[IN:%.*]], i64 [[INDEX:%.*]]) { ; IR-NEXT: entry: -; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]] +; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[INDEX]] ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) ; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> ; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 @@ -1421,15 +1472,15 @@ ; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 ; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]] ; IR: 7: -; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[PTR]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4 +; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[PTR]], i32 [[IN]] syncscope("workgroup") seq_cst, align 4 ; IR-NEXT: br label [[TMP9]] ; IR: 9: ; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ] -; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]]) +; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]]) ; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 2147483647, i32 [[IN]] ; IR-NEXT: [[TMP13:%.*]] = icmp slt i32 [[TMP11]], [[TMP12]] ; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]] -; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4 +; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2]], align 4 ; IR-NEXT: ret void ; entry: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll @@ -6,6 +6,22 @@ declare i32 @llvm.amdgcn.permlane16(i32, i32, i32, i32, i1, i1) declare i32 @llvm.amdgcn.permlanex16(i32, i32, i32, i32, i1, i1) + + +declare i16 @llvm.amdgcn.permlane16.i16(i16, i16, i32, i32, i1, i1) #0 +declare half @llvm.amdgcn.permlane16.f16(half, half, i32, i32, i1, i1) #0 +declare float @llvm.amdgcn.permlane16.f32(float, float, i32, i32, i1, i1) #0 +declare <3 x i16> @llvm.amdgcn.permlane16.v3i16(<3 x i16>, <3 x i16>, i32, i32, i1, i1) #0 +declare <9 x float> @llvm.amdgcn.permlane16.v9f32(<9 x float>, <9 x float>, i32, i32, i1, i1) #0 + +declare i16 @llvm.amdgcn.permlanex16.i16(i16, i16, i32, i32, i1, i1) #0 +declare half @llvm.amdgcn.permlanex16.f16(half, half, i32, i32, i1, i1) #0 +declare float @llvm.amdgcn.permlanex16.f32(float, float, i32, i32, i1, i1) #0 +declare <3 x i16> @llvm.amdgcn.permlanex16.v3i16(<3 x i16>, <3 x i16>, i32, i32, i1, i1) #0 +declare <9 x float> @llvm.amdgcn.permlanex16.v9f32(<9 x float>, <9 x float>, i32, i32, i1, i1) #0 + + + declare i32 @llvm.amdgcn.workitem.id.x() declare i32 @llvm.amdgcn.workitem.id.y() @@ -917,6 +933,255 @@ ret void } +define void @test_permlane_i16(ptr addrspace(1) %out, i16 %src0, i32 %src1, i32 %src2) #1 { +; GFX10-LABEL: test_permlane_i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_readfirstlane_b32 s4, v3 +; GFX10-NEXT: v_readfirstlane_b32 s5, v4 +; GFX10-NEXT: v_permlane16_b32 v2, v2, s4, s5 op_sel:[1,0] +; GFX10-NEXT: global_store_short v[0:1], v2, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: test_permlane_i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_readfirstlane_b32 s0, v3 +; GFX11-NEXT: v_readfirstlane_b32 s1, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlane16_b32 v2, v2, s0, s1 op_sel:[1,0] +; GFX11-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %v = call i16 @llvm.amdgcn.permlane16.i16(i16 %src0, i16 %src0, i32 %src1, i32 %src2, i1 true, i1 false) + store i16 %v, ptr addrspace(1) %out, align 4 + ret void +} + +define void @test_permlane_f16(ptr addrspace(1) %out, half %src0, i32 %src1, i32 %src2) #1 { +; GFX10-LABEL: test_permlane_f16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_readfirstlane_b32 s4, v3 +; GFX10-NEXT: v_readfirstlane_b32 s5, v4 +; GFX10-NEXT: v_permlane16_b32 v2, v2, s4, s5 op_sel:[1,0] +; GFX10-NEXT: global_store_short v[0:1], v2, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: test_permlane_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_readfirstlane_b32 s0, v3 +; GFX11-NEXT: v_readfirstlane_b32 s1, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlane16_b32 v2, v2, s0, s1 op_sel:[1,0] +; GFX11-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %v = call half @llvm.amdgcn.permlane16.f16(half %src0, half %src0, i32 %src1, i32 %src2, i1 true, i1 false) + store half %v, ptr addrspace(1) %out, align 4 + ret void +} + +define void @test_permlane_f32(ptr addrspace(1) %out, float %src0, i32 %src1, i32 %src2) #1 { +; GFX10-LABEL: test_permlane_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_readfirstlane_b32 s4, v3 +; GFX10-NEXT: v_readfirstlane_b32 s5, v4 +; GFX10-NEXT: v_permlane16_b32 v2, v2, s4, s5 op_sel:[1,0] +; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: test_permlane_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_readfirstlane_b32 s0, v3 +; GFX11-NEXT: v_readfirstlane_b32 s1, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlane16_b32 v2, v2, s0, s1 op_sel:[1,0] +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %v = call float @llvm.amdgcn.permlane16.f32(float %src0, float %src0, i32 %src1, i32 %src2, i1 true, i1 false) + store float %v, ptr addrspace(1) %out, align 4 + ret void +} + +define void @test_permlane_v3i16(ptr addrspace(1) %out, <3 x i16> %src0, i32 %src1, i32 %src2) #1 { +; GFX10-SDAG-LABEL: test_permlane_v3i16: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v4 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v3, v3, s4, s5 op_sel:[1,0] +; GFX10-SDAG-NEXT: v_permlane16_b32 v2, v2, s4, s5 op_sel:[1,0] +; GFX10-SDAG-NEXT: global_store_short v[0:1], v3, off offset:4 +; GFX10-SDAG-NEXT: global_store_dword v[0:1], v2, off +; GFX10-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: test_permlane_v3i16: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v4 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s5, v5 +; GFX10-GISEL-NEXT: v_lshl_or_b32 v3, s4, 16, v3 +; GFX10-GISEL-NEXT: v_permlane16_b32 v2, v2, s4, s5 op_sel:[1,0] +; GFX10-GISEL-NEXT: v_permlane16_b32 v3, v3, s4, s5 op_sel:[1,0] +; GFX10-GISEL-NEXT: global_store_short v[0:1], v2, off +; GFX10-GISEL-NEXT: global_store_short_d16_hi v[0:1], v2, off offset:2 +; GFX10-GISEL-NEXT: global_store_short v[0:1], v3, off offset:4 +; GFX10-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: test_permlane_v3i16: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v4 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v5 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1 op_sel:[1,0] +; GFX11-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 op_sel:[1,0] +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: global_store_b16 v[0:1], v3, off offset:4 +; GFX11-SDAG-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: test_permlane_v3i16: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s0, v4 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v5 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_lshl_or_b32 v3, s0, 16, v3 +; GFX11-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1 op_sel:[1,0] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane16_b32 v3, v3, s0, s1 op_sel:[1,0] +; GFX11-GISEL-NEXT: s_clause 0x2 +; GFX11-GISEL-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-GISEL-NEXT: global_store_d16_hi_b16 v[0:1], v2, off offset:2 +; GFX11-GISEL-NEXT: global_store_b16 v[0:1], v3, off offset:4 +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] + %v = call <3 x i16> @llvm.amdgcn.permlane16.v3i16(<3 x i16> %src0, <3 x i16> %src0, i32 %src1, i32 %src2, i1 true, i1 false) + store <3 x i16> %v, ptr addrspace(1) %out, align 4 + ret void +} + +define void @test_permlane_v9f32(ptr addrspace(1) %out, <9 x float> %src0, i32 %src1, i32 %src2) #1 { +; GFX10-SDAG-LABEL: test_permlane_v9f32: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v11 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v12 +; GFX10-SDAG-NEXT: v_permlane16_b32 v10, v10, s4, s5 op_sel:[1,0] +; GFX10-SDAG-NEXT: v_permlane16_b32 v6, v6, s4, s5 op_sel:[1,0] +; GFX10-SDAG-NEXT: v_permlane16_b32 v7, v7, s4, s5 op_sel:[1,0] +; GFX10-SDAG-NEXT: v_permlane16_b32 v8, v8, s4, s5 op_sel:[1,0] +; GFX10-SDAG-NEXT: v_permlane16_b32 v9, v9, s4, s5 op_sel:[1,0] +; GFX10-SDAG-NEXT: v_permlane16_b32 v2, v2, s4, s5 op_sel:[1,0] +; GFX10-SDAG-NEXT: v_permlane16_b32 v3, v3, s4, s5 op_sel:[1,0] +; GFX10-SDAG-NEXT: v_permlane16_b32 v4, v4, s4, s5 op_sel:[1,0] +; GFX10-SDAG-NEXT: v_permlane16_b32 v5, v5, s4, s5 op_sel:[1,0] +; GFX10-SDAG-NEXT: global_store_dword v[0:1], v10, off offset:32 +; GFX10-SDAG-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:16 +; GFX10-SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX10-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: test_permlane_v9f32: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v11 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s5, v12 +; GFX10-GISEL-NEXT: v_permlane16_b32 v2, v2, s4, s5 op_sel:[1,0] +; GFX10-GISEL-NEXT: v_permlane16_b32 v3, v3, s4, s5 op_sel:[1,0] +; GFX10-GISEL-NEXT: v_permlane16_b32 v4, v4, s4, s5 op_sel:[1,0] +; GFX10-GISEL-NEXT: v_permlane16_b32 v5, v5, s4, s5 op_sel:[1,0] +; GFX10-GISEL-NEXT: v_permlane16_b32 v6, v6, s4, s5 op_sel:[1,0] +; GFX10-GISEL-NEXT: v_permlane16_b32 v7, v7, s4, s5 op_sel:[1,0] +; GFX10-GISEL-NEXT: v_permlane16_b32 v8, v8, s4, s5 op_sel:[1,0] +; GFX10-GISEL-NEXT: v_permlane16_b32 v9, v9, s4, s5 op_sel:[1,0] +; GFX10-GISEL-NEXT: v_permlane16_b32 v10, v10, s4, s5 op_sel:[1,0] +; GFX10-GISEL-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX10-GISEL-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:16 +; GFX10-GISEL-NEXT: global_store_dword v[0:1], v10, off offset:32 +; GFX10-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: test_permlane_v9f32: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v11 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v12 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlane16_b32 v10, v10, s0, s1 op_sel:[1,0] +; GFX11-SDAG-NEXT: v_permlane16_b32 v6, v6, s0, s1 op_sel:[1,0] +; GFX11-SDAG-NEXT: v_permlane16_b32 v7, v7, s0, s1 op_sel:[1,0] +; GFX11-SDAG-NEXT: v_permlane16_b32 v8, v8, s0, s1 op_sel:[1,0] +; GFX11-SDAG-NEXT: v_permlane16_b32 v9, v9, s0, s1 op_sel:[1,0] +; GFX11-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 op_sel:[1,0] +; GFX11-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1 op_sel:[1,0] +; GFX11-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1 op_sel:[1,0] +; GFX11-SDAG-NEXT: v_permlane16_b32 v5, v5, s0, s1 op_sel:[1,0] +; GFX11-SDAG-NEXT: s_clause 0x2 +; GFX11-SDAG-NEXT: global_store_b32 v[0:1], v10, off offset:32 +; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: test_permlane_v9f32: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s0, v11 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v12 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1 op_sel:[1,0] +; GFX11-GISEL-NEXT: v_permlane16_b32 v3, v3, s0, s1 op_sel:[1,0] +; GFX11-GISEL-NEXT: v_permlane16_b32 v4, v4, s0, s1 op_sel:[1,0] +; GFX11-GISEL-NEXT: v_permlane16_b32 v5, v5, s0, s1 op_sel:[1,0] +; GFX11-GISEL-NEXT: v_permlane16_b32 v6, v6, s0, s1 op_sel:[1,0] +; GFX11-GISEL-NEXT: v_permlane16_b32 v7, v7, s0, s1 op_sel:[1,0] +; GFX11-GISEL-NEXT: v_permlane16_b32 v8, v8, s0, s1 op_sel:[1,0] +; GFX11-GISEL-NEXT: v_permlane16_b32 v9, v9, s0, s1 op_sel:[1,0] +; GFX11-GISEL-NEXT: v_permlane16_b32 v10, v10, s0, s1 op_sel:[1,0] +; GFX11-GISEL-NEXT: s_clause 0x2 +; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX11-GISEL-NEXT: global_store_b32 v[0:1], v10, off offset:32 +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] + %v = call <9 x float> @llvm.amdgcn.permlane16.v9f32(<9 x float> %src0, <9 x float> %src0, i32 %src1, i32 %src2, i1 true, i1 false) + store <9 x float> %v, ptr addrspace(1) %out, align 4 + ret void +} + define amdgpu_kernel void @v_permlanex16_b32_tid_tid(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { ; GFX10-LABEL: v_permlanex16_b32_tid_tid: ; GFX10: ; %bb.0: @@ -1122,3 +1387,252 @@ store i32 %v, ptr addrspace(1) %out ret void } + +define void @test_permlanex16_i16(ptr addrspace(1) %out, i16 %src0, i32 %src1, i32 %src2) #1 { +; GFX10-LABEL: test_permlanex16_i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_readfirstlane_b32 s4, v3 +; GFX10-NEXT: v_readfirstlane_b32 s5, v4 +; GFX10-NEXT: v_permlanex16_b32 v2, v2, s4, s5 op_sel:[1,0] +; GFX10-NEXT: global_store_short v[0:1], v2, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: test_permlanex16_i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_readfirstlane_b32 s0, v3 +; GFX11-NEXT: v_readfirstlane_b32 s1, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlanex16_b32 v2, v2, s0, s1 op_sel:[1,0] +; GFX11-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %v = call i16 @llvm.amdgcn.permlanex16.i16(i16 %src0, i16 %src0, i32 %src1, i32 %src2, i1 true, i1 false) + store i16 %v, ptr addrspace(1) %out, align 4 + ret void +} + +define void @test_permlanxex16_f16(ptr addrspace(1) %out, half %src0, i32 %src1, i32 %src2) #1 { +; GFX10-LABEL: test_permlanxex16_f16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_readfirstlane_b32 s4, v3 +; GFX10-NEXT: v_readfirstlane_b32 s5, v4 +; GFX10-NEXT: v_permlanex16_b32 v2, v2, s4, s5 op_sel:[1,0] +; GFX10-NEXT: global_store_short v[0:1], v2, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: test_permlanxex16_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_readfirstlane_b32 s0, v3 +; GFX11-NEXT: v_readfirstlane_b32 s1, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlanex16_b32 v2, v2, s0, s1 op_sel:[1,0] +; GFX11-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %v = call half @llvm.amdgcn.permlanex16.f16(half %src0, half %src0, i32 %src1, i32 %src2, i1 true, i1 false) + store half %v, ptr addrspace(1) %out, align 4 + ret void +} + +define void @test_permlanex16_f32(ptr addrspace(1) %out, float %src0, i32 %src1, i32 %src2) #1 { +; GFX10-LABEL: test_permlanex16_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_readfirstlane_b32 s4, v3 +; GFX10-NEXT: v_readfirstlane_b32 s5, v4 +; GFX10-NEXT: v_permlanex16_b32 v2, v2, s4, s5 op_sel:[1,0] +; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: test_permlanex16_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_readfirstlane_b32 s0, v3 +; GFX11-NEXT: v_readfirstlane_b32 s1, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlanex16_b32 v2, v2, s0, s1 op_sel:[1,0] +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %v = call float @llvm.amdgcn.permlanex16.f32(float %src0, float %src0, i32 %src1, i32 %src2, i1 true, i1 false) + store float %v, ptr addrspace(1) %out, align 4 + ret void +} + +define void @test_permlanex16_v3i16(ptr addrspace(1) %out, <3 x i16> %src0, i32 %src1, i32 %src2) #1 { +; GFX10-SDAG-LABEL: test_permlanex16_v3i16: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v4 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v3, v3, s4, s5 op_sel:[1,0] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v2, v2, s4, s5 op_sel:[1,0] +; GFX10-SDAG-NEXT: global_store_short v[0:1], v3, off offset:4 +; GFX10-SDAG-NEXT: global_store_dword v[0:1], v2, off +; GFX10-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: test_permlanex16_v3i16: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v4 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s5, v5 +; GFX10-GISEL-NEXT: v_lshl_or_b32 v3, s4, 16, v3 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v2, v2, s4, s5 op_sel:[1,0] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v3, v3, s4, s5 op_sel:[1,0] +; GFX10-GISEL-NEXT: global_store_short v[0:1], v2, off +; GFX10-GISEL-NEXT: global_store_short_d16_hi v[0:1], v2, off offset:2 +; GFX10-GISEL-NEXT: global_store_short v[0:1], v3, off offset:4 +; GFX10-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: test_permlanex16_v3i16: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v4 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v5 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1 op_sel:[1,0] +; GFX11-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 op_sel:[1,0] +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: global_store_b16 v[0:1], v3, off offset:4 +; GFX11-SDAG-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: test_permlanex16_v3i16: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s0, v4 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v5 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_lshl_or_b32 v3, s0, 16, v3 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1 op_sel:[1,0] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v3, v3, s0, s1 op_sel:[1,0] +; GFX11-GISEL-NEXT: s_clause 0x2 +; GFX11-GISEL-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-GISEL-NEXT: global_store_d16_hi_b16 v[0:1], v2, off offset:2 +; GFX11-GISEL-NEXT: global_store_b16 v[0:1], v3, off offset:4 +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] + %v = call <3 x i16> @llvm.amdgcn.permlanex16.v3i16(<3 x i16> %src0, <3 x i16> %src0, i32 %src1, i32 %src2, i1 true, i1 false) + store <3 x i16> %v, ptr addrspace(1) %out, align 4 + ret void +} + +define void @test_permlanex16_v9f32(ptr addrspace(1) %out, <9 x float> %src0, i32 %src1, i32 %src2) #1 { +; GFX10-SDAG-LABEL: test_permlanex16_v9f32: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v11 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v12 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v10, v10, s4, s5 op_sel:[1,0] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v6, v6, s4, s5 op_sel:[1,0] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v7, v7, s4, s5 op_sel:[1,0] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v8, v8, s4, s5 op_sel:[1,0] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v9, v9, s4, s5 op_sel:[1,0] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v2, v2, s4, s5 op_sel:[1,0] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v3, v3, s4, s5 op_sel:[1,0] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v4, v4, s4, s5 op_sel:[1,0] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v5, v5, s4, s5 op_sel:[1,0] +; GFX10-SDAG-NEXT: global_store_dword v[0:1], v10, off offset:32 +; GFX10-SDAG-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:16 +; GFX10-SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX10-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: test_permlanex16_v9f32: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v11 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s5, v12 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v2, v2, s4, s5 op_sel:[1,0] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v3, v3, s4, s5 op_sel:[1,0] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v4, v4, s4, s5 op_sel:[1,0] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v5, v5, s4, s5 op_sel:[1,0] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v6, v6, s4, s5 op_sel:[1,0] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v7, v7, s4, s5 op_sel:[1,0] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v8, v8, s4, s5 op_sel:[1,0] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v9, v9, s4, s5 op_sel:[1,0] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v10, v10, s4, s5 op_sel:[1,0] +; GFX10-GISEL-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX10-GISEL-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:16 +; GFX10-GISEL-NEXT: global_store_dword v[0:1], v10, off offset:32 +; GFX10-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: test_permlanex16_v9f32: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v11 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v12 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v10, v10, s0, s1 op_sel:[1,0] +; GFX11-SDAG-NEXT: v_permlanex16_b32 v6, v6, s0, s1 op_sel:[1,0] +; GFX11-SDAG-NEXT: v_permlanex16_b32 v7, v7, s0, s1 op_sel:[1,0] +; GFX11-SDAG-NEXT: v_permlanex16_b32 v8, v8, s0, s1 op_sel:[1,0] +; GFX11-SDAG-NEXT: v_permlanex16_b32 v9, v9, s0, s1 op_sel:[1,0] +; GFX11-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 op_sel:[1,0] +; GFX11-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1 op_sel:[1,0] +; GFX11-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1 op_sel:[1,0] +; GFX11-SDAG-NEXT: v_permlanex16_b32 v5, v5, s0, s1 op_sel:[1,0] +; GFX11-SDAG-NEXT: s_clause 0x2 +; GFX11-SDAG-NEXT: global_store_b32 v[0:1], v10, off offset:32 +; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: test_permlanex16_v9f32: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s0, v11 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v12 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1 op_sel:[1,0] +; GFX11-GISEL-NEXT: v_permlanex16_b32 v3, v3, s0, s1 op_sel:[1,0] +; GFX11-GISEL-NEXT: v_permlanex16_b32 v4, v4, s0, s1 op_sel:[1,0] +; GFX11-GISEL-NEXT: v_permlanex16_b32 v5, v5, s0, s1 op_sel:[1,0] +; GFX11-GISEL-NEXT: v_permlanex16_b32 v6, v6, s0, s1 op_sel:[1,0] +; GFX11-GISEL-NEXT: v_permlanex16_b32 v7, v7, s0, s1 op_sel:[1,0] +; GFX11-GISEL-NEXT: v_permlanex16_b32 v8, v8, s0, s1 op_sel:[1,0] +; GFX11-GISEL-NEXT: v_permlanex16_b32 v9, v9, s0, s1 op_sel:[1,0] +; GFX11-GISEL-NEXT: v_permlanex16_b32 v10, v10, s0, s1 op_sel:[1,0] +; GFX11-GISEL-NEXT: s_clause 0x2 +; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX11-GISEL-NEXT: global_store_b32 v[0:1], v10, off offset:32 +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] + %v = call <9 x float> @llvm.amdgcn.permlanex16.v9f32(<9 x float> %src0, <9 x float> %src0, i32 %src1, i32 %src2, i1 true, i1 false) + store <9 x float> %v, ptr addrspace(1) %out, align 4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll @@ -1,11 +1,20 @@ +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope %s ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope %s -declare i32 @llvm.amdgcn.readfirstlane(i32) #0 +declare i32 @llvm.amdgcn.readfirstlane.i32(i32) #0 +declare float @llvm.amdgcn.readfirstlane.f32(float) #0 +declare <2 x half> @llvm.amdgcn.readfirstlane.v2f16(<2 x half>) #0 +declare <2 x i16> @llvm.amdgcn.readfirstlane.v2i16(<2 x i16>) #0 +declare i16 @llvm.amdgcn.readfirstlane.i16(i16) #0 +declare half @llvm.amdgcn.readfirstlane.f16(half) #0 +declare <3 x i16> @llvm.amdgcn.readfirstlane.v3i16(<3 x i16>) #0 +declare <9 x float> @llvm.amdgcn.readfirstlane.v9f32(<9 x float>) #0 + ; CHECK-LABEL: {{^}}test_readfirstlane: ; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, v2 define void @test_readfirstlane(ptr addrspace(1) %out, i32 %src) #1 { - %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %src) + %readfirstlane = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %src) store i32 %readfirstlane, ptr addrspace(1) %out, align 4 ret void } @@ -15,7 +24,7 @@ ; CHECK-NOT: [[SGPR_VAL]] ; CHECK: ; use [[SGPR_VAL]] define amdgpu_kernel void @test_readfirstlane_imm(ptr addrspace(1) %out) #1 { - %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 32) + %readfirstlane = call i32 @llvm.amdgcn.readfirstlane.i32(i32 32) call void asm sideeffect "; use $0", "s"(i32 %readfirstlane) ret void } @@ -25,7 +34,7 @@ ; CHECK-NOT: [[VVAL]] ; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VVAL]] define amdgpu_kernel void @test_readfirstlane_imm_fold(ptr addrspace(1) %out) #1 { - %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 32) + %readfirstlane = call i32 @llvm.amdgcn.readfirstlane.i32(i32 32) store i32 %readfirstlane, ptr addrspace(1) %out, align 4 ret void } @@ -36,7 +45,7 @@ ; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VVAL]] define amdgpu_kernel void @test_readfirstlane_m0(ptr addrspace(1) %out) #1 { %m0 = call i32 asm "s_mov_b32 m0, -1", "={m0}"() - %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %m0) + %readfirstlane = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %m0) store i32 %readfirstlane, ptr addrspace(1) %out, align 4 ret void } @@ -51,7 +60,7 @@ ; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VCOPY]] define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr(ptr addrspace(1) %out) #1 { %sgpr = call i32 asm "s_mov_b32 $0, 0", "=s"() - %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %sgpr) + %readfirstlane = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %sgpr) store i32 %readfirstlane, ptr addrspace(1) %out, align 4 ret void } @@ -62,10 +71,84 @@ define amdgpu_kernel void @test_readfirstlane_fi(ptr addrspace(1) %out) #1 { %alloca = alloca i32, addrspace(5) %int = ptrtoint ptr addrspace(5) %alloca to i32 - %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %int) + %readfirstlane = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %int) call void asm sideeffect "; use $0", "s"(i32 %readfirstlane) ret void } +; CHECK-LABEL: {{^}}test_readfirstlane_v2f16: +; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, v2 +; CHECK-NOT: v_readfirstlane_b32 +define void @test_readfirstlane_v2f16(ptr addrspace(1) %out, <2 x half> %src) #1 { + %readfirstlane = call <2 x half> @llvm.amdgcn.readfirstlane.v2f16(<2 x half> %src) + store <2 x half> %readfirstlane, ptr addrspace(1) %out, align 4 + ret void +} + +; CHECK-LABEL: {{^}}test_readfirstlane_v2i16: +; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, v2 +; CHECK-NOT: v_readfirstlane_b32 +define void @test_readfirstlane_v2i16(ptr addrspace(1) %out, <2 x i16> %src) #1 { + %readfirstlane = call <2 x i16> @llvm.amdgcn.readfirstlane.v2i16(<2 x i16> %src) + store <2 x i16> %readfirstlane, ptr addrspace(1) %out, align 4 + ret void +} + + +; CHECK-LABEL: {{^}}test_readfirstlane_i16: +; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, v2 +; CHECK-NOT: v_readfirstlane_b32 +define void @test_readfirstlane_i16(ptr addrspace(1) %out, i16 %src) { + %readfirstlane = call i16 @llvm.amdgcn.readfirstlane.i16(i16 %src) + store i16 %readfirstlane, ptr addrspace(1) %out, align 2 + ret void +} + +; CHECK-LABEL: {{^}}test_readfirstlane_f16: +; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, v2 +; CHECK-NOT: v_readfirstlane_b32 +define void @test_readfirstlane_f16(ptr addrspace(1) %out, half %src) { + %readfirstlane = call half @llvm.amdgcn.readfirstlane.f16(half %src) + store half %readfirstlane, ptr addrspace(1) %out, align 2 + ret void +} + +; CHECK-LABEL: {{^}}test_readfirstlane_f32: +; CHECK-NOT: v_cvt_f32_i32_e32 +; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, v2 +; CHECK-NOT: v_readfirstlane_b32 +define void @test_readfirstlane_f32(ptr addrspace(1) %out, float %src) #1 { + %readfirstlane = call float @llvm.amdgcn.readfirstlane.f32(float %src) + store float %readfirstlane, ptr addrspace(1) %out, align 4 + ret void +} + +; CHECK-LABEL: {{^}}test_readfirstlane_v3i16: +; CHECK-DAG: v_readfirstlane_b32 s{{[0-9]+}}, +; CHECK-DAG: v_readfirstlane_b32 s{{[0-9]+}}, +; CHECK-NOT: v_readfirstlane_b32 +define void @test_readfirstlane_v3i16(ptr addrspace(1) %out, <3 x i16> %src) { + %readfirstlane = call <3 x i16> @llvm.amdgcn.readfirstlane.v3i16(<3 x i16> %src) + store <3 x i16> %readfirstlane, ptr addrspace(1) %out, align 2 + ret void +} + +; CHECK-LABEL: {{^}}test_readfirstlane_v9f32: +; CHECK-DAG: v_readfirstlane_b32 s{{[0-9]+}}, v2 +; CHECK-DAG: v_readfirstlane_b32 s{{[0-9]+}}, v3 +; CHECK-DAG: v_readfirstlane_b32 s{{[0-9]+}}, v4 +; CHECK-DAG: v_readfirstlane_b32 s{{[0-9]+}}, v5 +; CHECK-DAG: v_readfirstlane_b32 s{{[0-9]+}}, v6 +; CHECK-DAG: v_readfirstlane_b32 s{{[0-9]+}}, v7 +; CHECK-DAG: v_readfirstlane_b32 s{{[0-9]+}}, v8 +; CHECK-DAG: v_readfirstlane_b32 s{{[0-9]+}}, v9 +; CHECK-DAG: v_readfirstlane_b32 s{{[0-9]+}}, v10 +; CHECK-NOT: v_readfirstlane_b32 +define void @test_readfirstlane_v9f32(ptr addrspace(1) %out, <9 x float> %src) { + %readfirstlane = call <9 x float> @llvm.amdgcn.readfirstlane.v9f32(<9 x float> %src) + store <9 x float> %readfirstlane, ptr addrspace(1) %out, align 2 + ret void +} + attributes #0 = { nounwind readnone convergent } -attributes #1 = { nounwind } +attributes #1 = { nounwind } \ No newline at end of file diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll @@ -1,6 +1,11 @@ ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope %s declare i32 @llvm.amdgcn.readlane(i32, i32) #0 +declare i16 @llvm.amdgcn.readlane.i16(i16, i32) #0 +declare half @llvm.amdgcn.readlane.f16(half, i32) #0 +declare float @llvm.amdgcn.readlane.f32(float, i32) #0 +declare <3 x i16> @llvm.amdgcn.readlane.v3i16(<3 x i16>, i32) #0 +declare <9 x float> @llvm.amdgcn.readlane.v9f32(<9 x float>, i32) #0 ; CHECK-LABEL: {{^}}test_readlane_sreg_sreg: ; CHECK-NOT: v_readlane_b32 @@ -77,8 +82,63 @@ ret void } +; CHECK-LABEL: {{^}}test_readlane_i16: +; CHECK: v_readlane_b32 s{{[0-9]+}}, v2, 15 +; CHECK-NOT: v_readlane_b32 +define void @test_readlane_i16(ptr addrspace(1) %out, i16 %src) { + %readlane = call i16 @llvm.amdgcn.readlane.i16(i16 %src, i32 15) + store i16 %readlane, ptr addrspace(1) %out, align 2 + ret void +} + +; CHECK-LABEL: {{^}}test_readlane_f16: +; CHECK: v_readlane_b32 s{{[0-9]+}}, v2, 15 +; CHECK-NOT: v_readlane_b32 +define void @test_readlane_f16(ptr addrspace(1) %out, half %src) { + %readlane = call half @llvm.amdgcn.readlane.f16(half %src, i32 15) + store half %readlane, ptr addrspace(1) %out, align 2 + ret void +} + +; CHECK-LABEL: {{^}}test_readlane_f32: +; CHECK-NOT: v_cvt_f32_i32_e32 +; CHECK: v_readlane_b32 s{{[0-9]+}}, v2, 15 +; CHECK-NOT: v_readlane_b32 +define void @test_readlane_f32(ptr addrspace(1) %out, float %src) { + %readlane = call float @llvm.amdgcn.readlane.f32(float %src, i32 15) + store float %readlane, ptr addrspace(1) %out, align 2 + ret void +} + +; CHECK-LABEL: {{^}}test_readlane_v3i16: +; CHECK-DAG: v_readlane_b32 s{{[0-9]+}}, +; CHECK-DAG: v_readlane_b32 s{{[0-9]+}}, +; CHECK-NOT: v_readlane_b32 +define void @test_readlane_v3i16(ptr addrspace(1) %out, <3 x i16> %src) { + %readlane = call <3 x i16> @llvm.amdgcn.readlane.v3i16(<3 x i16> %src, i32 15) + store <3 x i16> %readlane, ptr addrspace(1) %out, align 2 + ret void +} + +; CHECK-LABEL: {{^}}test_readlane_v9f32: +; CHECK-DAG: v_readlane_b32 s{{[0-9]+}}, v2, 15 +; CHECK-DAG: v_readlane_b32 s{{[0-9]+}}, v3, 15 +; CHECK-DAG: v_readlane_b32 s{{[0-9]+}}, v4, 15 +; CHECK-DAG: v_readlane_b32 s{{[0-9]+}}, v5, 15 +; CHECK-DAG: v_readlane_b32 s{{[0-9]+}}, v6, 15 +; CHECK-DAG: v_readlane_b32 s{{[0-9]+}}, v7, 15 +; CHECK-DAG: v_readlane_b32 s{{[0-9]+}}, v8, 15 +; CHECK-DAG: v_readlane_b32 s{{[0-9]+}}, v9, 15 +; CHECK-DAG: v_readlane_b32 s{{[0-9]+}}, v10, 15 +; CHECK-NOT: v_readlane_b32 +define void @test_readlane_v9f32(<9 x float> addrspace(1)* %out, <9 x float> %src) { + %readlane = call <9 x float> @llvm.amdgcn.readlane.v9f32(<9 x float> %src, i32 15) + store <9 x float> %readlane, <9 x float> addrspace(1)* %out, align 2 + ret void +} + declare i32 @llvm.amdgcn.workitem.id.x() #2 attributes #0 = { nounwind readnone convergent } attributes #1 = { nounwind } -attributes #2 = { nounwind readnone } +attributes #2 = { nounwind readnone } \ No newline at end of file diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll @@ -4,6 +4,11 @@ ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=CHECK,GFX10 %s declare i32 @llvm.amdgcn.writelane(i32, i32, i32) #0 +declare i16 @llvm.amdgcn.writelane.i16(i16, i32, i16) #0 +declare half @llvm.amdgcn.writelane.f16(half, i32, half) #0 +declare float @llvm.amdgcn.writelane.f32(float, i32, float) #0 +declare <3 x i16> @llvm.amdgcn.writelane.v3i16(<3 x i16>, i32, <3 x i16>) #0 +declare <9 x float> @llvm.amdgcn.writelane.v9f32(<9 x float>, i32, <9 x float>) #0 ; CHECK-LABEL: {{^}}test_writelane_sreg: ; CIGFX9: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, m0 @@ -80,8 +85,66 @@ ret void } +; CHECK-LABEL: {{^}}test_writelane_i16: +; CHECK: v_writelane_b32 v{{[0-9]+}}, +; CHECK-NOT: v_writelane_b32 +define void @test_writelane_i16(ptr addrspace(1) %out, i16 %src) { + %writelane = call i16 @llvm.amdgcn.writelane.i16(i16 1234, i32 15, i16 %src) + store i16 %writelane, ptr addrspace(1) %out, align 2 + ret void +} + +; CHECK-LABEL: {{^}}test_writelane_f16: +; CHECK: v_writelane_b32 v{{[0-9]+}}, +; CHECK-NOT: v_writelane_b32 +define void @test_writelane_f16(ptr addrspace(1) %out, half %src) { + %writelane = call half @llvm.amdgcn.writelane.f16(half 1.0, i32 15, half %src) + store half %writelane, ptr addrspace(1) %out, align 2 + ret void +} + + +; CHECK-LABEL: {{^}}test_writelane_f32: +; CHECK-NOT: v_cvt_f32_i32_e32 +; CHECK: v_writelane_b32 v{{[0-9]+}}, +; CHECK-NOT: v_writelane_b32 +define void @test_writelane_f32(ptr addrspace(1) %out, float %src) #1 { + %writelane = call float @llvm.amdgcn.writelane.f32(float 2.0, i32 15, float %src) + store float %writelane, ptr addrspace(1) %out, align 4 + ret void +} + + +; CHECK-LABEL: {{^}}test_writelane_v3i16: +; CHECK-DAG: v_writelane_b32 v{{[0-9]+}}, +; CHECK-DAG: v_writelane_b32 v{{[0-9]+}}, +; CHECK-NOT: v_writelane_b32 +define void @test_writelane_v3i16(ptr addrspace(1) %out, <3 x i16> %src) { + %writelane = call <3 x i16> @llvm.amdgcn.writelane.v3i16(<3 x i16> zeroinitializer, i32 15, <3 x i16> %src) + store <3 x i16> %writelane, ptr addrspace(1) %out, align 2 + ret void +} + +; CHECK-LABEL: {{^}}test_writelane_v9f32: +; CHECK-DAG: v_writelane_b32 v{{[0-9]+}}, 0, 15 +; CHECK-DAG: v_writelane_b32 v{{[0-9]+}}, 0, 15 +; CHECK-DAG: v_writelane_b32 v{{[0-9]+}}, 0, 15 +; CHECK-DAG: v_writelane_b32 v{{[0-9]+}}, 0, 15 +; CHECK-DAG: v_writelane_b32 v{{[0-9]+}}, 0, 15 +; CHECK-DAG: v_writelane_b32 v{{[0-9]+}}, 0, 15 +; CHECK-DAG: v_writelane_b32 v{{[0-9]+}}, 0, 15 +; CHECK-DAG: v_writelane_b32 v{{[0-9]+}}, 0, 15 +; CHECK-DAG: v_writelane_b32 v{{[0-9]+}}, 0, 15 +; CHECK-NOT: v_writelane_b32 +define void @test_writelane_v9f32(ptr addrspace(1) %out, <9 x float> %src) { + %writelane = call <9 x float> @llvm.amdgcn.writelane.v9f32(<9 x float> zeroinitializer, i32 15, <9 x float> %src) + store <9 x float> %writelane, ptr addrspace(1) %out, align 2 + ret void +} + + declare i32 @llvm.amdgcn.workitem.id.x() #2 attributes #0 = { nounwind readnone convergent } attributes #1 = { nounwind } -attributes #2 = { nounwind readnone } +attributes #2 = { nounwind readnone } \ No newline at end of file diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll --- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll +++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 ; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes=instcombine -S < %s | FileCheck %s ; -------------------------------------------------------------------- @@ -9,7 +9,8 @@ declare double @llvm.amdgcn.rcp.f64(double) nounwind readnone define float @test_constant_fold_rcp_f32_undef() nounwind { -; CHECK-LABEL: @test_constant_fold_rcp_f32_undef( +; CHECK-LABEL: define float @test_constant_fold_rcp_f32_undef +; CHECK-SAME: () #[[ATTR1:[0-9]+]] { ; CHECK-NEXT: ret float 0x7FF8000000000000 ; %val = call float @llvm.amdgcn.rcp.f32(float undef) nounwind readnone @@ -17,7 +18,8 @@ } define float @test_constant_fold_rcp_f32_1() nounwind { -; CHECK-LABEL: @test_constant_fold_rcp_f32_1( +; CHECK-LABEL: define float @test_constant_fold_rcp_f32_1 +; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: ret float 1.000000e+00 ; %val = call float @llvm.amdgcn.rcp.f32(float 1.0) nounwind readnone @@ -25,7 +27,8 @@ } define double @test_constant_fold_rcp_f64_1() nounwind { -; CHECK-LABEL: @test_constant_fold_rcp_f64_1( +; CHECK-LABEL: define double @test_constant_fold_rcp_f64_1 +; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: ret double 1.000000e+00 ; %val = call double @llvm.amdgcn.rcp.f64(double 1.0) nounwind readnone @@ -33,7 +36,8 @@ } define float @test_constant_fold_rcp_f32_half() nounwind { -; CHECK-LABEL: @test_constant_fold_rcp_f32_half( +; CHECK-LABEL: define float @test_constant_fold_rcp_f32_half +; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: ret float 2.000000e+00 ; %val = call float @llvm.amdgcn.rcp.f32(float 0.5) nounwind readnone @@ -41,7 +45,8 @@ } define double @test_constant_fold_rcp_f64_half() nounwind { -; CHECK-LABEL: @test_constant_fold_rcp_f64_half( +; CHECK-LABEL: define double @test_constant_fold_rcp_f64_half +; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: ret double 2.000000e+00 ; %val = call double @llvm.amdgcn.rcp.f64(double 0.5) nounwind readnone @@ -49,7 +54,8 @@ } define float @test_constant_fold_rcp_f32_43() nounwind { -; CHECK-LABEL: @test_constant_fold_rcp_f32_43( +; CHECK-LABEL: define float @test_constant_fold_rcp_f32_43 +; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: ret float 0x3F97D05F40000000 ; %val = call float @llvm.amdgcn.rcp.f32(float 4.300000e+01) nounwind readnone @@ -57,7 +63,8 @@ } define double @test_constant_fold_rcp_f64_43() nounwind { -; CHECK-LABEL: @test_constant_fold_rcp_f64_43( +; CHECK-LABEL: define double @test_constant_fold_rcp_f64_43 +; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: ret double 0x3F97D05F417D05F4 ; %val = call double @llvm.amdgcn.rcp.f64(double 4.300000e+01) nounwind readnone @@ -65,8 +72,9 @@ } define float @test_constant_fold_rcp_f32_43_strictfp() nounwind strictfp { -; CHECK-LABEL: @test_constant_fold_rcp_f32_43_strictfp( -; CHECK-NEXT: [[VAL:%.*]] = call float @llvm.amdgcn.rcp.f32(float 4.300000e+01) #[[ATTR14:[0-9]+]] +; CHECK-LABEL: define float @test_constant_fold_rcp_f32_43_strictfp +; CHECK-SAME: () #[[ATTR2:[0-9]+]] { +; CHECK-NEXT: [[VAL:%.*]] = call float @llvm.amdgcn.rcp.f32(float 4.300000e+01) #[[ATTR13:[0-9]+]] ; CHECK-NEXT: ret float [[VAL]] ; %val = call float @llvm.amdgcn.rcp.f32(float 4.300000e+01) strictfp nounwind readnone @@ -82,7 +90,8 @@ declare double @llvm.amdgcn.sqrt.f64(double) nounwind readnone define half @test_constant_fold_sqrt_f16_undef() nounwind { -; CHECK-LABEL: @test_constant_fold_sqrt_f16_undef( +; CHECK-LABEL: define half @test_constant_fold_sqrt_f16_undef +; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: ret half 0xH7E00 ; %val = call half @llvm.amdgcn.sqrt.f16(half undef) nounwind readnone @@ -90,7 +99,8 @@ } define float @test_constant_fold_sqrt_f32_undef() nounwind { -; CHECK-LABEL: @test_constant_fold_sqrt_f32_undef( +; CHECK-LABEL: define float @test_constant_fold_sqrt_f32_undef +; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: ret float 0x7FF8000000000000 ; %val = call float @llvm.amdgcn.sqrt.f32(float undef) nounwind readnone @@ -98,7 +108,8 @@ } define double @test_constant_fold_sqrt_f64_undef() nounwind { -; CHECK-LABEL: @test_constant_fold_sqrt_f64_undef( +; CHECK-LABEL: define double @test_constant_fold_sqrt_f64_undef +; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: ret double 0x7FF8000000000000 ; %val = call double @llvm.amdgcn.sqrt.f64(double undef) nounwind readnone @@ -106,8 +117,9 @@ } define half @test_constant_fold_sqrt_f16_0() nounwind { -; CHECK-LABEL: @test_constant_fold_sqrt_f16_0( -; CHECK-NEXT: [[VAL:%.*]] = call half @llvm.amdgcn.sqrt.f16(half 0xH0000) #[[ATTR15:[0-9]+]] +; CHECK-LABEL: define half @test_constant_fold_sqrt_f16_0 +; CHECK-SAME: () #[[ATTR1]] { +; CHECK-NEXT: [[VAL:%.*]] = call half @llvm.amdgcn.sqrt.f16(half 0xH0000) #[[ATTR14:[0-9]+]] ; CHECK-NEXT: ret half [[VAL]] ; %val = call half @llvm.amdgcn.sqrt.f16(half 0.0) nounwind readnone @@ -115,8 +127,9 @@ } define float @test_constant_fold_sqrt_f32_0() nounwind { -; CHECK-LABEL: @test_constant_fold_sqrt_f32_0( -; CHECK-NEXT: [[VAL:%.*]] = call float @llvm.amdgcn.sqrt.f32(float 0.000000e+00) #[[ATTR15]] +; CHECK-LABEL: define float @test_constant_fold_sqrt_f32_0 +; CHECK-SAME: () #[[ATTR1]] { +; CHECK-NEXT: [[VAL:%.*]] = call float @llvm.amdgcn.sqrt.f32(float 0.000000e+00) #[[ATTR14]] ; CHECK-NEXT: ret float [[VAL]] ; %val = call float @llvm.amdgcn.sqrt.f32(float 0.0) nounwind readnone @@ -124,8 +137,9 @@ } define double @test_constant_fold_sqrt_f64_0() nounwind { -; CHECK-LABEL: @test_constant_fold_sqrt_f64_0( -; CHECK-NEXT: [[VAL:%.*]] = call double @llvm.amdgcn.sqrt.f64(double 0.000000e+00) #[[ATTR15]] +; CHECK-LABEL: define double @test_constant_fold_sqrt_f64_0 +; CHECK-SAME: () #[[ATTR1]] { +; CHECK-NEXT: [[VAL:%.*]] = call double @llvm.amdgcn.sqrt.f64(double 0.000000e+00) #[[ATTR14]] ; CHECK-NEXT: ret double [[VAL]] ; %val = call double @llvm.amdgcn.sqrt.f64(double 0.0) nounwind readnone @@ -133,8 +147,9 @@ } define half @test_constant_fold_sqrt_f16_neg0() nounwind { -; CHECK-LABEL: @test_constant_fold_sqrt_f16_neg0( -; CHECK-NEXT: [[VAL:%.*]] = call half @llvm.amdgcn.sqrt.f16(half 0xH8000) #[[ATTR15]] +; CHECK-LABEL: define half @test_constant_fold_sqrt_f16_neg0 +; CHECK-SAME: () #[[ATTR1]] { +; CHECK-NEXT: [[VAL:%.*]] = call half @llvm.amdgcn.sqrt.f16(half 0xH8000) #[[ATTR14]] ; CHECK-NEXT: ret half [[VAL]] ; %val = call half @llvm.amdgcn.sqrt.f16(half -0.0) nounwind readnone @@ -142,8 +157,9 @@ } define float @test_constant_fold_sqrt_f32_neg0() nounwind { -; CHECK-LABEL: @test_constant_fold_sqrt_f32_neg0( -; CHECK-NEXT: [[VAL:%.*]] = call float @llvm.amdgcn.sqrt.f32(float -0.000000e+00) #[[ATTR15]] +; CHECK-LABEL: define float @test_constant_fold_sqrt_f32_neg0 +; CHECK-SAME: () #[[ATTR1]] { +; CHECK-NEXT: [[VAL:%.*]] = call float @llvm.amdgcn.sqrt.f32(float -0.000000e+00) #[[ATTR14]] ; CHECK-NEXT: ret float [[VAL]] ; %val = call float @llvm.amdgcn.sqrt.f32(float -0.0) nounwind readnone @@ -151,8 +167,9 @@ } define double @test_constant_fold_sqrt_f64_neg0() nounwind { -; CHECK-LABEL: @test_constant_fold_sqrt_f64_neg0( -; CHECK-NEXT: [[VAL:%.*]] = call double @llvm.amdgcn.sqrt.f64(double -0.000000e+00) #[[ATTR15]] +; CHECK-LABEL: define double @test_constant_fold_sqrt_f64_neg0 +; CHECK-SAME: () #[[ATTR1]] { +; CHECK-NEXT: [[VAL:%.*]] = call double @llvm.amdgcn.sqrt.f64(double -0.000000e+00) #[[ATTR14]] ; CHECK-NEXT: ret double [[VAL]] ; %val = call double @llvm.amdgcn.sqrt.f64(double -0.0) nounwind readnone @@ -160,7 +177,8 @@ } define double @test_constant_fold_sqrt_snan_f64() nounwind { -; CHECK-LABEL: @test_constant_fold_sqrt_snan_f64( +; CHECK-LABEL: define double @test_constant_fold_sqrt_snan_f64 +; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: [[VAL:%.*]] = call double @llvm.amdgcn.sqrt.f64(double 0x7FF0000000000001) ; CHECK-NEXT: ret double [[VAL]] ; @@ -169,7 +187,8 @@ } define double @test_constant_fold_sqrt_qnan_f64() nounwind { -; CHECK-LABEL: @test_constant_fold_sqrt_qnan_f64( +; CHECK-LABEL: define double @test_constant_fold_sqrt_qnan_f64 +; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: [[VAL:%.*]] = call double @llvm.amdgcn.sqrt.f64(double 0x7FF8000000000000) ; CHECK-NEXT: ret double [[VAL]] ; @@ -178,7 +197,8 @@ } define double @test_constant_fold_sqrt_neg1() nounwind { -; CHECK-LABEL: @test_constant_fold_sqrt_neg1( +; CHECK-LABEL: define double @test_constant_fold_sqrt_neg1 +; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: [[VAL:%.*]] = call double @llvm.amdgcn.sqrt.f64(double -1.000000e+00) ; CHECK-NEXT: ret double [[VAL]] ; @@ -193,7 +213,8 @@ declare float @llvm.amdgcn.rsq.f32(float) nounwind readnone define float @test_constant_fold_rsq_f32_undef() nounwind { -; CHECK-LABEL: @test_constant_fold_rsq_f32_undef( +; CHECK-LABEL: define float @test_constant_fold_rsq_f32_undef +; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: ret float 0x7FF8000000000000 ; %val = call float @llvm.amdgcn.rsq.f32(float undef) nounwind readnone @@ -209,7 +230,8 @@ define float @test_constant_fold_frexp_mant_f32_undef() nounwind { -; CHECK-LABEL: @test_constant_fold_frexp_mant_f32_undef( +; CHECK-LABEL: define float @test_constant_fold_frexp_mant_f32_undef +; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: ret float undef ; %val = call float @llvm.amdgcn.frexp.mant.f32(float undef) @@ -217,7 +239,8 @@ } define double @test_constant_fold_frexp_mant_f64_undef() nounwind { -; CHECK-LABEL: @test_constant_fold_frexp_mant_f64_undef( +; CHECK-LABEL: define double @test_constant_fold_frexp_mant_f64_undef +; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: ret double undef ; %val = call double @llvm.amdgcn.frexp.mant.f64(double undef) @@ -225,7 +248,8 @@ } define float @test_constant_fold_frexp_mant_f32_0() nounwind { -; CHECK-LABEL: @test_constant_fold_frexp_mant_f32_0( +; CHECK-LABEL: define float @test_constant_fold_frexp_mant_f32_0 +; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: ret float 0.000000e+00 ; %val = call float @llvm.amdgcn.frexp.mant.f32(float 0.0) @@ -233,7 +257,8 @@ } define double @test_constant_fold_frexp_mant_f64_0() nounwind { -; CHECK-LABEL: @test_constant_fold_frexp_mant_f64_0( +; CHECK-LABEL: define double @test_constant_fold_frexp_mant_f64_0 +; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: ret double 0.000000e+00 ; %val = call double @llvm.amdgcn.frexp.mant.f64(double 0.0) @@ -241,7 +266,8 @@ } define float @test_constant_fold_frexp_mant_f32_n0() nounwind { -; CHECK-LABEL: @test_constant_fold_frexp_mant_f32_n0( +; CHECK-LABEL: define float @test_constant_fold_frexp_mant_f32_n0 +; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: ret float -0.000000e+00 ; %val = call float @llvm.amdgcn.frexp.mant.f32(float -0.0) @@ -249,7 +275,8 @@ } define double @test_constant_fold_frexp_mant_f64_n0() nounwind { -; CHECK-LABEL: @test_constant_fold_frexp_mant_f64_n0( +; CHECK-LABEL: define double @test_constant_fold_frexp_mant_f64_n0 +; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: ret double -0.000000e+00 ; %val = call double @llvm.amdgcn.frexp.mant.f64(double -0.0) @@ -257,7 +284,8 @@ } define float @test_constant_fold_frexp_mant_f32_1() nounwind { -; CHECK-LABEL: @test_constant_fold_frexp_mant_f32_1( +; CHECK-LABEL: define float @test_constant_fold_frexp_mant_f32_1 +; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: ret float 5.000000e-01 ; %val = call float @llvm.amdgcn.frexp.mant.f32(float 1.0) @@ -265,7 +293,8 @@ } define double @test_constant_fold_frexp_mant_f64_1() nounwind { -; CHECK-LABEL: @test_constant_fold_frexp_mant_f64_1( +; CHECK-LABEL: define double @test_constant_fold_frexp_mant_f64_1 +; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: ret double 5.000000e-01 ; %val = call double @llvm.amdgcn.frexp.mant.f64(double 1.0) @@ -273,7 +302,8 @@ } define float @test_constant_fold_frexp_mant_f32_n1() nounwind { -; CHECK-LABEL: @test_constant_fold_frexp_mant_f32_n1( +; CHECK-LABEL: define float @test_constant_fold_frexp_mant_f32_n1 +; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: ret float -5.000000e-01 ; %val = call float @llvm.amdgcn.frexp.mant.f32(float -1.0) @@ -281,7 +311,8 @@ } define double @test_constant_fold_frexp_mant_f64_n1() nounwind { -; CHECK-LABEL: @test_constant_fold_frexp_mant_f64_n1( +; CHECK-LABEL: define double @test_constant_fold_frexp_mant_f64_n1 +; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: ret double -5.000000e-01 ; %val = call double @llvm.amdgcn.frexp.mant.f64(double -1.0) @@ -289,7 +320,8 @@ } define float @test_constant_fold_frexp_mant_f32_nan() nounwind { -; CHECK-LABEL: @test_constant_fold_frexp_mant_f32_nan( +; CHECK-LABEL: define float @test_constant_fold_frexp_mant_f32_nan +; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: ret float 0x7FF8000000000000 ; %val = call float @llvm.amdgcn.frexp.mant.f32(float 0x7FF8000000000000) @@ -297,7 +329,8 @@ } define double @test_constant_fold_frexp_mant_f64_nan() nounwind { -; CHECK-LABEL: @test_constant_fold_frexp_mant_f64_nan( +; CHECK-LABEL: define double @test_constant_fold_frexp_mant_f64_nan +; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: ret double 0x7FF8000000000000 ; %val = call double @llvm.amdgcn.frexp.mant.f64(double 0x7FF8000000000000) @@ -305,7 +338,8 @@ } define float @test_constant_fold_frexp_mant_f32_inf() nounwind { -; CHECK-LABEL: @test_constant_fold_frexp_mant_f32_inf( +; CHECK-LABEL: define float @test_constant_fold_frexp_mant_f32_inf +; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: ret float 0x7FF0000000000000 ; %val = call float @llvm.amdgcn.frexp.mant.f32(float 0x7FF0000000000000) @@ -313,7 +347,8 @@ } define double @test_constant_fold_frexp_mant_f64_inf() nounwind { -; CHECK-LABEL: @test_constant_fold_frexp_mant_f64_inf( +; CHECK-LABEL: define double @test_constant_fold_frexp_mant_f64_inf +; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: ret double 0x7FF0000000000000 ; %val = call double @llvm.amdgcn.frexp.mant.f64(double 0x7FF0000000000000) @@ -321,7 +356,8 @@ } define float @test_constant_fold_frexp_mant_f32_ninf() nounwind { -; CHECK-LABEL: @test_constant_fold_frexp_mant_f32_ninf( +; CHECK-LABEL: define float @test_constant_fold_frexp_mant_f32_ninf +; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: ret float 0xFFF0000000000000 ; %val = call float @llvm.amdgcn.frexp.mant.f32(float 0xFFF0000000000000) @@ -329,7 +365,8 @@ } define double @test_constant_fold_frexp_mant_f64_ninf() nounwind { -; CHECK-LABEL: @test_constant_fold_frexp_mant_f64_ninf( +; CHECK-LABEL: define double @test_constant_fold_frexp_mant_f64_ninf +; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: ret double 0xFFF0000000000000 ; %val = call double @llvm.amdgcn.frexp.mant.f64(double 0xFFF0000000000000) @@ -337,7 +374,8 @@ } define float @test_constant_fold_frexp_mant_f32_max_num() nounwind { -; CHECK-LABEL: @test_constant_fold_frexp_mant_f32_max_num( +; CHECK-LABEL: define float @test_constant_fold_frexp_mant_f32_max_num +; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: ret float 0x3FEFFFFFE0000000 ; %val = call float @llvm.amdgcn.frexp.mant.f32(float 0x47EFFFFFE0000000) @@ -345,7 +383,8 @@ } define double @test_constant_fold_frexp_mant_f64_max_num() nounwind { -; CHECK-LABEL: @test_constant_fold_frexp_mant_f64_max_num( +; CHECK-LABEL: define double @test_constant_fold_frexp_mant_f64_max_num +; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: ret double 0x3FEFFFFFFFFFFFFF ; %val = call double @llvm.amdgcn.frexp.mant.f64(double 0x7FEFFFFFFFFFFFFF) @@ -353,7 +392,8 @@ } define float @test_constant_fold_frexp_mant_f32_min_num() nounwind { -; CHECK-LABEL: @test_constant_fold_frexp_mant_f32_min_num( +; CHECK-LABEL: define float @test_constant_fold_frexp_mant_f32_min_num +; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: ret float 5.000000e-01 ; %val = call float @llvm.amdgcn.frexp.mant.f32(float 0x36A0000000000000) @@ -361,7 +401,8 @@ } define double @test_constant_fold_frexp_mant_f64_min_num() nounwind { -; CHECK-LABEL: @test_constant_fold_frexp_mant_f64_min_num( +; CHECK-LABEL: define double @test_constant_fold_frexp_mant_f64_min_num +; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: ret double 5.000000e-01 ; %val = call double @llvm.amdgcn.frexp.mant.f64(double 4.940656e-324) @@ -377,7 +418,8 @@ declare i32 @llvm.amdgcn.frexp.exp.f64(double) nounwind readnone define i32 @test_constant_fold_frexp_exp_f32_undef() nounwind { -; CHECK-LABEL: @test_constant_fold_frexp_exp_f32_undef( +; CHECK-LABEL: define i32 @test_constant_fold_frexp_exp_f32_undef +; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: ret i32 undef ; %val = call i32 @llvm.amdgcn.frexp.exp.f32(float undef) @@ -385,7 +427,8 @@ } define i32 @test_constant_fold_frexp_exp_f64_undef() nounwind { -; CHECK-LABEL: @test_constant_fold_frexp_exp_f64_undef( +; CHECK-LABEL: define i32 @test_constant_fold_frexp_exp_f64_undef +; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: ret i32 undef ; %val = call i32 @llvm.amdgcn.frexp.exp.f64(double undef) @@ -393,7 +436,8 @@ } define i32 @test_constant_fold_frexp_exp_f32_0() nounwind { -; CHECK-LABEL: @test_constant_fold_frexp_exp_f32_0( +; CHECK-LABEL: define i32 @test_constant_fold_frexp_exp_f32_0 +; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: ret i32 0 ; %val = call i32 @llvm.amdgcn.frexp.exp.f32(float 0.0) @@ -401,7 +445,8 @@ } define i32 @test_constant_fold_frexp_exp_f64_0() nounwind { -; CHECK-LABEL: @test_constant_fold_frexp_exp_f64_0( +; CHECK-LABEL: define i32 @test_constant_fold_frexp_exp_f64_0 +; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: ret i32 0 ; %val = call i32 @llvm.amdgcn.frexp.exp.f64(double 0.0) @@ -409,7 +454,8 @@ } define i32 @test_constant_fold_frexp_exp_f32_n0() nounwind { -; CHECK-LABEL: @test_constant_fold_frexp_exp_f32_n0( +; CHECK-LABEL: define i32 @test_constant_fold_frexp_exp_f32_n0 +; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: ret i32 0 ; %val = call i32 @llvm.amdgcn.frexp.exp.f32(float -0.0) @@ -417,7 +463,8 @@ } define i32 @test_constant_fold_frexp_exp_f64_n0() nounwind { -; CHECK-LABEL: @test_constant_fold_frexp_exp_f64_n0( +; CHECK-LABEL: define i32 @test_constant_fold_frexp_exp_f64_n0 +; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: ret i32 0 ; %val = call i32 @llvm.amdgcn.frexp.exp.f64(double -0.0) @@ -425,7 +472,8 @@ } define i32 @test_constant_fold_frexp_exp_f32_1024() nounwind { -; CHECK-LABEL: @test_constant_fold_frexp_exp_f32_1024( +; CHECK-LABEL: define i32 @test_constant_fold_frexp_exp_f32_1024 +; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: ret i32 11 ; %val = call i32 @llvm.amdgcn.frexp.exp.f32(float 1024.0) @@ -433,7 +481,8 @@ } define i32 @test_constant_fold_frexp_exp_f64_1024() nounwind { -; CHECK-LABEL: @test_constant_fold_frexp_exp_f64_1024( +; CHECK-LABEL: define i32 @test_constant_fold_frexp_exp_f64_1024 +; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: ret i32 11 ; %val = call i32 @llvm.amdgcn.frexp.exp.f64(double 1024.0) @@ -441,7 +490,8 @@ } define i32 @test_constant_fold_frexp_exp_f32_n1024() nounwind { -; CHECK-LABEL: @test_constant_fold_frexp_exp_f32_n1024( +; CHECK-LABEL: define i32 @test_constant_fold_frexp_exp_f32_n1024 +; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: ret i32 11 ; %val = call i32 @llvm.amdgcn.frexp.exp.f32(float -1024.0) @@ -449,7 +499,8 @@ } define i32 @test_constant_fold_frexp_exp_f64_n1024() nounwind { -; CHECK-LABEL: @test_constant_fold_frexp_exp_f64_n1024( +; CHECK-LABEL: define i32 @test_constant_fold_frexp_exp_f64_n1024 +; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: ret i32 11 ; %val = call i32 @llvm.amdgcn.frexp.exp.f64(double -1024.0) @@ -457,7 +508,8 @@ } define i32 @test_constant_fold_frexp_exp_f32_1_1024() nounwind { -; CHECK-LABEL: @test_constant_fold_frexp_exp_f32_1_1024( +; CHECK-LABEL: define i32 @test_constant_fold_frexp_exp_f32_1_1024 +; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: ret i32 -9 ; %val = call i32 @llvm.amdgcn.frexp.exp.f32(float 0.0009765625) @@ -465,7 +517,8 @@ } define i32 @test_constant_fold_frexp_exp_f64_1_1024() nounwind { -; CHECK-LABEL: @test_constant_fold_frexp_exp_f64_1_1024( +; CHECK-LABEL: define i32 @test_constant_fold_frexp_exp_f64_1_1024 +; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: ret i32 -9 ; %val = call i32 @llvm.amdgcn.frexp.exp.f64(double 0.0009765625) @@ -473,7 +526,8 @@ } define i32 @test_constant_fold_frexp_exp_f32_nan() nounwind { -; CHECK-LABEL: @test_constant_fold_frexp_exp_f32_nan( +; CHECK-LABEL: define i32 @test_constant_fold_frexp_exp_f32_nan +; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: ret i32 0 ; %val = call i32 @llvm.amdgcn.frexp.exp.f32(float 0x7FF8000000000000) @@ -481,7 +535,8 @@ } define i32 @test_constant_fold_frexp_exp_f64_nan() nounwind { -; CHECK-LABEL: @test_constant_fold_frexp_exp_f64_nan( +; CHECK-LABEL: define i32 @test_constant_fold_frexp_exp_f64_nan +; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: ret i32 0 ; %val = call i32 @llvm.amdgcn.frexp.exp.f64(double 0x7FF8000000000000) @@ -489,7 +544,8 @@ } define i32 @test_constant_fold_frexp_exp_f32_inf() nounwind { -; CHECK-LABEL: @test_constant_fold_frexp_exp_f32_inf( +; CHECK-LABEL: define i32 @test_constant_fold_frexp_exp_f32_inf +; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: ret i32 0 ; %val = call i32 @llvm.amdgcn.frexp.exp.f32(float 0x7FF0000000000000) @@ -497,7 +553,8 @@ } define i32 @test_constant_fold_frexp_exp_f64_inf() nounwind { -; CHECK-LABEL: @test_constant_fold_frexp_exp_f64_inf( +; CHECK-LABEL: define i32 @test_constant_fold_frexp_exp_f64_inf +; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: ret i32 0 ; %val = call i32 @llvm.amdgcn.frexp.exp.f64(double 0x7FF0000000000000) @@ -505,7 +562,8 @@ } define i32 @test_constant_fold_frexp_exp_f32_ninf() nounwind { -; CHECK-LABEL: @test_constant_fold_frexp_exp_f32_ninf( +; CHECK-LABEL: define i32 @test_constant_fold_frexp_exp_f32_ninf +; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: ret i32 0 ; %val = call i32 @llvm.amdgcn.frexp.exp.f32(float 0xFFF0000000000000) @@ -513,7 +571,8 @@ } define i32 @test_constant_fold_frexp_exp_f64_ninf() nounwind { -; CHECK-LABEL: @test_constant_fold_frexp_exp_f64_ninf( +; CHECK-LABEL: define i32 @test_constant_fold_frexp_exp_f64_ninf +; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: ret i32 0 ; %val = call i32 @llvm.amdgcn.frexp.exp.f64(double 0xFFF0000000000000) @@ -521,7 +580,8 @@ } define i32 @test_constant_fold_frexp_exp_f32_max_num() nounwind { -; CHECK-LABEL: @test_constant_fold_frexp_exp_f32_max_num( +; CHECK-LABEL: define i32 @test_constant_fold_frexp_exp_f32_max_num +; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: ret i32 128 ; %val = call i32 @llvm.amdgcn.frexp.exp.f32(float 0x47EFFFFFE0000000) @@ -529,7 +589,8 @@ } define i32 @test_constant_fold_frexp_exp_f64_max_num() nounwind { -; CHECK-LABEL: @test_constant_fold_frexp_exp_f64_max_num( +; CHECK-LABEL: define i32 @test_constant_fold_frexp_exp_f64_max_num +; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: ret i32 1024 ; %val = call i32 @llvm.amdgcn.frexp.exp.f64(double 0x7FEFFFFFFFFFFFFF) @@ -537,7 +598,8 @@ } define i32 @test_constant_fold_frexp_exp_f32_min_num() nounwind { -; CHECK-LABEL: @test_constant_fold_frexp_exp_f32_min_num( +; CHECK-LABEL: define i32 @test_constant_fold_frexp_exp_f32_min_num +; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: ret i32 -148 ; %val = call i32 @llvm.amdgcn.frexp.exp.f32(float 0x36A0000000000000) @@ -545,7 +607,8 @@ } define i32 @test_constant_fold_frexp_exp_f64_min_num() nounwind { -; CHECK-LABEL: @test_constant_fold_frexp_exp_f64_min_num( +; CHECK-LABEL: define i32 @test_constant_fold_frexp_exp_f64_min_num +; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: ret i32 -1073 ; %val = call i32 @llvm.amdgcn.frexp.exp.f64(double 4.940656e-324) @@ -560,7 +623,8 @@ declare i1 @llvm.amdgcn.class.f64(double, i32) nounwind readnone define i1 @test_class_undef_mask_f32(float %x) nounwind { -; CHECK-LABEL: @test_class_undef_mask_f32( +; CHECK-LABEL: define i1 @test_class_undef_mask_f32 +; CHECK-SAME: (float [[X:%.*]]) #[[ATTR1]] { ; CHECK-NEXT: ret i1 false ; %val = call i1 @llvm.amdgcn.class.f32(float %x, i32 undef) @@ -568,8 +632,9 @@ } define i1 @test_class_over_max_mask_f32(float %x) nounwind { -; CHECK-LABEL: @test_class_over_max_mask_f32( -; CHECK-NEXT: [[VAL:%.*]] = call i1 @llvm.amdgcn.class.f32(float [[X:%.*]], i32 1) +; CHECK-LABEL: define i1 @test_class_over_max_mask_f32 +; CHECK-SAME: (float [[X:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[VAL:%.*]] = call i1 @llvm.amdgcn.class.f32(float [[X]], i32 1) ; CHECK-NEXT: ret i1 [[VAL]] ; %val = call i1 @llvm.amdgcn.class.f32(float %x, i32 1025) @@ -577,7 +642,8 @@ } define i1 @test_class_no_mask_f32(float %x) nounwind { -; CHECK-LABEL: @test_class_no_mask_f32( +; CHECK-LABEL: define i1 @test_class_no_mask_f32 +; CHECK-SAME: (float [[X:%.*]]) #[[ATTR1]] { ; CHECK-NEXT: ret i1 false ; %val = call i1 @llvm.amdgcn.class.f32(float %x, i32 0) @@ -585,7 +651,8 @@ } define i1 @test_class_full_mask_f32(float %x) nounwind { -; CHECK-LABEL: @test_class_full_mask_f32( +; CHECK-LABEL: define i1 @test_class_full_mask_f32 +; CHECK-SAME: (float [[X:%.*]]) #[[ATTR1]] { ; CHECK-NEXT: ret i1 true ; %val = call i1 @llvm.amdgcn.class.f32(float %x, i32 1023) @@ -593,7 +660,8 @@ } define i1 @test_class_undef_no_mask_f32() nounwind { -; CHECK-LABEL: @test_class_undef_no_mask_f32( +; CHECK-LABEL: define i1 @test_class_undef_no_mask_f32 +; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: ret i1 false ; %val = call i1 @llvm.amdgcn.class.f32(float undef, i32 0) @@ -601,7 +669,8 @@ } define i1 @test_class_undef_full_mask_f32() nounwind { -; CHECK-LABEL: @test_class_undef_full_mask_f32( +; CHECK-LABEL: define i1 @test_class_undef_full_mask_f32 +; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: ret i1 true ; %val = call i1 @llvm.amdgcn.class.f32(float undef, i32 1023) @@ -609,7 +678,8 @@ } define i1 @test_class_undef_val_f32() nounwind { -; CHECK-LABEL: @test_class_undef_val_f32( +; CHECK-LABEL: define i1 @test_class_undef_val_f32 +; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: ret i1 undef ; %val = call i1 @llvm.amdgcn.class.f32(float undef, i32 4) @@ -617,7 +687,8 @@ } define i1 @test_class_undef_undef_f32() nounwind { -; CHECK-LABEL: @test_class_undef_undef_f32( +; CHECK-LABEL: define i1 @test_class_undef_undef_f32 +; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: ret i1 undef ; %val = call i1 @llvm.amdgcn.class.f32(float undef, i32 undef) @@ -625,8 +696,9 @@ } define i1 @test_class_var_mask_f32(float %x, i32 %mask) nounwind { -; CHECK-LABEL: @test_class_var_mask_f32( -; CHECK-NEXT: [[VAL:%.*]] = call i1 @llvm.amdgcn.class.f32(float [[X:%.*]], i32 [[MASK:%.*]]) +; CHECK-LABEL: define i1 @test_class_var_mask_f32 +; CHECK-SAME: (float [[X:%.*]], i32 [[MASK:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[VAL:%.*]] = call i1 @llvm.amdgcn.class.f32(float [[X]], i32 [[MASK]]) ; CHECK-NEXT: ret i1 [[VAL]] ; %val = call i1 @llvm.amdgcn.class.f32(float %x, i32 %mask) @@ -634,8 +706,9 @@ } define i1 @test_class_isnan_f32(float %x) nounwind { -; CHECK-LABEL: @test_class_isnan_f32( -; CHECK-NEXT: [[VAL:%.*]] = fcmp uno float [[X:%.*]], 0.000000e+00 +; CHECK-LABEL: define i1 @test_class_isnan_f32 +; CHECK-SAME: (float [[X:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[VAL:%.*]] = fcmp uno float [[X]], 0.000000e+00 ; CHECK-NEXT: ret i1 [[VAL]] ; %val = call i1 @llvm.amdgcn.class.f32(float %x, i32 3) @@ -643,8 +716,9 @@ } define i1 @test_class_isnan_f32_strict(float %x) nounwind { -; CHECK-LABEL: @test_class_isnan_f32_strict( -; CHECK-NEXT: [[VAL:%.*]] = call i1 @llvm.amdgcn.class.f32(float [[X:%.*]], i32 3) #[[ATTR16:[0-9]+]] +; CHECK-LABEL: define i1 @test_class_isnan_f32_strict +; CHECK-SAME: (float [[X:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[VAL:%.*]] = call i1 @llvm.amdgcn.class.f32(float [[X]], i32 3) #[[ATTR15:[0-9]+]] ; CHECK-NEXT: ret i1 [[VAL]] ; %val = call i1 @llvm.amdgcn.class.f32(float %x, i32 3) strictfp @@ -652,8 +726,9 @@ } define i1 @test_class_is_p0_n0_f32(float %x) nounwind { -; CHECK-LABEL: @test_class_is_p0_n0_f32( -; CHECK-NEXT: [[VAL:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00 +; CHECK-LABEL: define i1 @test_class_is_p0_n0_f32 +; CHECK-SAME: (float [[X:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[VAL:%.*]] = fcmp oeq float [[X]], 0.000000e+00 ; CHECK-NEXT: ret i1 [[VAL]] ; %val = call i1 @llvm.amdgcn.class.f32(float %x, i32 96) @@ -661,8 +736,9 @@ } define i1 @test_class_is_p0_n0_f32_strict(float %x) nounwind { -; CHECK-LABEL: @test_class_is_p0_n0_f32_strict( -; CHECK-NEXT: [[VAL:%.*]] = call i1 @llvm.amdgcn.class.f32(float [[X:%.*]], i32 96) #[[ATTR16]] +; CHECK-LABEL: define i1 @test_class_is_p0_n0_f32_strict +; CHECK-SAME: (float [[X:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[VAL:%.*]] = call i1 @llvm.amdgcn.class.f32(float [[X]], i32 96) #[[ATTR15]] ; CHECK-NEXT: ret i1 [[VAL]] ; %val = call i1 @llvm.amdgcn.class.f32(float %x, i32 96) strictfp @@ -670,7 +746,8 @@ } define i1 @test_constant_class_snan_test_snan_f64() nounwind { -; CHECK-LABEL: @test_constant_class_snan_test_snan_f64( +; CHECK-LABEL: define i1 @test_constant_class_snan_test_snan_f64 +; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: ret i1 true ; %val = call i1 @llvm.amdgcn.class.f64(double 0x7FF0000000000001, i32 1) @@ -678,7 +755,8 @@ } define i1 @test_constant_class_qnan_test_qnan_f64() nounwind { -; CHECK-LABEL: @test_constant_class_qnan_test_qnan_f64( +; CHECK-LABEL: define i1 @test_constant_class_qnan_test_qnan_f64 +; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: ret i1 true ; %val = call i1 @llvm.amdgcn.class.f64(double 0x7FF8000000000000, i32 2) @@ -686,7 +764,8 @@ } define i1 @test_constant_class_qnan_test_snan_f64() nounwind { -; CHECK-LABEL: @test_constant_class_qnan_test_snan_f64( +; CHECK-LABEL: define i1 @test_constant_class_qnan_test_snan_f64 +; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: ret i1 false ; %val = call i1 @llvm.amdgcn.class.f64(double 0x7FF8000000000000, i32 1) @@ -694,7 +773,8 @@ } define i1 @test_constant_class_ninf_test_ninf_f64() nounwind { -; CHECK-LABEL: @test_constant_class_ninf_test_ninf_f64( +; CHECK-LABEL: define i1 @test_constant_class_ninf_test_ninf_f64 +; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: ret i1 true ; %val = call i1 @llvm.amdgcn.class.f64(double 0xFFF0000000000000, i32 4) @@ -702,7 +782,8 @@ } define i1 @test_constant_class_pinf_test_ninf_f64() nounwind { -; CHECK-LABEL: @test_constant_class_pinf_test_ninf_f64( +; CHECK-LABEL: define i1 @test_constant_class_pinf_test_ninf_f64 +; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: ret i1 false ; %val = call i1 @llvm.amdgcn.class.f64(double 0x7FF0000000000000, i32 4) @@ -710,7 +791,8 @@ } define i1 @test_constant_class_qnan_test_ninf_f64() nounwind { -; CHECK-LABEL: @test_constant_class_qnan_test_ninf_f64( +; CHECK-LABEL: define i1 @test_constant_class_qnan_test_ninf_f64 +; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: ret i1 false ; %val = call i1 @llvm.amdgcn.class.f64(double 0x7FF8000000000000, i32 4) @@ -718,7 +800,8 @@ } define i1 @test_constant_class_snan_test_ninf_f64() nounwind { -; CHECK-LABEL: @test_constant_class_snan_test_ninf_f64( +; CHECK-LABEL: define i1 @test_constant_class_snan_test_ninf_f64 +; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: ret i1 false ; %val = call i1 @llvm.amdgcn.class.f64(double 0x7FF0000000000001, i32 4) @@ -726,7 +809,8 @@ } define i1 @test_constant_class_nnormal_test_nnormal_f64() nounwind { -; CHECK-LABEL: @test_constant_class_nnormal_test_nnormal_f64( +; CHECK-LABEL: define i1 @test_constant_class_nnormal_test_nnormal_f64 +; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: ret i1 true ; %val = call i1 @llvm.amdgcn.class.f64(double -1.0, i32 8) @@ -734,7 +818,8 @@ } define i1 @test_constant_class_pnormal_test_nnormal_f64() nounwind { -; CHECK-LABEL: @test_constant_class_pnormal_test_nnormal_f64( +; CHECK-LABEL: define i1 @test_constant_class_pnormal_test_nnormal_f64 +; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: ret i1 false ; %val = call i1 @llvm.amdgcn.class.f64(double 1.0, i32 8) @@ -742,7 +827,8 @@ } define i1 @test_constant_class_nsubnormal_test_nsubnormal_f64() nounwind { -; CHECK-LABEL: @test_constant_class_nsubnormal_test_nsubnormal_f64( +; CHECK-LABEL: define i1 @test_constant_class_nsubnormal_test_nsubnormal_f64 +; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: ret i1 true ; %val = call i1 @llvm.amdgcn.class.f64(double 0x800fffffffffffff, i32 16) @@ -750,7 +836,8 @@ } define i1 @test_constant_class_psubnormal_test_nsubnormal_f64() nounwind { -; CHECK-LABEL: @test_constant_class_psubnormal_test_nsubnormal_f64( +; CHECK-LABEL: define i1 @test_constant_class_psubnormal_test_nsubnormal_f64 +; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: ret i1 false ; %val = call i1 @llvm.amdgcn.class.f64(double 0x000fffffffffffff, i32 16) @@ -758,7 +845,8 @@ } define i1 @test_constant_class_nzero_test_nzero_f64() nounwind { -; CHECK-LABEL: @test_constant_class_nzero_test_nzero_f64( +; CHECK-LABEL: define i1 @test_constant_class_nzero_test_nzero_f64 +; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: ret i1 true ; %val = call i1 @llvm.amdgcn.class.f64(double -0.0, i32 32) @@ -766,7 +854,8 @@ } define i1 @test_constant_class_pzero_test_nzero_f64() nounwind { -; CHECK-LABEL: @test_constant_class_pzero_test_nzero_f64( +; CHECK-LABEL: define i1 @test_constant_class_pzero_test_nzero_f64 +; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: ret i1 false ; %val = call i1 @llvm.amdgcn.class.f64(double 0.0, i32 32) @@ -774,7 +863,8 @@ } define i1 @test_constant_class_pzero_test_pzero_f64() nounwind { -; CHECK-LABEL: @test_constant_class_pzero_test_pzero_f64( +; CHECK-LABEL: define i1 @test_constant_class_pzero_test_pzero_f64 +; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: ret i1 true ; %val = call i1 @llvm.amdgcn.class.f64(double 0.0, i32 64) @@ -782,7 +872,8 @@ } define i1 @test_constant_class_nzero_test_pzero_f64() nounwind { -; CHECK-LABEL: @test_constant_class_nzero_test_pzero_f64( +; CHECK-LABEL: define i1 @test_constant_class_nzero_test_pzero_f64 +; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: ret i1 false ; %val = call i1 @llvm.amdgcn.class.f64(double -0.0, i32 64) @@ -790,7 +881,8 @@ } define i1 @test_constant_class_psubnormal_test_psubnormal_f64() nounwind { -; CHECK-LABEL: @test_constant_class_psubnormal_test_psubnormal_f64( +; CHECK-LABEL: define i1 @test_constant_class_psubnormal_test_psubnormal_f64 +; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: ret i1 true ; %val = call i1 @llvm.amdgcn.class.f64(double 0x000fffffffffffff, i32 128) @@ -798,7 +890,8 @@ } define i1 @test_constant_class_nsubnormal_test_psubnormal_f64() nounwind { -; CHECK-LABEL: @test_constant_class_nsubnormal_test_psubnormal_f64( +; CHECK-LABEL: define i1 @test_constant_class_nsubnormal_test_psubnormal_f64 +; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: ret i1 false ; %val = call i1 @llvm.amdgcn.class.f64(double 0x800fffffffffffff, i32 128) @@ -806,7 +899,8 @@ } define i1 @test_constant_class_pnormal_test_pnormal_f64() nounwind { -; CHECK-LABEL: @test_constant_class_pnormal_test_pnormal_f64( +; CHECK-LABEL: define i1 @test_constant_class_pnormal_test_pnormal_f64 +; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: ret i1 true ; %val = call i1 @llvm.amdgcn.class.f64(double 1.0, i32 256) @@ -814,7 +908,8 @@ } define i1 @test_constant_class_nnormal_test_pnormal_f64() nounwind { -; CHECK-LABEL: @test_constant_class_nnormal_test_pnormal_f64( +; CHECK-LABEL: define i1 @test_constant_class_nnormal_test_pnormal_f64 +; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: ret i1 false ; %val = call i1 @llvm.amdgcn.class.f64(double -1.0, i32 256) @@ -822,7 +917,8 @@ } define i1 @test_constant_class_pinf_test_pinf_f64() nounwind { -; CHECK-LABEL: @test_constant_class_pinf_test_pinf_f64( +; CHECK-LABEL: define i1 @test_constant_class_pinf_test_pinf_f64 +; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: ret i1 true ; %val = call i1 @llvm.amdgcn.class.f64(double 0x7FF0000000000000, i32 512) @@ -830,7 +926,8 @@ } define i1 @test_constant_class_ninf_test_pinf_f64() nounwind { -; CHECK-LABEL: @test_constant_class_ninf_test_pinf_f64( +; CHECK-LABEL: define i1 @test_constant_class_ninf_test_pinf_f64 +; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: ret i1 false ; %val = call i1 @llvm.amdgcn.class.f64(double 0xFFF0000000000000, i32 512) @@ -838,7 +935,8 @@ } define i1 @test_constant_class_qnan_test_pinf_f64() nounwind { -; CHECK-LABEL: @test_constant_class_qnan_test_pinf_f64( +; CHECK-LABEL: define i1 @test_constant_class_qnan_test_pinf_f64 +; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: ret i1 false ; %val = call i1 @llvm.amdgcn.class.f64(double 0x7FF8000000000000, i32 512) @@ -846,7 +944,8 @@ } define i1 @test_constant_class_snan_test_pinf_f64() nounwind { -; CHECK-LABEL: @test_constant_class_snan_test_pinf_f64( +; CHECK-LABEL: define i1 @test_constant_class_snan_test_pinf_f64 +; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: ret i1 false ; %val = call i1 @llvm.amdgcn.class.f64(double 0x7FF0000000000001, i32 512) @@ -854,7 +953,8 @@ } define i1 @test_class_is_snan_nnan_src(float %x) { -; CHECK-LABEL: @test_class_is_snan_nnan_src( +; CHECK-LABEL: define i1 @test_class_is_snan_nnan_src +; CHECK-SAME: (float [[X:%.*]]) #[[ATTR3:[0-9]+]] { ; CHECK-NEXT: ret i1 false ; %nnan = fadd nnan float %x, 1.0 @@ -863,7 +963,8 @@ } define i1 @test_class_is_qnan_nnan_src(float %x) { -; CHECK-LABEL: @test_class_is_qnan_nnan_src( +; CHECK-LABEL: define i1 @test_class_is_qnan_nnan_src +; CHECK-SAME: (float [[X:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: ret i1 false ; %nnan = fadd nnan float %x, 1.0 @@ -872,7 +973,8 @@ } define i1 @test_class_is_nan_nnan_src(float %x) { -; CHECK-LABEL: @test_class_is_nan_nnan_src( +; CHECK-LABEL: define i1 @test_class_is_nan_nnan_src +; CHECK-SAME: (float [[X:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: ret i1 false ; %nnan = fadd nnan float %x, 1.0 @@ -881,8 +983,9 @@ } define i1 @test_class_is_nan_other_nnan_src(float %x) { -; CHECK-LABEL: @test_class_is_nan_other_nnan_src( -; CHECK-NEXT: [[NNAN:%.*]] = fadd nnan float [[X:%.*]], 1.000000e+00 +; CHECK-LABEL: define i1 @test_class_is_nan_other_nnan_src +; CHECK-SAME: (float [[X:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[NNAN:%.*]] = fadd nnan float [[X]], 1.000000e+00 ; CHECK-NEXT: [[CLASS:%.*]] = call i1 @llvm.amdgcn.class.f32(float [[NNAN]], i32 264) ; CHECK-NEXT: ret i1 [[CLASS]] ; @@ -898,8 +1001,9 @@ declare float @llvm.fabs.f32(float) nounwind readnone define float @cos_fneg_f32(float %x) { -; CHECK-LABEL: @cos_fneg_f32( -; CHECK-NEXT: [[COS:%.*]] = call float @llvm.amdgcn.cos.f32(float [[X:%.*]]) +; CHECK-LABEL: define float @cos_fneg_f32 +; CHECK-SAME: (float [[X:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[COS:%.*]] = call float @llvm.amdgcn.cos.f32(float [[X]]) ; CHECK-NEXT: ret float [[COS]] ; %x.fneg = fsub float -0.0, %x @@ -908,8 +1012,9 @@ } define float @cos_unary_fneg_f32(float %x) { -; CHECK-LABEL: @cos_unary_fneg_f32( -; CHECK-NEXT: [[COS:%.*]] = call float @llvm.amdgcn.cos.f32(float [[X:%.*]]) +; CHECK-LABEL: define float @cos_unary_fneg_f32 +; CHECK-SAME: (float [[X:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[COS:%.*]] = call float @llvm.amdgcn.cos.f32(float [[X]]) ; CHECK-NEXT: ret float [[COS]] ; %x.fneg = fneg float %x @@ -918,8 +1023,9 @@ } define float @cos_fabs_f32(float %x) { -; CHECK-LABEL: @cos_fabs_f32( -; CHECK-NEXT: [[COS:%.*]] = call float @llvm.amdgcn.cos.f32(float [[X:%.*]]) +; CHECK-LABEL: define float @cos_fabs_f32 +; CHECK-SAME: (float [[X:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[COS:%.*]] = call float @llvm.amdgcn.cos.f32(float [[X]]) ; CHECK-NEXT: ret float [[COS]] ; %x.fabs = call float @llvm.fabs.f32(float %x) @@ -928,8 +1034,9 @@ } define float @cos_fabs_fneg_f32(float %x) { -; CHECK-LABEL: @cos_fabs_fneg_f32( -; CHECK-NEXT: [[COS:%.*]] = call float @llvm.amdgcn.cos.f32(float [[X:%.*]]) +; CHECK-LABEL: define float @cos_fabs_fneg_f32 +; CHECK-SAME: (float [[X:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[COS:%.*]] = call float @llvm.amdgcn.cos.f32(float [[X]]) ; CHECK-NEXT: ret float [[COS]] ; %x.fabs = call float @llvm.fabs.f32(float %x) @@ -939,8 +1046,9 @@ } define float @cos_fabs_unary_fneg_f32(float %x) { -; CHECK-LABEL: @cos_fabs_unary_fneg_f32( -; CHECK-NEXT: [[COS:%.*]] = call float @llvm.amdgcn.cos.f32(float [[X:%.*]]) +; CHECK-LABEL: define float @cos_fabs_unary_fneg_f32 +; CHECK-SAME: (float [[X:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[COS:%.*]] = call float @llvm.amdgcn.cos.f32(float [[X]]) ; CHECK-NEXT: ret float [[COS]] ; %x.fabs = call float @llvm.fabs.f32(float %x) @@ -956,8 +1064,9 @@ declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) nounwind readnone define <2 x half> @vars_lhs_cvt_pkrtz(float %x, float %y) { -; CHECK-LABEL: @vars_lhs_cvt_pkrtz( -; CHECK-NEXT: [[CVT:%.*]] = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float [[X:%.*]], float [[Y:%.*]]) +; CHECK-LABEL: define <2 x half> @vars_lhs_cvt_pkrtz +; CHECK-SAME: (float [[X:%.*]], float [[Y:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[CVT:%.*]] = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float [[X]], float [[Y]]) ; CHECK-NEXT: ret <2 x half> [[CVT]] ; %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %y) @@ -965,8 +1074,9 @@ } define <2 x half> @constant_lhs_cvt_pkrtz(float %y) { -; CHECK-LABEL: @constant_lhs_cvt_pkrtz( -; CHECK-NEXT: [[CVT:%.*]] = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 0.000000e+00, float [[Y:%.*]]) +; CHECK-LABEL: define <2 x half> @constant_lhs_cvt_pkrtz +; CHECK-SAME: (float [[Y:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[CVT:%.*]] = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 0.000000e+00, float [[Y]]) ; CHECK-NEXT: ret <2 x half> [[CVT]] ; %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 0.0, float %y) @@ -974,8 +1084,9 @@ } define <2 x half> @constant_rhs_cvt_pkrtz(float %x) { -; CHECK-LABEL: @constant_rhs_cvt_pkrtz( -; CHECK-NEXT: [[CVT:%.*]] = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float [[X:%.*]], float 0.000000e+00) +; CHECK-LABEL: define <2 x half> @constant_rhs_cvt_pkrtz +; CHECK-SAME: (float [[X:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[CVT:%.*]] = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float [[X]], float 0.000000e+00) ; CHECK-NEXT: ret <2 x half> [[CVT]] ; %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float 0.0) @@ -983,8 +1094,9 @@ } define <2 x half> @undef_lhs_cvt_pkrtz(float %y) { -; CHECK-LABEL: @undef_lhs_cvt_pkrtz( -; CHECK-NEXT: [[CVT:%.*]] = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float undef, float [[Y:%.*]]) +; CHECK-LABEL: define <2 x half> @undef_lhs_cvt_pkrtz +; CHECK-SAME: (float [[Y:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[CVT:%.*]] = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float undef, float [[Y]]) ; CHECK-NEXT: ret <2 x half> [[CVT]] ; %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float undef, float %y) @@ -992,8 +1104,9 @@ } define <2 x half> @undef_rhs_cvt_pkrtz(float %x) { -; CHECK-LABEL: @undef_rhs_cvt_pkrtz( -; CHECK-NEXT: [[CVT:%.*]] = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float [[X:%.*]], float undef) +; CHECK-LABEL: define <2 x half> @undef_rhs_cvt_pkrtz +; CHECK-SAME: (float [[X:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[CVT:%.*]] = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float [[X]], float undef) ; CHECK-NEXT: ret <2 x half> [[CVT]] ; %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float undef) @@ -1001,7 +1114,8 @@ } define <2 x half> @undef_cvt_pkrtz() { -; CHECK-LABEL: @undef_cvt_pkrtz( +; CHECK-LABEL: define <2 x half> @undef_cvt_pkrtz +; CHECK-SAME: () #[[ATTR3]] { ; CHECK-NEXT: ret <2 x half> undef ; %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float undef, float undef) @@ -1009,7 +1123,8 @@ } define <2 x half> @constant_splat0_cvt_pkrtz() { -; CHECK-LABEL: @constant_splat0_cvt_pkrtz( +; CHECK-LABEL: define <2 x half> @constant_splat0_cvt_pkrtz +; CHECK-SAME: () #[[ATTR3]] { ; CHECK-NEXT: ret <2 x half> zeroinitializer ; %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 0.0, float 0.0) @@ -1017,7 +1132,8 @@ } define <2 x half> @constant_cvt_pkrtz() { -; CHECK-LABEL: @constant_cvt_pkrtz( +; CHECK-LABEL: define <2 x half> @constant_cvt_pkrtz +; CHECK-SAME: () #[[ATTR3]] { ; CHECK-NEXT: ret <2 x half> ; %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 2.0, float 4.0) @@ -1026,7 +1142,8 @@ ; Test constant values where rtz changes result define <2 x half> @constant_rtz_pkrtz() { -; CHECK-LABEL: @constant_rtz_pkrtz( +; CHECK-LABEL: define <2 x half> @constant_rtz_pkrtz +; CHECK-SAME: () #[[ATTR3]] { ; CHECK-NEXT: ret <2 x half> ; %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 65535.0, float 65535.0) @@ -1040,8 +1157,9 @@ declare <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float, float) nounwind readnone define <2 x i16> @undef_lhs_cvt_pknorm_i16(float %y) { -; CHECK-LABEL: @undef_lhs_cvt_pknorm_i16( -; CHECK-NEXT: [[CVT:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float undef, float [[Y:%.*]]) +; CHECK-LABEL: define <2 x i16> @undef_lhs_cvt_pknorm_i16 +; CHECK-SAME: (float [[Y:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[CVT:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float undef, float [[Y]]) ; CHECK-NEXT: ret <2 x i16> [[CVT]] ; %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float undef, float %y) @@ -1049,8 +1167,9 @@ } define <2 x i16> @undef_rhs_cvt_pknorm_i16(float %x) { -; CHECK-LABEL: @undef_rhs_cvt_pknorm_i16( -; CHECK-NEXT: [[CVT:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float [[X:%.*]], float undef) +; CHECK-LABEL: define <2 x i16> @undef_rhs_cvt_pknorm_i16 +; CHECK-SAME: (float [[X:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[CVT:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float [[X]], float undef) ; CHECK-NEXT: ret <2 x i16> [[CVT]] ; %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %x, float undef) @@ -1058,7 +1177,8 @@ } define <2 x i16> @undef_cvt_pknorm_i16() { -; CHECK-LABEL: @undef_cvt_pknorm_i16( +; CHECK-LABEL: define <2 x i16> @undef_cvt_pknorm_i16 +; CHECK-SAME: () #[[ATTR3]] { ; CHECK-NEXT: ret <2 x i16> undef ; %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float undef, float undef) @@ -1072,8 +1192,9 @@ declare <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float, float) nounwind readnone define <2 x i16> @undef_lhs_cvt_pknorm_u16(float %y) { -; CHECK-LABEL: @undef_lhs_cvt_pknorm_u16( -; CHECK-NEXT: [[CVT:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float undef, float [[Y:%.*]]) +; CHECK-LABEL: define <2 x i16> @undef_lhs_cvt_pknorm_u16 +; CHECK-SAME: (float [[Y:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[CVT:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float undef, float [[Y]]) ; CHECK-NEXT: ret <2 x i16> [[CVT]] ; %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float undef, float %y) @@ -1081,8 +1202,9 @@ } define <2 x i16> @undef_rhs_cvt_pknorm_u16(float %x) { -; CHECK-LABEL: @undef_rhs_cvt_pknorm_u16( -; CHECK-NEXT: [[CVT:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float [[X:%.*]], float undef) +; CHECK-LABEL: define <2 x i16> @undef_rhs_cvt_pknorm_u16 +; CHECK-SAME: (float [[X:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[CVT:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float [[X]], float undef) ; CHECK-NEXT: ret <2 x i16> [[CVT]] ; %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %x, float undef) @@ -1090,7 +1212,8 @@ } define <2 x i16> @undef_cvt_pknorm_u16() { -; CHECK-LABEL: @undef_cvt_pknorm_u16( +; CHECK-LABEL: define <2 x i16> @undef_cvt_pknorm_u16 +; CHECK-SAME: () #[[ATTR3]] { ; CHECK-NEXT: ret <2 x i16> undef ; %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float undef, float undef) @@ -1104,8 +1227,9 @@ declare <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32, i32) nounwind readnone define <2 x i16> @undef_lhs_cvt_pk_i16(i32 %y) { -; CHECK-LABEL: @undef_lhs_cvt_pk_i16( -; CHECK-NEXT: [[CVT:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 undef, i32 [[Y:%.*]]) +; CHECK-LABEL: define <2 x i16> @undef_lhs_cvt_pk_i16 +; CHECK-SAME: (i32 [[Y:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[CVT:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 undef, i32 [[Y]]) ; CHECK-NEXT: ret <2 x i16> [[CVT]] ; %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 undef, i32 %y) @@ -1113,8 +1237,9 @@ } define <2 x i16> @undef_rhs_cvt_pk_i16(i32 %x) { -; CHECK-LABEL: @undef_rhs_cvt_pk_i16( -; CHECK-NEXT: [[CVT:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 [[X:%.*]], i32 undef) +; CHECK-LABEL: define <2 x i16> @undef_rhs_cvt_pk_i16 +; CHECK-SAME: (i32 [[X:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[CVT:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 [[X]], i32 undef) ; CHECK-NEXT: ret <2 x i16> [[CVT]] ; %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 %x, i32 undef) @@ -1122,7 +1247,8 @@ } define <2 x i16> @undef_cvt_pk_i16() { -; CHECK-LABEL: @undef_cvt_pk_i16( +; CHECK-LABEL: define <2 x i16> @undef_cvt_pk_i16 +; CHECK-SAME: () #[[ATTR3]] { ; CHECK-NEXT: ret <2 x i16> undef ; %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 undef, i32 undef) @@ -1136,8 +1262,9 @@ declare <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32, i32) nounwind readnone define <2 x i16> @undef_lhs_cvt_pk_u16(i32 %y) { -; CHECK-LABEL: @undef_lhs_cvt_pk_u16( -; CHECK-NEXT: [[CVT:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 undef, i32 [[Y:%.*]]) +; CHECK-LABEL: define <2 x i16> @undef_lhs_cvt_pk_u16 +; CHECK-SAME: (i32 [[Y:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[CVT:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 undef, i32 [[Y]]) ; CHECK-NEXT: ret <2 x i16> [[CVT]] ; %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 undef, i32 %y) @@ -1145,8 +1272,9 @@ } define <2 x i16> @undef_rhs_cvt_pk_u16(i32 %x) { -; CHECK-LABEL: @undef_rhs_cvt_pk_u16( -; CHECK-NEXT: [[CVT:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 [[X:%.*]], i32 undef) +; CHECK-LABEL: define <2 x i16> @undef_rhs_cvt_pk_u16 +; CHECK-SAME: (i32 [[X:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[CVT:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 [[X]], i32 undef) ; CHECK-NEXT: ret <2 x i16> [[CVT]] ; %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 %x, i32 undef) @@ -1154,7 +1282,8 @@ } define <2 x i16> @undef_cvt_pk_u16() { -; CHECK-LABEL: @undef_cvt_pk_u16( +; CHECK-LABEL: define <2 x i16> @undef_cvt_pk_u16 +; CHECK-SAME: () #[[ATTR3]] { ; CHECK-NEXT: ret <2 x i16> undef ; %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 undef, i32 undef) @@ -1169,8 +1298,9 @@ declare i64 @llvm.amdgcn.ubfe.i64(i64, i32, i32) nounwind readnone define i32 @ubfe_var_i32(i32 %src, i32 %offset, i32 %width) { -; CHECK-LABEL: @ubfe_var_i32( -; CHECK-NEXT: [[BFE:%.*]] = call i32 @llvm.amdgcn.ubfe.i32(i32 [[SRC:%.*]], i32 [[OFFSET:%.*]], i32 [[WIDTH:%.*]]) +; CHECK-LABEL: define i32 @ubfe_var_i32 +; CHECK-SAME: (i32 [[SRC:%.*]], i32 [[OFFSET:%.*]], i32 [[WIDTH:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[BFE:%.*]] = call i32 @llvm.amdgcn.ubfe.i32(i32 [[SRC]], i32 [[OFFSET]], i32 [[WIDTH]]) ; CHECK-NEXT: ret i32 [[BFE]] ; %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 %offset, i32 %width) @@ -1178,8 +1308,9 @@ } define i32 @ubfe_clear_high_bits_constant_offset_i32(i32 %src, i32 %width) { -; CHECK-LABEL: @ubfe_clear_high_bits_constant_offset_i32( -; CHECK-NEXT: [[BFE:%.*]] = call i32 @llvm.amdgcn.ubfe.i32(i32 [[SRC:%.*]], i32 5, i32 [[WIDTH:%.*]]) +; CHECK-LABEL: define i32 @ubfe_clear_high_bits_constant_offset_i32 +; CHECK-SAME: (i32 [[SRC:%.*]], i32 [[WIDTH:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[BFE:%.*]] = call i32 @llvm.amdgcn.ubfe.i32(i32 [[SRC]], i32 5, i32 [[WIDTH]]) ; CHECK-NEXT: ret i32 [[BFE]] ; %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 133, i32 %width) @@ -1187,8 +1318,9 @@ } define i32 @ubfe_clear_high_bits_constant_width_i32(i32 %src, i32 %offset) { -; CHECK-LABEL: @ubfe_clear_high_bits_constant_width_i32( -; CHECK-NEXT: [[BFE:%.*]] = call i32 @llvm.amdgcn.ubfe.i32(i32 [[SRC:%.*]], i32 [[OFFSET:%.*]], i32 5) +; CHECK-LABEL: define i32 @ubfe_clear_high_bits_constant_width_i32 +; CHECK-SAME: (i32 [[SRC:%.*]], i32 [[OFFSET:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[BFE:%.*]] = call i32 @llvm.amdgcn.ubfe.i32(i32 [[SRC]], i32 [[OFFSET]], i32 5) ; CHECK-NEXT: ret i32 [[BFE]] ; %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 %offset, i32 133) @@ -1196,7 +1328,8 @@ } define i32 @ubfe_width_0(i32 %src, i32 %offset) { -; CHECK-LABEL: @ubfe_width_0( +; CHECK-LABEL: define i32 @ubfe_width_0 +; CHECK-SAME: (i32 [[SRC:%.*]], i32 [[OFFSET:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: ret i32 0 ; %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 %offset, i32 0) @@ -1204,8 +1337,9 @@ } define i32 @ubfe_width_31(i32 %src, i32 %offset) { -; CHECK-LABEL: @ubfe_width_31( -; CHECK-NEXT: [[BFE:%.*]] = call i32 @llvm.amdgcn.ubfe.i32(i32 [[SRC:%.*]], i32 [[OFFSET:%.*]], i32 31) +; CHECK-LABEL: define i32 @ubfe_width_31 +; CHECK-SAME: (i32 [[SRC:%.*]], i32 [[OFFSET:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[BFE:%.*]] = call i32 @llvm.amdgcn.ubfe.i32(i32 [[SRC]], i32 [[OFFSET]], i32 31) ; CHECK-NEXT: ret i32 [[BFE]] ; %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 %offset, i32 31) @@ -1213,7 +1347,8 @@ } define i32 @ubfe_width_32(i32 %src, i32 %offset) { -; CHECK-LABEL: @ubfe_width_32( +; CHECK-LABEL: define i32 @ubfe_width_32 +; CHECK-SAME: (i32 [[SRC:%.*]], i32 [[OFFSET:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: ret i32 0 ; %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 %offset, i32 32) @@ -1221,8 +1356,9 @@ } define i32 @ubfe_width_33(i32 %src, i32 %offset) { -; CHECK-LABEL: @ubfe_width_33( -; CHECK-NEXT: [[BFE:%.*]] = call i32 @llvm.amdgcn.ubfe.i32(i32 [[SRC:%.*]], i32 [[OFFSET:%.*]], i32 1) +; CHECK-LABEL: define i32 @ubfe_width_33 +; CHECK-SAME: (i32 [[SRC:%.*]], i32 [[OFFSET:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[BFE:%.*]] = call i32 @llvm.amdgcn.ubfe.i32(i32 [[SRC]], i32 [[OFFSET]], i32 1) ; CHECK-NEXT: ret i32 [[BFE]] ; %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 %offset, i32 33) @@ -1230,8 +1366,9 @@ } define i32 @ubfe_offset_33(i32 %src, i32 %width) { -; CHECK-LABEL: @ubfe_offset_33( -; CHECK-NEXT: [[BFE:%.*]] = call i32 @llvm.amdgcn.ubfe.i32(i32 [[SRC:%.*]], i32 1, i32 [[WIDTH:%.*]]) +; CHECK-LABEL: define i32 @ubfe_offset_33 +; CHECK-SAME: (i32 [[SRC:%.*]], i32 [[WIDTH:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[BFE:%.*]] = call i32 @llvm.amdgcn.ubfe.i32(i32 [[SRC]], i32 1, i32 [[WIDTH]]) ; CHECK-NEXT: ret i32 [[BFE]] ; %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 33, i32 %width) @@ -1239,8 +1376,9 @@ } define i32 @ubfe_offset_0(i32 %src, i32 %width) { -; CHECK-LABEL: @ubfe_offset_0( -; CHECK-NEXT: [[BFE:%.*]] = call i32 @llvm.amdgcn.ubfe.i32(i32 [[SRC:%.*]], i32 0, i32 [[WIDTH:%.*]]) +; CHECK-LABEL: define i32 @ubfe_offset_0 +; CHECK-SAME: (i32 [[SRC:%.*]], i32 [[WIDTH:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[BFE:%.*]] = call i32 @llvm.amdgcn.ubfe.i32(i32 [[SRC]], i32 0, i32 [[WIDTH]]) ; CHECK-NEXT: ret i32 [[BFE]] ; %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 0, i32 %width) @@ -1248,8 +1386,9 @@ } define i32 @ubfe_offset_32(i32 %src, i32 %width) { -; CHECK-LABEL: @ubfe_offset_32( -; CHECK-NEXT: [[BFE:%.*]] = call i32 @llvm.amdgcn.ubfe.i32(i32 [[SRC:%.*]], i32 0, i32 [[WIDTH:%.*]]) +; CHECK-LABEL: define i32 @ubfe_offset_32 +; CHECK-SAME: (i32 [[SRC:%.*]], i32 [[WIDTH:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[BFE:%.*]] = call i32 @llvm.amdgcn.ubfe.i32(i32 [[SRC]], i32 0, i32 [[WIDTH]]) ; CHECK-NEXT: ret i32 [[BFE]] ; %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 32, i32 %width) @@ -1257,8 +1396,9 @@ } define i32 @ubfe_offset_31(i32 %src, i32 %width) { -; CHECK-LABEL: @ubfe_offset_31( -; CHECK-NEXT: [[BFE:%.*]] = call i32 @llvm.amdgcn.ubfe.i32(i32 [[SRC:%.*]], i32 31, i32 [[WIDTH:%.*]]) +; CHECK-LABEL: define i32 @ubfe_offset_31 +; CHECK-SAME: (i32 [[SRC:%.*]], i32 [[WIDTH:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[BFE:%.*]] = call i32 @llvm.amdgcn.ubfe.i32(i32 [[SRC]], i32 31, i32 [[WIDTH]]) ; CHECK-NEXT: ret i32 [[BFE]] ; %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 31, i32 %width) @@ -1266,7 +1406,8 @@ } define i32 @ubfe_offset_0_width_0(i32 %src) { -; CHECK-LABEL: @ubfe_offset_0_width_0( +; CHECK-LABEL: define i32 @ubfe_offset_0_width_0 +; CHECK-SAME: (i32 [[SRC:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: ret i32 0 ; %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 0, i32 0) @@ -1274,17 +1415,19 @@ } define i32 @ubfe_offset_0_width_3(i32 %src) { -; CHECK-LABEL: @ubfe_offset_0_width_3( -; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[SRC:%.*]], 7 -; CHECK-NEXT: ret i32 [[TMP1]] +; CHECK-LABEL: define i32 @ubfe_offset_0_width_3 +; CHECK-SAME: (i32 [[SRC:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[BFE:%.*]] = and i32 [[SRC]], 7 +; CHECK-NEXT: ret i32 [[BFE]] ; %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 0, i32 3) ret i32 %bfe } define i32 @ubfe_offset_3_width_1(i32 %src) { -; CHECK-LABEL: @ubfe_offset_3_width_1( -; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[SRC:%.*]], 3 +; CHECK-LABEL: define i32 @ubfe_offset_3_width_1 +; CHECK-SAME: (i32 [[SRC:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[SRC]], 3 ; CHECK-NEXT: [[BFE:%.*]] = and i32 [[TMP1]], 1 ; CHECK-NEXT: ret i32 [[BFE]] ; @@ -1293,8 +1436,9 @@ } define i32 @ubfe_offset_3_width_4(i32 %src) { -; CHECK-LABEL: @ubfe_offset_3_width_4( -; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[SRC:%.*]], 3 +; CHECK-LABEL: define i32 @ubfe_offset_3_width_4 +; CHECK-SAME: (i32 [[SRC:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[SRC]], 3 ; CHECK-NEXT: [[BFE:%.*]] = and i32 [[TMP1]], 15 ; CHECK-NEXT: ret i32 [[BFE]] ; @@ -1303,7 +1447,8 @@ } define i32 @ubfe_0_0_0() { -; CHECK-LABEL: @ubfe_0_0_0( +; CHECK-LABEL: define i32 @ubfe_0_0_0 +; CHECK-SAME: () #[[ATTR3]] { ; CHECK-NEXT: ret i32 0 ; %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 0, i32 0, i32 0) @@ -1311,7 +1456,8 @@ } define i32 @ubfe_neg1_5_7() { -; CHECK-LABEL: @ubfe_neg1_5_7( +; CHECK-LABEL: define i32 @ubfe_neg1_5_7 +; CHECK-SAME: () #[[ATTR3]] { ; CHECK-NEXT: ret i32 127 ; %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 -1, i32 5, i32 7) @@ -1319,7 +1465,8 @@ } define i32 @ubfe_undef_src_i32(i32 %offset, i32 %width) { -; CHECK-LABEL: @ubfe_undef_src_i32( +; CHECK-LABEL: define i32 @ubfe_undef_src_i32 +; CHECK-SAME: (i32 [[OFFSET:%.*]], i32 [[WIDTH:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: ret i32 undef ; %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 undef, i32 %offset, i32 %width) @@ -1327,8 +1474,9 @@ } define i32 @ubfe_undef_offset_i32(i32 %src, i32 %width) { -; CHECK-LABEL: @ubfe_undef_offset_i32( -; CHECK-NEXT: [[BFE:%.*]] = call i32 @llvm.amdgcn.ubfe.i32(i32 [[SRC:%.*]], i32 undef, i32 [[WIDTH:%.*]]) +; CHECK-LABEL: define i32 @ubfe_undef_offset_i32 +; CHECK-SAME: (i32 [[SRC:%.*]], i32 [[WIDTH:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[BFE:%.*]] = call i32 @llvm.amdgcn.ubfe.i32(i32 [[SRC]], i32 undef, i32 [[WIDTH]]) ; CHECK-NEXT: ret i32 [[BFE]] ; %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 undef, i32 %width) @@ -1336,8 +1484,9 @@ } define i32 @ubfe_undef_width_i32(i32 %src, i32 %offset) { -; CHECK-LABEL: @ubfe_undef_width_i32( -; CHECK-NEXT: [[BFE:%.*]] = call i32 @llvm.amdgcn.ubfe.i32(i32 [[SRC:%.*]], i32 [[OFFSET:%.*]], i32 undef) +; CHECK-LABEL: define i32 @ubfe_undef_width_i32 +; CHECK-SAME: (i32 [[SRC:%.*]], i32 [[OFFSET:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[BFE:%.*]] = call i32 @llvm.amdgcn.ubfe.i32(i32 [[SRC]], i32 [[OFFSET]], i32 undef) ; CHECK-NEXT: ret i32 [[BFE]] ; %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 %offset, i32 undef) @@ -1345,8 +1494,9 @@ } define i64 @ubfe_offset_33_width_4_i64(i64 %src) { -; CHECK-LABEL: @ubfe_offset_33_width_4_i64( -; CHECK-NEXT: [[TMP1:%.*]] = lshr i64 [[SRC:%.*]], 33 +; CHECK-LABEL: define i64 @ubfe_offset_33_width_4_i64 +; CHECK-SAME: (i64 [[SRC:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[TMP1:%.*]] = lshr i64 [[SRC]], 33 ; CHECK-NEXT: [[BFE:%.*]] = and i64 [[TMP1]], 15 ; CHECK-NEXT: ret i64 [[BFE]] ; @@ -1355,8 +1505,9 @@ } define i64 @ubfe_offset_0_i64(i64 %src, i32 %width) { -; CHECK-LABEL: @ubfe_offset_0_i64( -; CHECK-NEXT: [[BFE:%.*]] = call i64 @llvm.amdgcn.ubfe.i64(i64 [[SRC:%.*]], i32 0, i32 [[WIDTH:%.*]]) +; CHECK-LABEL: define i64 @ubfe_offset_0_i64 +; CHECK-SAME: (i64 [[SRC:%.*]], i32 [[WIDTH:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[BFE:%.*]] = call i64 @llvm.amdgcn.ubfe.i64(i64 [[SRC]], i32 0, i32 [[WIDTH]]) ; CHECK-NEXT: ret i64 [[BFE]] ; %bfe = call i64 @llvm.amdgcn.ubfe.i64(i64 %src, i32 0, i32 %width) @@ -1364,8 +1515,9 @@ } define i64 @ubfe_offset_32_width_32_i64(i64 %src) { -; CHECK-LABEL: @ubfe_offset_32_width_32_i64( -; CHECK-NEXT: [[BFE:%.*]] = lshr i64 [[SRC:%.*]], 32 +; CHECK-LABEL: define i64 @ubfe_offset_32_width_32_i64 +; CHECK-SAME: (i64 [[SRC:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[BFE:%.*]] = lshr i64 [[SRC]], 32 ; CHECK-NEXT: ret i64 [[BFE]] ; %bfe = call i64 @llvm.amdgcn.ubfe.i64(i64 %src, i32 32, i32 32) @@ -1380,8 +1532,9 @@ declare i64 @llvm.amdgcn.sbfe.i64(i64, i32, i32) nounwind readnone define i32 @sbfe_offset_31(i32 %src, i32 %width) { -; CHECK-LABEL: @sbfe_offset_31( -; CHECK-NEXT: [[BFE:%.*]] = call i32 @llvm.amdgcn.sbfe.i32(i32 [[SRC:%.*]], i32 31, i32 [[WIDTH:%.*]]) +; CHECK-LABEL: define i32 @sbfe_offset_31 +; CHECK-SAME: (i32 [[SRC:%.*]], i32 [[WIDTH:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[BFE:%.*]] = call i32 @llvm.amdgcn.sbfe.i32(i32 [[SRC]], i32 31, i32 [[WIDTH]]) ; CHECK-NEXT: ret i32 [[BFE]] ; %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %src, i32 31, i32 %width) @@ -1389,7 +1542,8 @@ } define i32 @sbfe_neg1_5_7() { -; CHECK-LABEL: @sbfe_neg1_5_7( +; CHECK-LABEL: define i32 @sbfe_neg1_5_7 +; CHECK-SAME: () #[[ATTR3]] { ; CHECK-NEXT: ret i32 -1 ; %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 -1, i32 5, i32 7) @@ -1397,8 +1551,9 @@ } define i64 @sbfe_offset_32_width_32_i64(i64 %src) { -; CHECK-LABEL: @sbfe_offset_32_width_32_i64( -; CHECK-NEXT: [[BFE:%.*]] = ashr i64 [[SRC:%.*]], 32 +; CHECK-LABEL: define i64 @sbfe_offset_32_width_32_i64 +; CHECK-SAME: (i64 [[SRC:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[BFE:%.*]] = ashr i64 [[SRC]], 32 ; CHECK-NEXT: ret i64 [[BFE]] ; %bfe = call i64 @llvm.amdgcn.sbfe.i64(i64 %src, i32 32, i32 32) @@ -1416,15 +1571,16 @@ define void @exp_disabled_inputs_to_undef(float %x, float %y, float %z, float %w) { ; enable src0..src3 constants -; CHECK-LABEL: @exp_disabled_inputs_to_undef( +; CHECK-LABEL: define void @exp_disabled_inputs_to_undef +; CHECK-SAME: (float [[X:%.*]], float [[Y:%.*]], float [[Z:%.*]], float [[W:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: call void @llvm.amdgcn.exp.f32(i32 0, i32 1, float 1.000000e+00, float undef, float undef, float undef, i1 true, i1 false) ; CHECK-NEXT: call void @llvm.amdgcn.exp.f32(i32 0, i32 2, float undef, float 2.000000e+00, float undef, float undef, i1 true, i1 false) ; CHECK-NEXT: call void @llvm.amdgcn.exp.f32(i32 0, i32 4, float undef, float undef, float 5.000000e-01, float undef, i1 true, i1 false) ; CHECK-NEXT: call void @llvm.amdgcn.exp.f32(i32 0, i32 8, float undef, float undef, float undef, float 4.000000e+00, i1 true, i1 false) -; CHECK-NEXT: call void @llvm.amdgcn.exp.f32(i32 0, i32 1, float [[X:%.*]], float undef, float undef, float undef, i1 true, i1 false) -; CHECK-NEXT: call void @llvm.amdgcn.exp.f32(i32 0, i32 2, float undef, float [[Y:%.*]], float undef, float undef, i1 true, i1 false) -; CHECK-NEXT: call void @llvm.amdgcn.exp.f32(i32 0, i32 4, float undef, float undef, float [[Z:%.*]], float undef, i1 true, i1 false) -; CHECK-NEXT: call void @llvm.amdgcn.exp.f32(i32 0, i32 8, float undef, float undef, float undef, float [[W:%.*]], i1 true, i1 false) +; CHECK-NEXT: call void @llvm.amdgcn.exp.f32(i32 0, i32 1, float [[X]], float undef, float undef, float undef, i1 true, i1 false) +; CHECK-NEXT: call void @llvm.amdgcn.exp.f32(i32 0, i32 2, float undef, float [[Y]], float undef, float undef, i1 true, i1 false) +; CHECK-NEXT: call void @llvm.amdgcn.exp.f32(i32 0, i32 4, float undef, float undef, float [[Z]], float undef, i1 true, i1 false) +; CHECK-NEXT: call void @llvm.amdgcn.exp.f32(i32 0, i32 8, float undef, float undef, float undef, float [[W]], i1 true, i1 false) ; CHECK-NEXT: call void @llvm.amdgcn.exp.f32(i32 0, i32 0, float undef, float undef, float undef, float undef, i1 true, i1 false) ; CHECK-NEXT: call void @llvm.amdgcn.exp.f32(i32 0, i32 3, float 1.000000e+00, float 2.000000e+00, float undef, float undef, i1 true, i1 false) ; CHECK-NEXT: call void @llvm.amdgcn.exp.f32(i32 0, i32 5, float 1.000000e+00, float undef, float 5.000000e-01, float undef, i1 true, i1 false) @@ -1464,16 +1620,17 @@ define void @exp_compr_disabled_inputs_to_undef(<2 x half> %xy, <2 x half> %zw) { -; CHECK-LABEL: @exp_compr_disabled_inputs_to_undef( +; CHECK-LABEL: define void @exp_compr_disabled_inputs_to_undef +; CHECK-SAME: (<2 x half> [[XY:%.*]], <2 x half> [[ZW:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 0, <2 x half> undef, <2 x half> undef, i1 true, i1 false) ; CHECK-NEXT: call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 1, <2 x half> , <2 x half> undef, i1 true, i1 false) ; CHECK-NEXT: call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 2, <2 x half> , <2 x half> undef, i1 true, i1 false) ; CHECK-NEXT: call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 3, <2 x half> , <2 x half> undef, i1 true, i1 false) ; CHECK-NEXT: call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 0, <2 x half> undef, <2 x half> undef, i1 true, i1 false) -; CHECK-NEXT: call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 1, <2 x half> [[XY:%.*]], <2 x half> undef, i1 true, i1 false) +; CHECK-NEXT: call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 1, <2 x half> [[XY]], <2 x half> undef, i1 true, i1 false) ; CHECK-NEXT: call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 2, <2 x half> [[XY]], <2 x half> undef, i1 true, i1 false) ; CHECK-NEXT: call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 3, <2 x half> [[XY]], <2 x half> undef, i1 true, i1 false) -; CHECK-NEXT: call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 12, <2 x half> undef, <2 x half> [[ZW:%.*]], i1 true, i1 false) +; CHECK-NEXT: call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 12, <2 x half> undef, <2 x half> [[ZW]], i1 true, i1 false) ; CHECK-NEXT: call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> [[XY]], <2 x half> [[ZW]], i1 true, i1 false) ; CHECK-NEXT: ret void ; @@ -1499,8 +1656,9 @@ declare float @llvm.amdgcn.fmed3.f32(float, float, float) nounwind readnone define float @fmed3_f32(float %x, float %y, float %z) { -; CHECK-LABEL: @fmed3_f32( -; CHECK-NEXT: [[MED3:%.*]] = call float @llvm.amdgcn.fmed3.f32(float [[X:%.*]], float [[Y:%.*]], float [[Z:%.*]]) +; CHECK-LABEL: define float @fmed3_f32 +; CHECK-SAME: (float [[X:%.*]], float [[Y:%.*]], float [[Z:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[MED3:%.*]] = call float @llvm.amdgcn.fmed3.f32(float [[X]], float [[Y]], float [[Z]]) ; CHECK-NEXT: ret float [[MED3]] ; %med3 = call float @llvm.amdgcn.fmed3.f32(float %x, float %y, float %z) @@ -1508,8 +1666,9 @@ } define float @fmed3_canonicalize_x_c0_c1_f32(float %x) { -; CHECK-LABEL: @fmed3_canonicalize_x_c0_c1_f32( -; CHECK-NEXT: [[MED3:%.*]] = call float @llvm.amdgcn.fmed3.f32(float [[X:%.*]], float 0.000000e+00, float 1.000000e+00) +; CHECK-LABEL: define float @fmed3_canonicalize_x_c0_c1_f32 +; CHECK-SAME: (float [[X:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[MED3:%.*]] = call float @llvm.amdgcn.fmed3.f32(float [[X]], float 0.000000e+00, float 1.000000e+00) ; CHECK-NEXT: ret float [[MED3]] ; %med3 = call float @llvm.amdgcn.fmed3.f32(float %x, float 0.0, float 1.0) @@ -1517,8 +1676,9 @@ } define float @fmed3_canonicalize_c0_x_c1_f32(float %x) { -; CHECK-LABEL: @fmed3_canonicalize_c0_x_c1_f32( -; CHECK-NEXT: [[MED3:%.*]] = call float @llvm.amdgcn.fmed3.f32(float [[X:%.*]], float 0.000000e+00, float 1.000000e+00) +; CHECK-LABEL: define float @fmed3_canonicalize_c0_x_c1_f32 +; CHECK-SAME: (float [[X:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[MED3:%.*]] = call float @llvm.amdgcn.fmed3.f32(float [[X]], float 0.000000e+00, float 1.000000e+00) ; CHECK-NEXT: ret float [[MED3]] ; %med3 = call float @llvm.amdgcn.fmed3.f32(float 0.0, float %x, float 1.0) @@ -1526,8 +1686,9 @@ } define float @fmed3_canonicalize_c0_c1_x_f32(float %x) { -; CHECK-LABEL: @fmed3_canonicalize_c0_c1_x_f32( -; CHECK-NEXT: [[MED3:%.*]] = call float @llvm.amdgcn.fmed3.f32(float [[X:%.*]], float 0.000000e+00, float 1.000000e+00) +; CHECK-LABEL: define float @fmed3_canonicalize_c0_c1_x_f32 +; CHECK-SAME: (float [[X:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[MED3:%.*]] = call float @llvm.amdgcn.fmed3.f32(float [[X]], float 0.000000e+00, float 1.000000e+00) ; CHECK-NEXT: ret float [[MED3]] ; %med3 = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float %x) @@ -1535,8 +1696,9 @@ } define float @fmed3_canonicalize_x_y_c_f32(float %x, float %y) { -; CHECK-LABEL: @fmed3_canonicalize_x_y_c_f32( -; CHECK-NEXT: [[MED3:%.*]] = call float @llvm.amdgcn.fmed3.f32(float [[X:%.*]], float [[Y:%.*]], float 1.000000e+00) +; CHECK-LABEL: define float @fmed3_canonicalize_x_y_c_f32 +; CHECK-SAME: (float [[X:%.*]], float [[Y:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[MED3:%.*]] = call float @llvm.amdgcn.fmed3.f32(float [[X]], float [[Y]], float 1.000000e+00) ; CHECK-NEXT: ret float [[MED3]] ; %med3 = call float @llvm.amdgcn.fmed3.f32(float %x, float %y, float 1.0) @@ -1544,8 +1706,9 @@ } define float @fmed3_canonicalize_x_c_y_f32(float %x, float %y) { -; CHECK-LABEL: @fmed3_canonicalize_x_c_y_f32( -; CHECK-NEXT: [[MED3:%.*]] = call float @llvm.amdgcn.fmed3.f32(float [[X:%.*]], float [[Y:%.*]], float 1.000000e+00) +; CHECK-LABEL: define float @fmed3_canonicalize_x_c_y_f32 +; CHECK-SAME: (float [[X:%.*]], float [[Y:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[MED3:%.*]] = call float @llvm.amdgcn.fmed3.f32(float [[X]], float [[Y]], float 1.000000e+00) ; CHECK-NEXT: ret float [[MED3]] ; %med3 = call float @llvm.amdgcn.fmed3.f32(float %x, float 1.0, float %y) @@ -1553,8 +1716,9 @@ } define float @fmed3_canonicalize_c_x_y_f32(float %x, float %y) { -; CHECK-LABEL: @fmed3_canonicalize_c_x_y_f32( -; CHECK-NEXT: [[MED3:%.*]] = call float @llvm.amdgcn.fmed3.f32(float [[X:%.*]], float [[Y:%.*]], float 1.000000e+00) +; CHECK-LABEL: define float @fmed3_canonicalize_c_x_y_f32 +; CHECK-SAME: (float [[X:%.*]], float [[Y:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[MED3:%.*]] = call float @llvm.amdgcn.fmed3.f32(float [[X]], float [[Y]], float 1.000000e+00) ; CHECK-NEXT: ret float [[MED3]] ; %med3 = call float @llvm.amdgcn.fmed3.f32(float 1.0, float %x, float %y) @@ -1562,8 +1726,9 @@ } define float @fmed3_undef_x_y_f32(float %x, float %y) { -; CHECK-LABEL: @fmed3_undef_x_y_f32( -; CHECK-NEXT: [[MED3:%.*]] = call float @llvm.minnum.f32(float [[X:%.*]], float [[Y:%.*]]) +; CHECK-LABEL: define float @fmed3_undef_x_y_f32 +; CHECK-SAME: (float [[X:%.*]], float [[Y:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[MED3:%.*]] = call float @llvm.minnum.f32(float [[X]], float [[Y]]) ; CHECK-NEXT: ret float [[MED3]] ; %med3 = call float @llvm.amdgcn.fmed3.f32(float undef, float %x, float %y) @@ -1571,8 +1736,9 @@ } define float @fmed3_fmf_undef_x_y_f32(float %x, float %y) { -; CHECK-LABEL: @fmed3_fmf_undef_x_y_f32( -; CHECK-NEXT: [[MED3:%.*]] = call nnan float @llvm.minnum.f32(float [[X:%.*]], float [[Y:%.*]]) +; CHECK-LABEL: define float @fmed3_fmf_undef_x_y_f32 +; CHECK-SAME: (float [[X:%.*]], float [[Y:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[MED3:%.*]] = call nnan float @llvm.minnum.f32(float [[X]], float [[Y]]) ; CHECK-NEXT: ret float [[MED3]] ; %med3 = call nnan float @llvm.amdgcn.fmed3.f32(float undef, float %x, float %y) @@ -1580,8 +1746,9 @@ } define float @fmed3_x_undef_y_f32(float %x, float %y) { -; CHECK-LABEL: @fmed3_x_undef_y_f32( -; CHECK-NEXT: [[MED3:%.*]] = call float @llvm.minnum.f32(float [[X:%.*]], float [[Y:%.*]]) +; CHECK-LABEL: define float @fmed3_x_undef_y_f32 +; CHECK-SAME: (float [[X:%.*]], float [[Y:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[MED3:%.*]] = call float @llvm.minnum.f32(float [[X]], float [[Y]]) ; CHECK-NEXT: ret float [[MED3]] ; %med3 = call float @llvm.amdgcn.fmed3.f32(float %x, float undef, float %y) @@ -1589,8 +1756,9 @@ } define float @fmed3_x_y_undef_f32(float %x, float %y) { -; CHECK-LABEL: @fmed3_x_y_undef_f32( -; CHECK-NEXT: [[MED3:%.*]] = call float @llvm.maxnum.f32(float [[X:%.*]], float [[Y:%.*]]) +; CHECK-LABEL: define float @fmed3_x_y_undef_f32 +; CHECK-SAME: (float [[X:%.*]], float [[Y:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[MED3:%.*]] = call float @llvm.maxnum.f32(float [[X]], float [[Y]]) ; CHECK-NEXT: ret float [[MED3]] ; %med3 = call float @llvm.amdgcn.fmed3.f32(float %x, float %y, float undef) @@ -1598,8 +1766,9 @@ } define float @fmed3_qnan0_x_y_f32(float %x, float %y) { -; CHECK-LABEL: @fmed3_qnan0_x_y_f32( -; CHECK-NEXT: [[MED3:%.*]] = call float @llvm.minnum.f32(float [[X:%.*]], float [[Y:%.*]]) +; CHECK-LABEL: define float @fmed3_qnan0_x_y_f32 +; CHECK-SAME: (float [[X:%.*]], float [[Y:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[MED3:%.*]] = call float @llvm.minnum.f32(float [[X]], float [[Y]]) ; CHECK-NEXT: ret float [[MED3]] ; %med3 = call float @llvm.amdgcn.fmed3.f32(float 0x7FF8000000000000, float %x, float %y) @@ -1607,8 +1776,9 @@ } define float @fmed3_x_qnan0_y_f32(float %x, float %y) { -; CHECK-LABEL: @fmed3_x_qnan0_y_f32( -; CHECK-NEXT: [[MED3:%.*]] = call float @llvm.minnum.f32(float [[X:%.*]], float [[Y:%.*]]) +; CHECK-LABEL: define float @fmed3_x_qnan0_y_f32 +; CHECK-SAME: (float [[X:%.*]], float [[Y:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[MED3:%.*]] = call float @llvm.minnum.f32(float [[X]], float [[Y]]) ; CHECK-NEXT: ret float [[MED3]] ; %med3 = call float @llvm.amdgcn.fmed3.f32(float %x, float 0x7FF8000000000000, float %y) @@ -1616,8 +1786,9 @@ } define float @fmed3_x_y_qnan0_f32(float %x, float %y) { -; CHECK-LABEL: @fmed3_x_y_qnan0_f32( -; CHECK-NEXT: [[MED3:%.*]] = call float @llvm.maxnum.f32(float [[X:%.*]], float [[Y:%.*]]) +; CHECK-LABEL: define float @fmed3_x_y_qnan0_f32 +; CHECK-SAME: (float [[X:%.*]], float [[Y:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[MED3:%.*]] = call float @llvm.maxnum.f32(float [[X]], float [[Y]]) ; CHECK-NEXT: ret float [[MED3]] ; %med3 = call float @llvm.amdgcn.fmed3.f32(float %x, float %y, float 0x7FF8000000000000) @@ -1625,8 +1796,9 @@ } define float @fmed3_qnan1_x_y_f32(float %x, float %y) { -; CHECK-LABEL: @fmed3_qnan1_x_y_f32( -; CHECK-NEXT: [[MED3:%.*]] = call float @llvm.minnum.f32(float [[X:%.*]], float [[Y:%.*]]) +; CHECK-LABEL: define float @fmed3_qnan1_x_y_f32 +; CHECK-SAME: (float [[X:%.*]], float [[Y:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[MED3:%.*]] = call float @llvm.minnum.f32(float [[X]], float [[Y]]) ; CHECK-NEXT: ret float [[MED3]] ; %med3 = call float @llvm.amdgcn.fmed3.f32(float 0x7FF8000100000000, float %x, float %y) @@ -1635,7 +1807,8 @@ ; This can return any of the qnans. define float @fmed3_qnan0_qnan1_qnan2_f32(float %x, float %y) { -; CHECK-LABEL: @fmed3_qnan0_qnan1_qnan2_f32( +; CHECK-LABEL: define float @fmed3_qnan0_qnan1_qnan2_f32 +; CHECK-SAME: (float [[X:%.*]], float [[Y:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: ret float 0x7FF8030000000000 ; %med3 = call float @llvm.amdgcn.fmed3.f32(float 0x7FF8000100000000, float 0x7FF8002000000000, float 0x7FF8030000000000) @@ -1643,7 +1816,8 @@ } define float @fmed3_constant_src0_0_f32(float %x, float %y) { -; CHECK-LABEL: @fmed3_constant_src0_0_f32( +; CHECK-LABEL: define float @fmed3_constant_src0_0_f32 +; CHECK-SAME: (float [[X:%.*]], float [[Y:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: ret float 5.000000e-01 ; %med3 = call float @llvm.amdgcn.fmed3.f32(float 0.5, float -1.0, float 4.0) @@ -1651,7 +1825,8 @@ } define float @fmed3_constant_src0_1_f32(float %x, float %y) { -; CHECK-LABEL: @fmed3_constant_src0_1_f32( +; CHECK-LABEL: define float @fmed3_constant_src0_1_f32 +; CHECK-SAME: (float [[X:%.*]], float [[Y:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: ret float 5.000000e-01 ; %med3 = call float @llvm.amdgcn.fmed3.f32(float 0.5, float 4.0, float -1.0) @@ -1659,7 +1834,8 @@ } define float @fmed3_constant_src1_0_f32(float %x, float %y) { -; CHECK-LABEL: @fmed3_constant_src1_0_f32( +; CHECK-LABEL: define float @fmed3_constant_src1_0_f32 +; CHECK-SAME: (float [[X:%.*]], float [[Y:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: ret float 5.000000e-01 ; %med3 = call float @llvm.amdgcn.fmed3.f32(float -1.0, float 0.5, float 4.0) @@ -1667,7 +1843,8 @@ } define float @fmed3_constant_src1_1_f32(float %x, float %y) { -; CHECK-LABEL: @fmed3_constant_src1_1_f32( +; CHECK-LABEL: define float @fmed3_constant_src1_1_f32 +; CHECK-SAME: (float [[X:%.*]], float [[Y:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: ret float 5.000000e-01 ; %med3 = call float @llvm.amdgcn.fmed3.f32(float 4.0, float 0.5, float -1.0) @@ -1675,7 +1852,8 @@ } define float @fmed3_constant_src2_0_f32(float %x, float %y) { -; CHECK-LABEL: @fmed3_constant_src2_0_f32( +; CHECK-LABEL: define float @fmed3_constant_src2_0_f32 +; CHECK-SAME: (float [[X:%.*]], float [[Y:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: ret float 5.000000e-01 ; %med3 = call float @llvm.amdgcn.fmed3.f32(float -1.0, float 4.0, float 0.5) @@ -1683,7 +1861,8 @@ } define float @fmed3_constant_src2_1_f32(float %x, float %y) { -; CHECK-LABEL: @fmed3_constant_src2_1_f32( +; CHECK-LABEL: define float @fmed3_constant_src2_1_f32 +; CHECK-SAME: (float [[X:%.*]], float [[Y:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: ret float 5.000000e-01 ; %med3 = call float @llvm.amdgcn.fmed3.f32(float 4.0, float -1.0, float 0.5) @@ -1691,31 +1870,35 @@ } define float @fmed3_x_qnan0_qnan1_f32(float %x) { -; CHECK-LABEL: @fmed3_x_qnan0_qnan1_f32( -; CHECK-NEXT: ret float [[X:%.*]] +; CHECK-LABEL: define float @fmed3_x_qnan0_qnan1_f32 +; CHECK-SAME: (float [[X:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: ret float [[X]] ; %med3 = call float @llvm.amdgcn.fmed3.f32(float %x, float 0x7FF8001000000000, float 0x7FF8002000000000) ret float %med3 } define float @fmed3_qnan0_x_qnan1_f32(float %x) { -; CHECK-LABEL: @fmed3_qnan0_x_qnan1_f32( -; CHECK-NEXT: ret float [[X:%.*]] +; CHECK-LABEL: define float @fmed3_qnan0_x_qnan1_f32 +; CHECK-SAME: (float [[X:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: ret float [[X]] ; %med3 = call float @llvm.amdgcn.fmed3.f32(float 0x7FF8001000000000, float %x, float 0x7FF8002000000000) ret float %med3 } define float @fmed3_qnan0_qnan1_x_f32(float %x) { -; CHECK-LABEL: @fmed3_qnan0_qnan1_x_f32( -; CHECK-NEXT: ret float [[X:%.*]] +; CHECK-LABEL: define float @fmed3_qnan0_qnan1_x_f32 +; CHECK-SAME: (float [[X:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: ret float [[X]] ; %med3 = call float @llvm.amdgcn.fmed3.f32(float 0x7FF8001000000000, float 0x7FF8002000000000, float %x) ret float %med3 } define float @fmed3_nan_0_1_f32() { -; CHECK-LABEL: @fmed3_nan_0_1_f32( +; CHECK-LABEL: define float @fmed3_nan_0_1_f32 +; CHECK-SAME: () #[[ATTR3]] { ; CHECK-NEXT: ret float 0.000000e+00 ; %med3 = call float @llvm.amdgcn.fmed3.f32(float 0x7FF8001000000000, float 0.0, float 1.0) @@ -1723,7 +1906,8 @@ } define float @fmed3_0_nan_1_f32() { -; CHECK-LABEL: @fmed3_0_nan_1_f32( +; CHECK-LABEL: define float @fmed3_0_nan_1_f32 +; CHECK-SAME: () #[[ATTR3]] { ; CHECK-NEXT: ret float 0.000000e+00 ; %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 0x7FF8001000000000, float 1.0) @@ -1731,7 +1915,8 @@ } define float @fmed3_0_1_nan_f32() { -; CHECK-LABEL: @fmed3_0_1_nan_f32( +; CHECK-LABEL: define float @fmed3_0_1_nan_f32 +; CHECK-SAME: () #[[ATTR3]] { ; CHECK-NEXT: ret float 1.000000e+00 ; %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float 0x7FF8001000000000) @@ -1739,7 +1924,8 @@ } define float @fmed3_undef_0_1_f32() { -; CHECK-LABEL: @fmed3_undef_0_1_f32( +; CHECK-LABEL: define float @fmed3_undef_0_1_f32 +; CHECK-SAME: () #[[ATTR3]] { ; CHECK-NEXT: ret float 0.000000e+00 ; %med3 = call float @llvm.amdgcn.fmed3.f32(float undef, float 0.0, float 1.0) @@ -1747,7 +1933,8 @@ } define float @fmed3_0_undef_1_f32() { -; CHECK-LABEL: @fmed3_0_undef_1_f32( +; CHECK-LABEL: define float @fmed3_0_undef_1_f32 +; CHECK-SAME: () #[[ATTR3]] { ; CHECK-NEXT: ret float 0.000000e+00 ; %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float undef, float 1.0) @@ -1755,7 +1942,8 @@ } define float @fmed3_0_1_undef_f32() { -; CHECK-LABEL: @fmed3_0_1_undef_f32( +; CHECK-LABEL: define float @fmed3_0_1_undef_f32 +; CHECK-SAME: () #[[ATTR3]] { ; CHECK-NEXT: ret float 1.000000e+00 ; %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float undef) @@ -1771,8 +1959,9 @@ declare i64 @llvm.amdgcn.icmp.i64.i1(i1, i1, i32 immarg) nounwind readnone convergent define i64 @invalid_icmp_code(i32 %a, i32 %b) { -; CHECK-LABEL: @invalid_icmp_code( -; CHECK-NEXT: [[UNDER:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i32(i32 [[A:%.*]], i32 [[B:%.*]], i32 31) +; CHECK-LABEL: define i64 @invalid_icmp_code +; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[UNDER:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i32(i32 [[A]], i32 [[B]], i32 31) ; CHECK-NEXT: [[OVER:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i32(i32 [[A]], i32 [[B]], i32 42) ; CHECK-NEXT: [[OR:%.*]] = or i64 [[UNDER]], [[OVER]] ; CHECK-NEXT: ret i64 [[OR]] @@ -1784,7 +1973,8 @@ } define i64 @icmp_constant_inputs_false() { -; CHECK-LABEL: @icmp_constant_inputs_false( +; CHECK-LABEL: define i64 @icmp_constant_inputs_false +; CHECK-SAME: () #[[ATTR3]] { ; CHECK-NEXT: ret i64 0 ; %result = call i64 @llvm.amdgcn.icmp.i64.i32(i32 9, i32 8, i32 32) @@ -1792,8 +1982,9 @@ } define i64 @icmp_constant_inputs_true() { -; CHECK-LABEL: @icmp_constant_inputs_true( -; CHECK-NEXT: [[RESULT:%.*]] = call i64 @llvm.read_register.i64(metadata [[META0:![0-9]+]]) #[[ATTR17:[0-9]+]] +; CHECK-LABEL: define i64 @icmp_constant_inputs_true +; CHECK-SAME: () #[[ATTR3]] { +; CHECK-NEXT: [[RESULT:%.*]] = call i64 @llvm.read_register.i64(metadata [[META0:![0-9]+]]) #[[ATTR16:[0-9]+]] ; CHECK-NEXT: ret i64 [[RESULT]] ; %result = call i64 @llvm.amdgcn.icmp.i64.i32(i32 9, i32 8, i32 34) @@ -1801,8 +1992,9 @@ } define i64 @icmp_constant_to_rhs_slt(i32 %x) { -; CHECK-LABEL: @icmp_constant_to_rhs_slt( -; CHECK-NEXT: [[RESULT:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i32(i32 [[X:%.*]], i32 9, i32 38) +; CHECK-LABEL: define i64 @icmp_constant_to_rhs_slt +; CHECK-SAME: (i32 [[X:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[RESULT:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i32(i32 [[X]], i32 9, i32 38) ; CHECK-NEXT: ret i64 [[RESULT]] ; %result = call i64 @llvm.amdgcn.icmp.i64.i32(i32 9, i32 %x, i32 40) @@ -1810,8 +2002,9 @@ } define i64 @fold_icmp_ne_0_zext_icmp_eq_i32(i32 %a, i32 %b) { -; CHECK-LABEL: @fold_icmp_ne_0_zext_icmp_eq_i32( -; CHECK-NEXT: [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i32(i32 [[A:%.*]], i32 [[B:%.*]], i32 32) +; CHECK-LABEL: define i64 @fold_icmp_ne_0_zext_icmp_eq_i32 +; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i32(i32 [[A]], i32 [[B]], i32 32) ; CHECK-NEXT: ret i64 [[MASK]] ; %cmp = icmp eq i32 %a, %b @@ -1821,8 +2014,9 @@ } define i64 @fold_icmp_ne_0_zext_icmp_ne_i32(i32 %a, i32 %b) { -; CHECK-LABEL: @fold_icmp_ne_0_zext_icmp_ne_i32( -; CHECK-NEXT: [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i32(i32 [[A:%.*]], i32 [[B:%.*]], i32 33) +; CHECK-LABEL: define i64 @fold_icmp_ne_0_zext_icmp_ne_i32 +; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i32(i32 [[A]], i32 [[B]], i32 33) ; CHECK-NEXT: ret i64 [[MASK]] ; %cmp = icmp ne i32 %a, %b @@ -1832,8 +2026,9 @@ } define i64 @fold_icmp_ne_0_zext_icmp_sle_i32(i32 %a, i32 %b) { -; CHECK-LABEL: @fold_icmp_ne_0_zext_icmp_sle_i32( -; CHECK-NEXT: [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i32(i32 [[A:%.*]], i32 [[B:%.*]], i32 41) +; CHECK-LABEL: define i64 @fold_icmp_ne_0_zext_icmp_sle_i32 +; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i32(i32 [[A]], i32 [[B]], i32 41) ; CHECK-NEXT: ret i64 [[MASK]] ; %cmp = icmp sle i32 %a, %b @@ -1843,8 +2038,9 @@ } define i64 @fold_icmp_ne_0_zext_icmp_ugt_i64(i64 %a, i64 %b) { -; CHECK-LABEL: @fold_icmp_ne_0_zext_icmp_ugt_i64( -; CHECK-NEXT: [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i64(i64 [[A:%.*]], i64 [[B:%.*]], i32 34) +; CHECK-LABEL: define i64 @fold_icmp_ne_0_zext_icmp_ugt_i64 +; CHECK-SAME: (i64 [[A:%.*]], i64 [[B:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i64(i64 [[A]], i64 [[B]], i32 34) ; CHECK-NEXT: ret i64 [[MASK]] ; %cmp = icmp ugt i64 %a, %b @@ -1854,8 +2050,9 @@ } define i64 @fold_icmp_ne_0_zext_icmp_ult_swap_i64(i64 %a, i64 %b) { -; CHECK-LABEL: @fold_icmp_ne_0_zext_icmp_ult_swap_i64( -; CHECK-NEXT: [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i64(i64 [[A:%.*]], i64 [[B:%.*]], i32 34) +; CHECK-LABEL: define i64 @fold_icmp_ne_0_zext_icmp_ult_swap_i64 +; CHECK-SAME: (i64 [[A:%.*]], i64 [[B:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i64(i64 [[A]], i64 [[B]], i32 34) ; CHECK-NEXT: ret i64 [[MASK]] ; %cmp = icmp ugt i64 %a, %b @@ -1865,8 +2062,9 @@ } define i64 @fold_icmp_ne_0_zext_fcmp_oeq_f32(float %a, float %b) { -; CHECK-LABEL: @fold_icmp_ne_0_zext_fcmp_oeq_f32( -; CHECK-NEXT: [[MASK:%.*]] = call i64 @llvm.amdgcn.fcmp.i64.f32(float [[A:%.*]], float [[B:%.*]], i32 1) +; CHECK-LABEL: define i64 @fold_icmp_ne_0_zext_fcmp_oeq_f32 +; CHECK-SAME: (float [[A:%.*]], float [[B:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[MASK:%.*]] = call i64 @llvm.amdgcn.fcmp.i64.f32(float [[A]], float [[B]], i32 1) ; CHECK-NEXT: ret i64 [[MASK]] ; %cmp = fcmp oeq float %a, %b @@ -1876,8 +2074,9 @@ } define i64 @fold_icmp_ne_0_zext_fcmp_une_f32(float %a, float %b) { -; CHECK-LABEL: @fold_icmp_ne_0_zext_fcmp_une_f32( -; CHECK-NEXT: [[MASK:%.*]] = call i64 @llvm.amdgcn.fcmp.i64.f32(float [[A:%.*]], float [[B:%.*]], i32 14) +; CHECK-LABEL: define i64 @fold_icmp_ne_0_zext_fcmp_une_f32 +; CHECK-SAME: (float [[A:%.*]], float [[B:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[MASK:%.*]] = call i64 @llvm.amdgcn.fcmp.i64.f32(float [[A]], float [[B]], i32 14) ; CHECK-NEXT: ret i64 [[MASK]] ; %cmp = fcmp une float %a, %b @@ -1887,8 +2086,9 @@ } define i64 @fold_icmp_ne_0_zext_fcmp_olt_f64(double %a, double %b) { -; CHECK-LABEL: @fold_icmp_ne_0_zext_fcmp_olt_f64( -; CHECK-NEXT: [[MASK:%.*]] = call i64 @llvm.amdgcn.fcmp.i64.f64(double [[A:%.*]], double [[B:%.*]], i32 4) +; CHECK-LABEL: define i64 @fold_icmp_ne_0_zext_fcmp_olt_f64 +; CHECK-SAME: (double [[A:%.*]], double [[B:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[MASK:%.*]] = call i64 @llvm.amdgcn.fcmp.i64.f64(double [[A]], double [[B]], i32 4) ; CHECK-NEXT: ret i64 [[MASK]] ; %cmp = fcmp olt double %a, %b @@ -1898,8 +2098,9 @@ } define i64 @fold_icmp_sext_icmp_ne_0_i32(i32 %a, i32 %b) { -; CHECK-LABEL: @fold_icmp_sext_icmp_ne_0_i32( -; CHECK-NEXT: [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i32(i32 [[A:%.*]], i32 [[B:%.*]], i32 32) +; CHECK-LABEL: define i64 @fold_icmp_sext_icmp_ne_0_i32 +; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i32(i32 [[A]], i32 [[B]], i32 32) ; CHECK-NEXT: ret i64 [[MASK]] ; %cmp = icmp eq i32 %a, %b @@ -1909,8 +2110,9 @@ } define i64 @fold_icmp_eq_0_zext_icmp_eq_i32(i32 %a, i32 %b) { -; CHECK-LABEL: @fold_icmp_eq_0_zext_icmp_eq_i32( -; CHECK-NEXT: [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i32(i32 [[A:%.*]], i32 [[B:%.*]], i32 33) +; CHECK-LABEL: define i64 @fold_icmp_eq_0_zext_icmp_eq_i32 +; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i32(i32 [[A]], i32 [[B]], i32 33) ; CHECK-NEXT: ret i64 [[MASK]] ; %cmp = icmp eq i32 %a, %b @@ -1920,8 +2122,9 @@ } define i64 @fold_icmp_eq_0_zext_icmp_slt_i32(i32 %a, i32 %b) { -; CHECK-LABEL: @fold_icmp_eq_0_zext_icmp_slt_i32( -; CHECK-NEXT: [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i32(i32 [[A:%.*]], i32 [[B:%.*]], i32 39) +; CHECK-LABEL: define i64 @fold_icmp_eq_0_zext_icmp_slt_i32 +; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i32(i32 [[A]], i32 [[B]], i32 39) ; CHECK-NEXT: ret i64 [[MASK]] ; %cmp = icmp slt i32 %a, %b @@ -1931,8 +2134,9 @@ } define i64 @fold_icmp_eq_0_zext_fcmp_oeq_f32(float %a, float %b) { -; CHECK-LABEL: @fold_icmp_eq_0_zext_fcmp_oeq_f32( -; CHECK-NEXT: [[MASK:%.*]] = call i64 @llvm.amdgcn.fcmp.i64.f32(float [[A:%.*]], float [[B:%.*]], i32 14) +; CHECK-LABEL: define i64 @fold_icmp_eq_0_zext_fcmp_oeq_f32 +; CHECK-SAME: (float [[A:%.*]], float [[B:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[MASK:%.*]] = call i64 @llvm.amdgcn.fcmp.i64.f32(float [[A]], float [[B]], i32 14) ; CHECK-NEXT: ret i64 [[MASK]] ; %cmp = fcmp oeq float %a, %b @@ -1942,8 +2146,9 @@ } define i64 @fold_icmp_eq_0_zext_fcmp_ule_f32(float %a, float %b) { -; CHECK-LABEL: @fold_icmp_eq_0_zext_fcmp_ule_f32( -; CHECK-NEXT: [[MASK:%.*]] = call i64 @llvm.amdgcn.fcmp.i64.f32(float [[A:%.*]], float [[B:%.*]], i32 2) +; CHECK-LABEL: define i64 @fold_icmp_eq_0_zext_fcmp_ule_f32 +; CHECK-SAME: (float [[A:%.*]], float [[B:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[MASK:%.*]] = call i64 @llvm.amdgcn.fcmp.i64.f32(float [[A]], float [[B]], i32 2) ; CHECK-NEXT: ret i64 [[MASK]] ; %cmp = fcmp ule float %a, %b @@ -1953,8 +2158,9 @@ } define i64 @fold_icmp_eq_0_zext_fcmp_ogt_f32(float %a, float %b) { -; CHECK-LABEL: @fold_icmp_eq_0_zext_fcmp_ogt_f32( -; CHECK-NEXT: [[MASK:%.*]] = call i64 @llvm.amdgcn.fcmp.i64.f32(float [[A:%.*]], float [[B:%.*]], i32 13) +; CHECK-LABEL: define i64 @fold_icmp_eq_0_zext_fcmp_ogt_f32 +; CHECK-SAME: (float [[A:%.*]], float [[B:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[MASK:%.*]] = call i64 @llvm.amdgcn.fcmp.i64.f32(float [[A]], float [[B]], i32 13) ; CHECK-NEXT: ret i64 [[MASK]] ; %cmp = fcmp ogt float %a, %b @@ -1964,8 +2170,9 @@ } define i64 @fold_icmp_zext_icmp_eq_1_i32(i32 %a, i32 %b) { -; CHECK-LABEL: @fold_icmp_zext_icmp_eq_1_i32( -; CHECK-NEXT: [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i32(i32 [[A:%.*]], i32 [[B:%.*]], i32 32) +; CHECK-LABEL: define i64 @fold_icmp_zext_icmp_eq_1_i32 +; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i32(i32 [[A]], i32 [[B]], i32 32) ; CHECK-NEXT: ret i64 [[MASK]] ; %cmp = icmp eq i32 %a, %b @@ -1975,8 +2182,9 @@ } define i64 @fold_icmp_zext_argi1_eq_1_i32(i1 %cond) { -; CHECK-LABEL: @fold_icmp_zext_argi1_eq_1_i32( -; CHECK-NEXT: [[ZEXT_COND:%.*]] = zext i1 [[COND:%.*]] to i32 +; CHECK-LABEL: define i64 @fold_icmp_zext_argi1_eq_1_i32 +; CHECK-SAME: (i1 [[COND:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[ZEXT_COND:%.*]] = zext i1 [[COND]] to i32 ; CHECK-NEXT: [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i32(i32 [[ZEXT_COND]], i32 0, i32 33) ; CHECK-NEXT: ret i64 [[MASK]] ; @@ -1986,8 +2194,9 @@ } define i64 @fold_icmp_zext_argi1_eq_neg1_i32(i1 %cond) { -; CHECK-LABEL: @fold_icmp_zext_argi1_eq_neg1_i32( -; CHECK-NEXT: [[ZEXT_COND:%.*]] = zext i1 [[COND:%.*]] to i32 +; CHECK-LABEL: define i64 @fold_icmp_zext_argi1_eq_neg1_i32 +; CHECK-SAME: (i1 [[COND:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[ZEXT_COND:%.*]] = zext i1 [[COND]] to i32 ; CHECK-NEXT: [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i32(i32 [[ZEXT_COND]], i32 -1, i32 32) ; CHECK-NEXT: ret i64 [[MASK]] ; @@ -1997,8 +2206,9 @@ } define i64 @fold_icmp_sext_argi1_eq_1_i32(i1 %cond) { -; CHECK-LABEL: @fold_icmp_sext_argi1_eq_1_i32( -; CHECK-NEXT: [[SEXT_COND:%.*]] = sext i1 [[COND:%.*]] to i32 +; CHECK-LABEL: define i64 @fold_icmp_sext_argi1_eq_1_i32 +; CHECK-SAME: (i1 [[COND:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[SEXT_COND:%.*]] = sext i1 [[COND]] to i32 ; CHECK-NEXT: [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i32(i32 [[SEXT_COND]], i32 1, i32 32) ; CHECK-NEXT: ret i64 [[MASK]] ; @@ -2008,8 +2218,9 @@ } define i64 @fold_icmp_sext_argi1_eq_neg1_i32(i1 %cond) { -; CHECK-LABEL: @fold_icmp_sext_argi1_eq_neg1_i32( -; CHECK-NEXT: [[SEXT_COND:%.*]] = sext i1 [[COND:%.*]] to i32 +; CHECK-LABEL: define i64 @fold_icmp_sext_argi1_eq_neg1_i32 +; CHECK-SAME: (i1 [[COND:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[SEXT_COND:%.*]] = sext i1 [[COND]] to i32 ; CHECK-NEXT: [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i32(i32 [[SEXT_COND]], i32 0, i32 33) ; CHECK-NEXT: ret i64 [[MASK]] ; @@ -2019,8 +2230,9 @@ } define i64 @fold_icmp_sext_argi1_eq_neg1_i64(i1 %cond) { -; CHECK-LABEL: @fold_icmp_sext_argi1_eq_neg1_i64( -; CHECK-NEXT: [[SEXT_COND:%.*]] = sext i1 [[COND:%.*]] to i64 +; CHECK-LABEL: define i64 @fold_icmp_sext_argi1_eq_neg1_i64 +; CHECK-SAME: (i1 [[COND:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[SEXT_COND:%.*]] = sext i1 [[COND]] to i64 ; CHECK-NEXT: [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i64(i64 [[SEXT_COND]], i64 0, i32 33) ; CHECK-NEXT: ret i64 [[MASK]] ; @@ -2031,8 +2243,9 @@ ; TODO: Should be able to fold to false define i64 @fold_icmp_sext_icmp_eq_1_i32(i32 %a, i32 %b) { -; CHECK-LABEL: @fold_icmp_sext_icmp_eq_1_i32( -; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[A:%.*]], [[B:%.*]] +; CHECK-LABEL: define i64 @fold_icmp_sext_icmp_eq_1_i32 +; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[A]], [[B]] ; CHECK-NEXT: [[SEXT_CMP:%.*]] = sext i1 [[CMP]] to i32 ; CHECK-NEXT: [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i32(i32 [[SEXT_CMP]], i32 1, i32 32) ; CHECK-NEXT: ret i64 [[MASK]] @@ -2044,8 +2257,9 @@ } define i64 @fold_icmp_sext_icmp_eq_neg1_i32(i32 %a, i32 %b) { -; CHECK-LABEL: @fold_icmp_sext_icmp_eq_neg1_i32( -; CHECK-NEXT: [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i32(i32 [[A:%.*]], i32 [[B:%.*]], i32 32) +; CHECK-LABEL: define i64 @fold_icmp_sext_icmp_eq_neg1_i32 +; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i32(i32 [[A]], i32 [[B]], i32 32) ; CHECK-NEXT: ret i64 [[MASK]] ; %cmp = icmp eq i32 %a, %b @@ -2055,8 +2269,9 @@ } define i64 @fold_icmp_sext_icmp_sge_neg1_i32(i32 %a, i32 %b) { -; CHECK-LABEL: @fold_icmp_sext_icmp_sge_neg1_i32( -; CHECK-NEXT: [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i32(i32 [[A:%.*]], i32 [[B:%.*]], i32 39) +; CHECK-LABEL: define i64 @fold_icmp_sext_icmp_sge_neg1_i32 +; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i32(i32 [[A]], i32 [[B]], i32 39) ; CHECK-NEXT: ret i64 [[MASK]] ; %cmp = icmp sge i32 %a, %b @@ -2066,8 +2281,9 @@ } define i64 @fold_not_icmp_ne_0_zext_icmp_sle_i32(i32 %a, i32 %b) { -; CHECK-LABEL: @fold_not_icmp_ne_0_zext_icmp_sle_i32( -; CHECK-NEXT: [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i32(i32 [[A:%.*]], i32 [[B:%.*]], i32 38) +; CHECK-LABEL: define i64 @fold_not_icmp_ne_0_zext_icmp_sle_i32 +; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i32(i32 [[A]], i32 [[B]], i32 38) ; CHECK-NEXT: ret i64 [[MASK]] ; %cmp = icmp sle i32 %a, %b @@ -2078,9 +2294,10 @@ } define i64 @fold_icmp_ne_0_zext_icmp_eq_i4(i4 %a, i4 %b) { -; CHECK-LABEL: @fold_icmp_ne_0_zext_icmp_eq_i4( -; CHECK-NEXT: [[TMP1:%.*]] = zext i4 [[A:%.*]] to i16 -; CHECK-NEXT: [[TMP2:%.*]] = zext i4 [[B:%.*]] to i16 +; CHECK-LABEL: define i64 @fold_icmp_ne_0_zext_icmp_eq_i4 +; CHECK-SAME: (i4 [[A:%.*]], i4 [[B:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[TMP1:%.*]] = zext i4 [[A]] to i16 +; CHECK-NEXT: [[TMP2:%.*]] = zext i4 [[B]] to i16 ; CHECK-NEXT: [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i16(i16 [[TMP1]], i16 [[TMP2]], i32 32) ; CHECK-NEXT: ret i64 [[MASK]] ; @@ -2091,9 +2308,10 @@ } define i64 @fold_icmp_ne_0_zext_icmp_eq_i8(i8 %a, i8 %b) { -; CHECK-LABEL: @fold_icmp_ne_0_zext_icmp_eq_i8( -; CHECK-NEXT: [[TMP1:%.*]] = zext i8 [[A:%.*]] to i16 -; CHECK-NEXT: [[TMP2:%.*]] = zext i8 [[B:%.*]] to i16 +; CHECK-LABEL: define i64 @fold_icmp_ne_0_zext_icmp_eq_i8 +; CHECK-SAME: (i8 [[A:%.*]], i8 [[B:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[TMP1:%.*]] = zext i8 [[A]] to i16 +; CHECK-NEXT: [[TMP2:%.*]] = zext i8 [[B]] to i16 ; CHECK-NEXT: [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i16(i16 [[TMP1]], i16 [[TMP2]], i32 32) ; CHECK-NEXT: ret i64 [[MASK]] ; @@ -2104,8 +2322,9 @@ } define i64 @fold_icmp_ne_0_zext_icmp_eq_i16(i16 %a, i16 %b) { -; CHECK-LABEL: @fold_icmp_ne_0_zext_icmp_eq_i16( -; CHECK-NEXT: [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i16(i16 [[A:%.*]], i16 [[B:%.*]], i32 32) +; CHECK-LABEL: define i64 @fold_icmp_ne_0_zext_icmp_eq_i16 +; CHECK-SAME: (i16 [[A:%.*]], i16 [[B:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i16(i16 [[A]], i16 [[B]], i32 32) ; CHECK-NEXT: ret i64 [[MASK]] ; %cmp = icmp eq i16 %a, %b @@ -2115,9 +2334,10 @@ } define i64 @fold_icmp_ne_0_zext_icmp_eq_i36(i36 %a, i36 %b) { -; CHECK-LABEL: @fold_icmp_ne_0_zext_icmp_eq_i36( -; CHECK-NEXT: [[TMP1:%.*]] = zext i36 [[A:%.*]] to i64 -; CHECK-NEXT: [[TMP2:%.*]] = zext i36 [[B:%.*]] to i64 +; CHECK-LABEL: define i64 @fold_icmp_ne_0_zext_icmp_eq_i36 +; CHECK-SAME: (i36 [[A:%.*]], i36 [[B:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[TMP1:%.*]] = zext i36 [[A]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = zext i36 [[B]] to i64 ; CHECK-NEXT: [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i64(i64 [[TMP1]], i64 [[TMP2]], i32 32) ; CHECK-NEXT: ret i64 [[MASK]] ; @@ -2128,8 +2348,9 @@ } define i64 @fold_icmp_ne_0_zext_icmp_eq_i128(i128 %a, i128 %b) { -; CHECK-LABEL: @fold_icmp_ne_0_zext_icmp_eq_i128( -; CHECK-NEXT: [[CMP:%.*]] = icmp eq i128 [[A:%.*]], [[B:%.*]] +; CHECK-LABEL: define i64 @fold_icmp_ne_0_zext_icmp_eq_i128 +; CHECK-SAME: (i128 [[A:%.*]], i128 [[B:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i128 [[A]], [[B]] ; CHECK-NEXT: [[ZEXT_CMP:%.*]] = zext i1 [[CMP]] to i32 ; CHECK-NEXT: [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i32(i32 [[ZEXT_CMP]], i32 0, i32 33) ; CHECK-NEXT: ret i64 [[MASK]] @@ -2141,8 +2362,9 @@ } define i64 @fold_icmp_ne_0_zext_fcmp_oeq_f16(half %a, half %b) { -; CHECK-LABEL: @fold_icmp_ne_0_zext_fcmp_oeq_f16( -; CHECK-NEXT: [[MASK:%.*]] = call i64 @llvm.amdgcn.fcmp.i64.f16(half [[A:%.*]], half [[B:%.*]], i32 1) +; CHECK-LABEL: define i64 @fold_icmp_ne_0_zext_fcmp_oeq_f16 +; CHECK-SAME: (half [[A:%.*]], half [[B:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[MASK:%.*]] = call i64 @llvm.amdgcn.fcmp.i64.f16(half [[A]], half [[B]], i32 1) ; CHECK-NEXT: ret i64 [[MASK]] ; %cmp = fcmp oeq half %a, %b @@ -2152,8 +2374,9 @@ } define i64 @fold_icmp_ne_0_zext_fcmp_oeq_f128(fp128 %a, fp128 %b) { -; CHECK-LABEL: @fold_icmp_ne_0_zext_fcmp_oeq_f128( -; CHECK-NEXT: [[CMP:%.*]] = fcmp oeq fp128 [[A:%.*]], [[B:%.*]] +; CHECK-LABEL: define i64 @fold_icmp_ne_0_zext_fcmp_oeq_f128 +; CHECK-SAME: (fp128 [[A:%.*]], fp128 [[B:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[CMP:%.*]] = fcmp oeq fp128 [[A]], [[B]] ; CHECK-NEXT: [[ZEXT_CMP:%.*]] = zext i1 [[CMP]] to i32 ; CHECK-NEXT: [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i32(i32 [[ZEXT_CMP]], i32 0, i32 33) ; CHECK-NEXT: ret i64 [[MASK]] @@ -2165,9 +2388,10 @@ } define i64 @fold_icmp_ne_0_zext_icmp_slt_i4(i4 %a, i4 %b) { -; CHECK-LABEL: @fold_icmp_ne_0_zext_icmp_slt_i4( -; CHECK-NEXT: [[TMP1:%.*]] = sext i4 [[A:%.*]] to i16 -; CHECK-NEXT: [[TMP2:%.*]] = sext i4 [[B:%.*]] to i16 +; CHECK-LABEL: define i64 @fold_icmp_ne_0_zext_icmp_slt_i4 +; CHECK-SAME: (i4 [[A:%.*]], i4 [[B:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[TMP1:%.*]] = sext i4 [[A]] to i16 +; CHECK-NEXT: [[TMP2:%.*]] = sext i4 [[B]] to i16 ; CHECK-NEXT: [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i16(i16 [[TMP1]], i16 [[TMP2]], i32 40) ; CHECK-NEXT: ret i64 [[MASK]] ; @@ -2178,9 +2402,10 @@ } define i64 @fold_icmp_ne_0_zext_icmp_slt_i8(i8 %a, i8 %b) { -; CHECK-LABEL: @fold_icmp_ne_0_zext_icmp_slt_i8( -; CHECK-NEXT: [[TMP1:%.*]] = sext i8 [[A:%.*]] to i16 -; CHECK-NEXT: [[TMP2:%.*]] = sext i8 [[B:%.*]] to i16 +; CHECK-LABEL: define i64 @fold_icmp_ne_0_zext_icmp_slt_i8 +; CHECK-SAME: (i8 [[A:%.*]], i8 [[B:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[TMP1:%.*]] = sext i8 [[A]] to i16 +; CHECK-NEXT: [[TMP2:%.*]] = sext i8 [[B]] to i16 ; CHECK-NEXT: [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i16(i16 [[TMP1]], i16 [[TMP2]], i32 40) ; CHECK-NEXT: ret i64 [[MASK]] ; @@ -2191,8 +2416,9 @@ } define i64 @fold_icmp_ne_0_zext_icmp_slt_i16(i16 %a, i16 %b) { -; CHECK-LABEL: @fold_icmp_ne_0_zext_icmp_slt_i16( -; CHECK-NEXT: [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i16(i16 [[A:%.*]], i16 [[B:%.*]], i32 40) +; CHECK-LABEL: define i64 @fold_icmp_ne_0_zext_icmp_slt_i16 +; CHECK-SAME: (i16 [[A:%.*]], i16 [[B:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i16(i16 [[A]], i16 [[B]], i32 40) ; CHECK-NEXT: ret i64 [[MASK]] ; %cmp = icmp slt i16 %a, %b @@ -2202,9 +2428,10 @@ } define i64 @fold_icmp_ne_0_zext_icmp_ult_i4(i4 %a, i4 %b) { -; CHECK-LABEL: @fold_icmp_ne_0_zext_icmp_ult_i4( -; CHECK-NEXT: [[TMP1:%.*]] = zext i4 [[A:%.*]] to i16 -; CHECK-NEXT: [[TMP2:%.*]] = zext i4 [[B:%.*]] to i16 +; CHECK-LABEL: define i64 @fold_icmp_ne_0_zext_icmp_ult_i4 +; CHECK-SAME: (i4 [[A:%.*]], i4 [[B:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[TMP1:%.*]] = zext i4 [[A]] to i16 +; CHECK-NEXT: [[TMP2:%.*]] = zext i4 [[B]] to i16 ; CHECK-NEXT: [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i16(i16 [[TMP1]], i16 [[TMP2]], i32 36) ; CHECK-NEXT: ret i64 [[MASK]] ; @@ -2215,9 +2442,10 @@ } define i64 @fold_icmp_ne_0_zext_icmp_ult_i8(i8 %a, i8 %b) { -; CHECK-LABEL: @fold_icmp_ne_0_zext_icmp_ult_i8( -; CHECK-NEXT: [[TMP1:%.*]] = zext i8 [[A:%.*]] to i16 -; CHECK-NEXT: [[TMP2:%.*]] = zext i8 [[B:%.*]] to i16 +; CHECK-LABEL: define i64 @fold_icmp_ne_0_zext_icmp_ult_i8 +; CHECK-SAME: (i8 [[A:%.*]], i8 [[B:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[TMP1:%.*]] = zext i8 [[A]] to i16 +; CHECK-NEXT: [[TMP2:%.*]] = zext i8 [[B]] to i16 ; CHECK-NEXT: [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i16(i16 [[TMP1]], i16 [[TMP2]], i32 36) ; CHECK-NEXT: ret i64 [[MASK]] ; @@ -2228,8 +2456,9 @@ } define i64 @fold_icmp_ne_0_zext_icmp_ult_i16(i16 %a, i16 %b) { -; CHECK-LABEL: @fold_icmp_ne_0_zext_icmp_ult_i16( -; CHECK-NEXT: [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i16(i16 [[A:%.*]], i16 [[B:%.*]], i32 36) +; CHECK-LABEL: define i64 @fold_icmp_ne_0_zext_icmp_ult_i16 +; CHECK-SAME: (i16 [[A:%.*]], i16 [[B:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i16(i16 [[A]], i16 [[B]], i32 36) ; CHECK-NEXT: ret i64 [[MASK]] ; %cmp = icmp ult i16 %a, %b @@ -2241,8 +2470,9 @@ ; 1-bit NE comparisons define i64 @fold_icmp_i1_ne_0_icmp_eq_i1(i32 %a, i32 %b) { -; CHECK-LABEL: @fold_icmp_i1_ne_0_icmp_eq_i1( -; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[A:%.*]], [[B:%.*]] +; CHECK-LABEL: define i64 @fold_icmp_i1_ne_0_icmp_eq_i1 +; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[A]], [[B]] ; CHECK-NEXT: [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i1(i1 [[CMP]], i1 false, i32 33) ; CHECK-NEXT: ret i64 [[MASK]] ; @@ -2252,8 +2482,9 @@ } define i64 @fold_icmp_i1_ne_0_icmp_ne_i1(i32 %a, i32 %b) { -; CHECK-LABEL: @fold_icmp_i1_ne_0_icmp_ne_i1( -; CHECK-NEXT: [[CMP:%.*]] = icmp ne i32 [[A:%.*]], [[B:%.*]] +; CHECK-LABEL: define i64 @fold_icmp_i1_ne_0_icmp_ne_i1 +; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[CMP:%.*]] = icmp ne i32 [[A]], [[B]] ; CHECK-NEXT: [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i1(i1 [[CMP]], i1 false, i32 33) ; CHECK-NEXT: ret i64 [[MASK]] ; @@ -2263,8 +2494,9 @@ } define i64 @fold_icmp_i1_ne_0_icmp_sle_i1(i32 %a, i32 %b) { -; CHECK-LABEL: @fold_icmp_i1_ne_0_icmp_sle_i1( -; CHECK-NEXT: [[CMP:%.*]] = icmp sle i32 [[A:%.*]], [[B:%.*]] +; CHECK-LABEL: define i64 @fold_icmp_i1_ne_0_icmp_sle_i1 +; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[CMP:%.*]] = icmp sle i32 [[A]], [[B]] ; CHECK-NEXT: [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i1(i1 [[CMP]], i1 false, i32 33) ; CHECK-NEXT: ret i64 [[MASK]] ; @@ -2274,8 +2506,9 @@ } define i64 @fold_icmp_i1_ne_0_icmp_ugt_i64(i64 %a, i64 %b) { -; CHECK-LABEL: @fold_icmp_i1_ne_0_icmp_ugt_i64( -; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i64 [[A:%.*]], [[B:%.*]] +; CHECK-LABEL: define i64 @fold_icmp_i1_ne_0_icmp_ugt_i64 +; CHECK-SAME: (i64 [[A:%.*]], i64 [[B:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i64 [[A]], [[B]] ; CHECK-NEXT: [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i1(i1 [[CMP]], i1 false, i32 33) ; CHECK-NEXT: ret i64 [[MASK]] ; @@ -2285,8 +2518,9 @@ } define i64 @fold_icmp_i1_ne_0_icmp_ult_swap_i64(i64 %a, i64 %b) { -; CHECK-LABEL: @fold_icmp_i1_ne_0_icmp_ult_swap_i64( -; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i64 [[A:%.*]], [[B:%.*]] +; CHECK-LABEL: define i64 @fold_icmp_i1_ne_0_icmp_ult_swap_i64 +; CHECK-SAME: (i64 [[A:%.*]], i64 [[B:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i64 [[A]], [[B]] ; CHECK-NEXT: [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i1(i1 [[CMP]], i1 false, i32 33) ; CHECK-NEXT: ret i64 [[MASK]] ; @@ -2296,8 +2530,9 @@ } define i64 @fold_icmp_i1_ne_0_fcmp_oeq_f32(float %a, float %b) { -; CHECK-LABEL: @fold_icmp_i1_ne_0_fcmp_oeq_f32( -; CHECK-NEXT: [[CMP:%.*]] = fcmp oeq float [[A:%.*]], [[B:%.*]] +; CHECK-LABEL: define i64 @fold_icmp_i1_ne_0_fcmp_oeq_f32 +; CHECK-SAME: (float [[A:%.*]], float [[B:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[CMP:%.*]] = fcmp oeq float [[A]], [[B]] ; CHECK-NEXT: [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i1(i1 [[CMP]], i1 false, i32 33) ; CHECK-NEXT: ret i64 [[MASK]] ; @@ -2307,8 +2542,9 @@ } define i64 @fold_icmp_i1_ne_0_fcmp_une_f32(float %a, float %b) { -; CHECK-LABEL: @fold_icmp_i1_ne_0_fcmp_une_f32( -; CHECK-NEXT: [[CMP:%.*]] = fcmp une float [[A:%.*]], [[B:%.*]] +; CHECK-LABEL: define i64 @fold_icmp_i1_ne_0_fcmp_une_f32 +; CHECK-SAME: (float [[A:%.*]], float [[B:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[CMP:%.*]] = fcmp une float [[A]], [[B]] ; CHECK-NEXT: [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i1(i1 [[CMP]], i1 false, i32 33) ; CHECK-NEXT: ret i64 [[MASK]] ; @@ -2318,8 +2554,9 @@ } define i64 @fold_icmp_i1_ne_0_fcmp_olt_f64(double %a, double %b) { -; CHECK-LABEL: @fold_icmp_i1_ne_0_fcmp_olt_f64( -; CHECK-NEXT: [[CMP:%.*]] = fcmp olt double [[A:%.*]], [[B:%.*]] +; CHECK-LABEL: define i64 @fold_icmp_i1_ne_0_fcmp_olt_f64 +; CHECK-SAME: (double [[A:%.*]], double [[B:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[CMP:%.*]] = fcmp olt double [[A]], [[B]] ; CHECK-NEXT: [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i1(i1 [[CMP]], i1 false, i32 33) ; CHECK-NEXT: ret i64 [[MASK]] ; @@ -2329,8 +2566,9 @@ } define i64 @fold_icmp_i1_ne_0_icmp_eq_i4(i4 %a, i4 %b) { -; CHECK-LABEL: @fold_icmp_i1_ne_0_icmp_eq_i4( -; CHECK-NEXT: [[CMP:%.*]] = icmp eq i4 [[A:%.*]], [[B:%.*]] +; CHECK-LABEL: define i64 @fold_icmp_i1_ne_0_icmp_eq_i4 +; CHECK-SAME: (i4 [[A:%.*]], i4 [[B:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i4 [[A]], [[B]] ; CHECK-NEXT: [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i1(i1 [[CMP]], i1 false, i32 33) ; CHECK-NEXT: ret i64 [[MASK]] ; @@ -2340,8 +2578,9 @@ } define i64 @fold_icmp_i1_ne_0_icmp_eq_i8(i8 %a, i8 %b) { -; CHECK-LABEL: @fold_icmp_i1_ne_0_icmp_eq_i8( -; CHECK-NEXT: [[CMP:%.*]] = icmp eq i8 [[A:%.*]], [[B:%.*]] +; CHECK-LABEL: define i64 @fold_icmp_i1_ne_0_icmp_eq_i8 +; CHECK-SAME: (i8 [[A:%.*]], i8 [[B:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i8 [[A]], [[B]] ; CHECK-NEXT: [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i1(i1 [[CMP]], i1 false, i32 33) ; CHECK-NEXT: ret i64 [[MASK]] ; @@ -2351,8 +2590,9 @@ } define i64 @fold_icmp_i1_ne_0_icmp_eq_i16(i16 %a, i16 %b) { -; CHECK-LABEL: @fold_icmp_i1_ne_0_icmp_eq_i16( -; CHECK-NEXT: [[CMP:%.*]] = icmp eq i16 [[A:%.*]], [[B:%.*]] +; CHECK-LABEL: define i64 @fold_icmp_i1_ne_0_icmp_eq_i16 +; CHECK-SAME: (i16 [[A:%.*]], i16 [[B:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i16 [[A]], [[B]] ; CHECK-NEXT: [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i1(i1 [[CMP]], i1 false, i32 33) ; CHECK-NEXT: ret i64 [[MASK]] ; @@ -2362,8 +2602,9 @@ } define i64 @fold_icmp_i1_ne_0_icmp_eq_i36(i36 %a, i36 %b) { -; CHECK-LABEL: @fold_icmp_i1_ne_0_icmp_eq_i36( -; CHECK-NEXT: [[CMP:%.*]] = icmp eq i36 [[A:%.*]], [[B:%.*]] +; CHECK-LABEL: define i64 @fold_icmp_i1_ne_0_icmp_eq_i36 +; CHECK-SAME: (i36 [[A:%.*]], i36 [[B:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i36 [[A]], [[B]] ; CHECK-NEXT: [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i1(i1 [[CMP]], i1 false, i32 33) ; CHECK-NEXT: ret i64 [[MASK]] ; @@ -2373,8 +2614,9 @@ } define i64 @fold_icmp_i1_ne_0_icmp_eq_i128(i128 %a, i128 %b) { -; CHECK-LABEL: @fold_icmp_i1_ne_0_icmp_eq_i128( -; CHECK-NEXT: [[CMP:%.*]] = icmp eq i128 [[A:%.*]], [[B:%.*]] +; CHECK-LABEL: define i64 @fold_icmp_i1_ne_0_icmp_eq_i128 +; CHECK-SAME: (i128 [[A:%.*]], i128 [[B:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i128 [[A]], [[B]] ; CHECK-NEXT: [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i1(i1 [[CMP]], i1 false, i32 33) ; CHECK-NEXT: ret i64 [[MASK]] ; @@ -2384,8 +2626,9 @@ } define i64 @fold_icmp_i1_ne_0_fcmp_oeq_f16(half %a, half %b) { -; CHECK-LABEL: @fold_icmp_i1_ne_0_fcmp_oeq_f16( -; CHECK-NEXT: [[CMP:%.*]] = fcmp oeq half [[A:%.*]], [[B:%.*]] +; CHECK-LABEL: define i64 @fold_icmp_i1_ne_0_fcmp_oeq_f16 +; CHECK-SAME: (half [[A:%.*]], half [[B:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[CMP:%.*]] = fcmp oeq half [[A]], [[B]] ; CHECK-NEXT: [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i1(i1 [[CMP]], i1 false, i32 33) ; CHECK-NEXT: ret i64 [[MASK]] ; @@ -2395,8 +2638,9 @@ } define i64 @fold_icmp_i1_ne_0_fcmp_oeq_f128(fp128 %a, fp128 %b) { -; CHECK-LABEL: @fold_icmp_i1_ne_0_fcmp_oeq_f128( -; CHECK-NEXT: [[CMP:%.*]] = fcmp oeq fp128 [[A:%.*]], [[B:%.*]] +; CHECK-LABEL: define i64 @fold_icmp_i1_ne_0_fcmp_oeq_f128 +; CHECK-SAME: (fp128 [[A:%.*]], fp128 [[B:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[CMP:%.*]] = fcmp oeq fp128 [[A]], [[B]] ; CHECK-NEXT: [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i1(i1 [[CMP]], i1 false, i32 33) ; CHECK-NEXT: ret i64 [[MASK]] ; @@ -2406,8 +2650,9 @@ } define i64 @fold_icmp_i1_ne_0_icmp_slt_i4(i4 %a, i4 %b) { -; CHECK-LABEL: @fold_icmp_i1_ne_0_icmp_slt_i4( -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i4 [[A:%.*]], [[B:%.*]] +; CHECK-LABEL: define i64 @fold_icmp_i1_ne_0_icmp_slt_i4 +; CHECK-SAME: (i4 [[A:%.*]], i4 [[B:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i4 [[A]], [[B]] ; CHECK-NEXT: [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i1(i1 [[CMP]], i1 false, i32 33) ; CHECK-NEXT: ret i64 [[MASK]] ; @@ -2417,8 +2662,9 @@ } define i64 @fold_icmp_i1_ne_0_icmp_slt_i8(i8 %a, i8 %b) { -; CHECK-LABEL: @fold_icmp_i1_ne_0_icmp_slt_i8( -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i8 [[A:%.*]], [[B:%.*]] +; CHECK-LABEL: define i64 @fold_icmp_i1_ne_0_icmp_slt_i8 +; CHECK-SAME: (i8 [[A:%.*]], i8 [[B:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i8 [[A]], [[B]] ; CHECK-NEXT: [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i1(i1 [[CMP]], i1 false, i32 33) ; CHECK-NEXT: ret i64 [[MASK]] ; @@ -2428,8 +2674,9 @@ } define i64 @fold_icmp_i1_ne_0_icmp_slt_i16(i16 %a, i16 %b) { -; CHECK-LABEL: @fold_icmp_i1_ne_0_icmp_slt_i16( -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i16 [[A:%.*]], [[B:%.*]] +; CHECK-LABEL: define i64 @fold_icmp_i1_ne_0_icmp_slt_i16 +; CHECK-SAME: (i16 [[A:%.*]], i16 [[B:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i16 [[A]], [[B]] ; CHECK-NEXT: [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i1(i1 [[CMP]], i1 false, i32 33) ; CHECK-NEXT: ret i64 [[MASK]] ; @@ -2439,8 +2686,9 @@ } define i64 @fold_icmp_i1_ne_0_icmp_ult_i4(i4 %a, i4 %b) { -; CHECK-LABEL: @fold_icmp_i1_ne_0_icmp_ult_i4( -; CHECK-NEXT: [[CMP:%.*]] = icmp ult i4 [[A:%.*]], [[B:%.*]] +; CHECK-LABEL: define i64 @fold_icmp_i1_ne_0_icmp_ult_i4 +; CHECK-SAME: (i4 [[A:%.*]], i4 [[B:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i4 [[A]], [[B]] ; CHECK-NEXT: [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i1(i1 [[CMP]], i1 false, i32 33) ; CHECK-NEXT: ret i64 [[MASK]] ; @@ -2450,8 +2698,9 @@ } define i64 @fold_icmp_i1_ne_0_icmp_ult_i8(i8 %a, i8 %b) { -; CHECK-LABEL: @fold_icmp_i1_ne_0_icmp_ult_i8( -; CHECK-NEXT: [[CMP:%.*]] = icmp ult i8 [[A:%.*]], [[B:%.*]] +; CHECK-LABEL: define i64 @fold_icmp_i1_ne_0_icmp_ult_i8 +; CHECK-SAME: (i8 [[A:%.*]], i8 [[B:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i8 [[A]], [[B]] ; CHECK-NEXT: [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i1(i1 [[CMP]], i1 false, i32 33) ; CHECK-NEXT: ret i64 [[MASK]] ; @@ -2461,8 +2710,9 @@ } define i64 @fold_icmp_i1_ne_0_icmp_ult_i16(i16 %a, i16 %b) { -; CHECK-LABEL: @fold_icmp_i1_ne_0_icmp_ult_i16( -; CHECK-NEXT: [[CMP:%.*]] = icmp ult i16 [[A:%.*]], [[B:%.*]] +; CHECK-LABEL: define i64 @fold_icmp_i1_ne_0_icmp_ult_i16 +; CHECK-SAME: (i16 [[A:%.*]], i16 [[B:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i16 [[A]], [[B]] ; CHECK-NEXT: [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i1(i1 [[CMP]], i1 false, i32 33) ; CHECK-NEXT: ret i64 [[MASK]] ; @@ -2478,8 +2728,9 @@ declare i64 @llvm.amdgcn.fcmp.i64.f32(float, float, i32 immarg) nounwind readnone convergent define i64 @invalid_fcmp_code(float %a, float %b) { -; CHECK-LABEL: @invalid_fcmp_code( -; CHECK-NEXT: [[UNDER:%.*]] = call i64 @llvm.amdgcn.fcmp.i64.f32(float [[A:%.*]], float [[B:%.*]], i32 -1) +; CHECK-LABEL: define i64 @invalid_fcmp_code +; CHECK-SAME: (float [[A:%.*]], float [[B:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[UNDER:%.*]] = call i64 @llvm.amdgcn.fcmp.i64.f32(float [[A]], float [[B]], i32 -1) ; CHECK-NEXT: [[OVER:%.*]] = call i64 @llvm.amdgcn.fcmp.i64.f32(float [[A]], float [[B]], i32 16) ; CHECK-NEXT: [[OR:%.*]] = or i64 [[UNDER]], [[OVER]] ; CHECK-NEXT: ret i64 [[OR]] @@ -2491,7 +2742,8 @@ } define i64 @fcmp_constant_inputs_false() { -; CHECK-LABEL: @fcmp_constant_inputs_false( +; CHECK-LABEL: define i64 @fcmp_constant_inputs_false +; CHECK-SAME: () #[[ATTR3]] { ; CHECK-NEXT: ret i64 0 ; %result = call i64 @llvm.amdgcn.fcmp.i64.f32(float 2.0, float 4.0, i32 1) @@ -2499,8 +2751,9 @@ } define i64 @fcmp_constant_inputs_true() { -; CHECK-LABEL: @fcmp_constant_inputs_true( -; CHECK-NEXT: [[RESULT:%.*]] = call i64 @llvm.read_register.i64(metadata [[META0]]) #[[ATTR17]] +; CHECK-LABEL: define i64 @fcmp_constant_inputs_true +; CHECK-SAME: () #[[ATTR3]] { +; CHECK-NEXT: [[RESULT:%.*]] = call i64 @llvm.read_register.i64(metadata [[META0]]) #[[ATTR16]] ; CHECK-NEXT: ret i64 [[RESULT]] ; %result = call i64 @llvm.amdgcn.fcmp.i64.f32(float 2.0, float 4.0, i32 4) @@ -2508,8 +2761,9 @@ } define i64 @fcmp_constant_to_rhs_olt(float %x) { -; CHECK-LABEL: @fcmp_constant_to_rhs_olt( -; CHECK-NEXT: [[RESULT:%.*]] = call i64 @llvm.amdgcn.fcmp.i64.f32(float [[X:%.*]], float 4.000000e+00, i32 2) +; CHECK-LABEL: define i64 @fcmp_constant_to_rhs_olt +; CHECK-SAME: (float [[X:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[RESULT:%.*]] = call i64 @llvm.amdgcn.fcmp.i64.f32(float [[X]], float 4.000000e+00, i32 2) ; CHECK-NEXT: ret i64 [[RESULT]] ; %result = call i64 @llvm.amdgcn.fcmp.i64.f32(float 4.0, float %x, i32 4) @@ -2524,8 +2778,9 @@ declare i32 @llvm.amdgcn.ballot.i32(i1) nounwind readnone convergent define i64 @ballot_nocombine_64(i1 %i) { -; CHECK-LABEL: @ballot_nocombine_64( -; CHECK-NEXT: [[B:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[I:%.*]]) +; CHECK-LABEL: define i64 @ballot_nocombine_64 +; CHECK-SAME: (i1 [[I:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[B:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[I]]) ; CHECK-NEXT: ret i64 [[B]] ; %b = call i64 @llvm.amdgcn.ballot.i64(i1 %i) @@ -2533,7 +2788,8 @@ } define i64 @ballot_zero_64() { -; CHECK-LABEL: @ballot_zero_64( +; CHECK-LABEL: define i64 @ballot_zero_64 +; CHECK-SAME: () #[[ATTR3]] { ; CHECK-NEXT: ret i64 0 ; %b = call i64 @llvm.amdgcn.ballot.i64(i1 0) @@ -2541,8 +2797,9 @@ } define i64 @ballot_one_64() { -; CHECK-LABEL: @ballot_one_64( -; CHECK-NEXT: [[B:%.*]] = call i64 @llvm.read_register.i64(metadata [[META0]]) #[[ATTR17]] +; CHECK-LABEL: define i64 @ballot_one_64 +; CHECK-SAME: () #[[ATTR3]] { +; CHECK-NEXT: [[B:%.*]] = call i64 @llvm.read_register.i64(metadata [[META0]]) #[[ATTR16]] ; CHECK-NEXT: ret i64 [[B]] ; %b = call i64 @llvm.amdgcn.ballot.i64(i1 1) @@ -2550,8 +2807,9 @@ } define i32 @ballot_nocombine_32(i1 %i) { -; CHECK-LABEL: @ballot_nocombine_32( -; CHECK-NEXT: [[B:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[I:%.*]]) +; CHECK-LABEL: define i32 @ballot_nocombine_32 +; CHECK-SAME: (i1 [[I:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[B:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[I]]) ; CHECK-NEXT: ret i32 [[B]] ; %b = call i32 @llvm.amdgcn.ballot.i32(i1 %i) @@ -2559,7 +2817,8 @@ } define i32 @ballot_zero_32() { -; CHECK-LABEL: @ballot_zero_32( +; CHECK-LABEL: define i32 @ballot_zero_32 +; CHECK-SAME: () #[[ATTR3]] { ; CHECK-NEXT: ret i32 0 ; %b = call i32 @llvm.amdgcn.ballot.i32(i1 0) @@ -2567,8 +2826,9 @@ } define i32 @ballot_one_32() { -; CHECK-LABEL: @ballot_one_32( -; CHECK-NEXT: [[B:%.*]] = call i32 @llvm.read_register.i32(metadata [[META1:![0-9]+]]) #[[ATTR17]] +; CHECK-LABEL: define i32 @ballot_one_32 +; CHECK-SAME: () #[[ATTR3]] { +; CHECK-NEXT: [[B:%.*]] = call i32 @llvm.read_register.i32(metadata [[META1:![0-9]+]]) #[[ATTR16]] ; CHECK-NEXT: ret i32 [[B]] ; %b = call i32 @llvm.amdgcn.ballot.i32(i1 1) @@ -2582,7 +2842,8 @@ declare i1 @llvm.amdgcn.wqm.vote(i1) define float @wqm_vote_true() { -; CHECK-LABEL: @wqm_vote_true( +; CHECK-LABEL: define float @wqm_vote_true +; CHECK-SAME: () #[[ATTR3]] { ; CHECK-NEXT: main_body: ; CHECK-NEXT: ret float 1.000000e+00 ; @@ -2593,7 +2854,8 @@ } define float @wqm_vote_false() { -; CHECK-LABEL: @wqm_vote_false( +; CHECK-LABEL: define float @wqm_vote_false +; CHECK-SAME: () #[[ATTR3]] { ; CHECK-NEXT: main_body: ; CHECK-NEXT: ret float 0.000000e+00 ; @@ -2604,7 +2866,8 @@ } define float @wqm_vote_undef() { -; CHECK-LABEL: @wqm_vote_undef( +; CHECK-LABEL: define float @wqm_vote_undef +; CHECK-SAME: () #[[ATTR3]] { ; CHECK-NEXT: main_body: ; CHECK-NEXT: ret float 0.000000e+00 ; @@ -2621,7 +2884,8 @@ declare void @llvm.amdgcn.kill(i1) define void @kill_true() { -; CHECK-LABEL: @kill_true( +; CHECK-LABEL: define void @kill_true +; CHECK-SAME: () #[[ATTR3]] { ; CHECK-NEXT: ret void ; call void @llvm.amdgcn.kill(i1 true) @@ -2637,8 +2901,9 @@ @gv = constant i32 0 define amdgpu_kernel void @readfirstlane_constant(i32 %arg) { -; CHECK-LABEL: @readfirstlane_constant( -; CHECK-NEXT: [[VAR:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[ARG:%.*]]) +; CHECK-LABEL: define amdgpu_kernel void @readfirstlane_constant +; CHECK-SAME: (i32 [[ARG:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[VAR:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG]]) ; CHECK-NEXT: store volatile i32 [[VAR]], ptr undef, align 4 ; CHECK-NEXT: store volatile i32 0, ptr undef, align 4 ; CHECK-NEXT: store volatile i32 123, ptr undef, align 4 @@ -2660,8 +2925,9 @@ } define i32 @readfirstlane_idempotent(i32 %arg) { -; CHECK-LABEL: @readfirstlane_idempotent( -; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[ARG:%.*]]) +; CHECK-LABEL: define i32 @readfirstlane_idempotent +; CHECK-SAME: (i32 [[ARG:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG]]) ; CHECK-NEXT: ret i32 [[READ0]] ; %read0 = call i32 @llvm.amdgcn.readfirstlane(i32 %arg) @@ -2671,8 +2937,9 @@ } define i32 @readfirstlane_readlane(i32 %arg) { -; CHECK-LABEL: @readfirstlane_readlane( -; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[ARG:%.*]]) +; CHECK-LABEL: define i32 @readfirstlane_readlane +; CHECK-SAME: (i32 [[ARG:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG]]) ; CHECK-NEXT: ret i32 [[READ0]] ; %read0 = call i32 @llvm.amdgcn.readfirstlane(i32 %arg) @@ -2681,12 +2948,13 @@ } define i32 @readfirstlane_readfirstlane_different_block(i32 %arg) { -; CHECK-LABEL: @readfirstlane_readfirstlane_different_block( +; CHECK-LABEL: define i32 @readfirstlane_readfirstlane_different_block +; CHECK-SAME: (i32 [[ARG:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: bb0: -; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[ARG:%.*]]) +; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG]]) ; CHECK-NEXT: br label [[BB1:%.*]] ; CHECK: bb1: -; CHECK-NEXT: [[READ1:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[READ0]]) +; CHECK-NEXT: [[READ1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[READ0]]) ; CHECK-NEXT: ret i32 [[READ1]] ; bb0: @@ -2699,12 +2967,13 @@ } define i32 @readfirstlane_readlane_different_block(i32 %arg) { -; CHECK-LABEL: @readfirstlane_readlane_different_block( +; CHECK-LABEL: define i32 @readfirstlane_readlane_different_block +; CHECK-SAME: (i32 [[ARG:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: bb0: -; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[ARG:%.*]], i32 0) +; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 0) ; CHECK-NEXT: br label [[BB1:%.*]] ; CHECK: bb1: -; CHECK-NEXT: [[READ1:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[READ0]]) +; CHECK-NEXT: [[READ1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[READ0]]) ; CHECK-NEXT: ret i32 [[READ1]] ; bb0: @@ -2716,6 +2985,44 @@ ret i32 %read1 } +define i32 @readfirstlane_bitcast(float %arg) { +; CHECK-LABEL: define i32 @readfirstlane_bitcast +; CHECK-SAME: (float [[ARG:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[TMP1:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[ARG]]) +; CHECK-NEXT: [[READ:%.*]] = bitcast float [[TMP1]] to i32 +; CHECK-NEXT: ret i32 [[READ]] +; + %bitcast.arg = bitcast float %arg to i32 + %read = call i32 @llvm.amdgcn.readfirstlane(i32 %bitcast.arg) + ret i32 %read +} + +define float @bitcast_readfirstlane_bitcast(float %arg) { +; CHECK-LABEL: define float @bitcast_readfirstlane_bitcast +; CHECK-SAME: (float [[ARG:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[TMP1:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[ARG]]) +; CHECK-NEXT: ret float [[TMP1]] +; + %bitcast.arg = bitcast float %arg to i32 + %read = call i32 @llvm.amdgcn.readfirstlane(i32 %bitcast.arg) + %cast.read = bitcast i32 %read to float + ret float %cast.read +} + +define i32 @readfirstlane_bitcast_multi_use(float %arg) { +; CHECK-LABEL: define i32 @readfirstlane_bitcast_multi_use +; CHECK-SAME: (float [[ARG:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: store float [[ARG]], ptr undef, align 4 +; CHECK-NEXT: [[TMP1:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[ARG]]) +; CHECK-NEXT: [[READ:%.*]] = bitcast float [[TMP1]] to i32 +; CHECK-NEXT: ret i32 [[READ]] +; + %bitcast.arg = bitcast float %arg to i32 + store i32 %bitcast.arg, i32* undef + %read = call i32 @llvm.amdgcn.readfirstlane(i32 %bitcast.arg) + ret i32 %read +} + ; -------------------------------------------------------------------- ; llvm.amdgcn.readlane ; -------------------------------------------------------------------- @@ -2723,8 +3030,9 @@ declare i32 @llvm.amdgcn.readlane(i32, i32) define amdgpu_kernel void @readlane_constant(i32 %arg, i32 %lane) { -; CHECK-LABEL: @readlane_constant( -; CHECK-NEXT: [[VAR:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[ARG:%.*]], i32 7) +; CHECK-LABEL: define amdgpu_kernel void @readlane_constant +; CHECK-SAME: (i32 [[ARG:%.*]], i32 [[LANE:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[VAR:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 7) ; CHECK-NEXT: store volatile i32 [[VAR]], ptr undef, align 4 ; CHECK-NEXT: store volatile i32 0, ptr undef, align 4 ; CHECK-NEXT: store volatile i32 123, ptr undef, align 4 @@ -2746,8 +3054,9 @@ } define i32 @readlane_idempotent(i32 %arg, i32 %lane) { -; CHECK-LABEL: @readlane_idempotent( -; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[ARG:%.*]], i32 [[LANE:%.*]]) +; CHECK-LABEL: define i32 @readlane_idempotent +; CHECK-SAME: (i32 [[ARG:%.*]], i32 [[LANE:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 [[LANE]]) ; CHECK-NEXT: ret i32 [[READ0]] ; %read0 = call i32 @llvm.amdgcn.readlane(i32 %arg, i32 %lane) @@ -2756,9 +3065,10 @@ } define i32 @readlane_idempotent_different_lanes(i32 %arg, i32 %lane0, i32 %lane1) { -; CHECK-LABEL: @readlane_idempotent_different_lanes( -; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[ARG:%.*]], i32 [[LANE0:%.*]]) -; CHECK-NEXT: [[READ1:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[READ0]], i32 [[LANE1:%.*]]) +; CHECK-LABEL: define i32 @readlane_idempotent_different_lanes +; CHECK-SAME: (i32 [[ARG:%.*]], i32 [[LANE0:%.*]], i32 [[LANE1:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 [[LANE0]]) +; CHECK-NEXT: [[READ1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[READ0]], i32 [[LANE1]]) ; CHECK-NEXT: ret i32 [[READ1]] ; %read0 = call i32 @llvm.amdgcn.readlane(i32 %arg, i32 %lane0) @@ -2767,8 +3077,9 @@ } define i32 @readlane_readfirstlane(i32 %arg) { -; CHECK-LABEL: @readlane_readfirstlane( -; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[ARG:%.*]]) +; CHECK-LABEL: define i32 @readlane_readfirstlane +; CHECK-SAME: (i32 [[ARG:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG]]) ; CHECK-NEXT: ret i32 [[READ0]] ; %read0 = call i32 @llvm.amdgcn.readfirstlane(i32 %arg) @@ -2777,12 +3088,13 @@ } define i32 @readlane_idempotent_different_block(i32 %arg, i32 %lane) { -; CHECK-LABEL: @readlane_idempotent_different_block( +; CHECK-LABEL: define i32 @readlane_idempotent_different_block +; CHECK-SAME: (i32 [[ARG:%.*]], i32 [[LANE:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: bb0: -; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[ARG:%.*]], i32 [[LANE:%.*]]) +; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 [[LANE]]) ; CHECK-NEXT: br label [[BB1:%.*]] ; CHECK: bb1: -; CHECK-NEXT: [[READ1:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[READ0]], i32 [[LANE]]) +; CHECK-NEXT: [[READ1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[READ0]], i32 [[LANE]]) ; CHECK-NEXT: ret i32 [[READ1]] ; bb0: @@ -2796,12 +3108,13 @@ define i32 @readlane_readfirstlane_different_block(i32 %arg) { -; CHECK-LABEL: @readlane_readfirstlane_different_block( +; CHECK-LABEL: define i32 @readlane_readfirstlane_different_block +; CHECK-SAME: (i32 [[ARG:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: bb0: -; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[ARG:%.*]]) +; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG]]) ; CHECK-NEXT: br label [[BB1:%.*]] ; CHECK: bb1: -; CHECK-NEXT: [[READ1:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[READ0]], i32 0) +; CHECK-NEXT: [[READ1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[READ0]], i32 0) ; CHECK-NEXT: ret i32 [[READ1]] ; bb0: @@ -2820,35 +3133,38 @@ declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32, i32, i32, i1) define amdgpu_kernel void @update_dpp_no_combine(ptr addrspace(1) %out, i32 %in1, i32 %in2) { -; CHECK-LABEL: @update_dpp_no_combine( -; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 [[IN1:%.*]], i32 [[IN2:%.*]], i32 1, i32 1, i32 1, i1 false) -; CHECK-NEXT: store i32 [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 4 +; CHECK-LABEL: define amdgpu_kernel void @update_dpp_no_combine +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 [[IN1:%.*]], i32 [[IN2:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 [[IN1]], i32 [[IN2]], i32 1, i32 1, i32 1, i1 false) +; CHECK-NEXT: store i32 [[VAL0]], ptr addrspace(1) [[OUT]], align 4 ; CHECK-NEXT: ret void ; - %tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %in1, i32 %in2, i32 1, i32 1, i32 1, i1 0) - store i32 %tmp0, ptr addrspace(1) %out + %val0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %in1, i32 %in2, i32 1, i32 1, i32 1, i1 0) + store i32 %val0, ptr addrspace(1) %out ret void } define amdgpu_kernel void @update_dpp_drop_old(ptr addrspace(1) %out, i32 %in1, i32 %in2) { -; CHECK-LABEL: @update_dpp_drop_old( -; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 undef, i32 [[IN2:%.*]], i32 3, i32 15, i32 15, i1 true) -; CHECK-NEXT: store i32 [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 4 +; CHECK-LABEL: define amdgpu_kernel void @update_dpp_drop_old +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 [[IN1:%.*]], i32 [[IN2:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 undef, i32 [[IN2]], i32 3, i32 15, i32 15, i1 true) +; CHECK-NEXT: store i32 [[VAL0]], ptr addrspace(1) [[OUT]], align 4 ; CHECK-NEXT: ret void ; - %tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %in1, i32 %in2, i32 3, i32 15, i32 15, i1 1) - store i32 %tmp0, ptr addrspace(1) %out + %val0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %in1, i32 %in2, i32 3, i32 15, i32 15, i1 1) + store i32 %val0, ptr addrspace(1) %out ret void } define amdgpu_kernel void @update_dpp_undef_old(ptr addrspace(1) %out, i32 %in1) { -; CHECK-LABEL: @update_dpp_undef_old( -; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 undef, i32 [[IN1:%.*]], i32 4, i32 15, i32 15, i1 true) -; CHECK-NEXT: store i32 [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 4 +; CHECK-LABEL: define amdgpu_kernel void @update_dpp_undef_old +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 [[IN1:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 undef, i32 [[IN1]], i32 4, i32 15, i32 15, i1 true) +; CHECK-NEXT: store i32 [[VAL0]], ptr addrspace(1) [[OUT]], align 4 ; CHECK-NEXT: ret void ; - %tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 undef, i32 %in1, i32 4, i32 15, i32 15, i1 1) - store i32 %tmp0, ptr addrspace(1) %out + %val0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 undef, i32 %in1, i32 4, i32 15, i32 15, i1 1) + store i32 %val0, ptr addrspace(1) %out ret void } @@ -2860,9 +3176,10 @@ declare i32 @llvm.amdgcn.permlane16(i32, i32, i32, i32, i1 immarg, i1 immarg) define amdgpu_kernel void @permlane16(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; CHECK-LABEL: @permlane16( -; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.permlane16(i32 12345, i32 [[SRC0:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]], i1 false, i1 false) -; CHECK-NEXT: store i32 [[RES]], ptr addrspace(1) [[OUT:%.*]], align 4 +; CHECK-LABEL: define amdgpu_kernel void @permlane16 +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.permlane16.i32(i32 12345, i32 [[SRC0]], i32 [[SRC1]], i32 [[SRC2]], i1 false, i1 false) +; CHECK-NEXT: store i32 [[RES]], ptr addrspace(1) [[OUT]], align 4 ; CHECK-NEXT: ret void ; %res = call i32 @llvm.amdgcn.permlane16(i32 12345, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) @@ -2871,9 +3188,10 @@ } define amdgpu_kernel void @permlane16_bound_ctrl(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; CHECK-LABEL: @permlane16_bound_ctrl( -; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.permlane16(i32 undef, i32 [[SRC0:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]], i1 false, i1 true) -; CHECK-NEXT: store i32 [[RES]], ptr addrspace(1) [[OUT:%.*]], align 4 +; CHECK-LABEL: define amdgpu_kernel void @permlane16_bound_ctrl +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.permlane16.i32(i32 undef, i32 [[SRC0]], i32 [[SRC1]], i32 [[SRC2]], i1 false, i1 true) +; CHECK-NEXT: store i32 [[RES]], ptr addrspace(1) [[OUT]], align 4 ; CHECK-NEXT: ret void ; %res = call i32 @llvm.amdgcn.permlane16(i32 12345, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 true) @@ -2882,9 +3200,10 @@ } define amdgpu_kernel void @permlane16_fetch_invalid_bound_ctrl(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; CHECK-LABEL: @permlane16_fetch_invalid_bound_ctrl( -; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.permlane16(i32 undef, i32 [[SRC0:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]], i1 true, i1 true) -; CHECK-NEXT: store i32 [[RES]], ptr addrspace(1) [[OUT:%.*]], align 4 +; CHECK-LABEL: define amdgpu_kernel void @permlane16_fetch_invalid_bound_ctrl +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.permlane16.i32(i32 undef, i32 [[SRC0]], i32 [[SRC1]], i32 [[SRC2]], i1 true, i1 true) +; CHECK-NEXT: store i32 [[RES]], ptr addrspace(1) [[OUT]], align 4 ; CHECK-NEXT: ret void ; %res = call i32 @llvm.amdgcn.permlane16(i32 12345, i32 %src0, i32 %src1, i32 %src2, i1 true, i1 true) @@ -2899,9 +3218,10 @@ declare i32 @llvm.amdgcn.permlanex16(i32, i32, i32, i32, i1 immarg, i1 immarg) define amdgpu_kernel void @permlanex16(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; CHECK-LABEL: @permlanex16( -; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.permlanex16(i32 12345, i32 [[SRC0:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]], i1 false, i1 false) -; CHECK-NEXT: store i32 [[RES]], ptr addrspace(1) [[OUT:%.*]], align 4 +; CHECK-LABEL: define amdgpu_kernel void @permlanex16 +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.permlanex16.i32(i32 12345, i32 [[SRC0]], i32 [[SRC1]], i32 [[SRC2]], i1 false, i1 false) +; CHECK-NEXT: store i32 [[RES]], ptr addrspace(1) [[OUT]], align 4 ; CHECK-NEXT: ret void ; %res = call i32 @llvm.amdgcn.permlanex16(i32 12345, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) @@ -2910,9 +3230,10 @@ } define amdgpu_kernel void @permlanex16_bound_ctrl(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; CHECK-LABEL: @permlanex16_bound_ctrl( -; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.permlanex16(i32 undef, i32 [[SRC0:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]], i1 false, i1 true) -; CHECK-NEXT: store i32 [[RES]], ptr addrspace(1) [[OUT:%.*]], align 4 +; CHECK-LABEL: define amdgpu_kernel void @permlanex16_bound_ctrl +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.permlanex16.i32(i32 undef, i32 [[SRC0]], i32 [[SRC1]], i32 [[SRC2]], i1 false, i1 true) +; CHECK-NEXT: store i32 [[RES]], ptr addrspace(1) [[OUT]], align 4 ; CHECK-NEXT: ret void ; %res = call i32 @llvm.amdgcn.permlanex16(i32 12345, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 true) @@ -2921,9 +3242,10 @@ } define amdgpu_kernel void @permlanex16_fetch_invalid_bound_ctrl(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; CHECK-LABEL: @permlanex16_fetch_invalid_bound_ctrl( -; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.permlanex16(i32 undef, i32 [[SRC0:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]], i1 true, i1 true) -; CHECK-NEXT: store i32 [[RES]], ptr addrspace(1) [[OUT:%.*]], align 4 +; CHECK-LABEL: define amdgpu_kernel void @permlanex16_fetch_invalid_bound_ctrl +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.permlanex16.i32(i32 undef, i32 [[SRC0]], i32 [[SRC1]], i32 [[SRC2]], i1 true, i1 true) +; CHECK-NEXT: store i32 [[RES]], ptr addrspace(1) [[OUT]], align 4 ; CHECK-NEXT: ret void ; %res = call i32 @llvm.amdgcn.permlanex16(i32 12345, i32 %src0, i32 %src1, i32 %src2, i1 true, i1 true) @@ -2991,9 +3313,10 @@ declare <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f32.f32(i32, i32, float, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 define amdgpu_kernel void @image_sample_a16_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s) { -; CHECK-LABEL: @image_sample_a16_1d( -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f16(i32 15, half [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_1d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[S:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f16(i32 15, half [[S]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; %s32 = fpext half %s to float @@ -3003,9 +3326,10 @@ } define amdgpu_kernel void @image_sample_a16_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t) { -; CHECK-LABEL: @image_sample_a16_2d( -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f16(i32 15, half [[S:%.*]], half [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_2d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[S:%.*]], half [[T:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f16(i32 15, half [[S]], half [[T]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; %s32 = fpext half %s to float @@ -3016,9 +3340,10 @@ } define amdgpu_kernel void @image_sample_a16_3d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %r) { -; CHECK-LABEL: @image_sample_a16_3d( -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f16(i32 15, half [[S:%.*]], half [[T:%.*]], half [[R:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_3d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[S:%.*]], half [[T:%.*]], half [[R:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f16(i32 15, half [[S]], half [[T]], half [[R]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; %s32 = fpext half %s to float @@ -3031,9 +3356,10 @@ define amdgpu_kernel void @image_sample_a16_cube(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %face) { ; -; CHECK-LABEL: @image_sample_a16_cube( -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cube.v4f32.f16(i32 15, half [[S:%.*]], half [[T:%.*]], half [[FACE:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_cube +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[S:%.*]], half [[T:%.*]], half [[FACE:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cube.v4f32.f16(i32 15, half [[S]], half [[T]], half [[FACE]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; %s32 = fpext half %s to float @@ -3045,9 +3371,10 @@ } define amdgpu_kernel void @image_sample_a16_1darray(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %slice) { -; CHECK-LABEL: @image_sample_a16_1darray( -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f16(i32 15, half [[S:%.*]], half [[SLICE:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_1darray +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[S:%.*]], half [[SLICE:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f16(i32 15, half [[S]], half [[SLICE]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; %s32 = fpext half %s to float @@ -3058,9 +3385,10 @@ } define amdgpu_kernel void @image_sample_a16_2darray(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %slice) { -; CHECK-LABEL: @image_sample_a16_2darray( -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.2darray.v4f32.f16(i32 15, half [[S:%.*]], half [[T:%.*]], half [[SLICE:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_2darray +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[S:%.*]], half [[T:%.*]], half [[SLICE:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.2darray.v4f32.f16(i32 15, half [[S]], half [[T]], half [[SLICE]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; %s32 = fpext half %s to float @@ -3072,9 +3400,10 @@ } define amdgpu_kernel void @image_sample_a16_c_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s) { -; CHECK-LABEL: @image_sample_a16_c_1d( -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.1d.v4f32.f16(i32 15, float [[ZCOMPARE:%.*]], half [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_c_1d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], half [[S:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.1d.v4f32.f16(i32 15, float [[ZCOMPARE]], half [[S]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; %s32 = fpext half %s to float @@ -3084,9 +3413,10 @@ } define amdgpu_kernel void @image_sample_a16_c_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %t) { -; CHECK-LABEL: @image_sample_a16_c_2d( -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.2d.v4f32.f16(i32 15, float [[ZCOMPARE:%.*]], half [[S:%.*]], half [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_c_2d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], half [[S:%.*]], half [[T:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.2d.v4f32.f16(i32 15, float [[ZCOMPARE]], half [[S]], half [[T]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; %s32 = fpext half %s to float @@ -3097,9 +3427,10 @@ } define amdgpu_kernel void @image_sample_a16_cl_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %clamp) { -; CHECK-LABEL: @image_sample_a16_cl_1d( -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cl.1d.v4f32.f16(i32 15, half [[S:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_cl_1d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[S:%.*]], half [[CLAMP:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cl.1d.v4f32.f16(i32 15, half [[S]], half [[CLAMP]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; %s32 = fpext half %s to float @@ -3110,9 +3441,10 @@ } define amdgpu_kernel void @image_sample_a16_cl_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %clamp) { -; CHECK-LABEL: @image_sample_a16_cl_2d( -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cl.2d.v4f32.f16(i32 15, half [[S:%.*]], half [[T:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_cl_2d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[S:%.*]], half [[T:%.*]], half [[CLAMP:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cl.2d.v4f32.f16(i32 15, half [[S]], half [[T]], half [[CLAMP]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; %s32 = fpext half %s to float @@ -3124,9 +3456,10 @@ } define amdgpu_kernel void @image_sample_a16_c_cl_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %clamp) { -; CHECK-LABEL: @image_sample_a16_c_cl_1d( -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cl.1d.v4f32.f16(i32 15, float [[ZCOMPARE:%.*]], half [[S:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_c_cl_1d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], half [[S:%.*]], half [[CLAMP:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cl.1d.v4f32.f16(i32 15, float [[ZCOMPARE]], half [[S]], half [[CLAMP]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; %s32 = fpext half %s to float @@ -3137,9 +3470,10 @@ } define amdgpu_kernel void @image_sample_a16_c_cl_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %t, half %clamp) { -; CHECK-LABEL: @image_sample_a16_c_cl_2d( -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cl.2d.v4f32.f16(i32 15, float [[ZCOMPARE:%.*]], half [[S:%.*]], half [[T:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_c_cl_2d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], half [[S:%.*]], half [[T:%.*]], half [[CLAMP:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cl.2d.v4f32.f16(i32 15, float [[ZCOMPARE]], half [[S]], half [[T]], half [[CLAMP]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; %s32 = fpext half %s to float @@ -3151,9 +3485,10 @@ } define amdgpu_kernel void @image_sample_a16_b16_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %bias, half %s) { -; CHECK-LABEL: @image_sample_a16_b16_1d( -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f16.f16(i32 15, half [[BIAS:%.*]], half [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_b16_1d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[BIAS:%.*]], half [[S:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f16.f16(i32 15, half [[BIAS]], half [[S]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; %bias32 = fpext half %bias to float @@ -3164,10 +3499,11 @@ } define amdgpu_kernel void @image_sample_a16_b32_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, half %s) { -; CHECK-LABEL: @image_sample_a16_b32_1d( -; CHECK-NEXT: [[S32:%.*]] = fpext half [[S:%.*]] to float -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32.f32(i32 15, float [[BIAS:%.*]], float [[S32]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_b32_1d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[BIAS:%.*]], half [[S:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[S32:%.*]] = fpext half [[S]] to float +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32.f32(i32 15, float [[BIAS]], float [[S32]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; %s32 = fpext half %s to float @@ -3177,9 +3513,10 @@ } define amdgpu_kernel void @image_sample_a16_b16_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %bias, half %s, half %t) { -; CHECK-LABEL: @image_sample_a16_b16_2d( -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.2d.v4f32.f16.f16(i32 15, half [[BIAS:%.*]], half [[S:%.*]], half [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_b16_2d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[BIAS:%.*]], half [[S:%.*]], half [[T:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.2d.v4f32.f16.f16(i32 15, half [[BIAS]], half [[S]], half [[T]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; %bias32 = fpext half %bias to float @@ -3191,11 +3528,12 @@ } define amdgpu_kernel void @image_sample_a16_b32_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, half %s, half %t) { -; CHECK-LABEL: @image_sample_a16_b32_2d( -; CHECK-NEXT: [[S32:%.*]] = fpext half [[S:%.*]] to float -; CHECK-NEXT: [[T32:%.*]] = fpext half [[T:%.*]] to float -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.2d.v4f32.f32.f32(i32 15, float [[BIAS:%.*]], float [[S32]], float [[T32]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_b32_2d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[BIAS:%.*]], half [[S:%.*]], half [[T:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[S32:%.*]] = fpext half [[S]] to float +; CHECK-NEXT: [[T32:%.*]] = fpext half [[T]] to float +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.2d.v4f32.f32.f32(i32 15, float [[BIAS]], float [[S32]], float [[T32]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; %s32 = fpext half %s to float @@ -3206,9 +3544,10 @@ } define amdgpu_kernel void @image_sample_a16_c_b16_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %bias, float %zcompare, half %s) { -; CHECK-LABEL: @image_sample_a16_c_b16_1d( -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.1d.v4f32.f16.f16(i32 15, half [[BIAS:%.*]], float [[ZCOMPARE:%.*]], half [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_c_b16_1d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[BIAS:%.*]], float [[ZCOMPARE:%.*]], half [[S:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.1d.v4f32.f16.f16(i32 15, half [[BIAS]], float [[ZCOMPARE]], half [[S]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; %bias32 = fpext half %bias to float @@ -3219,10 +3558,11 @@ } define amdgpu_kernel void @image_sample_a16_c_b32_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, half %s) { -; CHECK-LABEL: @image_sample_a16_c_b32_1d( -; CHECK-NEXT: [[S32:%.*]] = fpext half [[S:%.*]] to float -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.1d.v4f32.f32.f32(i32 15, float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S32]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_c_b32_1d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], half [[S:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[S32:%.*]] = fpext half [[S]] to float +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.1d.v4f32.f32.f32(i32 15, float [[BIAS]], float [[ZCOMPARE]], float [[S32]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; %s32 = fpext half %s to float @@ -3232,9 +3572,10 @@ } define amdgpu_kernel void @image_sample_a16_c_b16_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %bias, float %zcompare, half %s, half %t) { -; CHECK-LABEL: @image_sample_a16_c_b16_2d( -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.2d.v4f32.f16.f16(i32 15, half [[BIAS:%.*]], float [[ZCOMPARE:%.*]], half [[S:%.*]], half [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_c_b16_2d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[BIAS:%.*]], float [[ZCOMPARE:%.*]], half [[S:%.*]], half [[T:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.2d.v4f32.f16.f16(i32 15, half [[BIAS]], float [[ZCOMPARE]], half [[S]], half [[T]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; %bias32 = fpext half %bias to float @@ -3246,11 +3587,12 @@ } define amdgpu_kernel void @image_sample_a16_c_b32_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, half %s, half %t) { -; CHECK-LABEL: @image_sample_a16_c_b32_2d( -; CHECK-NEXT: [[S32:%.*]] = fpext half [[S:%.*]] to float -; CHECK-NEXT: [[T32:%.*]] = fpext half [[T:%.*]] to float -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.2d.v4f32.f32.f32(i32 15, float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S32]], float [[T32]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_c_b32_2d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], half [[S:%.*]], half [[T:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[S32:%.*]] = fpext half [[S]] to float +; CHECK-NEXT: [[T32:%.*]] = fpext half [[T]] to float +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.2d.v4f32.f32.f32(i32 15, float [[BIAS]], float [[ZCOMPARE]], float [[S32]], float [[T32]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; %s32 = fpext half %s to float @@ -3261,9 +3603,10 @@ } define amdgpu_kernel void @image_sample_a16_b16_cl_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %bias, half %s, half %clamp) { -; CHECK-LABEL: @image_sample_a16_b16_cl_1d( -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f16.f16(i32 15, half [[BIAS:%.*]], half [[S:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_b16_cl_1d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[BIAS:%.*]], half [[S:%.*]], half [[CLAMP:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f16.f16(i32 15, half [[BIAS]], half [[S]], half [[CLAMP]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; %bias32 = fpext half %bias to float @@ -3275,11 +3618,12 @@ } define amdgpu_kernel void @image_sample_a16_b32_cl_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, half %s, half %clamp) { -; CHECK-LABEL: @image_sample_a16_b32_cl_1d( -; CHECK-NEXT: [[S32:%.*]] = fpext half [[S:%.*]] to float -; CHECK-NEXT: [[CLAMP32:%.*]] = fpext half [[CLAMP:%.*]] to float -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f32.f32(i32 15, float [[BIAS:%.*]], float [[S32]], float [[CLAMP32]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_b32_cl_1d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[BIAS:%.*]], half [[S:%.*]], half [[CLAMP:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[S32:%.*]] = fpext half [[S]] to float +; CHECK-NEXT: [[CLAMP32:%.*]] = fpext half [[CLAMP]] to float +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f32.f32(i32 15, float [[BIAS]], float [[S32]], float [[CLAMP32]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; %s32 = fpext half %s to float @@ -3290,9 +3634,10 @@ } define amdgpu_kernel void @image_sample_a16_b16_cl_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %bias, half %s, half %t, half %clamp) { -; CHECK-LABEL: @image_sample_a16_b16_cl_2d( -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.cl.2d.v4f32.f16.f16(i32 15, half [[BIAS:%.*]], half [[S:%.*]], half [[T:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_b16_cl_2d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[BIAS:%.*]], half [[S:%.*]], half [[T:%.*]], half [[CLAMP:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.cl.2d.v4f32.f16.f16(i32 15, half [[BIAS]], half [[S]], half [[T]], half [[CLAMP]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; %bias32 = fpext half %bias to float @@ -3305,12 +3650,13 @@ } define amdgpu_kernel void @image_sample_a16_b32_cl_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, half %s, half %t, half %clamp) { -; CHECK-LABEL: @image_sample_a16_b32_cl_2d( -; CHECK-NEXT: [[S32:%.*]] = fpext half [[S:%.*]] to float -; CHECK-NEXT: [[T32:%.*]] = fpext half [[T:%.*]] to float -; CHECK-NEXT: [[CLAMP32:%.*]] = fpext half [[CLAMP:%.*]] to float -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.cl.2d.v4f32.f32.f32(i32 15, float [[BIAS:%.*]], float [[S32]], float [[T32]], float [[CLAMP32]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_b32_cl_2d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[BIAS:%.*]], half [[S:%.*]], half [[T:%.*]], half [[CLAMP:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[S32:%.*]] = fpext half [[S]] to float +; CHECK-NEXT: [[T32:%.*]] = fpext half [[T]] to float +; CHECK-NEXT: [[CLAMP32:%.*]] = fpext half [[CLAMP]] to float +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.cl.2d.v4f32.f32.f32(i32 15, float [[BIAS]], float [[S32]], float [[T32]], float [[CLAMP32]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; %s32 = fpext half %s to float @@ -3322,9 +3668,10 @@ } define amdgpu_kernel void @image_sample_a16_c_b16_cl_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %bias, float %zcompare, half %s, half %clamp) { -; CHECK-LABEL: @image_sample_a16_c_b16_cl_1d( -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.1d.v4f32.f16.f16(i32 15, half [[BIAS:%.*]], float [[ZCOMPARE:%.*]], half [[S:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_c_b16_cl_1d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[BIAS:%.*]], float [[ZCOMPARE:%.*]], half [[S:%.*]], half [[CLAMP:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.1d.v4f32.f16.f16(i32 15, half [[BIAS]], float [[ZCOMPARE]], half [[S]], half [[CLAMP]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; %bias32 = fpext half %bias to float @@ -3336,11 +3683,12 @@ } define amdgpu_kernel void @image_sample_a16_c_b32_cl_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, half %s, half %clamp) { -; CHECK-LABEL: @image_sample_a16_c_b32_cl_1d( -; CHECK-NEXT: [[S32:%.*]] = fpext half [[S:%.*]] to float -; CHECK-NEXT: [[CLAMP32:%.*]] = fpext half [[CLAMP:%.*]] to float -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.1d.v4f32.f32.f32(i32 15, float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S32]], float [[CLAMP32]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_c_b32_cl_1d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], half [[S:%.*]], half [[CLAMP:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[S32:%.*]] = fpext half [[S]] to float +; CHECK-NEXT: [[CLAMP32:%.*]] = fpext half [[CLAMP]] to float +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.1d.v4f32.f32.f32(i32 15, float [[BIAS]], float [[ZCOMPARE]], float [[S32]], float [[CLAMP32]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; %s32 = fpext half %s to float @@ -3351,9 +3699,10 @@ } define amdgpu_kernel void @image_sample_a16_c_b16_cl_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %bias, float %zcompare, half %s, half %t, half %clamp) { -; CHECK-LABEL: @image_sample_a16_c_b16_cl_2d( -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.2d.v4f32.f16.f16(i32 15, half [[BIAS:%.*]], float [[ZCOMPARE:%.*]], half [[S:%.*]], half [[T:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_c_b16_cl_2d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[BIAS:%.*]], float [[ZCOMPARE:%.*]], half [[S:%.*]], half [[T:%.*]], half [[CLAMP:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.2d.v4f32.f16.f16(i32 15, half [[BIAS]], float [[ZCOMPARE]], half [[S]], half [[T]], half [[CLAMP]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; %bias32 = fpext half %bias to float @@ -3366,12 +3715,13 @@ } define amdgpu_kernel void @image_sample_a16_c_b32_cl_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, half %s, half %t, half %clamp) { -; CHECK-LABEL: @image_sample_a16_c_b32_cl_2d( -; CHECK-NEXT: [[S32:%.*]] = fpext half [[S:%.*]] to float -; CHECK-NEXT: [[T32:%.*]] = fpext half [[T:%.*]] to float -; CHECK-NEXT: [[CLAMP32:%.*]] = fpext half [[CLAMP:%.*]] to float -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.2d.v4f32.f32.f32(i32 15, float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S32]], float [[T32]], float [[CLAMP32]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_c_b32_cl_2d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], half [[S:%.*]], half [[T:%.*]], half [[CLAMP:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[S32:%.*]] = fpext half [[S]] to float +; CHECK-NEXT: [[T32:%.*]] = fpext half [[T]] to float +; CHECK-NEXT: [[CLAMP32:%.*]] = fpext half [[CLAMP]] to float +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.2d.v4f32.f32.f32(i32 15, float [[BIAS]], float [[ZCOMPARE]], float [[S32]], float [[T32]], float [[CLAMP32]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; %s32 = fpext half %s to float @@ -3383,9 +3733,10 @@ } define amdgpu_kernel void @image_sample_a16_d_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, half %s) { -; CHECK-LABEL: @image_sample_a16_d_1d( -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f16.f16(i32 15, half [[DSDH:%.*]], half [[DSDV:%.*]], half [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_d_1d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[DSDH:%.*]], half [[DSDV:%.*]], half [[S:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f16.f16(i32 15, half [[DSDH]], half [[DSDV]], half [[S]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; %dsdh32 = fpext half %dsdh to float @@ -3397,9 +3748,10 @@ } define amdgpu_kernel void @image_sample_a16_d_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t) { -; CHECK-LABEL: @image_sample_a16_d_2d( -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f16.f16(i32 15, half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[S:%.*]], half [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_d_2d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[S:%.*]], half [[T:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f16.f16(i32 15, half [[DSDH]], half [[DTDH]], half [[DSDV]], half [[DTDV]], half [[S]], half [[T]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; %dsdh32 = fpext half %dsdh to float @@ -3414,9 +3766,10 @@ } define amdgpu_kernel void @image_sample_a16_d_3d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %drdh, half %dsdv, half %dtdv, half %drdv, half %s, half %t, half %r) { -; CHECK-LABEL: @image_sample_a16_d_3d( -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f16.f16(i32 15, half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DRDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[DRDV:%.*]], half [[S:%.*]], half [[T:%.*]], half [[R:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_d_3d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DRDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[DRDV:%.*]], half [[S:%.*]], half [[T:%.*]], half [[R:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f16.f16(i32 15, half [[DSDH]], half [[DTDH]], half [[DRDH]], half [[DSDV]], half [[DTDV]], half [[DRDV]], half [[S]], half [[T]], half [[R]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; %dsdh32 = fpext half %dsdh to float @@ -3434,9 +3787,10 @@ } define amdgpu_kernel void @image_sample_a16_c_d_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, half %s) { -; CHECK-LABEL: @image_sample_a16_c_d_1d( -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f16.f16(i32 15, float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DSDV:%.*]], half [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_c_d_1d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DSDV:%.*]], half [[S:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f16.f16(i32 15, float [[ZCOMPARE]], half [[DSDH]], half [[DSDV]], half [[S]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; %dsdh32 = fpext half %dsdh to float @@ -3448,9 +3802,10 @@ } define amdgpu_kernel void @image_sample_a16_c_d_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t) { -; CHECK-LABEL: @image_sample_a16_c_d_2d( -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f16.f16(i32 15, float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[S:%.*]], half [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_c_d_2d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[S:%.*]], half [[T:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f16.f16(i32 15, float [[ZCOMPARE]], half [[DSDH]], half [[DTDH]], half [[DSDV]], half [[DTDV]], half [[S]], half [[T]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; %dsdh32 = fpext half %dsdh to float @@ -3465,9 +3820,10 @@ } define amdgpu_kernel void @image_sample_a16_d_cl_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, half %s, half %clamp) { -; CHECK-LABEL: @image_sample_a16_d_cl_1d( -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f16.f16(i32 15, half [[DSDH:%.*]], half [[DSDV:%.*]], half [[S:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_d_cl_1d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[DSDH:%.*]], half [[DSDV:%.*]], half [[S:%.*]], half [[CLAMP:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f16.f16(i32 15, half [[DSDH]], half [[DSDV]], half [[S]], half [[CLAMP]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; %dsdh32 = fpext half %dsdh to float @@ -3480,9 +3836,10 @@ } define amdgpu_kernel void @image_sample_a16_d_cl_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp) { -; CHECK-LABEL: @image_sample_a16_d_cl_2d( -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f16.f16(i32 15, half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[S:%.*]], half [[T:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_d_cl_2d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[S:%.*]], half [[T:%.*]], half [[CLAMP:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f16.f16(i32 15, half [[DSDH]], half [[DTDH]], half [[DSDV]], half [[DTDV]], half [[S]], half [[T]], half [[CLAMP]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; %dsdh32 = fpext half %dsdh to float @@ -3498,9 +3855,10 @@ } define amdgpu_kernel void @image_sample_a16_c_d_cl_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, half %s, half %clamp) { -; CHECK-LABEL: @image_sample_a16_c_d_cl_1d( -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f16.f16(i32 15, float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DSDV:%.*]], half [[S:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_c_d_cl_1d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DSDV:%.*]], half [[S:%.*]], half [[CLAMP:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f16.f16(i32 15, float [[ZCOMPARE]], half [[DSDH]], half [[DSDV]], half [[S]], half [[CLAMP]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; %dsdh32 = fpext half %dsdh to float @@ -3513,9 +3871,10 @@ } define amdgpu_kernel void @image_sample_a16_c_d_cl_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp) { -; CHECK-LABEL: @image_sample_a16_c_d_cl_2d( -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f16.f16(i32 15, float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[S:%.*]], half [[T:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_c_d_cl_2d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[S:%.*]], half [[T:%.*]], half [[CLAMP:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f16.f16(i32 15, float [[ZCOMPARE]], half [[DSDH]], half [[DTDH]], half [[DSDV]], half [[DTDV]], half [[S]], half [[T]], half [[CLAMP]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; %dsdh32 = fpext half %dsdh to float @@ -3531,9 +3890,10 @@ } define amdgpu_kernel void @image_sample_a16_cd_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, half %s) { -; CHECK-LABEL: @image_sample_a16_cd_1d( -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f16.f16(i32 15, half [[DSDH:%.*]], half [[DSDV:%.*]], half [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_cd_1d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[DSDH:%.*]], half [[DSDV:%.*]], half [[S:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f16.f16(i32 15, half [[DSDH]], half [[DSDV]], half [[S]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; %dsdh32 = fpext half %dsdh to float @@ -3545,9 +3905,10 @@ } define amdgpu_kernel void @image_sample_a16_cd_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t) { -; CHECK-LABEL: @image_sample_a16_cd_2d( -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f16.f16(i32 15, half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[S:%.*]], half [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_cd_2d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[S:%.*]], half [[T:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f16.f16(i32 15, half [[DSDH]], half [[DTDH]], half [[DSDV]], half [[DTDV]], half [[S]], half [[T]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; %dsdh32 = fpext half %dsdh to float @@ -3562,9 +3923,10 @@ } define amdgpu_kernel void @image_sample_a16_c_cd_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, half %s) { -; CHECK-LABEL: @image_sample_a16_c_cd_1d( -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f16.f16(i32 15, float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DSDV:%.*]], half [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_c_cd_1d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DSDV:%.*]], half [[S:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f16.f16(i32 15, float [[ZCOMPARE]], half [[DSDH]], half [[DSDV]], half [[S]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; %dsdh32 = fpext half %dsdh to float @@ -3576,9 +3938,10 @@ } define amdgpu_kernel void @image_sample_a16_c_cd_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t) { -; CHECK-LABEL: @image_sample_a16_c_cd_2d( -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f16.f16(i32 15, float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[S:%.*]], half [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_c_cd_2d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[S:%.*]], half [[T:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f16.f16(i32 15, float [[ZCOMPARE]], half [[DSDH]], half [[DTDH]], half [[DSDV]], half [[DTDV]], half [[S]], half [[T]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; %dsdh32 = fpext half %dsdh to float @@ -3593,9 +3956,10 @@ } define amdgpu_kernel void @image_sample_a16_cd_cl_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, half %s, half %clamp) { -; CHECK-LABEL: @image_sample_a16_cd_cl_1d( -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f16.f16(i32 15, half [[DSDH:%.*]], half [[DSDV:%.*]], half [[S:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_cd_cl_1d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[DSDH:%.*]], half [[DSDV:%.*]], half [[S:%.*]], half [[CLAMP:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f16.f16(i32 15, half [[DSDH]], half [[DSDV]], half [[S]], half [[CLAMP]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; %dsdh32 = fpext half %dsdh to float @@ -3608,9 +3972,10 @@ } define amdgpu_kernel void @image_sample_a16_cd_cl_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp) { -; CHECK-LABEL: @image_sample_a16_cd_cl_2d( -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f16.f16(i32 15, half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[S:%.*]], half [[T:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_cd_cl_2d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[S:%.*]], half [[T:%.*]], half [[CLAMP:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f16.f16(i32 15, half [[DSDH]], half [[DTDH]], half [[DSDV]], half [[DTDV]], half [[S]], half [[T]], half [[CLAMP]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; %dsdh32 = fpext half %dsdh to float @@ -3626,9 +3991,10 @@ } define amdgpu_kernel void @image_sample_a16_c_cd_cl_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, half %s, half %clamp) { -; CHECK-LABEL: @image_sample_a16_c_cd_cl_1d( -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f16.f16(i32 15, float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DSDV:%.*]], half [[S:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_c_cd_cl_1d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DSDV:%.*]], half [[S:%.*]], half [[CLAMP:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f16.f16(i32 15, float [[ZCOMPARE]], half [[DSDH]], half [[DSDV]], half [[S]], half [[CLAMP]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; %dsdh32 = fpext half %dsdh to float @@ -3641,9 +4007,10 @@ } define amdgpu_kernel void @image_sample_a16_c_cd_cl_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp) { -; CHECK-LABEL: @image_sample_a16_c_cd_cl_2d( -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f16.f16(i32 15, float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[S:%.*]], half [[T:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_c_cd_cl_2d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[S:%.*]], half [[T:%.*]], half [[CLAMP:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f16.f16(i32 15, float [[ZCOMPARE]], half [[DSDH]], half [[DTDH]], half [[DSDV]], half [[DTDV]], half [[S]], half [[T]], half [[CLAMP]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; %dsdh32 = fpext half %dsdh to float @@ -3659,9 +4026,10 @@ } define amdgpu_kernel void @image_sample_a16_l_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %lod) { -; CHECK-LABEL: @image_sample_a16_l_1d( -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.l.1d.v4f32.f16(i32 15, half [[S:%.*]], half [[LOD:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_l_1d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[S:%.*]], half [[LOD:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.l.1d.v4f32.f16(i32 15, half [[S]], half [[LOD]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; %s32 = fpext half %s to float @@ -3672,9 +4040,10 @@ } define amdgpu_kernel void @image_sample_a16_l_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %lod) { -; CHECK-LABEL: @image_sample_a16_l_2d( -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.l.2d.v4f32.f16(i32 15, half [[S:%.*]], half [[T:%.*]], half [[LOD:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_l_2d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[S:%.*]], half [[T:%.*]], half [[LOD:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.l.2d.v4f32.f16(i32 15, half [[S]], half [[T]], half [[LOD]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; %s32 = fpext half %s to float @@ -3686,9 +4055,10 @@ } define amdgpu_kernel void @image_sample_a16_c_l_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %lod) { -; CHECK-LABEL: @image_sample_a16_c_l_1d( -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.l.1d.v4f32.f16(i32 15, float [[ZCOMPARE:%.*]], half [[S:%.*]], half [[LOD:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_c_l_1d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], half [[S:%.*]], half [[LOD:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.l.1d.v4f32.f16(i32 15, float [[ZCOMPARE]], half [[S]], half [[LOD]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; %s32 = fpext half %s to float @@ -3699,9 +4069,10 @@ } define amdgpu_kernel void @image_sample_a16_c_l_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %t, half %lod) { -; CHECK-LABEL: @image_sample_a16_c_l_2d( -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.l.2d.v4f32.f16(i32 15, float [[ZCOMPARE:%.*]], half [[S:%.*]], half [[T:%.*]], half [[LOD:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_c_l_2d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], half [[S:%.*]], half [[T:%.*]], half [[LOD:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.l.2d.v4f32.f16(i32 15, float [[ZCOMPARE]], half [[S]], half [[T]], half [[LOD]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; %s32 = fpext half %s to float @@ -3713,9 +4084,10 @@ } define amdgpu_kernel void @image_sample_a16_lz_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s) { -; CHECK-LABEL: @image_sample_a16_lz_1d( -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.lz.1d.v4f32.f16(i32 15, half [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_lz_1d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[S:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.lz.1d.v4f32.f16(i32 15, half [[S]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; %s32 = fpext half %s to float @@ -3725,9 +4097,10 @@ } define amdgpu_kernel void @image_sample_a16_lz_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t) { -; CHECK-LABEL: @image_sample_a16_lz_2d( -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f16(i32 15, half [[S:%.*]], half [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_lz_2d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[S:%.*]], half [[T:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f16(i32 15, half [[S]], half [[T]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; %s32 = fpext half %s to float @@ -3738,9 +4111,10 @@ } define amdgpu_kernel void @image_sample_a16_c_lz_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s) { -; CHECK-LABEL: @image_sample_a16_c_lz_1d( -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.lz.1d.v4f32.f16(i32 15, float [[ZCOMPARE:%.*]], half [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_c_lz_1d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], half [[S:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.lz.1d.v4f32.f16(i32 15, float [[ZCOMPARE]], half [[S]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; %s32 = fpext half %s to float @@ -3750,9 +4124,10 @@ } define amdgpu_kernel void @image_sample_a16_c_lz_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %t) { -; CHECK-LABEL: @image_sample_a16_c_lz_2d( -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.lz.2d.v4f32.f16(i32 15, float [[ZCOMPARE:%.*]], half [[S:%.*]], half [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_c_lz_2d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], half [[S:%.*]], half [[T:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.lz.2d.v4f32.f16(i32 15, float [[ZCOMPARE]], half [[S]], half [[T]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; %s32 = fpext half %s to float @@ -3763,9 +4138,10 @@ } define amdgpu_kernel void @image_sample_a16_c_d_o_2darray_V1(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %slice) { -; CHECK-LABEL: @image_sample_a16_c_d_o_2darray_V1( -; CHECK-NEXT: [[RES:%.*]] = call float @llvm.amdgcn.image.sample.c.d.o.2darray.f32.f16.f16(i32 4, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[S:%.*]], half [[T:%.*]], half [[SLICE:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store float [[RES]], ptr addrspace(1) [[OUT:%.*]], align 4 +; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_c_d_o_2darray_V1 +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[S:%.*]], half [[T:%.*]], half [[SLICE:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[RES:%.*]] = call float @llvm.amdgcn.image.sample.c.d.o.2darray.f32.f16.f16(i32 4, i32 [[OFFSET]], float [[ZCOMPARE]], half [[DSDH]], half [[DTDH]], half [[DSDV]], half [[DTDV]], half [[S]], half [[T]], half [[SLICE]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store float [[RES]], ptr addrspace(1) [[OUT]], align 4 ; CHECK-NEXT: ret void ; %dsdh32 = fpext half %dsdh to float @@ -3781,9 +4157,10 @@ } define amdgpu_kernel void @image_sample_a16_c_d_o_2darray_V2(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %slice) { -; CHECK-LABEL: @image_sample_a16_c_d_o_2darray_V2( -; CHECK-NEXT: [[RES:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f16.f16(i32 6, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[S:%.*]], half [[T:%.*]], half [[SLICE:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <2 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 8 +; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_c_d_o_2darray_V2 +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[S:%.*]], half [[T:%.*]], half [[SLICE:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[RES:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f16.f16(i32 6, i32 [[OFFSET]], float [[ZCOMPARE]], half [[DSDH]], half [[DTDH]], half [[DSDV]], half [[DTDV]], half [[S]], half [[T]], half [[SLICE]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <2 x float> [[RES]], ptr addrspace(1) [[OUT]], align 8 ; CHECK-NEXT: ret void ; %dsdh32 = fpext half %dsdh to float @@ -3799,9 +4176,10 @@ } define amdgpu_kernel void @image_sample_a16_c_d_o_2darray_const(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %slice) { -; CHECK-LABEL: @image_sample_a16_c_d_o_2darray_const( -; CHECK-NEXT: [[RES:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f16.f16(i32 6, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[S:%.*]], half 0xH3400, half [[SLICE:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <2 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 8 +; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_c_d_o_2darray_const +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[S:%.*]], half [[SLICE:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[RES:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f16.f16(i32 6, i32 [[OFFSET]], float [[ZCOMPARE]], half [[DSDH]], half [[DTDH]], half [[DSDV]], half [[DTDV]], half [[S]], half 0xH3400, half [[SLICE]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <2 x float> [[RES]], ptr addrspace(1) [[OUT]], align 8 ; CHECK-NEXT: ret void ; %dsdh32 = fpext half %dsdh to float @@ -3816,11 +4194,12 @@ } define amdgpu_kernel void @image_sample_a16_c_d_o_2darray_const_noopt(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %slice) { -; CHECK-LABEL: @image_sample_a16_c_d_o_2darray_const_noopt( -; CHECK-NEXT: [[S32:%.*]] = fpext half [[S:%.*]] to float -; CHECK-NEXT: [[SLICE32:%.*]] = fpext half [[SLICE:%.*]] to float -; CHECK-NEXT: [[RES:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f16.f32(i32 6, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], float [[S32]], float 1.000000e+10, float [[SLICE32]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <2 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 8 +; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_c_d_o_2darray_const_noopt +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[S:%.*]], half [[SLICE:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[S32:%.*]] = fpext half [[S]] to float +; CHECK-NEXT: [[SLICE32:%.*]] = fpext half [[SLICE]] to float +; CHECK-NEXT: [[RES:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f16.f32(i32 6, i32 [[OFFSET]], float [[ZCOMPARE]], half [[DSDH]], half [[DTDH]], half [[DSDV]], half [[DTDV]], float [[S32]], float 1.000000e+10, float [[SLICE32]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <2 x float> [[RES]], ptr addrspace(1) [[OUT]], align 8 ; CHECK-NEXT: ret void ; %dsdh32 = fpext half %dsdh to float @@ -3835,9 +4214,10 @@ } define amdgpu_kernel void @image_load_a16_mip_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, i16 %s) { -; CHECK-LABEL: @image_load_a16_mip_1d( -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i16(i32 15, i16 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-LABEL: define amdgpu_kernel void @image_load_a16_mip_1d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], i16 [[S:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i16(i32 15, i16 [[S]], <8 x i32> [[RSRC]], i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; %s32 = zext i16 %s to i32 @@ -3847,10 +4227,11 @@ } define amdgpu_kernel void @image_load_a16_mip_1d_noopt(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, i16 %s) { -; CHECK-LABEL: @image_load_a16_mip_1d_noopt( -; CHECK-NEXT: [[S32:%.*]] = sext i16 [[S:%.*]] to i32 -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 [[S32]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-LABEL: define amdgpu_kernel void @image_load_a16_mip_1d_noopt +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], i16 [[S:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[S32:%.*]] = sext i16 [[S]] to i32 +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 [[S32]], <8 x i32> [[RSRC]], i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; %s32 = sext i16 %s to i32 @@ -3860,9 +4241,10 @@ } define amdgpu_kernel void @image_load_a16_mip_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, i16 %s, i16 %t) { -; CHECK-LABEL: @image_load_a16_mip_2d( -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i16(i32 15, i16 [[S:%.*]], i16 [[T:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-LABEL: define amdgpu_kernel void @image_load_a16_mip_2d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], i16 [[S:%.*]], i16 [[T:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i16(i32 15, i16 [[S]], i16 [[T]], <8 x i32> [[RSRC]], i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; %s32 = zext i16 %s to i32 @@ -3873,9 +4255,10 @@ } define amdgpu_kernel void @image_load_a16_mip_2d_const(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, i16 %s) { -; CHECK-LABEL: @image_load_a16_mip_2d_const( -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i16(i32 15, i16 [[S:%.*]], i16 -1, <8 x i32> [[RSRC:%.*]], i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-LABEL: define amdgpu_kernel void @image_load_a16_mip_2d_const +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], i16 [[S:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i16(i32 15, i16 [[S]], i16 -1, <8 x i32> [[RSRC]], i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; %s32 = zext i16 %s to i32 @@ -3885,10 +4268,11 @@ } define amdgpu_kernel void @image_load_a16_mip_2d_const_noopt(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, i16 %s) { -; CHECK-LABEL: @image_load_a16_mip_2d_const_noopt( -; CHECK-NEXT: [[S32:%.*]] = zext i16 [[S:%.*]] to i32 -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32 15, i32 [[S32]], i32 65536, <8 x i32> [[RSRC:%.*]], i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-LABEL: define amdgpu_kernel void @image_load_a16_mip_2d_const_noopt +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], i16 [[S:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[S32:%.*]] = zext i16 [[S]] to i32 +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32 15, i32 [[S32]], i32 65536, <8 x i32> [[RSRC]], i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; %s32 = zext i16 %s to i32 @@ -3902,9 +4286,10 @@ ; -------------------------------------------------------------------- define amdgpu_kernel void @image_sample_g16_d_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) { -; CHECK-LABEL: @image_sample_g16_d_1d( -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f16.f32(i32 15, half [[DSDH:%.*]], half [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-LABEL: define amdgpu_kernel void @image_sample_g16_d_1d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[DSDH:%.*]], half [[DSDV:%.*]], float [[S:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f16.f32(i32 15, half [[DSDH]], half [[DSDV]], float [[S]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; %dsdh32 = fpext half %dsdh to float @@ -3915,9 +4300,10 @@ } define amdgpu_kernel void @image_sample_g16_d_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { -; CHECK-LABEL: @image_sample_g16_d_2d( -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f16.f32(i32 15, half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-LABEL: define amdgpu_kernel void @image_sample_g16_d_2d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f16.f32(i32 15, half [[DSDH]], half [[DTDH]], half [[DSDV]], half [[DTDV]], float [[S]], float [[T]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; %dsdh32 = fpext half %dsdh to float @@ -3930,9 +4316,10 @@ } define amdgpu_kernel void @image_sample_g16_d_3d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %drdh, half %dsdv, half %dtdv, half %drdv, float %s, float %t, float %r) { -; CHECK-LABEL: @image_sample_g16_d_3d( -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f16.f32(i32 15, half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DRDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[DRDV:%.*]], float [[S:%.*]], float [[T:%.*]], float [[R:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-LABEL: define amdgpu_kernel void @image_sample_g16_d_3d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DRDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[DRDV:%.*]], float [[S:%.*]], float [[T:%.*]], float [[R:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f16.f32(i32 15, half [[DSDH]], half [[DTDH]], half [[DRDH]], half [[DSDV]], half [[DTDV]], half [[DRDV]], float [[S]], float [[T]], float [[R]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; %dsdh32 = fpext half %dsdh to float @@ -3947,9 +4334,10 @@ } define amdgpu_kernel void @image_sample_g16_c_d_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s) { -; CHECK-LABEL: @image_sample_g16_c_d_1d( -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f16.f32(i32 15, float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-LABEL: define amdgpu_kernel void @image_sample_g16_c_d_1d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DSDV:%.*]], float [[S:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f16.f32(i32 15, float [[ZCOMPARE]], half [[DSDH]], half [[DSDV]], float [[S]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; %dsdh32 = fpext half %dsdh to float @@ -3960,9 +4348,10 @@ } define amdgpu_kernel void @image_sample_g16_c_d_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { -; CHECK-LABEL: @image_sample_g16_c_d_2d( -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f16.f32(i32 15, float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-LABEL: define amdgpu_kernel void @image_sample_g16_c_d_2d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f16.f32(i32 15, float [[ZCOMPARE]], half [[DSDH]], half [[DTDH]], half [[DSDV]], half [[DTDV]], float [[S]], float [[T]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; %dsdh32 = fpext half %dsdh to float @@ -3975,9 +4364,10 @@ } define amdgpu_kernel void @image_sample_g16_d_cl_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s, float %clamp) { -; CHECK-LABEL: @image_sample_g16_d_cl_1d( -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f16.f32(i32 15, half [[DSDH:%.*]], half [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-LABEL: define amdgpu_kernel void @image_sample_g16_d_cl_1d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[DSDH:%.*]], half [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f16.f32(i32 15, half [[DSDH]], half [[DSDV]], float [[S]], float [[CLAMP]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; %dsdh32 = fpext half %dsdh to float @@ -3988,9 +4378,10 @@ } define amdgpu_kernel void @image_sample_g16_d_cl_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) { -; CHECK-LABEL: @image_sample_g16_d_cl_2d( -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f16.f32(i32 15, half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-LABEL: define amdgpu_kernel void @image_sample_g16_d_cl_2d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f16.f32(i32 15, half [[DSDH]], half [[DTDH]], half [[DSDV]], half [[DTDV]], float [[S]], float [[T]], float [[CLAMP]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; %dsdh32 = fpext half %dsdh to float @@ -4003,9 +4394,10 @@ } define amdgpu_kernel void @image_sample_g16_c_d_cl_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp) { -; CHECK-LABEL: @image_sample_g16_c_d_cl_1d( -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f16.f32(i32 15, float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-LABEL: define amdgpu_kernel void @image_sample_g16_c_d_cl_1d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f16.f32(i32 15, float [[ZCOMPARE]], half [[DSDH]], half [[DSDV]], float [[S]], float [[CLAMP]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; %dsdh32 = fpext half %dsdh to float @@ -4016,9 +4408,10 @@ } define amdgpu_kernel void @image_sample_g16_c_d_cl_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) { -; CHECK-LABEL: @image_sample_g16_c_d_cl_2d( -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f16.f32(i32 15, float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-LABEL: define amdgpu_kernel void @image_sample_g16_c_d_cl_2d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f16.f32(i32 15, float [[ZCOMPARE]], half [[DSDH]], half [[DTDH]], half [[DSDV]], half [[DTDV]], float [[S]], float [[T]], float [[CLAMP]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; %dsdh32 = fpext half %dsdh to float @@ -4031,9 +4424,10 @@ } define amdgpu_kernel void @image_sample_g16_cd_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) { -; CHECK-LABEL: @image_sample_g16_cd_1d( -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f16.f32(i32 15, half [[DSDH:%.*]], half [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-LABEL: define amdgpu_kernel void @image_sample_g16_cd_1d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[DSDH:%.*]], half [[DSDV:%.*]], float [[S:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f16.f32(i32 15, half [[DSDH]], half [[DSDV]], float [[S]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; %dsdh32 = fpext half %dsdh to float @@ -4044,9 +4438,10 @@ } define amdgpu_kernel void @image_sample_g16_cd_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { -; CHECK-LABEL: @image_sample_g16_cd_2d( -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f16.f32(i32 15, half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-LABEL: define amdgpu_kernel void @image_sample_g16_cd_2d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f16.f32(i32 15, half [[DSDH]], half [[DTDH]], half [[DSDV]], half [[DTDV]], float [[S]], float [[T]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; %dsdh32 = fpext half %dsdh to float @@ -4059,9 +4454,10 @@ } define amdgpu_kernel void @image_sample_g16_c_cd_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s) { -; CHECK-LABEL: @image_sample_g16_c_cd_1d( -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f16.f32(i32 15, float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-LABEL: define amdgpu_kernel void @image_sample_g16_c_cd_1d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DSDV:%.*]], float [[S:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f16.f32(i32 15, float [[ZCOMPARE]], half [[DSDH]], half [[DSDV]], float [[S]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; %dsdh32 = fpext half %dsdh to float @@ -4072,9 +4468,10 @@ } define amdgpu_kernel void @image_sample_g16_c_cd_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { -; CHECK-LABEL: @image_sample_g16_c_cd_2d( -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f16.f32(i32 15, float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-LABEL: define amdgpu_kernel void @image_sample_g16_c_cd_2d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f16.f32(i32 15, float [[ZCOMPARE]], half [[DSDH]], half [[DTDH]], half [[DSDV]], half [[DTDV]], float [[S]], float [[T]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; %dsdh32 = fpext half %dsdh to float @@ -4087,9 +4484,10 @@ } define amdgpu_kernel void @image_sample_g16_cd_cl_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s, float %clamp) { -; CHECK-LABEL: @image_sample_g16_cd_cl_1d( -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f16.f32(i32 15, half [[DSDH:%.*]], half [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-LABEL: define amdgpu_kernel void @image_sample_g16_cd_cl_1d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[DSDH:%.*]], half [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f16.f32(i32 15, half [[DSDH]], half [[DSDV]], float [[S]], float [[CLAMP]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; %dsdh32 = fpext half %dsdh to float @@ -4100,9 +4498,10 @@ } define amdgpu_kernel void @image_sample_g16_cd_cl_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) { -; CHECK-LABEL: @image_sample_g16_cd_cl_2d( -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f16.f32(i32 15, half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-LABEL: define amdgpu_kernel void @image_sample_g16_cd_cl_2d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f16.f32(i32 15, half [[DSDH]], half [[DTDH]], half [[DSDV]], half [[DTDV]], float [[S]], float [[T]], float [[CLAMP]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; %dsdh32 = fpext half %dsdh to float @@ -4115,9 +4514,10 @@ } define amdgpu_kernel void @image_sample_g16_c_cd_cl_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp) { -; CHECK-LABEL: @image_sample_g16_c_cd_cl_1d( -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f16.f32(i32 15, float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-LABEL: define amdgpu_kernel void @image_sample_g16_c_cd_cl_1d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f16.f32(i32 15, float [[ZCOMPARE]], half [[DSDH]], half [[DSDV]], float [[S]], float [[CLAMP]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; %dsdh32 = fpext half %dsdh to float @@ -4128,9 +4528,10 @@ } define amdgpu_kernel void @image_sample_g16_c_cd_cl_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) { -; CHECK-LABEL: @image_sample_g16_c_cd_cl_2d( -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f16.f32(i32 15, float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-LABEL: define amdgpu_kernel void @image_sample_g16_c_cd_cl_2d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f16.f32(i32 15, float [[ZCOMPARE]], half [[DSDH]], half [[DTDH]], half [[DSDV]], half [[DTDV]], float [[S]], float [[T]], float [[CLAMP]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; %dsdh32 = fpext half %dsdh to float @@ -4143,9 +4544,10 @@ } define amdgpu_kernel void @image_sample_g16_c_d_o_2darray_V1(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice) { -; CHECK-LABEL: @image_sample_g16_c_d_o_2darray_V1( -; CHECK-NEXT: [[RES:%.*]] = call float @llvm.amdgcn.image.sample.c.d.o.2darray.f32.f16.f32(i32 4, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], float [[SLICE:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store float [[RES]], ptr addrspace(1) [[OUT:%.*]], align 4 +; CHECK-LABEL: define amdgpu_kernel void @image_sample_g16_c_d_o_2darray_V1 +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], float [[SLICE:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[RES:%.*]] = call float @llvm.amdgcn.image.sample.c.d.o.2darray.f32.f16.f32(i32 4, i32 [[OFFSET]], float [[ZCOMPARE]], half [[DSDH]], half [[DTDH]], half [[DSDV]], half [[DTDV]], float [[S]], float [[T]], float [[SLICE]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store float [[RES]], ptr addrspace(1) [[OUT]], align 4 ; CHECK-NEXT: ret void ; %dsdh32 = fpext half %dsdh to float @@ -4158,9 +4560,10 @@ } define amdgpu_kernel void @image_sample_g16_c_d_o_2darray_V2(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice) { -; CHECK-LABEL: @image_sample_g16_c_d_o_2darray_V2( -; CHECK-NEXT: [[RES:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f16.f32(i32 6, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], float [[SLICE:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <2 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 8 +; CHECK-LABEL: define amdgpu_kernel void @image_sample_g16_c_d_o_2darray_V2 +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], float [[SLICE:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[RES:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f16.f32(i32 6, i32 [[OFFSET]], float [[ZCOMPARE]], half [[DSDH]], half [[DTDH]], half [[DSDV]], half [[DTDV]], float [[S]], float [[T]], float [[SLICE]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <2 x float> [[RES]], ptr addrspace(1) [[OUT]], align 8 ; CHECK-NEXT: ret void ; %dsdh32 = fpext half %dsdh to float @@ -4177,9 +4580,10 @@ ; -------------------------------------------------------------------- define amdgpu_kernel void @image_sample_a16_1d_nnan(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s) { -; CHECK-LABEL: @image_sample_a16_1d_nnan( -; CHECK-NEXT: [[RES:%.*]] = call nnan <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f16(i32 15, half [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_1d_nnan +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[S:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[RES:%.*]] = call nnan <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f16(i32 15, half [[S]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; %s32 = fpext half %s to float @@ -4189,9 +4593,10 @@ } define amdgpu_kernel void @image_sample_a16_1d_nnan_ninf_nsz(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s) { -; CHECK-LABEL: @image_sample_a16_1d_nnan_ninf_nsz( -; CHECK-NEXT: [[RES:%.*]] = call nnan ninf nsz <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f16(i32 15, half [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_1d_nnan_ninf_nsz +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[S:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[RES:%.*]] = call nnan ninf nsz <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f16(i32 15, half [[S]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; %s32 = fpext half %s to float @@ -4201,9 +4606,10 @@ } define amdgpu_kernel void @image_sample_a16_1d_fast(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s) { -; CHECK-LABEL: @image_sample_a16_1d_fast( -; CHECK-NEXT: [[RES:%.*]] = call fast <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f16(i32 15, half [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_1d_fast +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[S:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[RES:%.*]] = call fast <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f16(i32 15, half [[S]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; %s32 = fpext half %s to float @@ -4213,9 +4619,10 @@ } define amdgpu_kernel void @image_sample_a16_2d_nnan(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t) { -; CHECK-LABEL: @image_sample_a16_2d_nnan( -; CHECK-NEXT: [[RES:%.*]] = call nnan <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f16(i32 15, half [[S:%.*]], half [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_2d_nnan +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[S:%.*]], half [[T:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[RES:%.*]] = call nnan <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f16(i32 15, half [[S]], half [[T]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; %s32 = fpext half %s to float @@ -4226,9 +4633,10 @@ } define amdgpu_kernel void @image_sample_a16_3d_nnan(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %r) { -; CHECK-LABEL: @image_sample_a16_3d_nnan( -; CHECK-NEXT: [[RES:%.*]] = call nnan <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f16(i32 15, half [[S:%.*]], half [[T:%.*]], half [[R:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_3d_nnan +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[S:%.*]], half [[T:%.*]], half [[R:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[RES:%.*]] = call nnan <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f16(i32 15, half [[S]], half [[T]], half [[R]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; %s32 = fpext half %s to float @@ -4241,9 +4649,10 @@ define amdgpu_kernel void @image_sample_a16_cube_nnan(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %face) { ; -; CHECK-LABEL: @image_sample_a16_cube_nnan( -; CHECK-NEXT: [[RES:%.*]] = call nnan <4 x float> @llvm.amdgcn.image.sample.cube.v4f32.f16(i32 15, half [[S:%.*]], half [[T:%.*]], half [[FACE:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_cube_nnan +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[S:%.*]], half [[T:%.*]], half [[FACE:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[RES:%.*]] = call nnan <4 x float> @llvm.amdgcn.image.sample.cube.v4f32.f16(i32 15, half [[S]], half [[T]], half [[FACE]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; %s32 = fpext half %s to float @@ -4255,9 +4664,10 @@ } define amdgpu_kernel void @image_sample_a16_1darray_nnan(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %slice) { -; CHECK-LABEL: @image_sample_a16_1darray_nnan( -; CHECK-NEXT: [[RES:%.*]] = call nnan <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f16(i32 15, half [[S:%.*]], half [[SLICE:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_1darray_nnan +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[S:%.*]], half [[SLICE:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[RES:%.*]] = call nnan <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f16(i32 15, half [[S]], half [[SLICE]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; %s32 = fpext half %s to float @@ -4268,9 +4678,10 @@ } define amdgpu_kernel void @image_sample_a16_2darray_nnan(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %slice) { -; CHECK-LABEL: @image_sample_a16_2darray_nnan( -; CHECK-NEXT: [[RES:%.*]] = call nnan <4 x float> @llvm.amdgcn.image.sample.2darray.v4f32.f16(i32 15, half [[S:%.*]], half [[T:%.*]], half [[SLICE:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-LABEL: define amdgpu_kernel void @image_sample_a16_2darray_nnan +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[S:%.*]], half [[T:%.*]], half [[SLICE:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[RES:%.*]] = call nnan <4 x float> @llvm.amdgcn.image.sample.2darray.v4f32.f16(i32 15, half [[S]], half [[T]], half [[SLICE]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; %s32 = fpext half %s to float @@ -4297,10 +4708,11 @@ declare <4 x float> @llvm.amdgcn.image.gather4.c.l.o.2darray.v4f32.f32(i32, i32, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 define amdgpu_kernel void @sample_l_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %lod) { -; CHECK-LABEL: @sample_l_1d( +; CHECK-LABEL: define amdgpu_kernel void @sample_l_1d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[S:%.*]], float [[LOD:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: main_body: -; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.lz.1d.v4f32.f32(i32 15, float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.lz.1d.v4f32.f32(i32 15, float [[S]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; main_body: @@ -4310,10 +4722,11 @@ } define amdgpu_kernel void @sample_l_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, float %lod) { -; CHECK-LABEL: @sample_l_2d( +; CHECK-LABEL: define amdgpu_kernel void @sample_l_2d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[S:%.*]], float [[T:%.*]], float [[LOD:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: main_body: -; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32(i32 15, float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32(i32 15, float [[S]], float [[T]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; main_body: @@ -4323,10 +4736,11 @@ } define amdgpu_kernel void @sample_c_l_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %lod) { -; CHECK-LABEL: @sample_c_l_1d( +; CHECK-LABEL: define amdgpu_kernel void @sample_c_l_1d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[LOD:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: main_body: -; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.lz.1d.v4f32.f32(i32 15, float [[ZCOMPARE:%.*]], float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.lz.1d.v4f32.f32(i32 15, float [[ZCOMPARE]], float [[S]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; main_body: @@ -4336,10 +4750,11 @@ } define amdgpu_kernel void @sample_c_l_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %t, float %lod) { -; CHECK-LABEL: @sample_c_l_2d( +; CHECK-LABEL: define amdgpu_kernel void @sample_c_l_2d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], float [[LOD:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: main_body: -; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.lz.2d.v4f32.f32(i32 15, float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.lz.2d.v4f32.f32(i32 15, float [[ZCOMPARE]], float [[S]], float [[T]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; main_body: @@ -4349,10 +4764,11 @@ } define amdgpu_kernel void @sample_l_o_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %s, float %lod) { -; CHECK-LABEL: @sample_l_o_1d( +; CHECK-LABEL: define amdgpu_kernel void @sample_l_o_1d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], i32 [[OFFSET:%.*]], float [[S:%.*]], float [[LOD:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: main_body: -; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.lz.o.1d.v4f32.f32(i32 15, i32 [[OFFSET:%.*]], float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.lz.o.1d.v4f32.f32(i32 15, i32 [[OFFSET]], float [[S]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; main_body: @@ -4362,10 +4778,11 @@ } define amdgpu_kernel void @sample_l_o_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %s, float %t, float %lod) { -; CHECK-LABEL: @sample_l_o_2d( +; CHECK-LABEL: define amdgpu_kernel void @sample_l_o_2d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], i32 [[OFFSET:%.*]], float [[S:%.*]], float [[T:%.*]], float [[LOD:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: main_body: -; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.lz.o.2d.v4f32.f32(i32 15, i32 [[OFFSET:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.lz.o.2d.v4f32.f32(i32 15, i32 [[OFFSET]], float [[S]], float [[T]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; main_body: @@ -4375,10 +4792,11 @@ } define amdgpu_kernel void @sample_c_l_o_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %s, float %lod) { -; CHECK-LABEL: @sample_c_l_o_1d( +; CHECK-LABEL: define amdgpu_kernel void @sample_c_l_o_1d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[LOD:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: main_body: -; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.lz.o.1d.v4f32.f32(i32 15, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.lz.o.1d.v4f32.f32(i32 15, i32 [[OFFSET]], float [[ZCOMPARE]], float [[S]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; main_body: @@ -4388,10 +4806,11 @@ } define amdgpu_kernel void @sample_c_l_o_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %s, float %t, float %lod) { -; CHECK-LABEL: @sample_c_l_o_2d( +; CHECK-LABEL: define amdgpu_kernel void @sample_c_l_o_2d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], float [[LOD:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: main_body: -; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.lz.o.2d.v4f32.f32(i32 15, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.lz.o.2d.v4f32.f32(i32 15, i32 [[OFFSET]], float [[ZCOMPARE]], float [[S]], float [[T]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; main_body: @@ -4401,10 +4820,11 @@ } define amdgpu_kernel void @gather4_l_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, float %lod) { -; CHECK-LABEL: @gather4_l_2d( +; CHECK-LABEL: define amdgpu_kernel void @gather4_l_2d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[S:%.*]], float [[T:%.*]], float [[LOD:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: main_body: -; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32(i32 15, float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32(i32 15, float [[S]], float [[T]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; main_body: @@ -4414,10 +4834,11 @@ } define amdgpu_kernel void @gather4_c_l_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %t, float %lod) { -; CHECK-LABEL: @gather4_c_l_2d( +; CHECK-LABEL: define amdgpu_kernel void @gather4_c_l_2d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], float [[LOD:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: main_body: -; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.c.lz.2d.v4f32.f32(i32 15, float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.c.lz.2d.v4f32.f32(i32 15, float [[ZCOMPARE]], float [[S]], float [[T]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; main_body: @@ -4427,10 +4848,11 @@ } define amdgpu_kernel void @gather4_l_o_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %s, float %t, float %lod) { -; CHECK-LABEL: @gather4_l_o_2d( +; CHECK-LABEL: define amdgpu_kernel void @gather4_l_o_2d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], i32 [[OFFSET:%.*]], float [[S:%.*]], float [[T:%.*]], float [[LOD:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: main_body: -; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f32(i32 15, i32 [[OFFSET:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f32(i32 15, i32 [[OFFSET]], float [[S]], float [[T]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; main_body: @@ -4440,10 +4862,11 @@ } define amdgpu_kernel void @gather4_c_l_o_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %s, float %t, float %lod) { -; CHECK-LABEL: @gather4_c_l_o_2d( +; CHECK-LABEL: define amdgpu_kernel void @gather4_c_l_o_2d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], float [[LOD:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: main_body: -; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2d.v4f32.f32(i32 15, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2d.v4f32.f32(i32 15, i32 [[OFFSET]], float [[ZCOMPARE]], float [[S]], float [[T]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; main_body: @@ -4453,10 +4876,11 @@ } define amdgpu_kernel void @gather4_c_l_o_2darray(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %s, float %t, float %slice, float %lod) { -; CHECK-LABEL: @gather4_c_l_o_2darray( +; CHECK-LABEL: define amdgpu_kernel void @gather4_c_l_o_2darray +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], float [[SLICE:%.*]], float [[LOD:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: main_body: -; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2darray.v4f32.f32(i32 15, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], float [[SLICE:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2darray.v4f32.f32(i32 15, i32 [[OFFSET]], float [[ZCOMPARE]], float [[S]], float [[T]], float [[SLICE]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; main_body: @@ -4470,10 +4894,11 @@ ; -------------------------------------------------------------------- define amdgpu_kernel void @load_mip_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, i32 %s) { -; CHECK-LABEL: @load_mip_1d( +; CHECK-LABEL: define amdgpu_kernel void @load_mip_1d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: main_body: -; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 [[S]], <8 x i32> [[RSRC]], i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; main_body: @@ -4483,10 +4908,11 @@ } define amdgpu_kernel void @load_mip_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, i32 %s, i32 %t) { -; CHECK-LABEL: @load_mip_2d( +; CHECK-LABEL: define amdgpu_kernel void @load_mip_2d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: main_body: -; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32 15, i32 [[S:%.*]], i32 [[T:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32 15, i32 [[S]], i32 [[T]], <8 x i32> [[RSRC]], i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; main_body: @@ -4496,10 +4922,11 @@ } define amdgpu_kernel void @load_mip_3d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %u) { -; CHECK-LABEL: @load_mip_3d( +; CHECK-LABEL: define amdgpu_kernel void @load_mip_3d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]], i32 [[U:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: main_body: -; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.load.3d.v4f32.i32(i32 15, i32 [[S:%.*]], i32 [[T:%.*]], i32 [[U:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.load.3d.v4f32.i32(i32 15, i32 [[S]], i32 [[T]], i32 [[U]], <8 x i32> [[RSRC]], i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; main_body: @@ -4509,10 +4936,11 @@ } define amdgpu_kernel void @load_mip_1darray(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, i32 %s, i32 %t) { -; CHECK-LABEL: @load_mip_1darray( +; CHECK-LABEL: define amdgpu_kernel void @load_mip_1darray +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: main_body: -; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.load.1darray.v4f32.i32(i32 15, i32 [[S:%.*]], i32 [[T:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.load.1darray.v4f32.i32(i32 15, i32 [[S]], i32 [[T]], <8 x i32> [[RSRC]], i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; main_body: @@ -4522,10 +4950,11 @@ } define amdgpu_kernel void @load_mip_2darray(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %u) { -; CHECK-LABEL: @load_mip_2darray( +; CHECK-LABEL: define amdgpu_kernel void @load_mip_2darray +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]], i32 [[U:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: main_body: -; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.load.2darray.v4f32.i32(i32 15, i32 [[S:%.*]], i32 [[T:%.*]], i32 [[U:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.load.2darray.v4f32.i32(i32 15, i32 [[S]], i32 [[T]], i32 [[U]], <8 x i32> [[RSRC]], i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; main_body: @@ -4535,10 +4964,11 @@ } define amdgpu_kernel void @load_mip_cube(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %u) { -; CHECK-LABEL: @load_mip_cube( +; CHECK-LABEL: define amdgpu_kernel void @load_mip_cube +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]], i32 [[U:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: main_body: -; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.load.cube.v4f32.i32(i32 15, i32 [[S:%.*]], i32 [[T:%.*]], i32 [[U:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.load.cube.v4f32.i32(i32 15, i32 [[S]], i32 [[T]], i32 [[U]], <8 x i32> [[RSRC]], i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; main_body: @@ -4549,9 +4979,10 @@ define amdgpu_kernel void @store_mip_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s) { -; CHECK-LABEL: @store_mip_1d( +; CHECK-LABEL: define amdgpu_kernel void @store_mip_1d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x float> [[VDATA:%.*]], i32 [[S:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: main_body: -; CHECK-NEXT: call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> [[VDATA:%.*]], i32 15, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> [[VDATA]], i32 15, i32 [[S]], <8 x i32> [[RSRC]], i32 0, i32 0) ; CHECK-NEXT: ret void ; main_body: @@ -4560,9 +4991,10 @@ } define amdgpu_kernel void @store_mip_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t) { -; CHECK-LABEL: @store_mip_2d( +; CHECK-LABEL: define amdgpu_kernel void @store_mip_2d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x float> [[VDATA:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: main_body: -; CHECK-NEXT: call void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float> [[VDATA:%.*]], i32 15, i32 [[S:%.*]], i32 [[T:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float> [[VDATA]], i32 15, i32 [[S]], i32 [[T]], <8 x i32> [[RSRC]], i32 0, i32 0) ; CHECK-NEXT: ret void ; main_body: @@ -4571,9 +5003,10 @@ } define amdgpu_kernel void @store_mip_3d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t, i32 %u) { -; CHECK-LABEL: @store_mip_3d( +; CHECK-LABEL: define amdgpu_kernel void @store_mip_3d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x float> [[VDATA:%.*]], i32 [[S:%.*]], i32 [[T:%.*]], i32 [[U:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: main_body: -; CHECK-NEXT: call void @llvm.amdgcn.image.store.3d.v4f32.i32(<4 x float> [[VDATA:%.*]], i32 15, i32 [[S:%.*]], i32 [[T:%.*]], i32 [[U:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.image.store.3d.v4f32.i32(<4 x float> [[VDATA]], i32 15, i32 [[S]], i32 [[T]], i32 [[U]], <8 x i32> [[RSRC]], i32 0, i32 0) ; CHECK-NEXT: ret void ; main_body: @@ -4582,9 +5015,10 @@ } define amdgpu_kernel void @store_mip_1darray(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t) { -; CHECK-LABEL: @store_mip_1darray( +; CHECK-LABEL: define amdgpu_kernel void @store_mip_1darray +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x float> [[VDATA:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: main_body: -; CHECK-NEXT: call void @llvm.amdgcn.image.store.1darray.v4f32.i32(<4 x float> [[VDATA:%.*]], i32 15, i32 [[S:%.*]], i32 [[T:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.image.store.1darray.v4f32.i32(<4 x float> [[VDATA]], i32 15, i32 [[S]], i32 [[T]], <8 x i32> [[RSRC]], i32 0, i32 0) ; CHECK-NEXT: ret void ; main_body: @@ -4593,9 +5027,10 @@ } define amdgpu_kernel void @store_mip_2darray(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t, i32 %u) { -; CHECK-LABEL: @store_mip_2darray( +; CHECK-LABEL: define amdgpu_kernel void @store_mip_2darray +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x float> [[VDATA:%.*]], i32 [[S:%.*]], i32 [[T:%.*]], i32 [[U:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: main_body: -; CHECK-NEXT: call void @llvm.amdgcn.image.store.2darray.v4f32.i32(<4 x float> [[VDATA:%.*]], i32 15, i32 [[S:%.*]], i32 [[T:%.*]], i32 [[U:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.image.store.2darray.v4f32.i32(<4 x float> [[VDATA]], i32 15, i32 [[S]], i32 [[T]], i32 [[U]], <8 x i32> [[RSRC]], i32 0, i32 0) ; CHECK-NEXT: ret void ; main_body: @@ -4604,9 +5039,10 @@ } define amdgpu_kernel void @store_mip_cube(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t, i32 %u) { -; CHECK-LABEL: @store_mip_cube( +; CHECK-LABEL: define amdgpu_kernel void @store_mip_cube +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x float> [[VDATA:%.*]], i32 [[S:%.*]], i32 [[T:%.*]], i32 [[U:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: main_body: -; CHECK-NEXT: call void @llvm.amdgcn.image.store.cube.v4f32.i32(<4 x float> [[VDATA:%.*]], i32 15, i32 [[S:%.*]], i32 [[T:%.*]], i32 [[U:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.image.store.cube.v4f32.i32(<4 x float> [[VDATA]], i32 15, i32 [[S]], i32 [[T]], i32 [[U]], <8 x i32> [[RSRC]], i32 0, i32 0) ; CHECK-NEXT: ret void ; main_body: @@ -4649,10 +5085,11 @@ declare <4 x float> @llvm.amdgcn.image.gather4.c.b.o.2d.v4f32.f32(i32, i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 define amdgpu_kernel void @sample_b_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) { -; CHECK-LABEL: @sample_b_1d( +; CHECK-LABEL: define amdgpu_kernel void @sample_b_1d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[S:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: main_body: -; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float [[S]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; main_body: @@ -4662,10 +5099,11 @@ } define amdgpu_kernel void @sample_b_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) { -; CHECK-LABEL: @sample_b_2d( +; CHECK-LABEL: define amdgpu_kernel void @sample_b_2d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[S:%.*]], float [[T:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: main_body: -; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float [[S]], float [[T]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; main_body: @@ -4675,10 +5113,11 @@ } define amdgpu_kernel void @sample_c_b_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s) { -; CHECK-LABEL: @sample_c_b_1d( +; CHECK-LABEL: define amdgpu_kernel void @sample_c_b_1d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: main_body: -; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.1d.v4f32.f32(i32 15, float [[ZCOMPARE:%.*]], float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.1d.v4f32.f32(i32 15, float [[ZCOMPARE]], float [[S]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; main_body: @@ -4688,10 +5127,11 @@ } define amdgpu_kernel void @sample_c_b_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %t) { -; CHECK-LABEL: @sample_c_b_2d( +; CHECK-LABEL: define amdgpu_kernel void @sample_c_b_2d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: main_body: -; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.2d.v4f32.f32(i32 15, float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.2d.v4f32.f32(i32 15, float [[ZCOMPARE]], float [[S]], float [[T]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; main_body: @@ -4701,10 +5141,11 @@ } define amdgpu_kernel void @sample_b_o_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %s) { -; CHECK-LABEL: @sample_b_o_1d( +; CHECK-LABEL: define amdgpu_kernel void @sample_b_o_1d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], i32 [[OFFSET:%.*]], float [[S:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: main_body: -; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.o.1d.v4f32.f32(i32 15, i32 [[OFFSET:%.*]], float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.o.1d.v4f32.f32(i32 15, i32 [[OFFSET]], float [[S]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; main_body: @@ -4714,10 +5155,11 @@ } define amdgpu_kernel void @sample_b_o_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %s, float %t) { -; CHECK-LABEL: @sample_b_o_2d( +; CHECK-LABEL: define amdgpu_kernel void @sample_b_o_2d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], i32 [[OFFSET:%.*]], float [[S:%.*]], float [[T:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: main_body: -; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.o.2d.v4f32.f32(i32 15, i32 [[OFFSET:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.o.2d.v4f32.f32(i32 15, i32 [[OFFSET]], float [[S]], float [[T]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; main_body: @@ -4727,10 +5169,11 @@ } define amdgpu_kernel void @sample_c_b_o_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %s) { -; CHECK-LABEL: @sample_c_b_o_1d( +; CHECK-LABEL: define amdgpu_kernel void @sample_c_b_o_1d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: main_body: -; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.o.1d.v4f32.f32(i32 15, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.o.1d.v4f32.f32(i32 15, i32 [[OFFSET]], float [[ZCOMPARE]], float [[S]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; main_body: @@ -4740,10 +5183,11 @@ } define amdgpu_kernel void @sample_c_b_o_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %s, float %t) { -; CHECK-LABEL: @sample_c_b_o_2d( +; CHECK-LABEL: define amdgpu_kernel void @sample_c_b_o_2d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: main_body: -; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.o.2d.v4f32.f32(i32 15, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.o.2d.v4f32.f32(i32 15, i32 [[OFFSET]], float [[ZCOMPARE]], float [[S]], float [[T]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; main_body: @@ -4753,10 +5197,11 @@ } define amdgpu_kernel void @gather4_b_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) { -; CHECK-LABEL: @gather4_b_2d( +; CHECK-LABEL: define amdgpu_kernel void @gather4_b_2d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[S:%.*]], float [[T:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: main_body: -; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32 15, float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32 15, float [[S]], float [[T]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; main_body: @@ -4766,10 +5211,11 @@ } define amdgpu_kernel void @gather4_c_b_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %t) { -; CHECK-LABEL: @gather4_c_b_2d( +; CHECK-LABEL: define amdgpu_kernel void @gather4_c_b_2d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: main_body: -; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.c.2d.v4f32.f32(i32 15, float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.c.2d.v4f32.f32(i32 15, float [[ZCOMPARE]], float [[S]], float [[T]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; main_body: @@ -4779,10 +5225,11 @@ } define amdgpu_kernel void @gather4_b_o_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %s, float %t) { -; CHECK-LABEL: @gather4_b_o_2d( +; CHECK-LABEL: define amdgpu_kernel void @gather4_b_o_2d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], i32 [[OFFSET:%.*]], float [[S:%.*]], float [[T:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: main_body: -; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.o.2d.v4f32.f32(i32 15, i32 [[OFFSET:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.o.2d.v4f32.f32(i32 15, i32 [[OFFSET]], float [[S]], float [[T]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; main_body: @@ -4792,10 +5239,11 @@ } define amdgpu_kernel void @gather4_c_b_o_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %s, float %t) { -; CHECK-LABEL: @gather4_c_b_o_2d( +; CHECK-LABEL: define amdgpu_kernel void @gather4_c_b_o_2d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: main_body: -; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.c.o.2d.v4f32.f32(i32 15, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.c.o.2d.v4f32.f32(i32 15, i32 [[OFFSET]], float [[ZCOMPARE]], float [[S]], float [[T]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; main_body: @@ -4805,10 +5253,11 @@ } define amdgpu_kernel void @sample_c_b_o_a16_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %s, half %t) { -; CHECK-LABEL: @sample_c_b_o_a16_2d( +; CHECK-LABEL: define amdgpu_kernel void @sample_c_b_o_a16_2d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], half [[S:%.*]], half [[T:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: main_body: -; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.o.2d.v4f32.f16(i32 15, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], half [[S:%.*]], half [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.o.2d.v4f32.f16(i32 15, i32 [[OFFSET]], float [[ZCOMPARE]], half [[S]], half [[T]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; main_body: @@ -4819,10 +5268,11 @@ ; Check that bias is not optimized away if > 0 define amdgpu_kernel void @sample_b_1d_pos(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) { -; CHECK-LABEL: @sample_b_1d_pos( +; CHECK-LABEL: define amdgpu_kernel void @sample_b_1d_pos +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[S:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: main_body: -; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32.f32(i32 15, float 1.000000e+00, float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32.f32(i32 15, float 1.000000e+00, float [[S]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; main_body: @@ -4833,10 +5283,11 @@ ; Check that bias is not optimized away if < 0 define amdgpu_kernel void @sample_b_1d_neg(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) { -; CHECK-LABEL: @sample_b_1d_neg( +; CHECK-LABEL: define amdgpu_kernel void @sample_b_1d_neg +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[S:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: main_body: -; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32.f32(i32 15, float -1.000000e+00, float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32.f32(i32 15, float -1.000000e+00, float [[S]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; main_body: @@ -4847,10 +5298,11 @@ ; Zero bias + A16 define amdgpu_kernel void @sample_b_1d_a16(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s) { -; CHECK-LABEL: @sample_b_1d_a16( +; CHECK-LABEL: define amdgpu_kernel void @sample_b_1d_a16 +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], half [[S:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: main_body: -; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f16(i32 15, half [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f16(i32 15, half [[S]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; main_body: @@ -4865,10 +5317,11 @@ ; -------------------------------------------------------------------- define amdgpu_kernel void @offset_sample_o_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) { -; CHECK-LABEL: @offset_sample_o_1d( +; CHECK-LABEL: define amdgpu_kernel void @offset_sample_o_1d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[S:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: main_body: -; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float [[S]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; main_body: @@ -4878,10 +5331,11 @@ } define amdgpu_kernel void @offset_sample_o_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) { -; CHECK-LABEL: @offset_sample_o_2d( +; CHECK-LABEL: define amdgpu_kernel void @offset_sample_o_2d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[S:%.*]], float [[T:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: main_body: -; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float [[S]], float [[T]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; main_body: @@ -4891,10 +5345,11 @@ } define amdgpu_kernel void @offset_sample_c_o_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s) { -; CHECK-LABEL: @offset_sample_c_o_1d( +; CHECK-LABEL: define amdgpu_kernel void @offset_sample_c_o_1d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: main_body: -; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.1d.v4f32.f32(i32 15, float [[ZCOMPARE:%.*]], float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.1d.v4f32.f32(i32 15, float [[ZCOMPARE]], float [[S]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; main_body: @@ -4904,10 +5359,11 @@ } define amdgpu_kernel void @offset_sample_c_o_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %t) { -; CHECK-LABEL: @offset_sample_c_o_2d( +; CHECK-LABEL: define amdgpu_kernel void @offset_sample_c_o_2d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: main_body: -; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.2d.v4f32.f32(i32 15, float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.2d.v4f32.f32(i32 15, float [[ZCOMPARE]], float [[S]], float [[T]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; main_body: @@ -4917,10 +5373,11 @@ } define amdgpu_kernel void @offset_sample_cl_o_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %clamp) { -; CHECK-LABEL: @offset_sample_cl_o_1d( +; CHECK-LABEL: define amdgpu_kernel void @offset_sample_cl_o_1d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[S:%.*]], float [[CLAMP:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: main_body: -; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cl.1d.v4f32.f32(i32 15, float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cl.1d.v4f32.f32(i32 15, float [[S]], float [[CLAMP]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; main_body: @@ -4930,10 +5387,11 @@ } define amdgpu_kernel void @offset_sample_cl_o_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, float %clamp) { -; CHECK-LABEL: @offset_sample_cl_o_2d( +; CHECK-LABEL: define amdgpu_kernel void @offset_sample_cl_o_2d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: main_body: -; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cl.2d.v4f32.f32(i32 15, float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cl.2d.v4f32.f32(i32 15, float [[S]], float [[T]], float [[CLAMP]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; main_body: @@ -4943,10 +5401,11 @@ } define amdgpu_kernel void @offset_sample_c_cl_o_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %clamp) { -; CHECK-LABEL: @offset_sample_c_cl_o_1d( +; CHECK-LABEL: define amdgpu_kernel void @offset_sample_c_cl_o_1d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[CLAMP:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: main_body: -; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cl.1d.v4f32.f32(i32 15, float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cl.1d.v4f32.f32(i32 15, float [[ZCOMPARE]], float [[S]], float [[CLAMP]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; main_body: @@ -4956,10 +5415,11 @@ } define amdgpu_kernel void @offset_sample_c_cl_o_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %t, float %clamp) { -; CHECK-LABEL: @offset_sample_c_cl_o_2d( +; CHECK-LABEL: define amdgpu_kernel void @offset_sample_c_cl_o_2d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: main_body: -; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cl.2d.v4f32.f32(i32 15, float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cl.2d.v4f32.f32(i32 15, float [[ZCOMPARE]], float [[S]], float [[T]], float [[CLAMP]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; main_body: @@ -4969,10 +5429,11 @@ } define amdgpu_kernel void @offset_sample_b_o_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %s) { -; CHECK-LABEL: @offset_sample_b_o_1d( +; CHECK-LABEL: define amdgpu_kernel void @offset_sample_b_o_1d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[BIAS:%.*]], float [[S:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: main_body: -; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32.f32(i32 15, float [[BIAS:%.*]], float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32.f32(i32 15, float [[BIAS]], float [[S]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; main_body: @@ -4982,10 +5443,11 @@ } define amdgpu_kernel void @offset_sample_b_o_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %s, float %t) { -; CHECK-LABEL: @offset_sample_b_o_2d( +; CHECK-LABEL: define amdgpu_kernel void @offset_sample_b_o_2d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[BIAS:%.*]], float [[S:%.*]], float [[T:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: main_body: -; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.2d.v4f32.f32.f32(i32 15, float [[BIAS:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.2d.v4f32.f32.f32(i32 15, float [[BIAS]], float [[S]], float [[T]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; main_body: @@ -4995,10 +5457,11 @@ } define amdgpu_kernel void @offset_sample_c_b_o_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, float %s) { -; CHECK-LABEL: @offset_sample_c_b_o_1d( +; CHECK-LABEL: define amdgpu_kernel void @offset_sample_c_b_o_1d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: main_body: -; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.1d.v4f32.f32.f32(i32 15, float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.1d.v4f32.f32.f32(i32 15, float [[BIAS]], float [[ZCOMPARE]], float [[S]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; main_body: @@ -5008,10 +5471,11 @@ } define amdgpu_kernel void @offset_sample_c_b_o_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, float %s, float %t) { -; CHECK-LABEL: @offset_sample_c_b_o_2d( +; CHECK-LABEL: define amdgpu_kernel void @offset_sample_c_b_o_2d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: main_body: -; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.2d.v4f32.f32.f32(i32 15, float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.2d.v4f32.f32.f32(i32 15, float [[BIAS]], float [[ZCOMPARE]], float [[S]], float [[T]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; main_body: @@ -5021,10 +5485,11 @@ } define amdgpu_kernel void @offset_sample_b_cl_o_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %s, float %clamp) { -; CHECK-LABEL: @offset_sample_b_cl_o_1d( +; CHECK-LABEL: define amdgpu_kernel void @offset_sample_b_cl_o_1d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[BIAS:%.*]], float [[S:%.*]], float [[CLAMP:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: main_body: -; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f32.f32(i32 15, float [[BIAS:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f32.f32(i32 15, float [[BIAS]], float [[S]], float [[CLAMP]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; main_body: @@ -5034,10 +5499,11 @@ } define amdgpu_kernel void @offset_sample_b_cl_o_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %s, float %t, float %clamp) { -; CHECK-LABEL: @offset_sample_b_cl_o_2d( +; CHECK-LABEL: define amdgpu_kernel void @offset_sample_b_cl_o_2d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[BIAS:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: main_body: -; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.cl.2d.v4f32.f32.f32(i32 15, float [[BIAS:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.cl.2d.v4f32.f32.f32(i32 15, float [[BIAS]], float [[S]], float [[T]], float [[CLAMP]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; main_body: @@ -5047,10 +5513,11 @@ } define amdgpu_kernel void @offset_sample_c_b_cl_o_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, float %s, float %clamp) { -; CHECK-LABEL: @offset_sample_c_b_cl_o_1d( +; CHECK-LABEL: define amdgpu_kernel void @offset_sample_c_b_cl_o_1d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[CLAMP:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: main_body: -; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.1d.v4f32.f32.f32(i32 15, float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.1d.v4f32.f32.f32(i32 15, float [[BIAS]], float [[ZCOMPARE]], float [[S]], float [[CLAMP]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; main_body: @@ -5060,10 +5527,11 @@ } define amdgpu_kernel void @offset_sample_c_b_cl_o_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, float %s, float %t, float %clamp) { -; CHECK-LABEL: @offset_sample_c_b_cl_o_2d( +; CHECK-LABEL: define amdgpu_kernel void @offset_sample_c_b_cl_o_2d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: main_body: -; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.2d.v4f32.f32.f32(i32 15, float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.2d.v4f32.f32.f32(i32 15, float [[BIAS]], float [[ZCOMPARE]], float [[S]], float [[T]], float [[CLAMP]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; main_body: @@ -5073,10 +5541,11 @@ } define amdgpu_kernel void @offset_sample_d_o_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dsdv, float %s) { -; CHECK-LABEL: @offset_sample_d_o_1d( +; CHECK-LABEL: define amdgpu_kernel void @offset_sample_d_o_1d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: main_body: -; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f32.f32(i32 15, float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f32.f32(i32 15, float [[DSDH]], float [[DSDV]], float [[S]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; main_body: @@ -5086,10 +5555,11 @@ } define amdgpu_kernel void @offset_sample_d_o_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t) { -; CHECK-LABEL: @offset_sample_d_o_2d( +; CHECK-LABEL: define amdgpu_kernel void @offset_sample_d_o_2d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[DSDH:%.*]], float [[DTDH:%.*]], float [[DSDV:%.*]], float [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: main_body: -; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f32.f32(i32 15, float [[DSDH:%.*]], float [[DTDH:%.*]], float [[DSDV:%.*]], float [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f32.f32(i32 15, float [[DSDH]], float [[DTDH]], float [[DSDV]], float [[DTDV]], float [[S]], float [[T]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; main_body: @@ -5099,10 +5569,11 @@ } define amdgpu_kernel void @offset_sample_c_d_o_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dsdv, float %s) { -; CHECK-LABEL: @offset_sample_c_d_o_1d( +; CHECK-LABEL: define amdgpu_kernel void @offset_sample_c_d_o_1d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: main_body: -; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f32.f32(i32 15, float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f32.f32(i32 15, float [[ZCOMPARE]], float [[DSDH]], float [[DSDV]], float [[S]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; main_body: @@ -5112,10 +5583,11 @@ } define amdgpu_kernel void @offset_sample_c_d_o_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t) { -; CHECK-LABEL: @offset_sample_c_d_o_2d( +; CHECK-LABEL: define amdgpu_kernel void @offset_sample_c_d_o_2d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DTDH:%.*]], float [[DSDV:%.*]], float [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: main_body: -; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f32.f32(i32 15, float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DTDH:%.*]], float [[DSDV:%.*]], float [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f32.f32(i32 15, float [[ZCOMPARE]], float [[DSDH]], float [[DTDH]], float [[DSDV]], float [[DTDV]], float [[S]], float [[T]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; main_body: @@ -5125,10 +5597,11 @@ } define amdgpu_kernel void @offset_sample_d_cl_o_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dsdv, float %s, float %clamp) { -; CHECK-LABEL: @offset_sample_d_cl_o_1d( +; CHECK-LABEL: define amdgpu_kernel void @offset_sample_d_cl_o_1d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: main_body: -; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f32.f32(i32 15, float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f32.f32(i32 15, float [[DSDH]], float [[DSDV]], float [[S]], float [[CLAMP]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; main_body: @@ -5138,10 +5611,11 @@ } define amdgpu_kernel void @offset_sample_d_cl_o_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %clamp) { -; CHECK-LABEL: @offset_sample_d_cl_o_2d( +; CHECK-LABEL: define amdgpu_kernel void @offset_sample_d_cl_o_2d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[DSDH:%.*]], float [[DTDH:%.*]], float [[DSDV:%.*]], float [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: main_body: -; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f32.f32(i32 15, float [[DSDH:%.*]], float [[DTDH:%.*]], float [[DSDV:%.*]], float [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f32.f32(i32 15, float [[DSDH]], float [[DTDH]], float [[DSDV]], float [[DTDV]], float [[S]], float [[T]], float [[CLAMP]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; main_body: @@ -5151,10 +5625,11 @@ } define amdgpu_kernel void @offset_sample_c_d_cl_o_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp) { -; CHECK-LABEL: @offset_sample_c_d_cl_o_1d( +; CHECK-LABEL: define amdgpu_kernel void @offset_sample_c_d_cl_o_1d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: main_body: -; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f32.f32(i32 15, float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f32.f32(i32 15, float [[ZCOMPARE]], float [[DSDH]], float [[DSDV]], float [[S]], float [[CLAMP]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; main_body: @@ -5164,10 +5639,11 @@ } define amdgpu_kernel void @offset_sample_c_d_cl_o_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %clamp) { -; CHECK-LABEL: @offset_sample_c_d_cl_o_2d( +; CHECK-LABEL: define amdgpu_kernel void @offset_sample_c_d_cl_o_2d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DTDH:%.*]], float [[DSDV:%.*]], float [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: main_body: -; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f32.f32(i32 15, float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DTDH:%.*]], float [[DSDV:%.*]], float [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f32.f32(i32 15, float [[ZCOMPARE]], float [[DSDH]], float [[DTDH]], float [[DSDV]], float [[DTDV]], float [[S]], float [[T]], float [[CLAMP]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; main_body: @@ -5177,10 +5653,11 @@ } define amdgpu_kernel void @offset_sample_cd_o_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dsdv, float %s) { -; CHECK-LABEL: @offset_sample_cd_o_1d( +; CHECK-LABEL: define amdgpu_kernel void @offset_sample_cd_o_1d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: main_body: -; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f32.f32(i32 15, float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f32.f32(i32 15, float [[DSDH]], float [[DSDV]], float [[S]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; main_body: @@ -5190,10 +5667,11 @@ } define amdgpu_kernel void @offset_sample_cd_o_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t) { -; CHECK-LABEL: @offset_sample_cd_o_2d( +; CHECK-LABEL: define amdgpu_kernel void @offset_sample_cd_o_2d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[DSDH:%.*]], float [[DTDH:%.*]], float [[DSDV:%.*]], float [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: main_body: -; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f32.f32(i32 15, float [[DSDH:%.*]], float [[DTDH:%.*]], float [[DSDV:%.*]], float [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f32.f32(i32 15, float [[DSDH]], float [[DTDH]], float [[DSDV]], float [[DTDV]], float [[S]], float [[T]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; main_body: @@ -5203,10 +5681,11 @@ } define amdgpu_kernel void @offset_sample_c_cd_o_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dsdv, float %s) { -; CHECK-LABEL: @offset_sample_c_cd_o_1d( +; CHECK-LABEL: define amdgpu_kernel void @offset_sample_c_cd_o_1d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: main_body: -; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f32.f32(i32 15, float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f32.f32(i32 15, float [[ZCOMPARE]], float [[DSDH]], float [[DSDV]], float [[S]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; main_body: @@ -5216,10 +5695,11 @@ } define amdgpu_kernel void @offset_sample_c_cd_o_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t) { -; CHECK-LABEL: @offset_sample_c_cd_o_2d( +; CHECK-LABEL: define amdgpu_kernel void @offset_sample_c_cd_o_2d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DTDH:%.*]], float [[DSDV:%.*]], float [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: main_body: -; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f32.f32(i32 15, float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DTDH:%.*]], float [[DSDV:%.*]], float [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f32.f32(i32 15, float [[ZCOMPARE]], float [[DSDH]], float [[DTDH]], float [[DSDV]], float [[DTDV]], float [[S]], float [[T]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; main_body: @@ -5229,10 +5709,11 @@ } define amdgpu_kernel void @offset_sample_cd_cl_o_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dsdv, float %s, float %clamp) { -; CHECK-LABEL: @offset_sample_cd_cl_o_1d( +; CHECK-LABEL: define amdgpu_kernel void @offset_sample_cd_cl_o_1d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: main_body: -; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f32.f32(i32 15, float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f32.f32(i32 15, float [[DSDH]], float [[DSDV]], float [[S]], float [[CLAMP]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; main_body: @@ -5242,10 +5723,11 @@ } define amdgpu_kernel void @offset_sample_cd_cl_o_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %clamp) { -; CHECK-LABEL: @offset_sample_cd_cl_o_2d( +; CHECK-LABEL: define amdgpu_kernel void @offset_sample_cd_cl_o_2d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[DSDH:%.*]], float [[DTDH:%.*]], float [[DSDV:%.*]], float [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: main_body: -; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f32.f32(i32 15, float [[DSDH:%.*]], float [[DTDH:%.*]], float [[DSDV:%.*]], float [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f32.f32(i32 15, float [[DSDH]], float [[DTDH]], float [[DSDV]], float [[DTDV]], float [[S]], float [[T]], float [[CLAMP]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; main_body: @@ -5255,10 +5737,11 @@ } define amdgpu_kernel void @offset_sample_c_cd_cl_o_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp) { -; CHECK-LABEL: @offset_sample_c_cd_cl_o_1d( +; CHECK-LABEL: define amdgpu_kernel void @offset_sample_c_cd_cl_o_1d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: main_body: -; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f32.f32(i32 15, float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f32.f32(i32 15, float [[ZCOMPARE]], float [[DSDH]], float [[DSDV]], float [[S]], float [[CLAMP]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; main_body: @@ -5268,10 +5751,11 @@ } define amdgpu_kernel void @offset_sample_c_cd_cl_o_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %clamp) { -; CHECK-LABEL: @offset_sample_c_cd_cl_o_2d( +; CHECK-LABEL: define amdgpu_kernel void @offset_sample_c_cd_cl_o_2d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DTDH:%.*]], float [[DSDV:%.*]], float [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: main_body: -; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f32.f32(i32 15, float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DTDH:%.*]], float [[DSDV:%.*]], float [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f32.f32(i32 15, float [[ZCOMPARE]], float [[DSDH]], float [[DTDH]], float [[DSDV]], float [[DTDV]], float [[S]], float [[T]], float [[CLAMP]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; main_body: @@ -5281,10 +5765,11 @@ } define amdgpu_kernel void @offset_sample_l_o_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %lod) { -; CHECK-LABEL: @offset_sample_l_o_1d( +; CHECK-LABEL: define amdgpu_kernel void @offset_sample_l_o_1d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[S:%.*]], float [[LOD:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: main_body: -; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.l.1d.v4f32.f32(i32 15, float [[S:%.*]], float [[LOD:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.l.1d.v4f32.f32(i32 15, float [[S]], float [[LOD]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; main_body: @@ -5294,10 +5779,11 @@ } define amdgpu_kernel void @offset_sample_l_o_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, float %lod) { -; CHECK-LABEL: @offset_sample_l_o_2d( +; CHECK-LABEL: define amdgpu_kernel void @offset_sample_l_o_2d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[S:%.*]], float [[T:%.*]], float [[LOD:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: main_body: -; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.l.2d.v4f32.f32(i32 15, float [[S:%.*]], float [[T:%.*]], float [[LOD:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.l.2d.v4f32.f32(i32 15, float [[S]], float [[T]], float [[LOD]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; main_body: @@ -5307,10 +5793,11 @@ } define amdgpu_kernel void @offset_sample_c_l_o_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %lod) { -; CHECK-LABEL: @offset_sample_c_l_o_1d( +; CHECK-LABEL: define amdgpu_kernel void @offset_sample_c_l_o_1d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[LOD:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: main_body: -; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.l.1d.v4f32.f32(i32 15, float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[LOD:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.l.1d.v4f32.f32(i32 15, float [[ZCOMPARE]], float [[S]], float [[LOD]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; main_body: @@ -5320,10 +5807,11 @@ } define amdgpu_kernel void @offset_sample_c_l_o_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %t, float %lod) { -; CHECK-LABEL: @offset_sample_c_l_o_2d( +; CHECK-LABEL: define amdgpu_kernel void @offset_sample_c_l_o_2d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], float [[LOD:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: main_body: -; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.l.2d.v4f32.f32(i32 15, float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], float [[LOD:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.l.2d.v4f32.f32(i32 15, float [[ZCOMPARE]], float [[S]], float [[T]], float [[LOD]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; main_body: @@ -5333,10 +5821,11 @@ } define amdgpu_kernel void @offset_sample_lz_o_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) { -; CHECK-LABEL: @offset_sample_lz_o_1d( +; CHECK-LABEL: define amdgpu_kernel void @offset_sample_lz_o_1d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[S:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: main_body: -; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.lz.1d.v4f32.f32(i32 15, float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.lz.1d.v4f32.f32(i32 15, float [[S]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; main_body: @@ -5346,10 +5835,11 @@ } define amdgpu_kernel void @offset_sample_lz_o_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) { -; CHECK-LABEL: @offset_sample_lz_o_2d( +; CHECK-LABEL: define amdgpu_kernel void @offset_sample_lz_o_2d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[S:%.*]], float [[T:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: main_body: -; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32(i32 15, float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32(i32 15, float [[S]], float [[T]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; main_body: @@ -5359,10 +5849,11 @@ } define amdgpu_kernel void @offset_sample_c_lz_o_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s) { -; CHECK-LABEL: @offset_sample_c_lz_o_1d( +; CHECK-LABEL: define amdgpu_kernel void @offset_sample_c_lz_o_1d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: main_body: -; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.lz.1d.v4f32.f32(i32 15, float [[ZCOMPARE:%.*]], float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.lz.1d.v4f32.f32(i32 15, float [[ZCOMPARE]], float [[S]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; main_body: @@ -5372,10 +5863,11 @@ } define amdgpu_kernel void @offset_sample_c_lz_o_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %t) { -; CHECK-LABEL: @offset_sample_c_lz_o_2d( +; CHECK-LABEL: define amdgpu_kernel void @offset_sample_c_lz_o_2d +; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i32> inreg [[RSRC:%.*]], <4 x i32> inreg [[SAMP:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]]) #[[ATTR3]] { ; CHECK-NEXT: main_body: -; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.lz.2d.v4f32.f32(i32 15, float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) -; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT:%.*]], align 16 +; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.lz.2d.v4f32.f32(i32 15, float [[ZCOMPARE]], float [[S]], float [[T]], <8 x i32> [[RSRC]], <4 x i32> [[SAMP]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[V]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; main_body: @@ -5432,7 +5924,8 @@ declare i1 @llvm.amdgcn.is.shared(ptr) nounwind readnone define i1 @test_is_shared_null() nounwind { -; CHECK-LABEL: @test_is_shared_null( +; CHECK-LABEL: define i1 @test_is_shared_null +; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: ret i1 false ; %val = call i1 @llvm.amdgcn.is.shared(ptr null) @@ -5440,7 +5933,8 @@ } define i1 @test_is_shared_undef() nounwind { -; CHECK-LABEL: @test_is_shared_undef( +; CHECK-LABEL: define i1 @test_is_shared_undef +; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: ret i1 undef ; %val = call i1 @llvm.amdgcn.is.shared(ptr undef) @@ -5454,7 +5948,8 @@ declare i1 @llvm.amdgcn.is.private(ptr) nounwind readnone define i1 @test_is_private_null() nounwind { -; CHECK-LABEL: @test_is_private_null( +; CHECK-LABEL: define i1 @test_is_private_null +; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: ret i1 false ; %val = call i1 @llvm.amdgcn.is.private(ptr null) @@ -5462,7 +5957,8 @@ } define i1 @test_is_private_undef() nounwind { -; CHECK-LABEL: @test_is_private_undef( +; CHECK-LABEL: define i1 @test_is_private_undef +; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: ret i1 undef ; %val = call i1 @llvm.amdgcn.is.private(ptr undef) @@ -5477,8 +5973,9 @@ declare float @llvm.amdgcn.trig.preop.f32(float, i32) define double @trig_preop_constfold_variable_undef_arg(i32 %arg) { -; CHECK-LABEL: @trig_preop_constfold_variable_undef_arg( -; CHECK-NEXT: [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double undef, i32 [[ARG:%.*]]) +; CHECK-LABEL: define double @trig_preop_constfold_variable_undef_arg +; CHECK-SAME: (i32 [[ARG:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double undef, i32 [[ARG]]) ; CHECK-NEXT: ret double [[VAL]] ; %val = call double @llvm.amdgcn.trig.preop.f64(double undef, i32 %arg) @@ -5486,8 +5983,9 @@ } define double @trig_preop_constfold_variable_poison_arg(i32 %arg) { -; CHECK-LABEL: @trig_preop_constfold_variable_poison_arg( -; CHECK-NEXT: [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double poison, i32 [[ARG:%.*]]) +; CHECK-LABEL: define double @trig_preop_constfold_variable_poison_arg +; CHECK-SAME: (i32 [[ARG:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double poison, i32 [[ARG]]) ; CHECK-NEXT: ret double [[VAL]] ; %val = call double @llvm.amdgcn.trig.preop.f64(double poison, i32 %arg) @@ -5495,8 +5993,9 @@ } define double @trig_preop_constfold_variable_arg_undef(double %arg) { -; CHECK-LABEL: @trig_preop_constfold_variable_arg_undef( -; CHECK-NEXT: [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double [[ARG:%.*]], i32 undef) +; CHECK-LABEL: define double @trig_preop_constfold_variable_arg_undef +; CHECK-SAME: (double [[ARG:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double [[ARG]], i32 undef) ; CHECK-NEXT: ret double [[VAL]] ; %val = call double @llvm.amdgcn.trig.preop.f64(double %arg, i32 undef) @@ -5504,8 +6003,9 @@ } define double @trig_preop_constfold_variable_arg_poison(double %arg) { -; CHECK-LABEL: @trig_preop_constfold_variable_arg_poison( -; CHECK-NEXT: [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double [[ARG:%.*]], i32 poison) +; CHECK-LABEL: define double @trig_preop_constfold_variable_arg_poison +; CHECK-SAME: (double [[ARG:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double [[ARG]], i32 poison) ; CHECK-NEXT: ret double [[VAL]] ; %val = call double @llvm.amdgcn.trig.preop.f64(double %arg, i32 poison) @@ -5513,8 +6013,9 @@ } define double @trig_preop_constfold_variable_int(i32 %arg) { -; CHECK-LABEL: @trig_preop_constfold_variable_int( -; CHECK-NEXT: [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double 3.454350e+02, i32 [[ARG:%.*]]) +; CHECK-LABEL: define double @trig_preop_constfold_variable_int +; CHECK-SAME: (i32 [[ARG:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double 3.454350e+02, i32 [[ARG]]) ; CHECK-NEXT: ret double [[VAL]] ; %val = call double @llvm.amdgcn.trig.preop.f64(double 3.454350e+02, i32 %arg) @@ -5522,8 +6023,9 @@ } define double @trig_preop_qnan(i32 %arg) { -; CHECK-LABEL: @trig_preop_qnan( -; CHECK-NEXT: [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double 0x7FF8000000000000, i32 [[ARG:%.*]]) +; CHECK-LABEL: define double @trig_preop_qnan +; CHECK-SAME: (i32 [[ARG:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double 0x7FF8000000000000, i32 [[ARG]]) ; CHECK-NEXT: ret double [[VAL]] ; %val = call double @llvm.amdgcn.trig.preop.f64(double 0x7FF8000000000000, i32 %arg) @@ -5531,8 +6033,9 @@ } define double @trig_preop_snan(i32 %arg) { -; CHECK-LABEL: @trig_preop_snan( -; CHECK-NEXT: [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double 0x7FF0000000000001, i32 [[ARG:%.*]]) +; CHECK-LABEL: define double @trig_preop_snan +; CHECK-SAME: (i32 [[ARG:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double 0x7FF0000000000001, i32 [[ARG]]) ; CHECK-NEXT: ret double [[VAL]] ; %val = call double @llvm.amdgcn.trig.preop.f64(double 0x7FF0000000000001, i32 %arg) @@ -5540,7 +6043,8 @@ } define double @trig_preop_inf_0() { -; CHECK-LABEL: @trig_preop_inf_0( +; CHECK-LABEL: define double @trig_preop_inf_0 +; CHECK-SAME: () #[[ATTR3]] { ; CHECK-NEXT: [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double 0x7FF0000000000000, i32 0) ; CHECK-NEXT: ret double [[VAL]] ; @@ -5549,7 +6053,8 @@ } define double @trig_preop_ninf_0() { -; CHECK-LABEL: @trig_preop_ninf_0( +; CHECK-LABEL: define double @trig_preop_ninf_0 +; CHECK-SAME: () #[[ATTR3]] { ; CHECK-NEXT: [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double 0xFFF0000000000000, i32 0) ; CHECK-NEXT: ret double [[VAL]] ; @@ -5558,8 +6063,9 @@ } define double @trig_preop_variable_fp(double %arg) { -; CHECK-LABEL: @trig_preop_variable_fp( -; CHECK-NEXT: [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double [[ARG:%.*]], i32 5) +; CHECK-LABEL: define double @trig_preop_variable_fp +; CHECK-SAME: (double [[ARG:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double [[ARG]], i32 5) ; CHECK-NEXT: ret double [[VAL]] ; %val = call double @llvm.amdgcn.trig.preop.f64(double %arg, i32 5) @@ -5567,8 +6073,9 @@ } define double @trig_preop_variable_args(double %arg0, i32 %arg1) { -; CHECK-LABEL: @trig_preop_variable_args( -; CHECK-NEXT: [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double [[ARG0:%.*]], i32 [[ARG1:%.*]]) +; CHECK-LABEL: define double @trig_preop_variable_args +; CHECK-SAME: (double [[ARG0:%.*]], i32 [[ARG1:%.*]]) #[[ATTR3]] { +; CHECK-NEXT: [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double [[ARG0]], i32 [[ARG1]]) ; CHECK-NEXT: ret double [[VAL]] ; %val = call double @llvm.amdgcn.trig.preop.f64(double %arg0, i32 %arg1) @@ -5576,7 +6083,8 @@ } define double @trig_preop_constfold() { -; CHECK-LABEL: @trig_preop_constfold( +; CHECK-LABEL: define double @trig_preop_constfold +; CHECK-SAME: () #[[ATTR3]] { ; CHECK-NEXT: [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double 3.454350e+02, i32 5) ; CHECK-NEXT: ret double [[VAL]] ; @@ -5585,8 +6093,9 @@ } define double @trig_preop_constfold_strictfp() { -; CHECK-LABEL: @trig_preop_constfold_strictfp( -; CHECK-NEXT: [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double 3.454350e+02, i32 5) #[[ATTR16]] +; CHECK-LABEL: define double @trig_preop_constfold_strictfp +; CHECK-SAME: () #[[ATTR3]] { +; CHECK-NEXT: [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double 3.454350e+02, i32 5) #[[ATTR15]] ; CHECK-NEXT: ret double [[VAL]] ; %val = call double @llvm.amdgcn.trig.preop.f64(double 3.454350e+02, i32 5) strictfp @@ -5594,7 +6103,8 @@ } define double @trig_preop_constfold_0.0__0() { -; CHECK-LABEL: @trig_preop_constfold_0.0__0( +; CHECK-LABEL: define double @trig_preop_constfold_0.0__0 +; CHECK-SAME: () #[[ATTR3]] { ; CHECK-NEXT: [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double 0.000000e+00, i32 0) ; CHECK-NEXT: ret double [[VAL]] ; @@ -5603,7 +6113,8 @@ } define double @trig_preop_constfold_0.0__1() { -; CHECK-LABEL: @trig_preop_constfold_0.0__1( +; CHECK-LABEL: define double @trig_preop_constfold_0.0__1 +; CHECK-SAME: () #[[ATTR3]] { ; CHECK-NEXT: [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double 0.000000e+00, i32 1) ; CHECK-NEXT: ret double [[VAL]] ; @@ -5612,7 +6123,8 @@ } define double @trig_preop_constfold_0.0__neg1() { -; CHECK-LABEL: @trig_preop_constfold_0.0__neg1( +; CHECK-LABEL: define double @trig_preop_constfold_0.0__neg1 +; CHECK-SAME: () #[[ATTR3]] { ; CHECK-NEXT: [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double 0.000000e+00, i32 -1) ; CHECK-NEXT: ret double [[VAL]] ; @@ -5621,7 +6133,8 @@ } define double @trig_preop_constfold_0.0__9999999() { -; CHECK-LABEL: @trig_preop_constfold_0.0__9999999( +; CHECK-LABEL: define double @trig_preop_constfold_0.0__9999999 +; CHECK-SAME: () #[[ATTR3]] { ; CHECK-NEXT: [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double 0.000000e+00, i32 9999999) ; CHECK-NEXT: ret double [[VAL]] ; @@ -5630,7 +6143,8 @@ } define double @trig_preop_constfold_0.0__neg999999() { -; CHECK-LABEL: @trig_preop_constfold_0.0__neg999999( +; CHECK-LABEL: define double @trig_preop_constfold_0.0__neg999999 +; CHECK-SAME: () #[[ATTR3]] { ; CHECK-NEXT: [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double 0.000000e+00, i32 -999999) ; CHECK-NEXT: ret double [[VAL]] ; @@ -5639,7 +6153,8 @@ } define double @trig_preop_constfold_0x0020000000000000_0() { -; CHECK-LABEL: @trig_preop_constfold_0x0020000000000000_0( +; CHECK-LABEL: define double @trig_preop_constfold_0x0020000000000000_0 +; CHECK-SAME: () #[[ATTR3]] { ; CHECK-NEXT: [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double 0x10000000000000, i32 0) ; CHECK-NEXT: ret double [[VAL]] ; @@ -5648,7 +6163,8 @@ } define double @trig_preop_constfold_0x001fffffffffffff_0() { -; CHECK-LABEL: @trig_preop_constfold_0x001fffffffffffff_0( +; CHECK-LABEL: define double @trig_preop_constfold_0x001fffffffffffff_0 +; CHECK-SAME: () #[[ATTR3]] { ; CHECK-NEXT: [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double 0xFFFFFFFFFFFFF, i32 0) ; CHECK-NEXT: ret double [[VAL]] ; @@ -5657,7 +6173,8 @@ } define double @trig_preop_constfold_0x8020000000000000_0() { -; CHECK-LABEL: @trig_preop_constfold_0x8020000000000000_0( +; CHECK-LABEL: define double @trig_preop_constfold_0x8020000000000000_0 +; CHECK-SAME: () #[[ATTR3]] { ; CHECK-NEXT: [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double 0x8020000000000000, i32 0) ; CHECK-NEXT: ret double [[VAL]] ; @@ -5666,7 +6183,8 @@ } define double @trig_preop_constfold_0x801fffffffffffff_0() { -; CHECK-LABEL: @trig_preop_constfold_0x801fffffffffffff_0( +; CHECK-LABEL: define double @trig_preop_constfold_0x801fffffffffffff_0 +; CHECK-SAME: () #[[ATTR3]] { ; CHECK-NEXT: [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double 0x801FFFFFFFFFFFFF, i32 0) ; CHECK-NEXT: ret double [[VAL]] ; diff --git a/llvm/test/Verifier/AMDGPU/intrinsic-immarg.ll b/llvm/test/Verifier/AMDGPU/intrinsic-immarg.ll --- a/llvm/test/Verifier/AMDGPU/intrinsic-immarg.ll +++ b/llvm/test/Verifier/AMDGPU/intrinsic-immarg.ll @@ -555,12 +555,12 @@ define i32 @test_permlane16(ptr addrspace(1) %out, i32 %arg0, i32 %arg1, i32 %arg2, i1 %arg3, i1 %arg4) { ; CHECK: immarg operand has non-immediate parameter ; CHECK-NEXT: i1 %arg3 - ; CHECK-NEXT: %v1 = call i32 @llvm.amdgcn.permlane16(i32 %arg0, i32 %arg0, i32 %arg1, i32 %arg2, i1 %arg3, i1 false) + ; CHECK-NEXT: %v1 = call i32 @llvm.amdgcn.permlane16.i32(i32 %arg0, i32 %arg0, i32 %arg1, i32 %arg2, i1 %arg3, i1 false) %v1 = call i32 @llvm.amdgcn.permlane16(i32 %arg0, i32 %arg0, i32 %arg1, i32 %arg2, i1 %arg3, i1 false) ; CHECK: immarg operand has non-immediate parameter ; CHECK-NEXT: i1 %arg4 - ; CHECK-NEXT: call i32 @llvm.amdgcn.permlane16(i32 %v2, i32 %arg0, i32 %arg1, i32 %arg2, i1 false, i1 %arg4) + ; CHECK-NEXT: call i32 @llvm.amdgcn.permlane16.i32(i32 %v2, i32 %arg0, i32 %arg1, i32 %arg2, i1 false, i1 %arg4) %v2 = call i32 @llvm.amdgcn.permlane16(i32 %v2, i32 %arg0, i32 %arg1, i32 %arg2, i1 false, i1 %arg4) ret i32 %v2 } @@ -569,12 +569,12 @@ define i32 @test_permlanex16(ptr addrspace(1) %out, i32 %arg0, i32 %arg1, i32 %arg2, i1 %arg3, i1 %arg4) { ; CHECK: immarg operand has non-immediate parameter ; CHECK-NEXT: i1 %arg3 - ; CHECK-NEXT: %v1 = call i32 @llvm.amdgcn.permlanex16(i32 %arg0, i32 %arg0, i32 %arg1, i32 %arg2, i1 %arg3, i1 false) + ; CHECK-NEXT: %v1 = call i32 @llvm.amdgcn.permlanex16.i32(i32 %arg0, i32 %arg0, i32 %arg1, i32 %arg2, i1 %arg3, i1 false) %v1 = call i32 @llvm.amdgcn.permlanex16(i32 %arg0, i32 %arg0, i32 %arg1, i32 %arg2, i1 %arg3, i1 false) ; CHECK: immarg operand has non-immediate parameter ; CHECK-NEXT: i1 %arg4 - ; CHECK-NEXT: call i32 @llvm.amdgcn.permlanex16(i32 %v2, i32 %arg0, i32 %arg1, i32 %arg2, i1 false, i1 %arg4) + ; CHECK-NEXT: call i32 @llvm.amdgcn.permlanex16.i32(i32 %v2, i32 %arg0, i32 %arg1, i32 %arg2, i1 false, i1 %arg4) %v2 = call i32 @llvm.amdgcn.permlanex16(i32 %v2, i32 %arg0, i32 %arg1, i32 %arg2, i1 false, i1 %arg4) ret i32 %v2 } @@ -600,7 +600,6 @@ ; CHECK: immarg operand has non-immediate parameter ; CHECK-NEXT: i32 %arg2 ; CHECK-NEXT: %val0 = call float @llvm.amdgcn.interp.p2(float %arg0, float %arg1, i32 %arg2, i32 0, i32 0) - %val0 = call float @llvm.amdgcn.interp.p2(float %arg0, float %arg1, i32 %arg2, i32 0, i32 0) store volatile float %val0, ptr addrspace(1) undef