diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -14873,6 +14873,10 @@ } LLVM_FALLTHROUGH; } + case AMDGPU::BI__builtin_amdgcn_readfirstlane: + return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_readfirstlane); + case AMDGPU::BI__builtin_amdgcn_readlane: + return emitBinaryBuiltin(*this, E, Intrinsic::amdgcn_readlane); default: return nullptr; } diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl --- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl @@ -291,15 +291,15 @@ *out = __builtin_amdgcn_ds_bpermute(a, b); } -// CHECK-LABEL: @test_readfirstlane -// CHECK: call i32 @llvm.amdgcn.readfirstlane(i32 %a) +// CHECK-LABEL: @test_readfirstlane( +// CHECK: call i32 @llvm.amdgcn.readfirstlane.i32(i32 %a) void test_readfirstlane(global int* out, int a) { *out = __builtin_amdgcn_readfirstlane(a); } // CHECK-LABEL: @test_readlane -// CHECK: call i32 @llvm.amdgcn.readlane(i32 %a, i32 %b) +// CHECK: call i32 @llvm.amdgcn.readlane.i32(i32 %a, i32 %b) void test_readlane(global int* out, int a, int b) { *out = __builtin_amdgcn_readlane(a, b); diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -1416,26 +1416,25 @@ [IntrNoMem, IntrConvergent, IntrWillReturn]>; def int_amdgcn_readfirstlane : - GCCBuiltin<"__builtin_amdgcn_readfirstlane">, - Intrinsic<[llvm_i32_ty], [llvm_i32_ty], + Intrinsic<[llvm_any_ty], [LLVMMatchType<0>], [IntrNoMem, IntrConvergent, IntrWillReturn]>; // The lane argument must be uniform across the currently active threads of the // current wave. Otherwise, the result is undefined. def int_amdgcn_readlane : - GCCBuiltin<"__builtin_amdgcn_readlane">, - Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], + Intrinsic<[llvm_any_ty], + [LLVMMatchType<0>, // data input + llvm_i32_ty], // uniform lane select [IntrNoMem, IntrConvergent, IntrWillReturn]>; // The value to write and lane select arguments must be uniform across the // currently active threads of the current wave. Otherwise, the result is // undefined. def int_amdgcn_writelane : - GCCBuiltin<"__builtin_amdgcn_writelane">, - Intrinsic<[llvm_i32_ty], [ - llvm_i32_ty, // uniform value to write: returned by the selected lane - llvm_i32_ty, // uniform lane select - llvm_i32_ty // returned by all lanes other than the selected one + Intrinsic<[llvm_any_ty], [ + LLVMMatchType<0>, // uniform value to write: returned by the selected lane + llvm_i32_ty, // uniform lane select + LLVMMatchType<0> // returned by all lanes other than the selected one ], [IntrNoMem, IntrConvergent, IntrWillReturn] >; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp @@ -285,11 +285,11 @@ Type *const Ty = V->getType(); Module *M = B.GetInsertBlock()->getModule(); Function *UpdateDPP = - Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty); + Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, {Ty}); Function *PermLaneX16 = Intrinsic::getDeclaration(M, Intrinsic::amdgcn_permlanex16, {}); Function *ReadLane = - Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {}); + Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {Ty}); for (unsigned Idx = 0; Idx < 4; Idx++) { V = buildNonAtomicBinOp( @@ -344,11 +344,11 @@ Type *const Ty = V->getType(); Module *M = B.GetInsertBlock()->getModule(); Function *UpdateDPP = - Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty); + Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, {Ty}); Function *ReadLane = - Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {}); + Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {Ty}); Function *WriteLane = - Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane, {}); + Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane, {Ty}); if (ST->hasDPPWavefrontShifts()) { // GFX9 has DPP wavefront shift operations. @@ -490,25 +490,8 @@ // each active lane in the wavefront. This will be our new value which we // will provide to the atomic operation. Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1); - if (TyBitWidth == 64) { - Value *const ExtractLo = B.CreateTrunc(NewV, B.getInt32Ty()); - Value *const ExtractHi = - B.CreateTrunc(B.CreateLShr(NewV, 32), B.getInt32Ty()); - CallInst *const ReadLaneLo = B.CreateIntrinsic( - Intrinsic::amdgcn_readlane, {}, {ExtractLo, LastLaneIdx}); - CallInst *const ReadLaneHi = B.CreateIntrinsic( - Intrinsic::amdgcn_readlane, {}, {ExtractHi, LastLaneIdx}); - Value *const PartialInsert = B.CreateInsertElement( - UndefValue::get(VecTy), ReadLaneLo, B.getInt32(0)); - Value *const Insert = - B.CreateInsertElement(PartialInsert, ReadLaneHi, B.getInt32(1)); - NewV = B.CreateBitCast(Insert, Ty); - } else if (TyBitWidth == 32) { - NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {}, - {NewV, LastLaneIdx}); - } else { - llvm_unreachable("Unhandled atomic bit width"); - } + NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {Ty}, + {NewV, LastLaneIdx}); // Finally mark the readlanes in the WWM section. NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, NewV); @@ -587,27 +570,8 @@ // We need to broadcast the value who was the lowest active lane (the first // lane) to all other lanes in the wavefront. We use an intrinsic for this, // but have to handle 64-bit broadcasts with two calls to this intrinsic. - Value *BroadcastI = nullptr; - - if (TyBitWidth == 64) { - Value *const ExtractLo = B.CreateTrunc(PHI, B.getInt32Ty()); - Value *const ExtractHi = - B.CreateTrunc(B.CreateLShr(PHI, 32), B.getInt32Ty()); - CallInst *const ReadFirstLaneLo = - B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractLo); - CallInst *const ReadFirstLaneHi = - B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractHi); - Value *const PartialInsert = B.CreateInsertElement( - UndefValue::get(VecTy), ReadFirstLaneLo, B.getInt32(0)); - Value *const Insert = - B.CreateInsertElement(PartialInsert, ReadFirstLaneHi, B.getInt32(1)); - BroadcastI = B.CreateBitCast(Insert, Ty); - } else if (TyBitWidth == 32) { - - BroadcastI = B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, PHI); - } else { - llvm_unreachable("Unhandled atomic bit width"); - } + Value *BroadcastI = + B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {Ty}, {PHI}); // Now that we have the result of our single atomic operation, we need to // get our individual lane's slice into the result. We use the lane offset diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -224,6 +224,10 @@ bool visitIntrinsicInst(IntrinsicInst &I); bool visitBitreverseIntrinsicInst(IntrinsicInst &I); + bool visitLaneIntrinsicInst(IntrinsicInst &I); + Value *buildLegalLaneIntrinsic(IRBuilder<> &B, Intrinsic::ID IID, + Value *Data0, Value *Lane = nullptr, + Value *Data1 = nullptr); bool doInitialization(Module &M) override; bool runOnFunction(Function &F) override; @@ -1344,6 +1348,10 @@ switch (I.getIntrinsicID()) { case Intrinsic::bitreverse: return visitBitreverseIntrinsicInst(I); + case Intrinsic::amdgcn_readfirstlane: + case Intrinsic::amdgcn_readlane: + case Intrinsic::amdgcn_writelane: + return visitLaneIntrinsicInst(I); default: return false; } @@ -1359,6 +1367,138 @@ return Changed; } +Value *AMDGPUCodeGenPrepare::buildLegalLaneIntrinsic(IRBuilder<> &B, + Intrinsic::ID IID, + Value *Data0, Value *Lane, + Value *Data1) { + Type *Ty = Data0->getType(); + + if (Ty == B.getInt32Ty()) { + Value *Args[3] = {Data0, Lane, Data1}; + unsigned NumArgs = Data1 != nullptr ? 3 : Lane != nullptr ? 2 : 1; + return B.CreateIntrinsic(IID, {B.getInt32Ty()}, {Args, NumArgs}); + } + + if (auto *VecTy = dyn_cast(Ty)) { + Type *EltType = VecTy->getElementType(); + bool is16Bit = + (EltType->isIntegerTy() && EltType->getIntegerBitWidth() == 16) || + (EltType->isHalfTy()); + int EC = VecTy->getElementCount().Min; + + Value *Result = UndefValue::get(Ty); + for (int i = 0; i < EC; i += 1 + is16Bit) { + Value *EltData0; + Value *EltData1 = nullptr; + + if (is16Bit) { + int Idxs[2] = {i, i + 1}; + EltData0 = B.CreateShuffleVector(Data0, UndefValue::get(Ty), Idxs); + EltData0 = B.CreateBitCast(EltData0, B.getInt32Ty()); + } else { + EltData0 = B.CreateExtractElement(Data0, i); + } + + if (Data1) { + if (is16Bit) { + int Idxs[2] = {i, i + 1}; + EltData1 = B.CreateShuffleVector(Data1, UndefValue::get(Ty), Idxs); + EltData1 = B.CreateBitCast(EltData1, B.getInt32Ty()); + } else { + EltData1 = B.CreateExtractElement(Data1, i); + } + } + + Value *EltResult = + buildLegalLaneIntrinsic(B, IID, EltData0, Lane, EltData1); + + if (is16Bit) { + EltResult = + B.CreateBitCast(EltResult, FixedVectorType::get(EltType, 2)); + for (int j = 0; j < 2; ++j) { + if (i + j >= EC) + break; + Result = B.CreateInsertElement( + Result, B.CreateExtractElement(EltResult, j), i + j); + } + } else { + Result = B.CreateInsertElement(Result, EltResult, i); + } + } + + return Result; + } + + unsigned BitWidth = DL->getTypeSizeInBits(Ty); + Type *IntTy = Ty; + + if (!Ty->isIntegerTy()) { + IntTy = IntegerType::get(Mod->getContext(), BitWidth); + Data0 = B.CreateBitOrPointerCast(Data0, IntTy); + if (Data1) + Data1 = B.CreateBitOrPointerCast(Data1, IntTy); + } + + if ((BitWidth % 32) != 0) { + Type *ExtendedTy = + IntegerType::get(Mod->getContext(), (BitWidth + 31) & ~31); + Data0 = B.CreateZExt(Data0, ExtendedTy); + if (Data1) + Data1 = B.CreateZExt(Data1, ExtendedTy); + } + + if (BitWidth > 32) { + Type *VecTy = FixedVectorType::get(B.getInt32Ty(), (BitWidth + 31) / 32); + Data0 = B.CreateBitCast(Data0, VecTy); + if (Data1) + Data1 = B.CreateBitCast(Data1, VecTy); + } + + Value *Result = buildLegalLaneIntrinsic(B, IID, Data0, Lane, Data1); + + if ((BitWidth % 32) != 0) { + if (BitWidth > 32) { + Result = B.CreateBitCast( + Result, IntegerType::get(Mod->getContext(), (BitWidth + 31) / 32)); + } + + Result = + B.CreateTrunc(Result, IntegerType::get(Mod->getContext(), BitWidth)); + } + + return B.CreateBitOrPointerCast(Result, Ty); +} + +/// "Legalize" readfirstlane/readlane/writelane to single-dword intrinsics +/// on i32. +/// +/// Done during codegen prepare purely because this turned out to be simpler +/// than doing it in this generality in SelectionDAG. +bool AMDGPUCodeGenPrepare::visitLaneIntrinsicInst(IntrinsicInst &I) { + Type *Ty = I.getType(); + if (Ty->isIntegerTy(32) && Ty->getIntegerBitWidth() == 32) + return false; // already legal + + Value *Data0 = I.getArgOperand(0); + Value *Lane = nullptr; + Value *Data1 = nullptr; + + if (I.getIntrinsicID() == Intrinsic::amdgcn_readlane) { + Lane = I.getArgOperand(1); + } else if (I.getIntrinsicID() == Intrinsic::amdgcn_writelane) { + Lane = I.getArgOperand(1); + Data1 = I.getArgOperand(2); + } + + IRBuilder<> Builder(&I); + Value *Legalized = + buildLegalLaneIntrinsic(Builder, I.getIntrinsicID(), Data0, Lane, Data1); + + I.replaceAllUsesWith(Legalized); + I.eraseFromParent(); + return true; +} + bool AMDGPUCodeGenPrepare::doInitialization(Module &M) { Mod = &M; DL = &Mod->getDataLayout(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -655,6 +655,17 @@ if (match(Src, PatternMatch::m_Intrinsic())) { return IC.replaceInstUsesWith(II, Src); } + + // readfirstlane (bitcast x) -> bitcast (readfirstlane x) + Value *BitcastInput = nullptr; + if (match(Src, + PatternMatch::m_BitCast(PatternMatch::m_Value(BitcastInput)))) { + CallInst *NewCall = + IC.Builder.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, + {BitcastInput->getType()}, BitcastInput); + Value *NewCast = IC.Builder.CreateBitCast(NewCall, II.getType()); + return IC.replaceInstUsesWith(II, NewCast); + } } else { // readlane (readlane x, y), y -> readlane x, y if (match(Src, PatternMatch::m_Intrinsic( diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -715,6 +715,8 @@ addPass(createAtomicExpandPass()); + if (EnableAtomicOptimizations) + addPass(createAMDGPUAtomicOptimizerPass()); addPass(createAMDGPULowerIntrinsicsPass()); @@ -871,10 +873,6 @@ bool GCNPassConfig::addPreISel() { AMDGPUPassConfig::addPreISel(); - if (EnableAtomicOptimizations) { - addPass(createAMDGPUAtomicOptimizerPass()); - } - // FIXME: We need to run a pass to propagate the attributes when calls are // supported. diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -2203,7 +2203,7 @@ // FIXME: Should also do this for readlane, but tablegen crashes on // the ignored src1. def : GCNPat< - (int_amdgcn_readfirstlane (i32 imm:$src)), + (i32 (int_amdgcn_readfirstlane (i32 imm:$src))), (S_MOV_B32 SReg_32:$src) >; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.readfirstlane.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.readfirstlane.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.readfirstlane.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.readfirstlane.mir @@ -1,11 +1,11 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -march=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=2 -pass-remarks-missed='gisel*' %s -o - 2> %t | FileCheck -check-prefix=GCN %s +# RUN: llc -march=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=2 -pass-remarks-missed='gisel.*' %s -o - 2> %t | FileCheck -check-prefix=GCN %s # RUN: FileCheck -check-prefix=ERR %s < %t -# ERR: remark: :0:0: cannot select: %1:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), %0:sgpr(s32) (in function: readfirstlane_s) +# ERR: remark: :0:0: cannot select: %1:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), %0:sgpr(s32) (in function: readfirstlane_s32_s) --- -name: readfirstlane_v +name: readfirstlane_s32_v legalized: true regBankSelected: true tracksRegLiveness: true @@ -13,7 +13,7 @@ body: | bb.0: liveins: $vgpr0 - ; GCN-LABEL: name: readfirstlane_v + ; GCN-LABEL: name: readfirstlane_s32_v ; GCN: liveins: $vgpr0 ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GCN: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec @@ -24,7 +24,7 @@ ... --- -name: readfirstlane_v_imm +name: readfirstlane_v_s32_imm legalized: true regBankSelected: true tracksRegLiveness: true @@ -32,7 +32,7 @@ body: | bb.0: - ; GCN-LABEL: name: readfirstlane_v_imm + ; GCN-LABEL: name: readfirstlane_v_s32_imm ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 123, implicit $exec ; GCN: [[COPY:%[0-9]+]]:sreg_32 = COPY [[V_MOV_B32_e32_]] ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 [[COPY]] @@ -44,7 +44,7 @@ # Make sure this fails to select --- -name: readfirstlane_s +name: readfirstlane_s32_s legalized: true regBankSelected: true tracksRegLiveness: true @@ -52,7 +52,7 @@ body: | bb.0: liveins: $sgpr0 - ; GCN-LABEL: name: readfirstlane_s + ; GCN-LABEL: name: readfirstlane_s32_s ; GCN: liveins: $sgpr0 ; GCN: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 ; GCN: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY]](s32) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll @@ -1,11 +1,20 @@ +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope %s ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope %s -declare i32 @llvm.amdgcn.readfirstlane(i32) #0 +declare i32 @llvm.amdgcn.readfirstlane.i32(i32) #0 +declare float @llvm.amdgcn.readfirstlane.f32(float) #0 +declare <2 x half> @llvm.amdgcn.readfirstlane.v2f16(<2 x half>) #0 +declare <2 x i16> @llvm.amdgcn.readfirstlane.v2i16(<2 x i16>) #0 +declare i8 addrspace(3)* @llvm.amdgcn.readfirstlane.p3i8(i8 addrspace(3)*) #0 +declare i16 @llvm.amdgcn.readfirstlane.i16(i16) #0 +declare half @llvm.amdgcn.readfirstlane.f16(half) #0 +declare <3 x i16> @llvm.amdgcn.readfirstlane.v3i16(<3 x i16>) #0 +declare <9 x float> @llvm.amdgcn.readfirstlane.v9f32(<9 x float>) #0 -; CHECK-LABEL: {{^}}test_readfirstlane: +; CHECK-LABEL: {{^}}test_readfirstlane_i32: ; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, v2 -define void @test_readfirstlane(i32 addrspace(1)* %out, i32 %src) #1 { - %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %src) +define void @test_readfirstlane_i32(i32 addrspace(1)* %out, i32 %src) #1 { + %readfirstlane = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %src) store i32 %readfirstlane, i32 addrspace(1)* %out, align 4 ret void } @@ -15,7 +24,7 @@ ; CHECK-NOT: [[SGPR_VAL]] ; CHECK: ; use [[SGPR_VAL]] define amdgpu_kernel void @test_readfirstlane_imm(i32 addrspace(1)* %out) #1 { - %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 32) + %readfirstlane = call i32 @llvm.amdgcn.readfirstlane.i32(i32 32) call void asm sideeffect "; use $0", "s"(i32 %readfirstlane) ret void } @@ -25,7 +34,7 @@ ; CHECK-NOT: [[VVAL]] ; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VVAL]] define amdgpu_kernel void @test_readfirstlane_imm_fold(i32 addrspace(1)* %out) #1 { - %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 32) + %readfirstlane = call i32 @llvm.amdgcn.readfirstlane.i32(i32 32) store i32 %readfirstlane, i32 addrspace(1)* %out, align 4 ret void } @@ -36,7 +45,7 @@ ; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VVAL]] define amdgpu_kernel void @test_readfirstlane_m0(i32 addrspace(1)* %out) #1 { %m0 = call i32 asm "s_mov_b32 m0, -1", "={m0}"() - %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %m0) + %readfirstlane = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %m0) store i32 %readfirstlane, i32 addrspace(1)* %out, align 4 ret void } @@ -51,7 +60,7 @@ ; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VCOPY]] define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr(i32 addrspace(1)* %out) #1 { %sgpr = call i32 asm "s_mov_b32 $0, 0", "=s"() - %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %sgpr) + %readfirstlane = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %sgpr) store i32 %readfirstlane, i32 addrspace(1)* %out, align 4 ret void } @@ -62,10 +71,91 @@ define amdgpu_kernel void @test_readfirstlane_fi(i32 addrspace(1)* %out) #1 { %alloca = alloca i32, addrspace(5) %int = ptrtoint i32 addrspace(5)* %alloca to i32 - %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %int) + %readfirstlane = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %int) call void asm sideeffect "; use $0", "s"(i32 %readfirstlane) ret void } +; CHECK-LABEL: {{^}}test_readfirstlane_f32: +; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, v2 +; CHECK-NOT: v_readfirstlane_b32 +define void @test_readfirstlane_f32(float addrspace(1)* %out, float %src) #1 { + %readfirstlane = call float @llvm.amdgcn.readfirstlane.f32(float %src) + store float %readfirstlane, float addrspace(1)* %out, align 4 + ret void +} + +; CHECK-LABEL: {{^}}test_readfirstlane_v2f16: +; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, v2 +; CHECK-NOT: v_readfirstlane_b32 +define void @test_readfirstlane_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %src) #1 { + %readfirstlane = call <2 x half> @llvm.amdgcn.readfirstlane.v2f16(<2 x half> %src) + store <2 x half> %readfirstlane, <2 x half> addrspace(1)* %out, align 4 + ret void +} + +; CHECK-LABEL: {{^}}test_readfirstlane_v2i16: +; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, v2 +; CHECK-NOT: v_readfirstlane_b32 +define void @test_readfirstlane_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %src) #1 { + %readfirstlane = call <2 x i16> @llvm.amdgcn.readfirstlane.v2i16(<2 x i16> %src) + store <2 x i16> %readfirstlane, <2 x i16> addrspace(1)* %out, align 4 + ret void +} + +; CHECK-LABEL: {{^}}test_readfirstlane_p3: +; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, v2 +; CHECK-NOT: v_readfirstlane_b32 +define void @test_readfirstlane_p3(i8 addrspace(3)* addrspace(1)* %out, i8 addrspace(3)* %src) #1 { + %readfirstlane = call i8 addrspace(3)* @llvm.amdgcn.readfirstlane.p3i8(i8 addrspace(3)* %src) + store i8 addrspace(3)* %readfirstlane, i8 addrspace(3)* addrspace(1)* %out, align 4 + ret void +} + +; CHECK-LABEL: {{^}}test_readfirstlane_i16: +; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, v2 +; CHECK-NOT: v_readfirstlane_b32 +define void @test_readfirstlane_i16(i16 addrspace(1)* %out, i16 %src) { + %readfirstlane = call i16 @llvm.amdgcn.readfirstlane.i16(i16 %src) + store i16 %readfirstlane, i16 addrspace(1)* %out, align 2 + ret void +} + +; CHECK-LABEL: {{^}}test_readfirstlane_f16: +; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, v2 +; CHECK-NOT: v_readfirstlane_b32 +define void @test_readfirstlane_f16(half addrspace(1)* %out, half %src) { + %readfirstlane = call half @llvm.amdgcn.readfirstlane.f16(half %src) + store half %readfirstlane, half addrspace(1)* %out, align 2 + ret void +} + +; CHECK-LABEL: {{^}}test_readfirstlane_v3i16: +; CHECK-DAG: v_readfirstlane_b32 s{{[0-9]+}}, +; CHECK-DAG: v_readfirstlane_b32 s{{[0-9]+}}, +; CHECK-NOT: v_readfirstlane_b32 +define void @test_readfirstlane_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %src) { + %readfirstlane = call <3 x i16> @llvm.amdgcn.readfirstlane.v3i16(<3 x i16> %src) + store <3 x i16> %readfirstlane, <3 x i16> addrspace(1)* %out, align 2 + ret void +} + +; CHECK-LABEL: {{^}}test_readfirstlane_v9f32: +; CHECK-DAG: v_readfirstlane_b32 s{{[0-9]+}}, v2 +; CHECK-DAG: v_readfirstlane_b32 s{{[0-9]+}}, v3 +; CHECK-DAG: v_readfirstlane_b32 s{{[0-9]+}}, v4 +; CHECK-DAG: v_readfirstlane_b32 s{{[0-9]+}}, v5 +; CHECK-DAG: v_readfirstlane_b32 s{{[0-9]+}}, v6 +; CHECK-DAG: v_readfirstlane_b32 s{{[0-9]+}}, v7 +; CHECK-DAG: v_readfirstlane_b32 s{{[0-9]+}}, v8 +; CHECK-DAG: v_readfirstlane_b32 s{{[0-9]+}}, v9 +; CHECK-DAG: v_readfirstlane_b32 s{{[0-9]+}}, v10 +; CHECK-NOT: v_readfirstlane_b32 +define void @test_readfirstlane_v9f32(<9 x float> addrspace(1)* %out, <9 x float> %src) { + %readfirstlane = call <9 x float> @llvm.amdgcn.readfirstlane.v9f32(<9 x float> %src) + store <9 x float> %readfirstlane, <9 x float> addrspace(1)* %out, align 2 + ret void +} + attributes #0 = { nounwind readnone convergent } attributes #1 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll @@ -1,6 +1,11 @@ ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope %s declare i32 @llvm.amdgcn.readlane(i32, i32) #0 +declare i8 addrspace(3)* @llvm.amdgcn.readlane.p3i8(i8 addrspace(3)*, i32) #0 +declare i16 @llvm.amdgcn.readlane.i16(i16, i32) #0 +declare half @llvm.amdgcn.readlane.f16(half, i32) #0 +declare <3 x i16> @llvm.amdgcn.readlane.v3i16(<3 x i16>, i32) #0 +declare <9 x float> @llvm.amdgcn.readlane.v9f32(<9 x float>, i32) #0 ; CHECK-LABEL: {{^}}test_readlane_sreg_sreg: ; CHECK-NOT: v_readlane_b32 @@ -77,6 +82,60 @@ ret void } +; CHECK-LABEL: {{^}}test_readlane_p3: +; CHECK: v_readlane_b32 s{{[0-9]+}}, v2, 15 +; CHECK-NOT: v_readlane_b32 +define void @test_readlane_p3(i8 addrspace(3)* addrspace(1)* %out, i8 addrspace(3)* %src) #1 { + %readlane = call i8 addrspace(3)* @llvm.amdgcn.readlane.p3i8(i8 addrspace(3)* %src, i32 15) + store i8 addrspace(3)* %readlane, i8 addrspace(3)* addrspace(1)* %out, align 4 + ret void +} + +; CHECK-LABEL: {{^}}test_readlane_i16: +; CHECK: v_readlane_b32 s{{[0-9]+}}, v2, 15 +; CHECK-NOT: v_readlane_b32 +define void @test_readlane_i16(i16 addrspace(1)* %out, i16 %src) { + %readlane = call i16 @llvm.amdgcn.readlane.i16(i16 %src, i32 15) + store i16 %readlane, i16 addrspace(1)* %out, align 2 + ret void +} + +; CHECK-LABEL: {{^}}test_readlane_f16: +; CHECK: v_readlane_b32 s{{[0-9]+}}, v2, 15 +; CHECK-NOT: v_readlane_b32 +define void @test_readlane_f16(half addrspace(1)* %out, half %src) { + %readlane = call half @llvm.amdgcn.readlane.f16(half %src, i32 15) + store half %readlane, half addrspace(1)* %out, align 2 + ret void +} + +; CHECK-LABEL: {{^}}test_readlane_v3i16: +; CHECK-DAG: v_readlane_b32 s{{[0-9]+}}, +; CHECK-DAG: v_readlane_b32 s{{[0-9]+}}, +; CHECK-NOT: v_readlane_b32 +define void @test_readlane_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %src) { + %readlane = call <3 x i16> @llvm.amdgcn.readlane.v3i16(<3 x i16> %src, i32 15) + store <3 x i16> %readlane, <3 x i16> addrspace(1)* %out, align 2 + ret void +} + +; CHECK-LABEL: {{^}}test_readlane_v9f32: +; CHECK-DAG: v_readlane_b32 s{{[0-9]+}}, v2, 15 +; CHECK-DAG: v_readlane_b32 s{{[0-9]+}}, v3, 15 +; CHECK-DAG: v_readlane_b32 s{{[0-9]+}}, v4, 15 +; CHECK-DAG: v_readlane_b32 s{{[0-9]+}}, v5, 15 +; CHECK-DAG: v_readlane_b32 s{{[0-9]+}}, v6, 15 +; CHECK-DAG: v_readlane_b32 s{{[0-9]+}}, v7, 15 +; CHECK-DAG: v_readlane_b32 s{{[0-9]+}}, v8, 15 +; CHECK-DAG: v_readlane_b32 s{{[0-9]+}}, v9, 15 +; CHECK-DAG: v_readlane_b32 s{{[0-9]+}}, v10, 15 +; CHECK-NOT: v_readlane_b32 +define void @test_readlane_v9f32(<9 x float> addrspace(1)* %out, <9 x float> %src) { + %readlane = call <9 x float> @llvm.amdgcn.readlane.v9f32(<9 x float> %src, i32 15) + store <9 x float> %readlane, <9 x float> addrspace(1)* %out, align 2 + ret void +} + declare i32 @llvm.amdgcn.workitem.id.x() #2 attributes #0 = { nounwind readnone convergent } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll @@ -3,6 +3,11 @@ ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,GFX10 %s declare i32 @llvm.amdgcn.writelane(i32, i32, i32) #0 +declare i8 addrspace(3)* @llvm.amdgcn.writelane.p3i8(i8 addrspace(3)*, i32, i8 addrspace(3)*) #0 +declare i16 @llvm.amdgcn.writelane.i16(i16, i32, i16) #0 +declare half @llvm.amdgcn.writelane.f16(half, i32, half) #0 +declare <3 x i16> @llvm.amdgcn.writelane.v3i16(<3 x i16>, i32, <3 x i16>) #0 +declare <9 x float> @llvm.amdgcn.writelane.v9f32(<9 x float>, i32, <9 x float>) #0 ; CHECK-LABEL: {{^}}test_writelane_sreg: ; CIGFX9: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, m0 @@ -79,6 +84,60 @@ ret void } +; CHECK-LABEL: {{^}}test_writelane_p3: +; CHECK: v_writelane_b32 v{{[0-9]+}}, 0, 15 +; CHECK-NOT: v_writelane_b32 +define void @test_writelane_p3(i8 addrspace(3)* addrspace(1)* %out, i8 addrspace(3)* %src) #1 { + %writelane = call i8 addrspace(3)* @llvm.amdgcn.writelane.p3i8(i8 addrspace(3)* null, i32 15, i8 addrspace(3)* %src) + store i8 addrspace(3)* %writelane, i8 addrspace(3)* addrspace(1)* %out, align 4 + ret void +} + +; CHECK-LABEL: {{^}}test_writelane_i16: +; CHECK: v_writelane_b32 v{{[0-9]+}}, +; CHECK-NOT: v_writelane_b32 +define void @test_writelane_i16(i16 addrspace(1)* %out, i16 %src) { + %writelane = call i16 @llvm.amdgcn.writelane.i16(i16 1234, i32 15, i16 %src) + store i16 %writelane, i16 addrspace(1)* %out, align 2 + ret void +} + +; CHECK-LABEL: {{^}}test_writelane_f16: +; CHECK: v_writelane_b32 v{{[0-9]+}}, +; CHECK-NOT: v_writelane_b32 +define void @test_writelane_f16(half addrspace(1)* %out, half %src) { + %writelane = call half @llvm.amdgcn.writelane.f16(half 1.0, i32 15, half %src) + store half %writelane, half addrspace(1)* %out, align 2 + ret void +} + +; CHECK-LABEL: {{^}}test_writelane_v3i16: +; CHECK-DAG: v_writelane_b32 v{{[0-9]+}}, +; CHECK-DAG: v_writelane_b32 v{{[0-9]+}}, +; CHECK-NOT: v_writelane_b32 +define void @test_writelane_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %src) { + %writelane = call <3 x i16> @llvm.amdgcn.writelane.v3i16(<3 x i16> zeroinitializer, i32 15, <3 x i16> %src) + store <3 x i16> %writelane, <3 x i16> addrspace(1)* %out, align 2 + ret void +} + +; CHECK-LABEL: {{^}}test_writelane_v9f32: +; CHECK-DAG: v_writelane_b32 v{{[0-9]+}}, 0, 15 +; CHECK-DAG: v_writelane_b32 v{{[0-9]+}}, 0, 15 +; CHECK-DAG: v_writelane_b32 v{{[0-9]+}}, 0, 15 +; CHECK-DAG: v_writelane_b32 v{{[0-9]+}}, 0, 15 +; CHECK-DAG: v_writelane_b32 v{{[0-9]+}}, 0, 15 +; CHECK-DAG: v_writelane_b32 v{{[0-9]+}}, 0, 15 +; CHECK-DAG: v_writelane_b32 v{{[0-9]+}}, 0, 15 +; CHECK-DAG: v_writelane_b32 v{{[0-9]+}}, 0, 15 +; CHECK-DAG: v_writelane_b32 v{{[0-9]+}}, 0, 15 +; CHECK-NOT: v_writelane_b32 +define void @test_writelane_v9f32(<9 x float> addrspace(1)* %out, <9 x float> %src) { + %writelane = call <9 x float> @llvm.amdgcn.writelane.v9f32(<9 x float> zeroinitializer, i32 15, <9 x float> %src) + store <9 x float> %writelane, <9 x float> addrspace(1)* %out, align 2 + ret void +} + declare i32 @llvm.amdgcn.workitem.id.x() #2 attributes #0 = { nounwind readnone convergent } diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll --- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll +++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll @@ -2507,8 +2507,8 @@ define amdgpu_kernel void @readfirstlane_constant(i32 %arg) { ; CHECK-LABEL: @readfirstlane_constant( -; CHECK-NEXT: [[VAR:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[ARG:%.*]]) -; CHECK-NEXT: store volatile i32 [[VAR]], i32* undef, align 4 +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG:%.*]]) +; CHECK-NEXT: store volatile i32 [[TMP1]], i32* undef, align 4 ; CHECK-NEXT: store volatile i32 0, i32* undef, align 4 ; CHECK-NEXT: store volatile i32 123, i32* undef, align 4 ; CHECK-NEXT: store volatile i32 ptrtoint (i32* @gv to i32), i32* undef, align 4 @@ -2530,8 +2530,8 @@ define i32 @readfirstlane_idempotent(i32 %arg) { ; CHECK-LABEL: @readfirstlane_idempotent( -; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[ARG:%.*]]) -; CHECK-NEXT: ret i32 [[READ0]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG:%.*]]) +; CHECK-NEXT: ret i32 [[TMP1]] ; %read0 = call i32 @llvm.amdgcn.readfirstlane(i32 %arg) %read1 = call i32 @llvm.amdgcn.readfirstlane(i32 %read0) @@ -2541,8 +2541,8 @@ define i32 @readfirstlane_readlane(i32 %arg) { ; CHECK-LABEL: @readfirstlane_readlane( -; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[ARG:%.*]]) -; CHECK-NEXT: ret i32 [[READ0]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG:%.*]]) +; CHECK-NEXT: ret i32 [[TMP1]] ; %read0 = call i32 @llvm.amdgcn.readfirstlane(i32 %arg) %read1 = call i32 @llvm.amdgcn.readlane(i32 %read0, i32 0) @@ -2552,11 +2552,11 @@ define i32 @readfirstlane_readfirstlane_different_block(i32 %arg) { ; CHECK-LABEL: @readfirstlane_readfirstlane_different_block( ; CHECK-NEXT: bb0: -; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[ARG:%.*]]) +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG:%.*]]) ; CHECK-NEXT: br label [[BB1:%.*]] ; CHECK: bb1: -; CHECK-NEXT: [[READ1:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[READ0]]) -; CHECK-NEXT: ret i32 [[READ1]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP0]]) +; CHECK-NEXT: ret i32 [[TMP1]] ; bb0: %read0 = call i32 @llvm.amdgcn.readfirstlane(i32 %arg) @@ -2570,11 +2570,11 @@ define i32 @readfirstlane_readlane_different_block(i32 %arg) { ; CHECK-LABEL: @readfirstlane_readlane_different_block( ; CHECK-NEXT: bb0: -; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[ARG:%.*]], i32 0) +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG:%.*]], i32 0) ; CHECK-NEXT: br label [[BB1:%.*]] ; CHECK: bb1: -; CHECK-NEXT: [[READ1:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[READ0]]) -; CHECK-NEXT: ret i32 [[READ1]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP0]]) +; CHECK-NEXT: ret i32 [[TMP1]] ; bb0: %read0 = call i32 @llvm.amdgcn.readlane(i32 %arg, i32 0) @@ -2585,6 +2585,41 @@ ret i32 %read1 } +define i32 @readfirstlane_bitcast(float %arg) { +; CHECK-LABEL: @readfirstlane_bitcast( +; CHECK-NEXT: [[TMP1:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[ARG:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = bitcast float [[TMP1]] to i32 +; CHECK-NEXT: ret i32 [[TMP2]] +; + %bitcast.arg = bitcast float %arg to i32 + %read = call i32 @llvm.amdgcn.readfirstlane(i32 %bitcast.arg) + ret i32 %read +} + +define float @bitcast_readfirstlane_bitcast(float %arg) { +; CHECK-LABEL: @bitcast_readfirstlane_bitcast( +; CHECK-NEXT: [[TMP1:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[ARG:%.*]]) +; CHECK-NEXT: ret float [[TMP1]] +; + %bitcast.arg = bitcast float %arg to i32 + %read = call i32 @llvm.amdgcn.readfirstlane(i32 %bitcast.arg) + %cast.read = bitcast i32 %read to float + ret float %cast.read +} + +define i32 @readfirstlane_bitcast_multi_use(float %arg) { +; CHECK-LABEL: @readfirstlane_bitcast_multi_use( +; CHECK-NEXT: store float [[ARG:%.*]], float* undef, align 4 +; CHECK-NEXT: [[TMP1:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[ARG]]) +; CHECK-NEXT: [[TMP2:%.*]] = bitcast float [[TMP1]] to i32 +; CHECK-NEXT: ret i32 [[TMP2]] +; + %bitcast.arg = bitcast float %arg to i32 + store i32 %bitcast.arg, i32* undef + %read = call i32 @llvm.amdgcn.readfirstlane(i32 %bitcast.arg) + ret i32 %read +} + ; -------------------------------------------------------------------- ; llvm.amdgcn.readlane ; -------------------------------------------------------------------- @@ -2593,8 +2628,8 @@ define amdgpu_kernel void @readlane_constant(i32 %arg, i32 %lane) { ; CHECK-LABEL: @readlane_constant( -; CHECK-NEXT: [[VAR:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[ARG:%.*]], i32 7) -; CHECK-NEXT: store volatile i32 [[VAR]], i32* undef, align 4 +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG:%.*]], i32 7) +; CHECK-NEXT: store volatile i32 [[TMP1]], i32* undef, align 4 ; CHECK-NEXT: store volatile i32 0, i32* undef, align 4 ; CHECK-NEXT: store volatile i32 123, i32* undef, align 4 ; CHECK-NEXT: store volatile i32 ptrtoint (i32* @gv to i32), i32* undef, align 4 @@ -2616,8 +2651,8 @@ define i32 @readlane_idempotent(i32 %arg, i32 %lane) { ; CHECK-LABEL: @readlane_idempotent( -; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[ARG:%.*]], i32 [[LANE:%.*]]) -; CHECK-NEXT: ret i32 [[READ0]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG:%.*]], i32 [[LANE:%.*]]) +; CHECK-NEXT: ret i32 [[TMP1]] ; %read0 = call i32 @llvm.amdgcn.readlane(i32 %arg, i32 %lane) %read1 = call i32 @llvm.amdgcn.readlane(i32 %read0, i32 %lane) @@ -2626,9 +2661,9 @@ define i32 @readlane_idempotent_different_lanes(i32 %arg, i32 %lane0, i32 %lane1) { ; CHECK-LABEL: @readlane_idempotent_different_lanes( -; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[ARG:%.*]], i32 [[LANE0:%.*]]) -; CHECK-NEXT: [[READ1:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[READ0]], i32 [[LANE1:%.*]]) -; CHECK-NEXT: ret i32 [[READ1]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG:%.*]], i32 [[LANE0:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP1]], i32 [[LANE1:%.*]]) +; CHECK-NEXT: ret i32 [[TMP2]] ; %read0 = call i32 @llvm.amdgcn.readlane(i32 %arg, i32 %lane0) %read1 = call i32 @llvm.amdgcn.readlane(i32 %read0, i32 %lane1) @@ -2637,8 +2672,8 @@ define i32 @readlane_readfirstlane(i32 %arg) { ; CHECK-LABEL: @readlane_readfirstlane( -; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[ARG:%.*]]) -; CHECK-NEXT: ret i32 [[READ0]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG:%.*]]) +; CHECK-NEXT: ret i32 [[TMP1]] ; %read0 = call i32 @llvm.amdgcn.readfirstlane(i32 %arg) %read1 = call i32 @llvm.amdgcn.readlane(i32 %read0, i32 0) @@ -2648,11 +2683,11 @@ define i32 @readlane_idempotent_different_block(i32 %arg, i32 %lane) { ; CHECK-LABEL: @readlane_idempotent_different_block( ; CHECK-NEXT: bb0: -; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[ARG:%.*]], i32 [[LANE:%.*]]) +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG:%.*]], i32 [[LANE:%.*]]) ; CHECK-NEXT: br label [[BB1:%.*]] ; CHECK: bb1: -; CHECK-NEXT: [[READ1:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[READ0]], i32 [[LANE]]) -; CHECK-NEXT: ret i32 [[READ1]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP0]], i32 [[LANE]]) +; CHECK-NEXT: ret i32 [[TMP1]] ; bb0: %read0 = call i32 @llvm.amdgcn.readlane(i32 %arg, i32 %lane) @@ -2667,11 +2702,11 @@ define i32 @readlane_readfirstlane_different_block(i32 %arg) { ; CHECK-LABEL: @readlane_readfirstlane_different_block( ; CHECK-NEXT: bb0: -; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[ARG:%.*]]) +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG:%.*]]) ; CHECK-NEXT: br label [[BB1:%.*]] ; CHECK: bb1: -; CHECK-NEXT: [[READ1:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[READ0]], i32 0) -; CHECK-NEXT: ret i32 [[READ1]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP0]], i32 0) +; CHECK-NEXT: ret i32 [[TMP1]] ; bb0: %read0 = call i32 @llvm.amdgcn.readfirstlane(i32 %arg)