diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -14873,6 +14873,10 @@ } LLVM_FALLTHROUGH; } + case AMDGPU::BI__builtin_amdgcn_readfirstlane: + return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_readfirstlane2); + case AMDGPU::BI__builtin_amdgcn_readlane: + return emitBinaryBuiltin(*this, E, Intrinsic::amdgcn_readlane2); default: return nullptr; } diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl --- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl @@ -291,15 +291,15 @@ *out = __builtin_amdgcn_ds_bpermute(a, b); } -// CHECK-LABEL: @test_readfirstlane -// CHECK: call i32 @llvm.amdgcn.readfirstlane(i32 %a) +// CHECK-LABEL: @test_readfirstlane( +// CHECK: call i32 @llvm.amdgcn.readfirstlane2.i32(i32 %a) void test_readfirstlane(global int* out, int a) { *out = __builtin_amdgcn_readfirstlane(a); } // CHECK-LABEL: @test_readlane -// CHECK: call i32 @llvm.amdgcn.readlane(i32 %a, i32 %b) +// CHECK: call i32 @llvm.amdgcn.readlane2.i32(i32 %a, i32 %b) void test_readlane(global int* out, int a, int b) { *out = __builtin_amdgcn_readlane(a, b); diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -1415,23 +1415,38 @@ Intrinsic<[llvm_anyint_ty], [llvm_i1_ty], [IntrNoMem, IntrConvergent, IntrWillReturn]>; -def int_amdgcn_readfirstlane : - GCCBuiltin<"__builtin_amdgcn_readfirstlane">, - Intrinsic<[llvm_i32_ty], [llvm_i32_ty], +def int_amdgcn_readfirstlane2 : + Intrinsic<[llvm_any_ty], [LLVMMatchType<0>], [IntrNoMem, IntrConvergent, IntrWillReturn]>; // The lane argument must be uniform across the currently active threads of the // current wave. Otherwise, the result is undefined. -def int_amdgcn_readlane : - GCCBuiltin<"__builtin_amdgcn_readlane">, - Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], +def int_amdgcn_readlane2 : + Intrinsic<[llvm_any_ty], + [LLVMMatchType<0>, // data input + llvm_i32_ty], // uniform lane select [IntrNoMem, IntrConvergent, IntrWillReturn]>; // The value to write and lane select arguments must be uniform across the // currently active threads of the current wave. Otherwise, the result is // undefined. +def int_amdgcn_writelane2 : + Intrinsic<[llvm_any_ty], [ + LLVMMatchType<0>, // uniform value to write: returned by the selected lane + llvm_i32_ty, // uniform lane select + LLVMMatchType<0> // returned by all lanes other than the selected one + ], + [IntrNoMem, IntrConvergent, IntrWillReturn] +>; + +// Non-overloaded versions of readfirstlane2 / readlane2 / writelane2. +def int_amdgcn_readfirstlane : + Intrinsic<[llvm_i32_ty], [llvm_i32_ty], + [IntrNoMem, IntrConvergent, IntrWillReturn]>; +def int_amdgcn_readlane : + Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], + [IntrNoMem, IntrConvergent, IntrWillReturn]>; def int_amdgcn_writelane : - GCCBuiltin<"__builtin_amdgcn_writelane">, Intrinsic<[llvm_i32_ty], [ llvm_i32_ty, // uniform value to write: returned by the selected lane llvm_i32_ty, // uniform lane select diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp @@ -285,11 +285,11 @@ Type *const Ty = V->getType(); Module *M = B.GetInsertBlock()->getModule(); Function *UpdateDPP = - Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty); + Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, {Ty}); Function *PermLaneX16 = Intrinsic::getDeclaration(M, Intrinsic::amdgcn_permlanex16, {}); Function *ReadLane = - Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {}); + Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane2, {Ty}); for (unsigned Idx = 0; Idx < 4; Idx++) { V = buildNonAtomicBinOp( @@ -344,11 +344,11 @@ Type *const Ty = V->getType(); Module *M = B.GetInsertBlock()->getModule(); Function *UpdateDPP = - Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty); + Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, {Ty}); Function *ReadLane = - Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {}); + Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane2, {Ty}); Function *WriteLane = - Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane, {}); + Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane2, {Ty}); if (ST->hasDPPWavefrontShifts()) { // GFX9 has DPP wavefront shift operations. @@ -490,25 +490,8 @@ // each active lane in the wavefront. This will be our new value which we // will provide to the atomic operation. Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1); - if (TyBitWidth == 64) { - Value *const ExtractLo = B.CreateTrunc(NewV, B.getInt32Ty()); - Value *const ExtractHi = - B.CreateTrunc(B.CreateLShr(NewV, 32), B.getInt32Ty()); - CallInst *const ReadLaneLo = B.CreateIntrinsic( - Intrinsic::amdgcn_readlane, {}, {ExtractLo, LastLaneIdx}); - CallInst *const ReadLaneHi = B.CreateIntrinsic( - Intrinsic::amdgcn_readlane, {}, {ExtractHi, LastLaneIdx}); - Value *const PartialInsert = B.CreateInsertElement( - UndefValue::get(VecTy), ReadLaneLo, B.getInt32(0)); - Value *const Insert = - B.CreateInsertElement(PartialInsert, ReadLaneHi, B.getInt32(1)); - NewV = B.CreateBitCast(Insert, Ty); - } else if (TyBitWidth == 32) { - NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {}, - {NewV, LastLaneIdx}); - } else { - llvm_unreachable("Unhandled atomic bit width"); - } + NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane2, {Ty}, + {NewV, LastLaneIdx}); // Finally mark the readlanes in the WWM section. NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, NewV); @@ -587,27 +570,8 @@ // We need to broadcast the value who was the lowest active lane (the first // lane) to all other lanes in the wavefront. We use an intrinsic for this, // but have to handle 64-bit broadcasts with two calls to this intrinsic. - Value *BroadcastI = nullptr; - - if (TyBitWidth == 64) { - Value *const ExtractLo = B.CreateTrunc(PHI, B.getInt32Ty()); - Value *const ExtractHi = - B.CreateTrunc(B.CreateLShr(PHI, 32), B.getInt32Ty()); - CallInst *const ReadFirstLaneLo = - B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractLo); - CallInst *const ReadFirstLaneHi = - B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractHi); - Value *const PartialInsert = B.CreateInsertElement( - UndefValue::get(VecTy), ReadFirstLaneLo, B.getInt32(0)); - Value *const Insert = - B.CreateInsertElement(PartialInsert, ReadFirstLaneHi, B.getInt32(1)); - BroadcastI = B.CreateBitCast(Insert, Ty); - } else if (TyBitWidth == 32) { - - BroadcastI = B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, PHI); - } else { - llvm_unreachable("Unhandled atomic bit width"); - } + Value *BroadcastI = + B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane2, {Ty}, {PHI}); // Now that we have the result of our single atomic operation, we need to // get our individual lane's slice into the result. We use the lane offset diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp @@ -79,7 +79,7 @@ const SIRegisterInfo *TRI = static_cast(MRI.getTargetRegisterInfo()); if (TRI->isSGPRReg(MRI, PhysReg)) { - auto ToSGPR = MIRBuilder.buildIntrinsic(Intrinsic::amdgcn_readfirstlane, + auto ToSGPR = MIRBuilder.buildIntrinsic(Intrinsic::amdgcn_readfirstlane2, {MRI.getType(ExtReg)}, false) .addReg(ExtReg); ExtReg = ToSGPR.getReg(0); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -224,6 +224,10 @@ bool visitIntrinsicInst(IntrinsicInst &I); bool visitBitreverseIntrinsicInst(IntrinsicInst &I); + bool visitLaneIntrinsicInst(IntrinsicInst &I, Intrinsic::ID CanonicalIID); + Value *buildLegalLaneIntrinsic(IRBuilder<> &B, Intrinsic::ID IID, + Value *Data0, Value *Lane = nullptr, + Value *Data1 = nullptr); bool doInitialization(Module &M) override; bool runOnFunction(Function &F) override; @@ -1344,6 +1348,16 @@ switch (I.getIntrinsicID()) { case Intrinsic::bitreverse: return visitBitreverseIntrinsicInst(I); + case Intrinsic::amdgcn_readfirstlane: + return visitLaneIntrinsicInst(I, Intrinsic::amdgcn_readfirstlane2); + case Intrinsic::amdgcn_readlane: + return visitLaneIntrinsicInst(I, Intrinsic::amdgcn_readlane2); + case Intrinsic::amdgcn_writelane: + return visitLaneIntrinsicInst(I, Intrinsic::amdgcn_writelane2); + case Intrinsic::amdgcn_readfirstlane2: + case Intrinsic::amdgcn_readlane2: + case Intrinsic::amdgcn_writelane2: + return visitLaneIntrinsicInst(I, I.getIntrinsicID()); default: return false; } @@ -1359,6 +1373,140 @@ return Changed; } +Value *AMDGPUCodeGenPrepare::buildLegalLaneIntrinsic(IRBuilder<> &B, + Intrinsic::ID IID, + Value *Data0, Value *Lane, + Value *Data1) { + Type *Ty = Data0->getType(); + + if (Ty == B.getInt32Ty()) { + Value *Args[3] = {Data0, Lane, Data1}; + unsigned NumArgs = Data1 != nullptr ? 3 : Lane != nullptr ? 2 : 1; + return B.CreateIntrinsic(IID, {B.getInt32Ty()}, {Args, NumArgs}); + } + + if (auto *VecTy = dyn_cast(Ty)) { + Type *EltType = VecTy->getElementType(); + bool is16Bit = + (EltType->isIntegerTy() && EltType->getIntegerBitWidth() == 16) || + (EltType->isHalfTy()); + int EC = VecTy->getElementCount().Min; + + Value *Result = UndefValue::get(Ty); + for (int i = 0; i < EC; i += 1 + is16Bit) { + Value *EltData0; + Value *EltData1 = nullptr; + + if (is16Bit) { + int Idxs[2] = {i, i + 1}; + EltData0 = B.CreateShuffleVector(Data0, UndefValue::get(Ty), Idxs); + EltData0 = B.CreateBitCast(EltData0, B.getInt32Ty()); + } else { + EltData0 = B.CreateExtractElement(Data0, i); + } + + if (Data1) { + if (is16Bit) { + int Idxs[2] = {i, i + 1}; + EltData1 = B.CreateShuffleVector(Data1, UndefValue::get(Ty), Idxs); + EltData1 = B.CreateBitCast(EltData1, B.getInt32Ty()); + } else { + EltData1 = B.CreateExtractElement(Data1, i); + } + } + + Value *EltResult = + buildLegalLaneIntrinsic(B, IID, EltData0, Lane, EltData1); + + if (is16Bit) { + EltResult = + B.CreateBitCast(EltResult, FixedVectorType::get(EltType, 2)); + for (int j = 0; j < 2; ++j) { + if (i + j >= EC) + break; + Result = B.CreateInsertElement( + Result, B.CreateExtractElement(EltResult, j), i + j); + } + } else { + Result = B.CreateInsertElement(Result, EltResult, i); + } + } + + return Result; + } + + unsigned BitWidth = DL->getTypeSizeInBits(Ty); + Type *IntTy = Ty; + + if (!Ty->isIntegerTy()) { + IntTy = IntegerType::get(Mod->getContext(), BitWidth); + Data0 = B.CreateBitOrPointerCast(Data0, IntTy); + if (Data1) + Data1 = B.CreateBitOrPointerCast(Data1, IntTy); + } + + if ((BitWidth % 32) != 0) { + Type *ExtendedTy = + IntegerType::get(Mod->getContext(), (BitWidth + 31) & ~31); + Data0 = B.CreateZExt(Data0, ExtendedTy); + if (Data1) + Data1 = B.CreateZExt(Data1, ExtendedTy); + } + + if (BitWidth > 32) { + Type *VecTy = FixedVectorType::get(B.getInt32Ty(), (BitWidth + 31) / 32); + Data0 = B.CreateBitCast(Data0, VecTy); + if (Data1) + Data1 = B.CreateBitCast(Data1, VecTy); + } + + Value *Result = buildLegalLaneIntrinsic(B, IID, Data0, Lane, Data1); + + if ((BitWidth % 32) != 0) { + if (BitWidth > 32) { + Result = B.CreateBitCast( + Result, IntegerType::get(Mod->getContext(), (BitWidth + 31) / 32)); + } + + Result = + B.CreateTrunc(Result, IntegerType::get(Mod->getContext(), BitWidth)); + } + + return B.CreateBitOrPointerCast(Result, Ty); +} + +/// "Legalize" readfirstlane/readlane/writelane to single-dword intrinsics +/// on i32. +/// +/// Done during codegen prepare purely because this turned out to be simpler +/// than doing it in this generality in SelectionDAG. +bool AMDGPUCodeGenPrepare::visitLaneIntrinsicInst(IntrinsicInst &I, + Intrinsic::ID CanonicalIID) { + Type *Ty = I.getType(); + if (I.getIntrinsicID() == CanonicalIID && Ty->isIntegerTy(32) && + Ty->getIntegerBitWidth() == 32) + return false; // already legal + + Value *Data0 = I.getArgOperand(0); + Value *Lane = nullptr; + Value *Data1 = nullptr; + + if (CanonicalIID == Intrinsic::amdgcn_readlane2) { + Lane = I.getArgOperand(1); + } else if (CanonicalIID == Intrinsic::amdgcn_writelane2) { + Lane = I.getArgOperand(1); + Data1 = I.getArgOperand(2); + } + + IRBuilder<> Builder(&I); + Value *Legalized = + buildLegalLaneIntrinsic(Builder, CanonicalIID, Data0, Lane, Data1); + + I.replaceAllUsesWith(Legalized); + I.eraseFromParent(); + return true; +} + bool AMDGPUCodeGenPrepare::doInitialization(Module &M) { Mod = &M; DL = &Mod->getDataLayout(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -795,9 +795,12 @@ switch (IntrID) { default: return false; + case Intrinsic::amdgcn_readfirstlane2: + case Intrinsic::amdgcn_readlane2: + return true; case Intrinsic::amdgcn_readfirstlane: case Intrinsic::amdgcn_readlane: - return true; + llvm_unreachable("should have been updated during CodeGenPrepare"); } } break; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -630,7 +630,26 @@ return IC.replaceOperand(II, 0, UndefValue::get(VDstIn->getType())); } case Intrinsic::amdgcn_readfirstlane: - case Intrinsic::amdgcn_readlane: { + case Intrinsic::amdgcn_readlane: + case Intrinsic::amdgcn_writelane: { + // Canonicalize to overloaded intrinsics. + Intrinsic::ID NewIID; + if (IID == Intrinsic::amdgcn_readfirstlane) + NewIID = Intrinsic::amdgcn_readfirstlane2; + else if (IID == Intrinsic::amdgcn_readlane) + NewIID = Intrinsic::amdgcn_readlane2; + else + NewIID = Intrinsic::amdgcn_writelane2; + + SmallVector Args; + for (Value *Arg : II.arg_operands()) + Args.push_back(Arg); + CallInst *UpgradedCall = IC.Builder.CreateIntrinsic( + NewIID, {II.getArgOperand(0)->getType()}, Args); + return IC.replaceInstUsesWith(II, UpgradedCall); + } + case Intrinsic::amdgcn_readfirstlane2: + case Intrinsic::amdgcn_readlane2: { // A constant value is trivially uniform. if (Constant *C = dyn_cast(II.getArgOperand(0))) { return IC.replaceInstUsesWith(II, C); @@ -646,18 +665,30 @@ // readfirstlane (readfirstlane x) -> readfirstlane x // readlane (readfirstlane x), y -> readfirstlane x if (match(Src, - PatternMatch::m_Intrinsic())) { + PatternMatch::m_Intrinsic())) { return IC.replaceInstUsesWith(II, Src); } - if (IID == Intrinsic::amdgcn_readfirstlane) { + if (IID == Intrinsic::amdgcn_readfirstlane2) { // readfirstlane (readlane x, y) -> readlane x, y - if (match(Src, PatternMatch::m_Intrinsic())) { + if (match(Src, + PatternMatch::m_Intrinsic())) { return IC.replaceInstUsesWith(II, Src); } + + // readfirstlane (bitcast x) -> bitcast (readfirstlane x) + Value *BitcastInput = nullptr; + if (match(Src, + PatternMatch::m_BitCast(PatternMatch::m_Value(BitcastInput)))) { + CallInst *NewCall = + IC.Builder.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane2, + {BitcastInput->getType()}, BitcastInput); + Value *NewCast = IC.Builder.CreateBitCast(NewCall, II.getType()); + return IC.replaceInstUsesWith(II, NewCast); + } } else { // readlane (readlane x, y), y -> readlane x, y - if (match(Src, PatternMatch::m_Intrinsic( + if (match(Src, PatternMatch::m_Intrinsic( PatternMatch::m_Value(), PatternMatch::m_Specific(II.getArgOperand(1))))) { return IC.replaceInstUsesWith(II, Src); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -920,7 +920,7 @@ return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM); case Intrinsic::amdgcn_wwm: return constrainCopyLikeIntrin(I, AMDGPU::WWM); - case Intrinsic::amdgcn_writelane: + case Intrinsic::amdgcn_writelane2: return selectWritelane(I); case Intrinsic::amdgcn_div_scale: return selectDivScale(I); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -334,7 +334,7 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic( const MachineInstr &MI, const MachineRegisterInfo &MRI) const { switch (MI.getIntrinsicID()) { - case Intrinsic::amdgcn_readlane: { + case Intrinsic::amdgcn_readlane2: { static const OpRegBankEntry<3> Table[2] = { // Perfectly legal. { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 }, @@ -346,7 +346,7 @@ const std::array RegSrcOpIdx = { { 0, 2, 3 } }; return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); } - case Intrinsic::amdgcn_writelane: { + case Intrinsic::amdgcn_writelane2: { static const OpRegBankEntry<4> Table[4] = { // Perfectly legal. { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 }, @@ -2966,7 +2966,7 @@ } case AMDGPU::G_INTRINSIC: { switch (MI.getIntrinsicID()) { - case Intrinsic::amdgcn_readlane: { + case Intrinsic::amdgcn_readlane2: { substituteSimpleCopyRegs(OpdMapper, 2); assert(OpdMapper.getVRegs(0).empty()); @@ -2977,7 +2977,7 @@ constrainOpWithReadfirstlane(MI, MRI, 3); // Index return; } - case Intrinsic::amdgcn_writelane: { + case Intrinsic::amdgcn_writelane2: { assert(OpdMapper.getVRegs(0).empty()); assert(OpdMapper.getVRegs(2).empty()); assert(OpdMapper.getVRegs(3).empty()); @@ -4115,7 +4115,7 @@ OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize); break; } - case Intrinsic::amdgcn_readlane: { + case Intrinsic::amdgcn_readlane2: { // This must be an SGPR, but accept a VGPR. Register IdxReg = MI.getOperand(3).getReg(); unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits(); @@ -4123,14 +4123,14 @@ OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize); LLVM_FALLTHROUGH; } - case Intrinsic::amdgcn_readfirstlane: { + case Intrinsic::amdgcn_readfirstlane2: { unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize); OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize); break; } - case Intrinsic::amdgcn_writelane: { + case Intrinsic::amdgcn_writelane2: { unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); Register SrcReg = MI.getOperand(2).getReg(); unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td --- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td @@ -251,6 +251,7 @@ def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; +def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -715,6 +715,8 @@ addPass(createAtomicExpandPass()); + if (EnableAtomicOptimizations) + addPass(createAMDGPUAtomicOptimizerPass()); addPass(createAMDGPULowerIntrinsicsPass()); @@ -871,10 +873,6 @@ bool GCNPassConfig::addPreISel() { AMDGPUPassConfig::addPreISel(); - if (EnableAtomicOptimizations) { - addPass(createAMDGPUAtomicOptimizerPass()); - } - // FIXME: We need to run a pass to propagate the attributes when calls are // supported. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -843,6 +843,8 @@ switch (Intrinsic->getIntrinsicID()) { default: return false; + case Intrinsic::amdgcn_readfirstlane2: + case Intrinsic::amdgcn_readlane2: case Intrinsic::amdgcn_readfirstlane: case Intrinsic::amdgcn_readlane: case Intrinsic::amdgcn_icmp: diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -802,6 +802,8 @@ setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f16, Custom); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2i16, Custom); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f16, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v3i16, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v3f16, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2f16, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2i16, Custom); diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -2203,7 +2203,7 @@ // FIXME: Should also do this for readlane, but tablegen crashes on // the ignored src1. def : GCNPat< - (int_amdgcn_readfirstlane (i32 imm:$src)), + (i32 (int_amdgcn_readfirstlane2 (i32 imm:$src))), (S_MOV_B32 SReg_32:$src) >; diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -163,7 +163,7 @@ InstSI <(outs SReg_32:$vdst), (ins VRegOrLds_32:$src0), "v_readfirstlane_b32 $vdst, $src0", - [(set i32:$vdst, (int_amdgcn_readfirstlane (i32 VRegOrLds_32:$src0)))]>, + [(set i32:$vdst, (int_amdgcn_readfirstlane2 (i32 VRegOrLds_32:$src0)))]>, Enc32 { let isCodeGenOnly = 0; diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -525,11 +525,11 @@ // These are special and do not read the exec mask. let isConvergent = 1, Uses = [] in { def V_READLANE_B32 : VOP2_Pseudo<"v_readlane_b32", VOP_READLANE, - [(set i32:$vdst, (int_amdgcn_readlane i32:$src0, i32:$src1))]>; + [(set i32:$vdst, (int_amdgcn_readlane2 i32:$src0, i32:$src1))]>; let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in { def V_WRITELANE_B32 : VOP2_Pseudo<"v_writelane_b32", VOP_WRITELANE, - [(set i32:$vdst, (int_amdgcn_writelane i32:$src0, i32:$src1, i32:$vdst_in))]>; + [(set i32:$vdst, (int_amdgcn_writelane2 i32:$src0, i32:$src1, i32:$vdst_in))]>; } // End $vdst = $vdst_in, DisableEncoding $vdst_in } // End isConvergent = 1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.readfirstlane.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.readfirstlane.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.readfirstlane.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.readfirstlane.mir @@ -1,11 +1,11 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -march=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=2 -pass-remarks-missed='gisel*' %s -o - 2> %t | FileCheck -check-prefix=GCN %s +# RUN: llc -march=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=2 -pass-remarks-missed='gisel.*' %s -o - 2> %t | FileCheck -check-prefix=GCN %s # RUN: FileCheck -check-prefix=ERR %s < %t -# ERR: remark: :0:0: cannot select: %1:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), %0:sgpr(s32) (in function: readfirstlane_s) +# ERR: remark: :0:0: cannot select: %1:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), %0:sgpr(s32) (in function: readfirstlane_s32_s) --- -name: readfirstlane_v +name: readfirstlane_s32_v legalized: true regBankSelected: true tracksRegLiveness: true @@ -13,18 +13,18 @@ body: | bb.0: liveins: $vgpr0 - ; GCN-LABEL: name: readfirstlane_v + ; GCN-LABEL: name: readfirstlane_s32_v ; GCN: liveins: $vgpr0 ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GCN: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec ; GCN: S_ENDPGM 0, implicit [[V_READFIRSTLANE_B32_]] %0:vgpr(s32) = COPY $vgpr0 - %1:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), %0 + %1:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), %0 S_ENDPGM 0, implicit %1 ... --- -name: readfirstlane_v_imm +name: readfirstlane_v_s32_imm legalized: true regBankSelected: true tracksRegLiveness: true @@ -32,19 +32,19 @@ body: | bb.0: - ; GCN-LABEL: name: readfirstlane_v_imm + ; GCN-LABEL: name: readfirstlane_v_s32_imm ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 123, implicit $exec ; GCN: [[COPY:%[0-9]+]]:sreg_32 = COPY [[V_MOV_B32_e32_]] ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 [[COPY]] ; GCN: S_ENDPGM 0, implicit [[S_MOV_B32_]] %0:vgpr(s32) = G_CONSTANT i32 123 - %1:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), %0 + %1:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), %0 S_ENDPGM 0, implicit %1 ... # Make sure this fails to select --- -name: readfirstlane_s +name: readfirstlane_s32_s legalized: true regBankSelected: true tracksRegLiveness: true @@ -52,12 +52,12 @@ body: | bb.0: liveins: $sgpr0 - ; GCN-LABEL: name: readfirstlane_s + ; GCN-LABEL: name: readfirstlane_s32_s ; GCN: liveins: $sgpr0 ; GCN: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 - ; GCN: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY]](s32) + ; GCN: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY]](s32) ; GCN: S_ENDPGM 0, implicit [[INT]](s32) %0:sgpr(s32) = COPY $sgpr0 - %1:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), %0 + %1:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), %0 S_ENDPGM 0, implicit %1 ... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_ps.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_ps.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_ps.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_ps.ll @@ -47,7 +47,7 @@ ; CHECK: bb.1 (%ir-block.0): ; CHECK: liveins: $vgpr0 ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; CHECK: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY]](s32) + ; CHECK: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY]](s32) ; CHECK: $sgpr0 = COPY [[INT]](s32) ; CHECK: SI_RETURN_TO_EPILOG implicit $sgpr0 ret i32 %vgpr @@ -61,9 +61,9 @@ ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; CHECK: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[MV]](s64) - ; CHECK: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32) + ; CHECK: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[UV]](s32) ; CHECK: $sgpr0 = COPY [[INT]](s32) - ; CHECK: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32) + ; CHECK: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[UV1]](s32) ; CHECK: $sgpr1 = COPY [[INT1]](s32) ; CHECK: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 ret i64 %vgpr @@ -77,9 +77,9 @@ ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32) ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<2 x s32>) - ; CHECK: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32) + ; CHECK: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[UV]](s32) ; CHECK: $sgpr0 = COPY [[INT]](s32) - ; CHECK: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32) + ; CHECK: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[UV1]](s32) ; CHECK: $sgpr1 = COPY [[INT1]](s32) ; CHECK: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 ret <2 x i32> %vgpr @@ -92,9 +92,9 @@ ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; CHECK: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; CHECK: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY]](s32) + ; CHECK: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY]](s32) ; CHECK: $sgpr0 = COPY [[INT]](s32) - ; CHECK: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY1]](s32) + ; CHECK: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY1]](s32) ; CHECK: $sgpr1 = COPY [[INT1]](s32) ; CHECK: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 %insertvalue0 = insertvalue { i32, i32 } undef, i32 %vgpr0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_vs.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_vs.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_vs.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_vs.ll @@ -76,9 +76,9 @@ ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; CHECK: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; CHECK: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY]](s32) + ; CHECK: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY]](s32) ; CHECK: $sgpr0 = COPY [[INT]](s32) - ; CHECK: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY1]](s32) + ; CHECK: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY1]](s32) ; CHECK: $sgpr1 = COPY [[INT1]](s32) ; CHECK: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 main_body: @@ -91,7 +91,7 @@ ; CHECK-LABEL: name: non_void_ret ; CHECK: bb.1 (%ir-block.0): ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; CHECK: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[C]](s32) + ; CHECK: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[C]](s32) ; CHECK: $sgpr0 = COPY [[INT]](s32) ; CHECK: SI_RETURN_TO_EPILOG implicit $sgpr0 ret i32 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.readfirstlane.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.readfirstlane.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.readfirstlane.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.readfirstlane.mir @@ -12,9 +12,9 @@ ; CHECK-LABEL: name: readfirstlane_s ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) - ; CHECK: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY1]](s32) + ; CHECK: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY1]](s32) %0:_(s32) = COPY $sgpr0 - %1:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), %0 + %1:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), %0 ... --- @@ -26,7 +26,7 @@ liveins: $vgpr0 ; CHECK-LABEL: name: readfirstlane_v ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; CHECK: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY]](s32) + ; CHECK: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY]](s32) %0:_(s32) = COPY $vgpr0 - %1:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), %0 + %1:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), %0 ... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.readlane.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.readlane.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.readlane.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.readlane.mir @@ -13,10 +13,10 @@ ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) - ; CHECK: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readlane), [[COPY2]](s32), [[COPY1]](s32) + ; CHECK: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readlane2), [[COPY2]](s32), [[COPY1]](s32) %0:_(s32) = COPY $sgpr0 %1:_(s32) = COPY $sgpr1 - %2:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readlane), %0, %1 + %2:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readlane2), %0, %1 ... --- @@ -29,10 +29,10 @@ ; CHECK-LABEL: name: readlane_vs ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 - ; CHECK: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readlane), [[COPY]](s32), [[COPY1]](s32) + ; CHECK: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readlane2), [[COPY]](s32), [[COPY1]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $sgpr0 - %2:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readlane), %0, %1 + %2:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readlane2), %0, %1 ... --- @@ -46,10 +46,10 @@ ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec - ; CHECK: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readlane), [[COPY]](s32), [[V_READFIRSTLANE_B32_]](s32) + ; CHECK: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readlane2), [[COPY]](s32), [[V_READFIRSTLANE_B32_]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 - %2:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readlane), %0, %1 + %2:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readlane2), %0, %1 ... --- @@ -64,8 +64,8 @@ ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec - ; CHECK: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readlane), [[COPY2]](s32), [[V_READFIRSTLANE_B32_]](s32) + ; CHECK: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readlane2), [[COPY2]](s32), [[V_READFIRSTLANE_B32_]](s32) %0:_(s32) = COPY $sgpr0 %1:_(s32) = COPY $vgpr0 - %2:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readlane), %0, %1 + %2:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readlane2), %0, %1 ... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll @@ -15,7 +15,7 @@ ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(s32) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load 4) ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[AMDGPU_S_BUFFER_LOAD]](s32) - ; CHECK: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32) + ; CHECK: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY5]](s32) ; CHECK: $sgpr0 = COPY [[INT]](s32) ; CHECK: SI_RETURN_TO_EPILOG implicit $sgpr0 %val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %rsrc, i32 %soffset, i32 0) @@ -35,10 +35,10 @@ ; CHECK: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<2 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load 8, align 4) ; CHECK: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<2 x s32>) ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[UV]](s32) - ; CHECK: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32) + ; CHECK: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY5]](s32) ; CHECK: $sgpr0 = COPY [[INT]](s32) ; CHECK: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[UV1]](s32) - ; CHECK: [[INT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY6]](s32) + ; CHECK: [[INT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY6]](s32) ; CHECK: $sgpr1 = COPY [[INT1]](s32) ; CHECK: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 %val = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %rsrc, i32 %soffset, i32 0) @@ -61,13 +61,13 @@ ; CHECK: [[UV:%[0-9]+]]:sgpr(<3 x s32>), [[UV1:%[0-9]+]]:sgpr(<3 x s32>), [[UV2:%[0-9]+]]:sgpr(<3 x s32>), [[UV3:%[0-9]+]]:sgpr(<3 x s32>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s32>) ; CHECK: [[UV4:%[0-9]+]]:sgpr(s32), [[UV5:%[0-9]+]]:sgpr(s32), [[UV6:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[UV]](<3 x s32>) ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[UV4]](s32) - ; CHECK: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32) + ; CHECK: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY5]](s32) ; CHECK: $sgpr0 = COPY [[INT]](s32) ; CHECK: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[UV5]](s32) - ; CHECK: [[INT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY6]](s32) + ; CHECK: [[INT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY6]](s32) ; CHECK: $sgpr1 = COPY [[INT1]](s32) ; CHECK: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[UV6]](s32) - ; CHECK: [[INT2:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY7]](s32) + ; CHECK: [[INT2:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY7]](s32) ; CHECK: $sgpr2 = COPY [[INT2]](s32) ; CHECK: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2 %val = call <3 x i32> @llvm.amdgcn.s.buffer.load.v3i32(<4 x i32> %rsrc, i32 %soffset, i32 0) @@ -87,28 +87,28 @@ ; CHECK: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<8 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load 32, align 4) ; CHECK: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32), [[UV2:%[0-9]+]]:sgpr(s32), [[UV3:%[0-9]+]]:sgpr(s32), [[UV4:%[0-9]+]]:sgpr(s32), [[UV5:%[0-9]+]]:sgpr(s32), [[UV6:%[0-9]+]]:sgpr(s32), [[UV7:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<8 x s32>) ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[UV]](s32) - ; CHECK: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32) + ; CHECK: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY5]](s32) ; CHECK: $sgpr0 = COPY [[INT]](s32) ; CHECK: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[UV1]](s32) - ; CHECK: [[INT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY6]](s32) + ; CHECK: [[INT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY6]](s32) ; CHECK: $sgpr1 = COPY [[INT1]](s32) ; CHECK: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[UV2]](s32) - ; CHECK: [[INT2:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY7]](s32) + ; CHECK: [[INT2:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY7]](s32) ; CHECK: $sgpr2 = COPY [[INT2]](s32) ; CHECK: [[COPY8:%[0-9]+]]:vgpr(s32) = COPY [[UV3]](s32) - ; CHECK: [[INT3:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY8]](s32) + ; CHECK: [[INT3:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY8]](s32) ; CHECK: $sgpr3 = COPY [[INT3]](s32) ; CHECK: [[COPY9:%[0-9]+]]:vgpr(s32) = COPY [[UV4]](s32) - ; CHECK: [[INT4:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY9]](s32) + ; CHECK: [[INT4:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY9]](s32) ; CHECK: $sgpr4 = COPY [[INT4]](s32) ; CHECK: [[COPY10:%[0-9]+]]:vgpr(s32) = COPY [[UV5]](s32) - ; CHECK: [[INT5:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY10]](s32) + ; CHECK: [[INT5:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY10]](s32) ; CHECK: $sgpr5 = COPY [[INT5]](s32) ; CHECK: [[COPY11:%[0-9]+]]:vgpr(s32) = COPY [[UV6]](s32) - ; CHECK: [[INT6:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY11]](s32) + ; CHECK: [[INT6:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY11]](s32) ; CHECK: $sgpr6 = COPY [[INT6]](s32) ; CHECK: [[COPY12:%[0-9]+]]:vgpr(s32) = COPY [[UV7]](s32) - ; CHECK: [[INT7:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY12]](s32) + ; CHECK: [[INT7:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY12]](s32) ; CHECK: $sgpr7 = COPY [[INT7]](s32) ; CHECK: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7 %val = call <8 x i32> @llvm.amdgcn.s.buffer.load.v8i32(<4 x i32> %rsrc, i32 %soffset, i32 0) @@ -128,52 +128,52 @@ ; CHECK: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<16 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load 64, align 4) ; CHECK: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32), [[UV2:%[0-9]+]]:sgpr(s32), [[UV3:%[0-9]+]]:sgpr(s32), [[UV4:%[0-9]+]]:sgpr(s32), [[UV5:%[0-9]+]]:sgpr(s32), [[UV6:%[0-9]+]]:sgpr(s32), [[UV7:%[0-9]+]]:sgpr(s32), [[UV8:%[0-9]+]]:sgpr(s32), [[UV9:%[0-9]+]]:sgpr(s32), [[UV10:%[0-9]+]]:sgpr(s32), [[UV11:%[0-9]+]]:sgpr(s32), [[UV12:%[0-9]+]]:sgpr(s32), [[UV13:%[0-9]+]]:sgpr(s32), [[UV14:%[0-9]+]]:sgpr(s32), [[UV15:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<16 x s32>) ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[UV]](s32) - ; CHECK: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32) + ; CHECK: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY5]](s32) ; CHECK: $sgpr0 = COPY [[INT]](s32) ; CHECK: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[UV1]](s32) - ; CHECK: [[INT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY6]](s32) + ; CHECK: [[INT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY6]](s32) ; CHECK: $sgpr1 = COPY [[INT1]](s32) ; CHECK: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[UV2]](s32) - ; CHECK: [[INT2:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY7]](s32) + ; CHECK: [[INT2:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY7]](s32) ; CHECK: $sgpr2 = COPY [[INT2]](s32) ; CHECK: [[COPY8:%[0-9]+]]:vgpr(s32) = COPY [[UV3]](s32) - ; CHECK: [[INT3:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY8]](s32) + ; CHECK: [[INT3:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY8]](s32) ; CHECK: $sgpr3 = COPY [[INT3]](s32) ; CHECK: [[COPY9:%[0-9]+]]:vgpr(s32) = COPY [[UV4]](s32) - ; CHECK: [[INT4:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY9]](s32) + ; CHECK: [[INT4:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY9]](s32) ; CHECK: $sgpr4 = COPY [[INT4]](s32) ; CHECK: [[COPY10:%[0-9]+]]:vgpr(s32) = COPY [[UV5]](s32) - ; CHECK: [[INT5:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY10]](s32) + ; CHECK: [[INT5:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY10]](s32) ; CHECK: $sgpr5 = COPY [[INT5]](s32) ; CHECK: [[COPY11:%[0-9]+]]:vgpr(s32) = COPY [[UV6]](s32) - ; CHECK: [[INT6:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY11]](s32) + ; CHECK: [[INT6:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY11]](s32) ; CHECK: $sgpr6 = COPY [[INT6]](s32) ; CHECK: [[COPY12:%[0-9]+]]:vgpr(s32) = COPY [[UV7]](s32) - ; CHECK: [[INT7:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY12]](s32) + ; CHECK: [[INT7:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY12]](s32) ; CHECK: $sgpr7 = COPY [[INT7]](s32) ; CHECK: [[COPY13:%[0-9]+]]:vgpr(s32) = COPY [[UV8]](s32) - ; CHECK: [[INT8:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY13]](s32) + ; CHECK: [[INT8:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY13]](s32) ; CHECK: $sgpr8 = COPY [[INT8]](s32) ; CHECK: [[COPY14:%[0-9]+]]:vgpr(s32) = COPY [[UV9]](s32) - ; CHECK: [[INT9:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY14]](s32) + ; CHECK: [[INT9:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY14]](s32) ; CHECK: $sgpr9 = COPY [[INT9]](s32) ; CHECK: [[COPY15:%[0-9]+]]:vgpr(s32) = COPY [[UV10]](s32) - ; CHECK: [[INT10:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY15]](s32) + ; CHECK: [[INT10:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY15]](s32) ; CHECK: $sgpr10 = COPY [[INT10]](s32) ; CHECK: [[COPY16:%[0-9]+]]:vgpr(s32) = COPY [[UV11]](s32) - ; CHECK: [[INT11:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY16]](s32) + ; CHECK: [[INT11:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY16]](s32) ; CHECK: $sgpr11 = COPY [[INT11]](s32) ; CHECK: [[COPY17:%[0-9]+]]:vgpr(s32) = COPY [[UV12]](s32) - ; CHECK: [[INT12:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY17]](s32) + ; CHECK: [[INT12:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY17]](s32) ; CHECK: $sgpr12 = COPY [[INT12]](s32) ; CHECK: [[COPY18:%[0-9]+]]:vgpr(s32) = COPY [[UV13]](s32) - ; CHECK: [[INT13:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY18]](s32) + ; CHECK: [[INT13:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY18]](s32) ; CHECK: $sgpr13 = COPY [[INT13]](s32) ; CHECK: [[COPY19:%[0-9]+]]:vgpr(s32) = COPY [[UV14]](s32) - ; CHECK: [[INT14:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY19]](s32) + ; CHECK: [[INT14:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY19]](s32) ; CHECK: $sgpr14 = COPY [[INT14]](s32) ; CHECK: [[COPY20:%[0-9]+]]:vgpr(s32) = COPY [[UV15]](s32) - ; CHECK: [[INT15:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY20]](s32) + ; CHECK: [[INT15:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY20]](s32) ; CHECK: $sgpr15 = COPY [[INT15]](s32) ; CHECK: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15 %val = call <16 x i32> @llvm.amdgcn.s.buffer.load.v16i32(<4 x i32> %rsrc, i32 %soffset, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.writelane.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.writelane.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.writelane.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.writelane.mir @@ -14,11 +14,11 @@ ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 ; CHECK: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY2]](s32) - ; CHECK: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.writelane), [[COPY]](s32), [[COPY1]](s32), [[COPY3]](s32) + ; CHECK: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.writelane2), [[COPY]](s32), [[COPY1]](s32), [[COPY3]](s32) %0:_(s32) = COPY $sgpr0 %1:_(s32) = COPY $sgpr1 %2:_(s32) = COPY $sgpr2 - %3:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.writelane), %0, %1, %2 + %3:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.writelane2), %0, %1, %2 ... --- @@ -32,11 +32,11 @@ ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; CHECK: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.writelane), [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32) + ; CHECK: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.writelane2), [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32) %0:_(s32) = COPY $sgpr0 %1:_(s32) = COPY $sgpr1 %2:_(s32) = COPY $vgpr0 - %3:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.writelane), %0, %1, %2 + %3:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.writelane2), %0, %1, %2 ... --- @@ -51,11 +51,11 @@ ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[COPY]](s32), implicit $exec - ; CHECK: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.writelane), [[V_READFIRSTLANE_B32_]](s32), [[COPY1]](s32), [[COPY2]](s32) + ; CHECK: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.writelane2), [[V_READFIRSTLANE_B32_]](s32), [[COPY1]](s32), [[COPY2]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $sgpr0 %2:_(s32) = COPY $vgpr1 - %3:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.writelane), %0, %1, %2 + %3:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.writelane2), %0, %1, %2 ... --- @@ -71,11 +71,11 @@ ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[COPY]](s32), implicit $exec ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec - ; CHECK: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.writelane), [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[COPY2]](s32) + ; CHECK: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.writelane2), [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[COPY2]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %2:_(s32) = COPY $vgpr2 - %3:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.writelane), %0, %1, %2 + %3:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.writelane2), %0, %1, %2 ... --- @@ -90,9 +90,9 @@ ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec - ; CHECK: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.writelane), [[COPY]](s32), [[V_READFIRSTLANE_B32_]](s32), [[COPY2]](s32) + ; CHECK: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.writelane2), [[COPY]](s32), [[V_READFIRSTLANE_B32_]](s32), [[COPY2]](s32) %0:_(s32) = COPY $sgpr0 %1:_(s32) = COPY $vgpr0 %2:_(s32) = COPY $vgpr1 - %3:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.writelane), %0, %1, %2 + %3:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.writelane2), %0, %1, %2 ... diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll @@ -1,11 +1,20 @@ +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope %s ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope %s -declare i32 @llvm.amdgcn.readfirstlane(i32) #0 +declare i32 @llvm.amdgcn.readfirstlane2.i32(i32) #0 +declare float @llvm.amdgcn.readfirstlane2.f32(float) #0 +declare <2 x half> @llvm.amdgcn.readfirstlane2.v2f16(<2 x half>) #0 +declare <2 x i16> @llvm.amdgcn.readfirstlane2.v2i16(<2 x i16>) #0 +declare i8 addrspace(3)* @llvm.amdgcn.readfirstlane2.p3i8(i8 addrspace(3)*) #0 +declare i16 @llvm.amdgcn.readfirstlane2.i16(i16) #0 +declare half @llvm.amdgcn.readfirstlane2.f16(half) #0 +declare <3 x i16> @llvm.amdgcn.readfirstlane2.v3i16(<3 x i16>) #0 +declare <9 x float> @llvm.amdgcn.readfirstlane2.v9f32(<9 x float>) #0 -; CHECK-LABEL: {{^}}test_readfirstlane: +; CHECK-LABEL: {{^}}test_readfirstlane_i32: ; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, v2 -define void @test_readfirstlane(i32 addrspace(1)* %out, i32 %src) #1 { - %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %src) +define void @test_readfirstlane_i32(i32 addrspace(1)* %out, i32 %src) #1 { + %readfirstlane = call i32 @llvm.amdgcn.readfirstlane2.i32(i32 %src) store i32 %readfirstlane, i32 addrspace(1)* %out, align 4 ret void } @@ -15,7 +24,7 @@ ; CHECK-NOT: [[SGPR_VAL]] ; CHECK: ; use [[SGPR_VAL]] define amdgpu_kernel void @test_readfirstlane_imm(i32 addrspace(1)* %out) #1 { - %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 32) + %readfirstlane = call i32 @llvm.amdgcn.readfirstlane2.i32(i32 32) call void asm sideeffect "; use $0", "s"(i32 %readfirstlane) ret void } @@ -25,7 +34,7 @@ ; CHECK-NOT: [[VVAL]] ; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VVAL]] define amdgpu_kernel void @test_readfirstlane_imm_fold(i32 addrspace(1)* %out) #1 { - %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 32) + %readfirstlane = call i32 @llvm.amdgcn.readfirstlane2.i32(i32 32) store i32 %readfirstlane, i32 addrspace(1)* %out, align 4 ret void } @@ -36,7 +45,7 @@ ; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VVAL]] define amdgpu_kernel void @test_readfirstlane_m0(i32 addrspace(1)* %out) #1 { %m0 = call i32 asm "s_mov_b32 m0, -1", "={m0}"() - %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %m0) + %readfirstlane = call i32 @llvm.amdgcn.readfirstlane2.i32(i32 %m0) store i32 %readfirstlane, i32 addrspace(1)* %out, align 4 ret void } @@ -51,7 +60,7 @@ ; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VCOPY]] define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr(i32 addrspace(1)* %out) #1 { %sgpr = call i32 asm "s_mov_b32 $0, 0", "=s"() - %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %sgpr) + %readfirstlane = call i32 @llvm.amdgcn.readfirstlane2.i32(i32 %sgpr) store i32 %readfirstlane, i32 addrspace(1)* %out, align 4 ret void } @@ -62,10 +71,91 @@ define amdgpu_kernel void @test_readfirstlane_fi(i32 addrspace(1)* %out) #1 { %alloca = alloca i32, addrspace(5) %int = ptrtoint i32 addrspace(5)* %alloca to i32 - %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %int) + %readfirstlane = call i32 @llvm.amdgcn.readfirstlane2.i32(i32 %int) call void asm sideeffect "; use $0", "s"(i32 %readfirstlane) ret void } +; CHECK-LABEL: {{^}}test_readfirstlane_f32: +; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, v2 +; CHECK-NOT: v_readfirstlane_b32 +define void @test_readfirstlane_f32(float addrspace(1)* %out, float %src) #1 { + %readfirstlane = call float @llvm.amdgcn.readfirstlane2.f32(float %src) + store float %readfirstlane, float addrspace(1)* %out, align 4 + ret void +} + +; CHECK-LABEL: {{^}}test_readfirstlane_v2f16: +; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, v2 +; CHECK-NOT: v_readfirstlane_b32 +define void @test_readfirstlane_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %src) #1 { + %readfirstlane = call <2 x half> @llvm.amdgcn.readfirstlane2.v2f16(<2 x half> %src) + store <2 x half> %readfirstlane, <2 x half> addrspace(1)* %out, align 4 + ret void +} + +; CHECK-LABEL: {{^}}test_readfirstlane_v2i16: +; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, v2 +; CHECK-NOT: v_readfirstlane_b32 +define void @test_readfirstlane_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %src) #1 { + %readfirstlane = call <2 x i16> @llvm.amdgcn.readfirstlane2.v2i16(<2 x i16> %src) + store <2 x i16> %readfirstlane, <2 x i16> addrspace(1)* %out, align 4 + ret void +} + +; CHECK-LABEL: {{^}}test_readfirstlane_p3: +; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, v2 +; CHECK-NOT: v_readfirstlane_b32 +define void @test_readfirstlane_p3(i8 addrspace(3)* addrspace(1)* %out, i8 addrspace(3)* %src) #1 { + %readfirstlane = call i8 addrspace(3)* @llvm.amdgcn.readfirstlane2.p3i8(i8 addrspace(3)* %src) + store i8 addrspace(3)* %readfirstlane, i8 addrspace(3)* addrspace(1)* %out, align 4 + ret void +} + +; CHECK-LABEL: {{^}}test_readfirstlane_i16: +; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, v2 +; CHECK-NOT: v_readfirstlane_b32 +define void @test_readfirstlane_i16(i16 addrspace(1)* %out, i16 %src) { + %readfirstlane = call i16 @llvm.amdgcn.readfirstlane2.i16(i16 %src) + store i16 %readfirstlane, i16 addrspace(1)* %out, align 2 + ret void +} + +; CHECK-LABEL: {{^}}test_readfirstlane_f16: +; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, v2 +; CHECK-NOT: v_readfirstlane_b32 +define void @test_readfirstlane_f16(half addrspace(1)* %out, half %src) { + %readfirstlane = call half @llvm.amdgcn.readfirstlane2.f16(half %src) + store half %readfirstlane, half addrspace(1)* %out, align 2 + ret void +} + +; CHECK-LABEL: {{^}}test_readfirstlane_v3i16: +; CHECK-DAG: v_readfirstlane_b32 s{{[0-9]+}}, +; CHECK-DAG: v_readfirstlane_b32 s{{[0-9]+}}, +; CHECK-NOT: v_readfirstlane_b32 +define void @test_readfirstlane_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %src) { + %readfirstlane = call <3 x i16> @llvm.amdgcn.readfirstlane2.v3i16(<3 x i16> %src) + store <3 x i16> %readfirstlane, <3 x i16> addrspace(1)* %out, align 2 + ret void +} + +; CHECK-LABEL: {{^}}test_readfirstlane_v9f32: +; CHECK-DAG: v_readfirstlane_b32 s{{[0-9]+}}, v2 +; CHECK-DAG: v_readfirstlane_b32 s{{[0-9]+}}, v3 +; CHECK-DAG: v_readfirstlane_b32 s{{[0-9]+}}, v4 +; CHECK-DAG: v_readfirstlane_b32 s{{[0-9]+}}, v5 +; CHECK-DAG: v_readfirstlane_b32 s{{[0-9]+}}, v6 +; CHECK-DAG: v_readfirstlane_b32 s{{[0-9]+}}, v7 +; CHECK-DAG: v_readfirstlane_b32 s{{[0-9]+}}, v8 +; CHECK-DAG: v_readfirstlane_b32 s{{[0-9]+}}, v9 +; CHECK-DAG: v_readfirstlane_b32 s{{[0-9]+}}, v10 +; CHECK-NOT: v_readfirstlane_b32 +define void @test_readfirstlane_v9f32(<9 x float> addrspace(1)* %out, <9 x float> %src) { + %readfirstlane = call <9 x float> @llvm.amdgcn.readfirstlane2.v9f32(<9 x float> %src) + store <9 x float> %readfirstlane, <9 x float> addrspace(1)* %out, align 2 + ret void +} + attributes #0 = { nounwind readnone convergent } attributes #1 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll @@ -1,6 +1,11 @@ ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope %s declare i32 @llvm.amdgcn.readlane(i32, i32) #0 +declare i8 addrspace(3)* @llvm.amdgcn.readlane2.p3i8(i8 addrspace(3)*, i32) #0 +declare i16 @llvm.amdgcn.readlane2.i16(i16, i32) #0 +declare half @llvm.amdgcn.readlane2.f16(half, i32) #0 +declare <3 x i16> @llvm.amdgcn.readlane2.v3i16(<3 x i16>, i32) #0 +declare <9 x float> @llvm.amdgcn.readlane2.v9f32(<9 x float>, i32) #0 ; CHECK-LABEL: {{^}}test_readlane_sreg_sreg: ; CHECK-NOT: v_readlane_b32 @@ -77,6 +82,60 @@ ret void } +; CHECK-LABEL: {{^}}test_readlane_p3: +; CHECK: v_readlane_b32 s{{[0-9]+}}, v2, 15 +; CHECK-NOT: v_readlane_b32 +define void @test_readlane_p3(i8 addrspace(3)* addrspace(1)* %out, i8 addrspace(3)* %src) #1 { + %readlane = call i8 addrspace(3)* @llvm.amdgcn.readlane2.p3i8(i8 addrspace(3)* %src, i32 15) + store i8 addrspace(3)* %readlane, i8 addrspace(3)* addrspace(1)* %out, align 4 + ret void +} + +; CHECK-LABEL: {{^}}test_readlane_i16: +; CHECK: v_readlane_b32 s{{[0-9]+}}, v2, 15 +; CHECK-NOT: v_readlane_b32 +define void @test_readlane_i16(i16 addrspace(1)* %out, i16 %src) { + %readlane = call i16 @llvm.amdgcn.readlane2.i16(i16 %src, i32 15) + store i16 %readlane, i16 addrspace(1)* %out, align 2 + ret void +} + +; CHECK-LABEL: {{^}}test_readlane_f16: +; CHECK: v_readlane_b32 s{{[0-9]+}}, v2, 15 +; CHECK-NOT: v_readlane_b32 +define void @test_readlane_f16(half addrspace(1)* %out, half %src) { + %readlane = call half @llvm.amdgcn.readlane2.f16(half %src, i32 15) + store half %readlane, half addrspace(1)* %out, align 2 + ret void +} + +; CHECK-LABEL: {{^}}test_readlane_v3i16: +; CHECK-DAG: v_readlane_b32 s{{[0-9]+}}, +; CHECK-DAG: v_readlane_b32 s{{[0-9]+}}, +; CHECK-NOT: v_readlane_b32 +define void @test_readlane_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %src) { + %readlane = call <3 x i16> @llvm.amdgcn.readlane2.v3i16(<3 x i16> %src, i32 15) + store <3 x i16> %readlane, <3 x i16> addrspace(1)* %out, align 2 + ret void +} + +; CHECK-LABEL: {{^}}test_readlane_v9f32: +; CHECK-DAG: v_readlane_b32 s{{[0-9]+}}, v2, 15 +; CHECK-DAG: v_readlane_b32 s{{[0-9]+}}, v3, 15 +; CHECK-DAG: v_readlane_b32 s{{[0-9]+}}, v4, 15 +; CHECK-DAG: v_readlane_b32 s{{[0-9]+}}, v5, 15 +; CHECK-DAG: v_readlane_b32 s{{[0-9]+}}, v6, 15 +; CHECK-DAG: v_readlane_b32 s{{[0-9]+}}, v7, 15 +; CHECK-DAG: v_readlane_b32 s{{[0-9]+}}, v8, 15 +; CHECK-DAG: v_readlane_b32 s{{[0-9]+}}, v9, 15 +; CHECK-DAG: v_readlane_b32 s{{[0-9]+}}, v10, 15 +; CHECK-NOT: v_readlane_b32 +define void @test_readlane_v9f32(<9 x float> addrspace(1)* %out, <9 x float> %src) { + %readlane = call <9 x float> @llvm.amdgcn.readlane2.v9f32(<9 x float> %src, i32 15) + store <9 x float> %readlane, <9 x float> addrspace(1)* %out, align 2 + ret void +} + declare i32 @llvm.amdgcn.workitem.id.x() #2 attributes #0 = { nounwind readnone convergent } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll @@ -3,6 +3,11 @@ ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,GFX10 %s declare i32 @llvm.amdgcn.writelane(i32, i32, i32) #0 +declare i8 addrspace(3)* @llvm.amdgcn.writelane2.p3i8(i8 addrspace(3)*, i32, i8 addrspace(3)*) #0 +declare i16 @llvm.amdgcn.writelane2.i16(i16, i32, i16) #0 +declare half @llvm.amdgcn.writelane2.f16(half, i32, half) #0 +declare <3 x i16> @llvm.amdgcn.writelane2.v3i16(<3 x i16>, i32, <3 x i16>) #0 +declare <9 x float> @llvm.amdgcn.writelane2.v9f32(<9 x float>, i32, <9 x float>) #0 ; CHECK-LABEL: {{^}}test_writelane_sreg: ; CIGFX9: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, m0 @@ -79,6 +84,60 @@ ret void } +; CHECK-LABEL: {{^}}test_writelane_p3: +; CHECK: v_writelane_b32 v{{[0-9]+}}, 0, 15 +; CHECK-NOT: v_writelane_b32 +define void @test_writelane_p3(i8 addrspace(3)* addrspace(1)* %out, i8 addrspace(3)* %src) #1 { + %writelane = call i8 addrspace(3)* @llvm.amdgcn.writelane2.p3i8(i8 addrspace(3)* null, i32 15, i8 addrspace(3)* %src) + store i8 addrspace(3)* %writelane, i8 addrspace(3)* addrspace(1)* %out, align 4 + ret void +} + +; CHECK-LABEL: {{^}}test_writelane_i16: +; CHECK: v_writelane_b32 v{{[0-9]+}}, +; CHECK-NOT: v_writelane_b32 +define void @test_writelane_i16(i16 addrspace(1)* %out, i16 %src) { + %writelane = call i16 @llvm.amdgcn.writelane2.i16(i16 1234, i32 15, i16 %src) + store i16 %writelane, i16 addrspace(1)* %out, align 2 + ret void +} + +; CHECK-LABEL: {{^}}test_writelane_f16: +; CHECK: v_writelane_b32 v{{[0-9]+}}, +; CHECK-NOT: v_writelane_b32 +define void @test_writelane_f16(half addrspace(1)* %out, half %src) { + %writelane = call half @llvm.amdgcn.writelane2.f16(half 1.0, i32 15, half %src) + store half %writelane, half addrspace(1)* %out, align 2 + ret void +} + +; CHECK-LABEL: {{^}}test_writelane_v3i16: +; CHECK-DAG: v_writelane_b32 v{{[0-9]+}}, +; CHECK-DAG: v_writelane_b32 v{{[0-9]+}}, +; CHECK-NOT: v_writelane_b32 +define void @test_writelane_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %src) { + %writelane = call <3 x i16> @llvm.amdgcn.writelane2.v3i16(<3 x i16> zeroinitializer, i32 15, <3 x i16> %src) + store <3 x i16> %writelane, <3 x i16> addrspace(1)* %out, align 2 + ret void +} + +; CHECK-LABEL: {{^}}test_writelane_v9f32: +; CHECK-DAG: v_writelane_b32 v{{[0-9]+}}, 0, 15 +; CHECK-DAG: v_writelane_b32 v{{[0-9]+}}, 0, 15 +; CHECK-DAG: v_writelane_b32 v{{[0-9]+}}, 0, 15 +; CHECK-DAG: v_writelane_b32 v{{[0-9]+}}, 0, 15 +; CHECK-DAG: v_writelane_b32 v{{[0-9]+}}, 0, 15 +; CHECK-DAG: v_writelane_b32 v{{[0-9]+}}, 0, 15 +; CHECK-DAG: v_writelane_b32 v{{[0-9]+}}, 0, 15 +; CHECK-DAG: v_writelane_b32 v{{[0-9]+}}, 0, 15 +; CHECK-DAG: v_writelane_b32 v{{[0-9]+}}, 0, 15 +; CHECK-NOT: v_writelane_b32 +define void @test_writelane_v9f32(<9 x float> addrspace(1)* %out, <9 x float> %src) { + %writelane = call <9 x float> @llvm.amdgcn.writelane2.v9f32(<9 x float> zeroinitializer, i32 15, <9 x float> %src) + store <9 x float> %writelane, <9 x float> addrspace(1)* %out, align 2 + ret void +} + declare i32 @llvm.amdgcn.workitem.id.x() #2 attributes #0 = { nounwind readnone convergent } diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll --- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll +++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll @@ -2507,8 +2507,8 @@ define amdgpu_kernel void @readfirstlane_constant(i32 %arg) { ; CHECK-LABEL: @readfirstlane_constant( -; CHECK-NEXT: [[VAR:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[ARG:%.*]]) -; CHECK-NEXT: store volatile i32 [[VAR]], i32* undef, align 4 +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.readfirstlane2.i32(i32 [[ARG:%.*]]) +; CHECK-NEXT: store volatile i32 [[TMP1]], i32* undef, align 4 ; CHECK-NEXT: store volatile i32 0, i32* undef, align 4 ; CHECK-NEXT: store volatile i32 123, i32* undef, align 4 ; CHECK-NEXT: store volatile i32 ptrtoint (i32* @gv to i32), i32* undef, align 4 @@ -2530,8 +2530,8 @@ define i32 @readfirstlane_idempotent(i32 %arg) { ; CHECK-LABEL: @readfirstlane_idempotent( -; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[ARG:%.*]]) -; CHECK-NEXT: ret i32 [[READ0]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.readfirstlane2.i32(i32 [[ARG:%.*]]) +; CHECK-NEXT: ret i32 [[TMP1]] ; %read0 = call i32 @llvm.amdgcn.readfirstlane(i32 %arg) %read1 = call i32 @llvm.amdgcn.readfirstlane(i32 %read0) @@ -2541,8 +2541,8 @@ define i32 @readfirstlane_readlane(i32 %arg) { ; CHECK-LABEL: @readfirstlane_readlane( -; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[ARG:%.*]]) -; CHECK-NEXT: ret i32 [[READ0]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.readfirstlane2.i32(i32 [[ARG:%.*]]) +; CHECK-NEXT: ret i32 [[TMP1]] ; %read0 = call i32 @llvm.amdgcn.readfirstlane(i32 %arg) %read1 = call i32 @llvm.amdgcn.readlane(i32 %read0, i32 0) @@ -2552,11 +2552,11 @@ define i32 @readfirstlane_readfirstlane_different_block(i32 %arg) { ; CHECK-LABEL: @readfirstlane_readfirstlane_different_block( ; CHECK-NEXT: bb0: -; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[ARG:%.*]]) +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.readfirstlane2.i32(i32 [[ARG:%.*]]) ; CHECK-NEXT: br label [[BB1:%.*]] ; CHECK: bb1: -; CHECK-NEXT: [[READ1:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[READ0]]) -; CHECK-NEXT: ret i32 [[READ1]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.readfirstlane2.i32(i32 [[TMP0]]) +; CHECK-NEXT: ret i32 [[TMP1]] ; bb0: %read0 = call i32 @llvm.amdgcn.readfirstlane(i32 %arg) @@ -2570,11 +2570,11 @@ define i32 @readfirstlane_readlane_different_block(i32 %arg) { ; CHECK-LABEL: @readfirstlane_readlane_different_block( ; CHECK-NEXT: bb0: -; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[ARG:%.*]], i32 0) +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.readlane2.i32(i32 [[ARG:%.*]], i32 0) ; CHECK-NEXT: br label [[BB1:%.*]] ; CHECK: bb1: -; CHECK-NEXT: [[READ1:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[READ0]]) -; CHECK-NEXT: ret i32 [[READ1]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.readfirstlane2.i32(i32 [[TMP0]]) +; CHECK-NEXT: ret i32 [[TMP1]] ; bb0: %read0 = call i32 @llvm.amdgcn.readlane(i32 %arg, i32 0) @@ -2585,6 +2585,41 @@ ret i32 %read1 } +define i32 @readfirstlane_bitcast(float %arg) { +; CHECK-LABEL: @readfirstlane_bitcast( +; CHECK-NEXT: [[TMP1:%.*]] = call float @llvm.amdgcn.readfirstlane2.f32(float [[ARG:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = bitcast float [[TMP1]] to i32 +; CHECK-NEXT: ret i32 [[TMP2]] +; + %bitcast.arg = bitcast float %arg to i32 + %read = call i32 @llvm.amdgcn.readfirstlane(i32 %bitcast.arg) + ret i32 %read +} + +define float @bitcast_readfirstlane_bitcast(float %arg) { +; CHECK-LABEL: @bitcast_readfirstlane_bitcast( +; CHECK-NEXT: [[TMP1:%.*]] = call float @llvm.amdgcn.readfirstlane2.f32(float [[ARG:%.*]]) +; CHECK-NEXT: ret float [[TMP1]] +; + %bitcast.arg = bitcast float %arg to i32 + %read = call i32 @llvm.amdgcn.readfirstlane(i32 %bitcast.arg) + %cast.read = bitcast i32 %read to float + ret float %cast.read +} + +define i32 @readfirstlane_bitcast_multi_use(float %arg) { +; CHECK-LABEL: @readfirstlane_bitcast_multi_use( +; CHECK-NEXT: store float [[ARG:%.*]], float* undef, align 4 +; CHECK-NEXT: [[TMP1:%.*]] = call float @llvm.amdgcn.readfirstlane2.f32(float [[ARG]]) +; CHECK-NEXT: [[TMP2:%.*]] = bitcast float [[TMP1]] to i32 +; CHECK-NEXT: ret i32 [[TMP2]] +; + %bitcast.arg = bitcast float %arg to i32 + store i32 %bitcast.arg, i32* undef + %read = call i32 @llvm.amdgcn.readfirstlane(i32 %bitcast.arg) + ret i32 %read +} + ; -------------------------------------------------------------------- ; llvm.amdgcn.readlane ; -------------------------------------------------------------------- @@ -2593,8 +2628,8 @@ define amdgpu_kernel void @readlane_constant(i32 %arg, i32 %lane) { ; CHECK-LABEL: @readlane_constant( -; CHECK-NEXT: [[VAR:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[ARG:%.*]], i32 7) -; CHECK-NEXT: store volatile i32 [[VAR]], i32* undef, align 4 +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.readlane2.i32(i32 [[ARG:%.*]], i32 7) +; CHECK-NEXT: store volatile i32 [[TMP1]], i32* undef, align 4 ; CHECK-NEXT: store volatile i32 0, i32* undef, align 4 ; CHECK-NEXT: store volatile i32 123, i32* undef, align 4 ; CHECK-NEXT: store volatile i32 ptrtoint (i32* @gv to i32), i32* undef, align 4 @@ -2616,8 +2651,8 @@ define i32 @readlane_idempotent(i32 %arg, i32 %lane) { ; CHECK-LABEL: @readlane_idempotent( -; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[ARG:%.*]], i32 [[LANE:%.*]]) -; CHECK-NEXT: ret i32 [[READ0]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.readlane2.i32(i32 [[ARG:%.*]], i32 [[LANE:%.*]]) +; CHECK-NEXT: ret i32 [[TMP1]] ; %read0 = call i32 @llvm.amdgcn.readlane(i32 %arg, i32 %lane) %read1 = call i32 @llvm.amdgcn.readlane(i32 %read0, i32 %lane) @@ -2626,9 +2661,9 @@ define i32 @readlane_idempotent_different_lanes(i32 %arg, i32 %lane0, i32 %lane1) { ; CHECK-LABEL: @readlane_idempotent_different_lanes( -; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[ARG:%.*]], i32 [[LANE0:%.*]]) -; CHECK-NEXT: [[READ1:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[READ0]], i32 [[LANE1:%.*]]) -; CHECK-NEXT: ret i32 [[READ1]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.readlane2.i32(i32 [[ARG:%.*]], i32 [[LANE0:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.readlane2.i32(i32 [[TMP1]], i32 [[LANE1:%.*]]) +; CHECK-NEXT: ret i32 [[TMP2]] ; %read0 = call i32 @llvm.amdgcn.readlane(i32 %arg, i32 %lane0) %read1 = call i32 @llvm.amdgcn.readlane(i32 %read0, i32 %lane1) @@ -2637,8 +2672,8 @@ define i32 @readlane_readfirstlane(i32 %arg) { ; CHECK-LABEL: @readlane_readfirstlane( -; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[ARG:%.*]]) -; CHECK-NEXT: ret i32 [[READ0]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.readfirstlane2.i32(i32 [[ARG:%.*]]) +; CHECK-NEXT: ret i32 [[TMP1]] ; %read0 = call i32 @llvm.amdgcn.readfirstlane(i32 %arg) %read1 = call i32 @llvm.amdgcn.readlane(i32 %read0, i32 0) @@ -2648,11 +2683,11 @@ define i32 @readlane_idempotent_different_block(i32 %arg, i32 %lane) { ; CHECK-LABEL: @readlane_idempotent_different_block( ; CHECK-NEXT: bb0: -; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[ARG:%.*]], i32 [[LANE:%.*]]) +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.readlane2.i32(i32 [[ARG:%.*]], i32 [[LANE:%.*]]) ; CHECK-NEXT: br label [[BB1:%.*]] ; CHECK: bb1: -; CHECK-NEXT: [[READ1:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[READ0]], i32 [[LANE]]) -; CHECK-NEXT: ret i32 [[READ1]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.readlane2.i32(i32 [[TMP0]], i32 [[LANE]]) +; CHECK-NEXT: ret i32 [[TMP1]] ; bb0: %read0 = call i32 @llvm.amdgcn.readlane(i32 %arg, i32 %lane) @@ -2667,11 +2702,11 @@ define i32 @readlane_readfirstlane_different_block(i32 %arg) { ; CHECK-LABEL: @readlane_readfirstlane_different_block( ; CHECK-NEXT: bb0: -; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[ARG:%.*]]) +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.readfirstlane2.i32(i32 [[ARG:%.*]]) ; CHECK-NEXT: br label [[BB1:%.*]] ; CHECK: bb1: -; CHECK-NEXT: [[READ1:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[READ0]], i32 0) -; CHECK-NEXT: ret i32 [[READ1]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.readlane2.i32(i32 [[TMP0]], i32 0) +; CHECK-NEXT: ret i32 [[TMP1]] ; bb0: %read0 = call i32 @llvm.amdgcn.readfirstlane(i32 %arg)