Index: lib/Target/AMDGPU/AMDGPUInstructionSelector.h =================================================================== --- lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -35,6 +35,7 @@ class AMDGPURegisterBankInfo; class GCNSubtarget; class MachineInstr; +class MachineIRBuilder; class MachineOperand; class MachineRegisterInfo; class SIInstrInfo; @@ -82,6 +83,12 @@ bool selectG_IMPLICIT_DEF(MachineInstr &I) const; bool selectG_INSERT(MachineInstr &I) const; bool selectG_INTRINSIC(MachineInstr &I, CodeGenCoverage &CoverageInfo) const; + + std::tuple + splitBufferOffsets(MachineIRBuilder &B, Register OrigOffset) const; + + bool selectStoreIntrinsic(MachineInstr &MI, bool IsFormat) const; + bool selectG_INTRINSIC_W_SIDE_EFFECTS(MachineInstr &I, CodeGenCoverage &CoverageInfo) const; int getS_CMPOpcode(CmpInst::Predicate P, unsigned Size) const; Index: lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -21,6 +21,7 @@ #include "SIMachineFunctionInfo.h" #include "llvm/CodeGen/GlobalISel/InstructionSelector.h" #include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h" +#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" #include "llvm/CodeGen/GlobalISel/Utils.h" #include "llvm/CodeGen/MachineBasicBlock.h" @@ -732,6 +733,188 @@ .addImm(Enabled); } +static bool isZero(Register Reg, MachineRegisterInfo &MRI) { + int64_t C; + if (mi_match(Reg, MRI, m_ICst(C)) && C == 0) + return true; + + // FIXME: matcher should ignore copies + return mi_match(Reg, MRI, m_Copy(m_ICst(C))) && C == 0; +} + +static unsigned extractGLC(unsigned CachePolicy) { + return CachePolicy & 1; +} + +static unsigned extractSLC(unsigned CachePolicy) { + return (CachePolicy >> 1) & 1; +} + +static unsigned extractDLC(unsigned CachePolicy) { + return (CachePolicy >> 2) & 1; +} + +// Returns Base register, constant offset, and offset def point. +static std::tuple +getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) { + MachineInstr *Def = getDefIgnoringCopies(Reg, MRI); + if (!Def) + return {Reg, 0, nullptr}; + + if (Def->getOpcode() == AMDGPU::G_CONSTANT) { + unsigned Offset; + const MachineOperand &Op = Def->getOperand(1); + if (Op.isImm()) + Offset = Op.getImm(); + else + Offset = Op.getCImm()->getZExtValue(); + + return {Register(), Offset, Def}; + } + + int64_t Offset; + if (Def->getOpcode() == AMDGPU::G_ADD) { + // TODO: Handle G_OR used for add case + if (mi_match(Def->getOperand(1).getReg(), MRI, m_ICst(Offset))) + return {Def->getOperand(0).getReg(), Offset, Def}; + + // FIXME: matcher should ignore copies + if (mi_match(Def->getOperand(1).getReg(), MRI, m_Copy(m_ICst(Offset)))) + return {Def->getOperand(0).getReg(), Offset, Def}; + } + + return {Reg, 0, Def}; +} + +// TODO: Move this to combiner +// Returns base register, imm offset, total constant offset. +std::tuple +AMDGPUInstructionSelector::splitBufferOffsets(MachineIRBuilder &B, + Register OrigOffset) const { + const unsigned MaxImm = 4095; + Register BaseReg; + unsigned TotalConstOffset; + MachineInstr *OffsetDef; + MachineRegisterInfo &MRI = *B.getMRI(); + + std::tie(BaseReg, TotalConstOffset, OffsetDef) + = getBaseWithConstantOffset(MRI, OrigOffset); + + unsigned ImmOffset = TotalConstOffset; + + // If the immediate value is too big for the immoffset field, put the value + // and -4096 into the immoffset field so that the value that is copied/added + // for the voffset field is a multiple of 4096, and it stands more chance + // of being CSEd with the copy/add for another similar load/store.f + // However, do not do that rounding down to a multiple of 4096 if that is a + // negative number, as it appears to be illegal to have a negative offset + // in the vgpr, even if adding the immediate offset makes it positive. + unsigned Overflow = ImmOffset & ~MaxImm; + ImmOffset -= Overflow; + if ((int32_t)Overflow < 0) { + Overflow += ImmOffset; + ImmOffset = 0; + } + + if (Overflow != 0) { + // In case this is in a waterfall loop, insert offset code at the def point + // of the offset, not inside the loop. + MachineBasicBlock::iterator OldInsPt = B.getInsertPt(); + MachineBasicBlock &OldMBB = B.getMBB(); + B.setInstr(*OffsetDef); + + if (!BaseReg) { + BaseReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + B.buildInstr(AMDGPU::V_MOV_B32_e32) + .addDef(BaseReg) + .addImm(Overflow); + } else { + Register OverflowVal = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + B.buildInstr(AMDGPU::V_MOV_B32_e32) + .addDef(OverflowVal) + .addImm(Overflow); + + Register NewBaseReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + TII.getAddNoCarry(B.getMBB(), B.getInsertPt(), B.getDebugLoc(), NewBaseReg) + .addReg(BaseReg) + .addReg(OverflowVal, RegState::Kill) + .addImm(0); + BaseReg = NewBaseReg; + } + + B.setInsertPt(OldMBB, OldInsPt); + } + + return {BaseReg, ImmOffset, TotalConstOffset}; +} + +bool AMDGPUInstructionSelector::selectStoreIntrinsic(MachineInstr &MI, + bool IsFormat) const { + MachineIRBuilder B(MI); + MachineRegisterInfo &MRI = *B.getMRI(); + MachineFunction &MF = B.getMF(); + Register VData = MI.getOperand(1).getReg(); + LLT Ty = MRI.getType(VData); + + int Size = Ty.getSizeInBits(); + if (Size % 32 != 0) + return false; + + // FIXME: Verifier should enforce 1 MMO for these intrinsics. + MachineMemOperand *MMO = *MI.memoperands_begin(); + const int MemSize = MMO->getSize(); + + Register RSrc = MI.getOperand(2).getReg(); + Register VOffset = MI.getOperand(3).getReg(); + Register SOffset = MI.getOperand(4).getReg(); + unsigned CachePolicy = MI.getOperand(5).getImm(); + unsigned ImmOffset; + unsigned TotalOffset; + + std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); + if (TotalOffset != 0) + MMO = MF.getMachineMemOperand(MMO, TotalOffset, MemSize); + + const bool Offen = !isZero(VOffset, MRI); + + unsigned Opc = AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact; + switch (8 * MemSize) { + case 8: + Opc = Offen ? AMDGPU::BUFFER_STORE_BYTE_OFFEN_exact : + AMDGPU::BUFFER_STORE_BYTE_OFFSET_exact; + break; + case 16: + Opc = Offen ? AMDGPU::BUFFER_STORE_SHORT_OFFEN_exact : + AMDGPU::BUFFER_STORE_SHORT_OFFSET_exact; + break; + default: + Opc = Offen ? AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact : + AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact; + if (Size > 32) + Opc = AMDGPU::getMUBUFOpcode(Opc, Size / 32); + break; + } + + MachineInstrBuilder MIB = B.buildInstr(Opc) + .addUse(VData); + + if (Offen) + MIB.addUse(VOffset); + + MIB.addUse(RSrc) + .addUse(SOffset) + .addImm(ImmOffset) + .addImm(extractGLC(CachePolicy)) + .addImm(extractSLC(CachePolicy)) + .addImm(0) // tfe: FIXME: Remove from inst + .addImm(extractDLC(CachePolicy)) + .addMemOperand(MMO); + + MI.eraseFromParent(); + + return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); +} + bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS( MachineInstr &I, CodeGenCoverage &CoverageInfo) const { MachineBasicBlock *BB = I.getParent(); @@ -786,6 +969,8 @@ MRI.setRegClass(Reg, TRI.getWaveMaskRegClass()); return true; } + case Intrinsic::amdgcn_raw_buffer_store: + return selectStoreIntrinsic(I, false); default: return selectImpl(I, CoverageInfo); } Index: lib/Target/AMDGPU/AMDGPULegalizerInfo.h =================================================================== --- lib/Target/AMDGPU/AMDGPULegalizerInfo.h +++ lib/Target/AMDGPU/AMDGPULegalizerInfo.h @@ -67,6 +67,9 @@ bool legalizeImplicitArgPtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const; + + bool legalizeRawBufferStore(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B, bool IsFormat) const; bool legalizeIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder) const override; Index: lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -1502,6 +1502,38 @@ return true; } +bool AMDGPULegalizerInfo::legalizeRawBufferStore(MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &B, + bool IsFormat) const { + assert(!IsFormat && "format buffer stores not implemented"); + + // TODO: Reject f16 format on targets where unsupported. + Register VData = MI.getOperand(1).getReg(); + LLT Ty = MRI.getType(VData); + + B.setInstr(MI); + + const LLT S32 = LLT::scalar(32); + const LLT S16 = LLT::scalar(16); + + // Fixup illegal register types for i8 stores. + if (Ty == LLT::scalar(8) || Ty == S16) { + Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); + MI.getOperand(1).setReg(AnyExt); + return true; + } + + if (Ty.isVector()) { + if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) + return true; + + return Ty.getElementType() == S32 && Ty.getNumElements() <= 4; + } + + return Ty == S32; +} + bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { @@ -1582,6 +1614,8 @@ case Intrinsic::amdgcn_dispatch_id: return legalizePreloadedArgIntrin(MI, MRI, B, AMDGPUFunctionArgInfo::DISPATCH_ID); + case Intrinsic::amdgcn_raw_buffer_store: + return legalizeRawBufferStore(MI, MRI, B, false); default: return true; } Index: lib/Target/AMDGPU/AMDGPURegisterBankInfo.h =================================================================== --- lib/Target/AMDGPU/AMDGPURegisterBankInfo.h +++ lib/Target/AMDGPU/AMDGPURegisterBankInfo.h @@ -23,7 +23,9 @@ namespace llvm { class LLT; +class GCNSubtarget; class MachineIRBuilder; +class SIInstrInfo; class SIRegisterInfo; class TargetRegisterInfo; @@ -36,9 +38,15 @@ #include "AMDGPUGenRegisterBank.inc" }; class AMDGPURegisterBankInfo : public AMDGPUGenRegisterBankInfo { + const GCNSubtarget &Subtarget; const SIRegisterInfo *TRI; + const SIInstrInfo *TII; - void executeInWaterfallLoop(MachineInstr &MI, + bool executeInWaterfallLoop(MachineIRBuilder &B, + MachineInstr &MI, + MachineRegisterInfo &MRI, + ArrayRef OpIndices) const; + bool executeInWaterfallLoop(MachineInstr &MI, MachineRegisterInfo &MRI, ArrayRef OpIndices) const; @@ -52,6 +60,15 @@ const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, MachineRegisterInfo &MRI, int RSrcIdx) const; + Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI, + Register Reg) const; + + std::pair + splitBufferOffsets(MachineIRBuilder &B, Register Offset) const; + + MachineInstr *selectStoreIntrinsic(MachineIRBuilder &B, + MachineInstr &MI) const; + /// See RegisterBankInfo::applyMapping. void applyMappingImpl(const OperandsMapper &OpdMapper) const override; @@ -110,7 +127,7 @@ int RsrcIdx) const; public: - AMDGPURegisterBankInfo(const TargetRegisterInfo &TRI); + AMDGPURegisterBankInfo(const GCNSubtarget &STI); unsigned copyCost(const RegisterBank &A, const RegisterBank &B, unsigned Size) const override; Index: lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -20,6 +20,7 @@ #include "llvm/ADT/SmallSet.h" #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" +#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" #include "llvm/CodeGen/GlobalISel/RegisterBank.h" #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" #include "llvm/CodeGen/TargetRegisterInfo.h" @@ -33,6 +34,7 @@ #include "AMDGPUGenRegisterBankInfo.def" using namespace llvm; +using namespace MIPatternMatch; namespace { @@ -84,9 +86,11 @@ }; } -AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const TargetRegisterInfo &TRI) +AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const GCNSubtarget &ST) : AMDGPUGenRegisterBankInfo(), - TRI(static_cast(&TRI)) { + Subtarget(ST), + TRI(Subtarget.getRegisterInfo()), + TII(Subtarget.getInstrInfo()) { // HACK: Until this is fully tablegen'd. static bool AlreadyInit = false; @@ -631,8 +635,10 @@ /// /// There is additional complexity to try for compare values to identify the /// unique values used. -void AMDGPURegisterBankInfo::executeInWaterfallLoop( - MachineInstr &MI, MachineRegisterInfo &MRI, +bool AMDGPURegisterBankInfo::executeInWaterfallLoop( + MachineIRBuilder &B, + MachineInstr &MI, + MachineRegisterInfo &MRI, ArrayRef OpIndices) const { MachineFunction *MF = MI.getParent()->getParent(); const GCNSubtarget &ST = MF->getSubtarget(); @@ -655,9 +661,8 @@ // No operands need to be replaced, so no need to loop. if (SGPROperandRegs.empty()) - return; + return false; - MachineIRBuilder B(MI); SmallVector ResultRegs; SmallVector InitResultRegs; SmallVector PhiRegs; @@ -915,6 +920,18 @@ B.buildInstr(AMDGPU::S_MOV_B64_term) .addDef(AMDGPU::EXEC) .addReg(SaveExecReg); + + // Restore the insert point before the original instruction. + B.setInsertPt(MBB, MBB.end()); + + return true; +} + +bool AMDGPURegisterBankInfo::executeInWaterfallLoop( + MachineInstr &MI, MachineRegisterInfo &MRI, + ArrayRef OpIndices) const { + MachineIRBuilder B(MI); + return executeInWaterfallLoop(B, MI, MRI, OpIndices); } // Legalize an operand that must be an SGPR by inserting a readfirstlane. @@ -1062,6 +1079,184 @@ } } +/// Handle register layout difference for f16 images for some subtargets. +Register AMDGPURegisterBankInfo::handleD16VData(MachineIRBuilder &B, + MachineRegisterInfo &MRI, + Register Reg) const { + if (!Subtarget.hasUnpackedD16VMem()) + return Reg; + + const LLT S16 = LLT::scalar(16); + LLT StoreVT = MRI.getType(Reg); + if (!StoreVT.isVector() || StoreVT.getElementType() != S16) + return Reg; + + auto Unmerge = B.buildUnmerge(S16, Reg); + + + SmallVector WideRegs; + for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) + WideRegs.push_back(Unmerge.getReg(I)); + + const LLT S32 = LLT::scalar(32); + int NumElts = StoreVT.getNumElements(); + + return B.buildMerge(LLT::vector(NumElts, S32), WideRegs).getReg(0); +} + +static std::pair +getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) { + int64_t Const; + if (mi_match(Reg, MRI, m_ICst(Const))) + return std::make_pair(Register(), Const); + + Register Base; + if (mi_match(Reg, MRI, m_GAdd(m_Reg(Base), m_ICst(Const)))) + return std::make_pair(Base, Const); + + // TODO: Handle G_OR used for add case + return std::make_pair(Reg, 0); +} + +std::pair +AMDGPURegisterBankInfo::splitBufferOffsets(MachineIRBuilder &B, + Register OrigOffset) const { + const unsigned MaxImm = 4095; + Register BaseReg; + unsigned ImmOffset; + const LLT S32 = LLT::scalar(32); + + std::tie(BaseReg, ImmOffset) = getBaseWithConstantOffset(*B.getMRI(), + OrigOffset); + + unsigned C1 = 0; + if (ImmOffset != 0) { + // If the immediate value is too big for the immoffset field, put the value + // and -4096 into the immoffset field so that the value that is copied/added + // for the voffset field is a multiple of 4096, and it stands more chance + // of being CSEd with the copy/add for another similar load/store. + // However, do not do that rounding down to a multiple of 4096 if that is a + // negative number, as it appears to be illegal to have a negative offset + // in the vgpr, even if adding the immediate offset makes it positive. + unsigned Overflow = ImmOffset & ~MaxImm; + ImmOffset -= Overflow; + if ((int32_t)Overflow < 0) { + Overflow += ImmOffset; + ImmOffset = 0; + } + + C1 = ImmOffset; + if (Overflow != 0) { + if (!BaseReg) + BaseReg = B.buildConstant(S32, Overflow).getReg(0); + else { + auto OverflowVal = B.buildConstant(S32, Overflow); + BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); + } + } + } + + if (!BaseReg) + BaseReg = B.buildConstant(S32, 0).getReg(0); + + return {BaseReg, C1}; +} + +static bool isZero(Register Reg, MachineRegisterInfo &MRI) { + int64_t C; + return mi_match(Reg, MRI, m_ICst(C)) && C == 0; +} + +static unsigned extractGLC(unsigned CachePolicy) { + return CachePolicy & 1; +} + +static unsigned extractSLC(unsigned CachePolicy) { + return (CachePolicy >> 1) & 1; +} + +static unsigned extractDLC(unsigned CachePolicy) { + return (CachePolicy >> 2) & 1; +} + +MachineInstr * +AMDGPURegisterBankInfo::selectStoreIntrinsic(MachineIRBuilder &B, + MachineInstr &MI) const { + MachineRegisterInfo &MRI = *B.getMRI(); + executeInWaterfallLoop(B, MI, MRI, {2, 4}); + + // FIXME: DAG lowering brokenly changes opcode based on FP vs. integer. + + Register VData = MI.getOperand(1).getReg(); + LLT Ty = MRI.getType(VData); + + int EltSize = Ty.getScalarSizeInBits(); + int Size = Ty.getSizeInBits(); + + // FIXME: Broken integer truncstore. + if (EltSize != 32) + report_fatal_error("unhandled intrinsic store"); + + // FIXME: Verifier should enforce 1 MMO for these intrinsics. + const int MemSize = (*MI.memoperands_begin())->getSize(); + + + Register RSrc = MI.getOperand(2).getReg(); + Register VOffset = MI.getOperand(3).getReg(); + Register SOffset = MI.getOperand(4).getReg(); + unsigned CachePolicy = MI.getOperand(5).getImm(); + + unsigned ImmOffset; + std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); + + const bool Offen = !isZero(VOffset, MRI); + + unsigned Opc = AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact; + switch (8 * MemSize) { + case 8: + Opc = Offen ? AMDGPU::BUFFER_STORE_BYTE_OFFEN_exact : + AMDGPU::BUFFER_STORE_BYTE_OFFSET_exact; + break; + case 16: + Opc = Offen ? AMDGPU::BUFFER_STORE_SHORT_OFFEN_exact : + AMDGPU::BUFFER_STORE_SHORT_OFFSET_exact; + break; + default: + Opc = Offen ? AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact : + AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact; + if (Size > 32) + Opc = AMDGPU::getMUBUFOpcode(Opc, Size / 32); + break; + } + + + // Set the insertion point back to the instruction in case it was moved into a + // loop. + B.setInstr(MI); + + MachineInstrBuilder MIB = B.buildInstr(Opc) + .addUse(VData); + + if (Offen) + MIB.addUse(VOffset); + + MIB.addUse(RSrc) + .addUse(SOffset) + .addImm(ImmOffset) + .addImm(extractGLC(CachePolicy)) + .addImm(extractSLC(CachePolicy)) + .addImm(0) // tfe: FIXME: Remove from inst + .addImm(extractDLC(CachePolicy)) + .cloneMemRefs(MI); + + // FIXME: We need a way to report failure from applyMappingImpl. + // Insert constrain copies before inserting the loop. + if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this)) + report_fatal_error("failed to constrain selected store intrinsic"); + + return MIB; +} + void AMDGPURegisterBankInfo::applyMappingImpl( const OperandsMapper &OpdMapper) const { MachineInstr &MI = OpdMapper.getMI(); @@ -1397,8 +1592,23 @@ return; } case Intrinsic::amdgcn_raw_buffer_load: - case Intrinsic::amdgcn_raw_buffer_store: { + case Intrinsic::amdgcn_raw_buffer_load_format: + case Intrinsic::amdgcn_raw_buffer_store: + case Intrinsic::amdgcn_raw_buffer_store_format: { applyDefaultMapping(OpdMapper); + +#if 0 + // We directly select here to avoid pack/unpack and conversion + // instructions out of a possible waterfall loop, and to avoid unnecessary + // intermediate operations. + + if (IntrID == Intrinsic::amdgcn_raw_buffer_store) { + MachineIRBuilder B(MI); + selectStoreIntrinsic(B, MI); + MI.eraseFromParent(); + return; + } +#endif executeInWaterfallLoop(MI, MRI, {2, 4}); return; } @@ -2338,7 +2548,8 @@ OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); break; } - case Intrinsic::amdgcn_raw_buffer_store: { + case Intrinsic::amdgcn_raw_buffer_store: + case Intrinsic::amdgcn_raw_buffer_store_format: { OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); Index: lib/Target/AMDGPU/AMDGPUSubtarget.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -280,7 +280,7 @@ FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) { CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering())); Legalizer.reset(new AMDGPULegalizerInfo(*this, TM)); - RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo())); + RegBankInfo.reset(new AMDGPURegisterBankInfo(*this)); InstSelector.reset(new AMDGPUInstructionSelector( *this, *static_cast(RegBankInfo.get()), TM)); } Index: test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll @@ -0,0 +1,791 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck %s +; FIXME: Test with SI when argument lowering not broken for f16 + +; Natural mapping +define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, float %val, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; CHECK: [[COPY:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr3 + ; CHECK: [[COPY2:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr4 + ; CHECK: [[COPY3:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr5 + ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: S_ENDPGM 0 + call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + ret void +} + +; Copies for VGPR arguments +define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__sgpr_val__sgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, float inreg %val, i32 inreg %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: raw_buffer_store__sgpr_rsrc__sgpr_val__sgpr_voffset__sgpr_soffset + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8 + ; CHECK: [[COPY:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr3 + ; CHECK: [[COPY2:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr4 + ; CHECK: [[COPY3:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr5 + ; CHECK: [[COPY4:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr6 + ; CHECK: [[COPY5:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr7 + ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr8 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] + ; CHECK: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY7]], [[COPY8]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: S_ENDPGM 0 + call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + ret void +} + +; Waterfall for rsrc +define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset(<4 x i32> %rsrc, float %val, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 + ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; CHECK: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; CHECK: bb.2: + ; CHECK: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub0, implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub1, implicit $exec + ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 + ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[COPY7]], implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub0, implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub1, implicit $exec + ; CHECK: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 + ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY8]], implicit $exec + ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc + ; CHECK: [[REG_SEQUENCE3:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE3]], [[COPY6]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec + ; CHECK: bb.3: + ; CHECK: successors: %bb.4(0x80000000) + ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; CHECK: bb.4: + ; CHECK: S_ENDPGM 0 + call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + ret void +} + +; Waterfall for soffset +define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__vgpr_soffset(<4 x i32> inreg %rsrc, float %val, i32 %voffset, i32 %soffset) { + ; CHECK-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__vgpr_soffset + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0, $vgpr1, $vgpr2 + ; CHECK: [[COPY:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr3 + ; CHECK: [[COPY2:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr4 + ; CHECK: [[COPY3:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr5 + ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; CHECK: bb.2: + ; CHECK: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]], [[COPY6]], implicit $exec + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[V_READFIRSTLANE_B32_]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec + ; CHECK: bb.3: + ; CHECK: successors: %bb.4(0x80000000) + ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; CHECK: bb.4: + ; CHECK: S_ENDPGM 0 + call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + ret void +} + +; Waterfall for rsrc and soffset +define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__vgpr_soffset(<4 x i32> %rsrc, float %val, i32 %voffset, i32 %soffset) { + ; CHECK-LABEL: name: raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__vgpr_soffset + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6 + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 + ; CHECK: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; CHECK: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; CHECK: bb.2: + ; CHECK: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub0, implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub1, implicit $exec + ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 + ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[COPY7]], implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub0, implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub1, implicit $exec + ; CHECK: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 + ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY8]], implicit $exec + ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc + ; CHECK: [[REG_SEQUENCE3:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 + ; CHECK: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec + ; CHECK: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec + ; CHECK: bb.3: + ; CHECK: successors: %bb.4(0x80000000) + ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; CHECK: bb.4: + ; CHECK: S_ENDPGM 0 + call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + ret void +} + +define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_glc(<4 x i32> inreg %rsrc, float %val, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_glc + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; CHECK: [[COPY:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr3 + ; CHECK: [[COPY2:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr4 + ; CHECK: [[COPY3:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr5 + ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: S_ENDPGM 0 + call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 1) + ret void +} + +define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_slc(<4 x i32> inreg %rsrc, float %val, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_slc + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; CHECK: [[COPY:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr3 + ; CHECK: [[COPY2:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr4 + ; CHECK: [[COPY3:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr5 + ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 1, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: S_ENDPGM 0 + call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 2) + ret void +} + +define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_glc_slc(<4 x i32> inreg %rsrc, float %val, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_glc_slc + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; CHECK: [[COPY:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr3 + ; CHECK: [[COPY2:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr4 + ; CHECK: [[COPY3:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr5 + ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 1, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: S_ENDPGM 0 + call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 3) + ret void +} + +define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_dlc(<4 x i32> inreg %rsrc, float %val, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_dlc + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; CHECK: [[COPY:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr3 + ; CHECK: [[COPY2:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr4 + ; CHECK: [[COPY3:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr5 + ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 1, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: S_ENDPGM 0 + call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 4) + ret void +} + +define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_slc_dlc(<4 x i32> inreg %rsrc, float %val, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_slc_dlc + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; CHECK: [[COPY:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr3 + ; CHECK: [[COPY2:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr4 + ; CHECK: [[COPY3:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr5 + ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 1, 0, 1, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: S_ENDPGM 0 + call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 6) + ret void +} + +define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_glc_dlc(<4 x i32> inreg %rsrc, float %val, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_glc_dlc + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; CHECK: [[COPY:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr3 + ; CHECK: [[COPY2:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr4 + ; CHECK: [[COPY3:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr5 + ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 0, 0, 1, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: S_ENDPGM 0 + call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 5) + ret void +} + +define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_glc_slc_dlc(<4 x i32> inreg %rsrc, float %val, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_glc_slc_dlc + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; CHECK: [[COPY:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr3 + ; CHECK: [[COPY2:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr4 + ; CHECK: [[COPY3:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr5 + ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 1, 0, 1, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: S_ENDPGM 0 + call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 7) + ret void +} + +define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f32(<4 x i32> inreg %rsrc, <2 x float> %val, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f32 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2 + ; CHECK: [[COPY:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr3 + ; CHECK: [[COPY2:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr4 + ; CHECK: [[COPY3:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr5 + ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 + ; CHECK: BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: S_ENDPGM 0 + call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + ret void +} + +define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v3f32(<4 x i32> inreg %rsrc, <3 x float> %val, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v3f32 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; CHECK: [[COPY:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr3 + ; CHECK: [[COPY2:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr4 + ; CHECK: [[COPY3:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr5 + ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; CHECK: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2 + ; CHECK: BUFFER_STORE_DWORDX3_OFFEN_exact [[REG_SEQUENCE1]], [[COPY7]], [[REG_SEQUENCE]], [[COPY8]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: S_ENDPGM 0 + call void @llvm.amdgcn.raw.buffer.store.v3f32(<3 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + ret void +} + +define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v4f32(<4 x i32> inreg %rsrc, <4 x float> %val, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v4f32 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; CHECK: [[COPY:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr3 + ; CHECK: [[COPY2:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr4 + ; CHECK: [[COPY3:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr5 + ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; CHECK: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; CHECK: [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3 + ; CHECK: BUFFER_STORE_DWORDX4_OFFEN_exact [[REG_SEQUENCE1]], [[COPY8]], [[REG_SEQUENCE]], [[COPY9]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: S_ENDPGM 0 + call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + ret void +} + +define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_i8(<4 x i32> inreg %rsrc, i32 %val, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_i8 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; CHECK: [[COPY:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr3 + ; CHECK: [[COPY2:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr4 + ; CHECK: [[COPY3:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr5 + ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK: BUFFER_STORE_BYTE_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 1 into custom TargetCustom7, addrspace 4) + ; CHECK: S_ENDPGM 0 + %val.trunc = trunc i32 %val to i8 + call void @llvm.amdgcn.raw.buffer.store.i8(i8 %val.trunc, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + ret void +} + +define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_i16(<4 x i32> inreg %rsrc, i32 %val, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_i16 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; CHECK: [[COPY:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr3 + ; CHECK: [[COPY2:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr4 + ; CHECK: [[COPY3:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr5 + ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK: BUFFER_STORE_SHORT_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: S_ENDPGM 0 + %val.trunc = trunc i32 %val to i16 + call void @llvm.amdgcn.raw.buffer.store.i16(i16 %val.trunc, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + ret void +} + +define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_f16(<4 x i32> inreg %rsrc, half %val, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_f16 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; CHECK: [[COPY:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr3 + ; CHECK: [[COPY2:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr4 + ; CHECK: [[COPY3:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr5 + ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK: BUFFER_STORE_SHORT_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: S_ENDPGM 0 + call void @llvm.amdgcn.raw.buffer.store.f16(half %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + ret void +} + +define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16(<4 x i32> inreg %rsrc, <2 x half> %val, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; CHECK: [[COPY:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr3 + ; CHECK: [[COPY2:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr4 + ; CHECK: [[COPY3:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr5 + ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: S_ENDPGM 0 + call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + ret void +} + +define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v4f16(<4 x i32> inreg %rsrc, <4 x half> %val, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v4f16 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2 + ; CHECK: [[COPY:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr3 + ; CHECK: [[COPY2:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr4 + ; CHECK: [[COPY3:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr5 + ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 + ; CHECK: BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: S_ENDPGM 0 + call void @llvm.amdgcn.raw.buffer.store.v4f16(<4 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + ret void +} + +define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v4f16(<4 x i32> %rsrc, <4 x half> %val, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v4f16 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6 + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 + ; CHECK: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6 + ; CHECK: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 + ; CHECK: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; CHECK: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; CHECK: bb.2: + ; CHECK: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub0, implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub1, implicit $exec + ; CHECK: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 + ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY8]], implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub0, implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub1, implicit $exec + ; CHECK: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 + ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY9]], implicit $exec + ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc + ; CHECK: [[REG_SEQUENCE4:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 + ; CHECK: BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec + ; CHECK: bb.3: + ; CHECK: successors: %bb.4(0x80000000) + ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; CHECK: bb.4: + ; CHECK: S_ENDPGM 0 + call void @llvm.amdgcn.raw.buffer.store.v4f16(<4 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + ret void +} + +define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__sgpr_soffset_f32_voffset4095(<4 x i32> inreg %rsrc, float %val, i32 inreg %soffset) { + ; CHECK-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__sgpr_soffset_f32_voffset4095 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 + ; CHECK: [[COPY:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr3 + ; CHECK: [[COPY2:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr4 + ; CHECK: [[COPY3:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr5 + ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], $noreg, [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 4095, align 1, addrspace 4) + ; CHECK: S_ENDPGM 0 + call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 4095, i32 %soffset, i32 0) + ret void +} + +define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__sgpr_soffset_f32_voffset4096(<4 x i32> inreg %rsrc, float %val, i32 inreg %soffset) { + ; CHECK-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__sgpr_soffset_f32_voffset4096 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 + ; CHECK: [[COPY:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr3 + ; CHECK: [[COPY2:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr4 + ; CHECK: [[COPY3:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr5 + ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 4096, align 1, addrspace 4) + ; CHECK: S_ENDPGM 0 + call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 4096, i32 %soffset, i32 0) + ret void +} + +define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_f32_voffset_add_16(<4 x i32> inreg %rsrc, float %val, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_f32_voffset_add_16 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; CHECK: [[COPY:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr3 + ; CHECK: [[COPY2:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr4 + ; CHECK: [[COPY3:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr5 + ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 16 + ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; CHECK: %11:vgpr_32, dead %13:sreg_64_xexec = V_ADD_I32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: S_ENDPGM 0 + %voffset.add = add i32 %voffset, 16 + call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) + ret void +} + +define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_f32_voffset_add_4095(<4 x i32> inreg %rsrc, float %val, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_f32_voffset_add_4095 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; CHECK: [[COPY:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr3 + ; CHECK: [[COPY2:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr4 + ; CHECK: [[COPY3:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr5 + ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 4095 + ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; CHECK: %11:vgpr_32, dead %13:sreg_64_xexec = V_ADD_I32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: S_ENDPGM 0 + %voffset.add = add i32 %voffset, 4095 + call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) + ret void +} + +define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_f32_voffset_add_4096(<4 x i32> inreg %rsrc, float %val, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_f32_voffset_add_4096 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; CHECK: [[COPY:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr3 + ; CHECK: [[COPY2:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr4 + ; CHECK: [[COPY3:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr5 + ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 4096 + ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; CHECK: %11:vgpr_32, dead %13:sreg_64_xexec = V_ADD_I32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: S_ENDPGM 0 + %voffset.add = add i32 %voffset, 4096 + call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) + ret void +} + +define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_soffset4095(<4 x i32> inreg %rsrc, <2 x half> %val, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_soffset4095 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; CHECK: [[COPY:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr3 + ; CHECK: [[COPY2:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr4 + ; CHECK: [[COPY3:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr5 + ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095 + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: S_ENDPGM 0 + call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 4095, i32 0) + ret void +} + +define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_soffset4096(<4 x i32> inreg %rsrc, <2 x half> %val, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_soffset4096 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; CHECK: [[COPY:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr3 + ; CHECK: [[COPY2:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr4 + ; CHECK: [[COPY3:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr5 + ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: S_ENDPGM 0 + call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 4096, i32 0) + ret void +} + +define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_soffset_add_16(<4 x i32> inreg %rsrc, <2 x half> %val, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_soffset_add_16 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; CHECK: [[COPY:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr3 + ; CHECK: [[COPY2:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr4 + ; CHECK: [[COPY3:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr5 + ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 16 + ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; CHECK: %11:vgpr_32, dead %13:sreg_64_xexec = V_ADD_I32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: S_ENDPGM 0 + %voffset.add = add i32 %voffset, 16 + call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) + ret void +} + +define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_soffset_add_4095(<4 x i32> inreg %rsrc, <2 x half> %val, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_soffset_add_4095 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; CHECK: [[COPY:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr3 + ; CHECK: [[COPY2:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr4 + ; CHECK: [[COPY3:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr5 + ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 4095 + ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; CHECK: %11:vgpr_32, dead %13:sreg_64_xexec = V_ADD_I32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: S_ENDPGM 0 + %voffset.add = add i32 %voffset, 4095 + call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) + ret void +} + +define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_soffset_add_4096(<4 x i32> inreg %rsrc, <2 x half> %val, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_soffset_add_4096 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; CHECK: [[COPY:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr3 + ; CHECK: [[COPY2:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr4 + ; CHECK: [[COPY3:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr5 + ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 4096 + ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; CHECK: %11:vgpr_32, dead %13:sreg_64_xexec = V_ADD_I32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: S_ENDPGM 0 + %voffset.add = add i32 %voffset, 4096 + call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) + ret void +} + +; An add of the offset is necessary, with a waterfall loop. Make sure the add is done outside of the waterfall loop. +define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_offset_add_5000(<4 x i32> %rsrc, float %val, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_offset_add_5000 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 + ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 5000 + ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; CHECK: %11:vgpr_32, dead %29:sreg_64_xexec = V_ADD_I32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec + ; CHECK: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; CHECK: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; CHECK: bb.2: + ; CHECK: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub0, implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub1, implicit $exec + ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 + ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[COPY8]], implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub0, implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub1, implicit $exec + ; CHECK: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 + ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY9]], implicit $exec + ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc + ; CHECK: [[REG_SEQUENCE3:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %11, [[REG_SEQUENCE3]], [[COPY6]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec + ; CHECK: bb.3: + ; CHECK: successors: %bb.4(0x80000000) + ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; CHECK: bb.4: + ; CHECK: S_ENDPGM 0 + %voffset.add = add i32 %voffset, 5000 + call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) + ret void +} + +; An add of the offset is necessary, with a waterfall loop. Make sure the add is done outside of the waterfall loop. +define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__5000_voffset__sgpr_soffset_offset(<4 x i32> %rsrc, float %val, i32 inreg %soffset) { + ; CHECK-LABEL: name: raw_buffer_store__vgpr_rsrc__vgpr_val__5000_voffset__sgpr_soffset_offset + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec + ; CHECK: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; CHECK: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; CHECK: bb.2: + ; CHECK: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]].sub0, implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]].sub1, implicit $exec + ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 + ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[COPY6]], implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub0, implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub1, implicit $exec + ; CHECK: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 + ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY7]], implicit $exec + ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc + ; CHECK: [[REG_SEQUENCE3:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE3]], [[COPY5]], 904, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 5000, align 1, addrspace 4) + ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec + ; CHECK: bb.3: + ; CHECK: successors: %bb.4(0x80000000) + ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; CHECK: bb.4: + ; CHECK: S_ENDPGM 0 + call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 5000, i32 %soffset, i32 0) + ret void +} + +declare void @llvm.amdgcn.raw.buffer.store.i8(i8, <4 x i32>, i32, i32, i32 immarg) +declare void @llvm.amdgcn.raw.buffer.store.i16(i16, <4 x i32>, i32, i32, i32 immarg) + +declare void @llvm.amdgcn.raw.buffer.store.f16(half, <4 x i32>, i32, i32, i32 immarg) +declare void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half>, <4 x i32>, i32, i32, i32 immarg) +declare void @llvm.amdgcn.raw.buffer.store.v4f16(<4 x half>, <4 x i32>, i32, i32, i32 immarg) + +declare void @llvm.amdgcn.raw.buffer.store.f32(float, <4 x i32>, i32, i32, i32 immarg) +declare void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float>, <4 x i32>, i32, i32, i32 immarg) +declare void @llvm.amdgcn.raw.buffer.store.v3f32(<3 x float>, <4 x i32>, i32, i32, i32 immarg) +declare void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32 immarg) Index: test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.buffer.store.ll =================================================================== --- test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.buffer.store.ll +++ /dev/null @@ -1,168 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -stop-after=regbankselect -regbankselect-fast -o - %s | FileCheck %s -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -stop-after=regbankselect -regbankselect-greedy -o - %s | FileCheck %s - -; Natural mapping -define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, float %val, i32 %voffset, i32 inreg %soffset) { - ; CHECK-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset - ; CHECK: bb.1 (%ir-block.0): - ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 - ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 - ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 - ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 - ; CHECK: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 - ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 - ; CHECK: [[COPY6:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 - ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) - ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.raw.buffer.store), [[COPY4]](s32), [[BUILD_VECTOR]](<4 x s32>), [[COPY5]](s32), [[COPY6]](s32), 0 :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) - ; CHECK: S_ENDPGM 0 - call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) - ret void -} - -; Copies for VGPR arguments -define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__sgpr_val__sgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, float inreg %val, i32 inreg %voffset, i32 inreg %soffset) { - ; CHECK-LABEL: name: raw_buffer_store__sgpr_rsrc__sgpr_val__sgpr_voffset__sgpr_soffset - ; CHECK: bb.1 (%ir-block.0): - ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8 - ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 - ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 - ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 - ; CHECK: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 - ; CHECK: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 - ; CHECK: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr7 - ; CHECK: [[COPY6:%[0-9]+]]:sgpr(s32) = COPY $sgpr8 - ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) - ; CHECK: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[COPY4]](s32) - ; CHECK: [[COPY8:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32) - ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.raw.buffer.store), [[COPY7]](s32), [[BUILD_VECTOR]](<4 x s32>), [[COPY8]](s32), [[COPY6]](s32), 0 :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) - ; CHECK: S_ENDPGM 0 - call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) - ret void -} - -; Waterfall for rsrc -define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset(<4 x i32> %rsrc, float %val, i32 %voffset, i32 inreg %soffset) { - ; CHECK-LABEL: name: raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset - ; CHECK: bb.1 (%ir-block.0): - ; CHECK: successors: %bb.2(0x80000000) - ; CHECK: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 - ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 - ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 - ; CHECK: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 - ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4 - ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY $vgpr5 - ; CHECK: [[COPY6:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 - ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) - ; CHECK: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF - ; CHECK: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) - ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec - ; CHECK: bb.2: - ; CHECK: successors: %bb.3(0x40000000), %bb.2(0x40000000) - ; CHECK: [[PHI:%[0-9]+]]:sreg_64 = PHI [[DEF]], %bb.1, %13, %bb.2 - ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec - ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec - ; CHECK: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) - ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV]](s64), [[UV]](s64), implicit $exec - ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub0(s64), implicit $exec - ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub1(s64), implicit $exec - ; CHECK: [[MV1:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) - ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec - ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc - ; CHECK: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) - ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.raw.buffer.store), [[COPY4]](s32), [[BUILD_VECTOR1]](<4 x s32>), [[COPY5]](s32), [[COPY6]](s32), 0 :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) - ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec - ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec - ; CHECK: bb.3: - ; CHECK: successors: %bb.4(0x80000000) - ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] - ; CHECK: bb.4: - ; CHECK: S_ENDPGM 0 - call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) - ret void -} - -; Waterfall for soffset -define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__vgpr_soffset(<4 x i32> inreg %rsrc, float %val, i32 %voffset, i32 %soffset) { - ; CHECK-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__vgpr_soffset - ; CHECK: bb.1 (%ir-block.0): - ; CHECK: successors: %bb.2(0x80000000) - ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0, $vgpr1, $vgpr2 - ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 - ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 - ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 - ; CHECK: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 - ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 - ; CHECK: [[COPY6:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 - ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) - ; CHECK: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF - ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec - ; CHECK: bb.2: - ; CHECK: successors: %bb.3(0x40000000), %bb.2(0x40000000) - ; CHECK: [[PHI:%[0-9]+]]:sreg_64 = PHI [[DEF]], %bb.1, %13, %bb.2 - ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY6]](s32), implicit $exec - ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY6]](s32), implicit $exec - ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.raw.buffer.store), [[COPY4]](s32), [[BUILD_VECTOR]](<4 x s32>), [[COPY5]](s32), [[V_READFIRSTLANE_B32_]](s32), 0 :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) - ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec - ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec - ; CHECK: bb.3: - ; CHECK: successors: %bb.4(0x80000000) - ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] - ; CHECK: bb.4: - ; CHECK: S_ENDPGM 0 - call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) - ret void -} - -; Waterfall for rsrc and soffset -define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__vgpr_soffset(<4 x i32> %rsrc, float %val, i32 %voffset, i32 %soffset) { - ; CHECK-LABEL: name: raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__vgpr_soffset - ; CHECK: bb.1 (%ir-block.0): - ; CHECK: successors: %bb.2(0x80000000) - ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6 - ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 - ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 - ; CHECK: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 - ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4 - ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY $vgpr5 - ; CHECK: [[COPY6:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr6 - ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) - ; CHECK: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF - ; CHECK: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) - ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec - ; CHECK: bb.2: - ; CHECK: successors: %bb.3(0x40000000), %bb.2(0x40000000) - ; CHECK: [[PHI:%[0-9]+]]:sreg_64 = PHI [[DEF]], %bb.1, %13, %bb.2 - ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec - ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec - ; CHECK: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) - ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV]](s64), [[UV]](s64), implicit $exec - ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub0(s64), implicit $exec - ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub1(s64), implicit $exec - ; CHECK: [[MV1:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) - ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec - ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc - ; CHECK: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) - ; CHECK: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY6]](s32), implicit $exec - ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]](s32), [[COPY6]](s32), implicit $exec - ; CHECK: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc - ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.raw.buffer.store), [[COPY4]](s32), [[BUILD_VECTOR1]](<4 x s32>), [[COPY5]](s32), [[V_READFIRSTLANE_B32_4]](s32), 0 :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) - ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec - ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec - ; CHECK: bb.3: - ; CHECK: successors: %bb.4(0x80000000) - ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] - ; CHECK: bb.4: - ; CHECK: S_ENDPGM 0 - call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) - ret void -} - -declare void @llvm.amdgcn.raw.buffer.store.f32(float, <4 x i32>, i32, i32, i32 immarg)