diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td --- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -218,6 +218,7 @@ def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; +def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -514,6 +514,7 @@ BUFFER_LOAD_BYTE, BUFFER_LOAD_SHORT, BUFFER_LOAD_FORMAT, + BUFFER_LOAD_FORMAT_TFE, BUFFER_LOAD_FORMAT_D16, SBUFFER_LOAD, BUFFER_STORE, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -4429,6 +4429,7 @@ NODE_NAME_CASE(BUFFER_LOAD_BYTE) NODE_NAME_CASE(BUFFER_LOAD_SHORT) NODE_NAME_CASE(BUFFER_LOAD_FORMAT) + NODE_NAME_CASE(BUFFER_LOAD_FORMAT_TFE) NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16) NODE_NAME_CASE(SBUFFER_LOAD) NODE_NAME_CASE(BUFFER_STORE) diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -4465,6 +4465,27 @@ return true; } +static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc, + Register VIndex, Register VOffset, Register SOffset, + unsigned ImmOffset, unsigned Format, + unsigned AuxiliaryData, MachineMemOperand *MMO, + bool IsTyped, bool HasVIndex, MachineIRBuilder &B) { + auto MIB = B.buildInstr(Opc) + .addDef(LoadDstReg) // vdata + .addUse(RSrc) // rsrc + .addUse(VIndex) // vindex + .addUse(VOffset) // voffset + .addUse(SOffset) // soffset + .addImm(ImmOffset); // offset(imm) + + if (IsTyped) + MIB.addImm(Format); + + MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) + .addImm(HasVIndex ? -1 : 0) // idxen(imm) + .addMemOperand(MMO); +} + bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, @@ -4476,18 +4497,27 @@ const LLT S32 = LLT::scalar(32); Register Dst = MI.getOperand(0).getReg(); - Register RSrc = MI.getOperand(2).getReg(); + + Register StatusDst; + int OpOffset = 0; + assert(MI.getNumExplicitDefs() == 1 || MI.getNumExplicitDefs() == 2); + bool IsTFE = MI.getNumExplicitDefs() == 2; + if (IsTFE) { + StatusDst = MI.getOperand(1).getReg(); + ++OpOffset; + } + + Register RSrc = MI.getOperand(2 + OpOffset).getReg(); // The typed intrinsics add an immediate after the registers. const unsigned NumVIndexOps = IsTyped ? 8 : 7; // The struct intrinsic variants add one additional operand over raw. - const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; + const bool HasVIndex = MI.getNumOperands() == NumVIndexOps + OpOffset; Register VIndex; - int OpOffset = 0; if (HasVIndex) { - VIndex = MI.getOperand(3).getReg(); - OpOffset = 1; + VIndex = MI.getOperand(3 + OpOffset).getReg(); + ++OpOffset; } else { VIndex = B.buildConstant(S32, 0).getReg(0); } @@ -4514,13 +4544,21 @@ unsigned Opc; + // TODO: Support TFE for typed and narrow loads. if (IsTyped) { + assert(!IsTFE); Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; } else if (IsFormat) { - Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : - AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; + if (IsD16) { + assert(!IsTFE); + Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16; + } else { + Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE + : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; + } } else { + assert(!IsTFE); switch (MemTy.getSizeInBits()) { case 8: Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; @@ -4534,49 +4572,46 @@ } } - Register LoadDstReg; - - bool IsExtLoad = - (!IsD16 && MemTy.getSizeInBits() < 32) || (IsD16 && !Ty.isVector()); - LLT UnpackedTy = Ty.changeElementSize(32); - - if (IsExtLoad) - LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); - else if (Unpacked && IsD16 && Ty.isVector()) - LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); - else - LoadDstReg = Dst; - - auto MIB = B.buildInstr(Opc) - .addDef(LoadDstReg) // vdata - .addUse(RSrc) // rsrc - .addUse(VIndex) // vindex - .addUse(VOffset) // voffset - .addUse(SOffset) // soffset - .addImm(ImmOffset); // offset(imm) - - if (IsTyped) - MIB.addImm(Format); - - MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) - .addImm(HasVIndex ? -1 : 0) // idxen(imm) - .addMemOperand(MMO); - - if (LoadDstReg != Dst) { - B.setInsertPt(B.getMBB(), ++B.getInsertPt()); - - // Widen result for extending loads was widened. - if (IsExtLoad) - B.buildTrunc(Dst, LoadDstReg); - else { - // Repack to original 16-bit vector result - // FIXME: G_TRUNC should work, but legalization currently fails - auto Unmerge = B.buildUnmerge(S32, LoadDstReg); - SmallVector Repack; - for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) - Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); - B.buildMerge(Dst, Repack); + if (IsTFE) { + unsigned NumValueDWords = divideCeil(Ty.getSizeInBits(), 32); + unsigned NumLoadDWords = NumValueDWords + 1; + LLT LoadTy = LLT::fixed_vector(NumLoadDWords, S32); + Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(LoadTy); + buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset, + Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B); + if (NumValueDWords == 1) { + B.buildUnmerge({Dst, StatusDst}, LoadDstReg); + } else { + SmallVector LoadElts; + for (unsigned I = 0; I != NumValueDWords; ++I) + LoadElts.push_back(B.getMRI()->createGenericVirtualRegister(S32)); + LoadElts.push_back(StatusDst); + B.buildUnmerge(LoadElts, LoadDstReg); + LoadElts.truncate(NumValueDWords); + B.buildMerge(Dst, LoadElts); } + } else if ((!IsD16 && MemTy.getSizeInBits() < 32) || + (IsD16 && !Ty.isVector())) { + Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); + buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset, + Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B); + B.setInsertPt(B.getMBB(), ++B.getInsertPt()); + B.buildTrunc(Dst, LoadDstReg); + } else if (Unpacked && IsD16 && Ty.isVector()) { + LLT UnpackedTy = Ty.changeElementSize(32); + Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); + buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset, + Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B); + B.setInsertPt(B.getMBB(), ++B.getInsertPt()); + // FIXME: G_TRUNC should work, but legalization currently fails + auto Unmerge = B.buildUnmerge(S32, LoadDstReg); + SmallVector Repack; + for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) + Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); + B.buildMerge(Dst, Repack); + } else { + buildBufferLoad(Opc, Dst, RSrc, VIndex, VOffset, SOffset, ImmOffset, Format, + AuxiliaryData, MMO, IsTyped, HasVIndex, B); } MI.eraseFromParent(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -2873,6 +2873,7 @@ case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE: case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE: case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT: + case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE: case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16: case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT: case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16: @@ -4038,6 +4039,7 @@ case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT: case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT: case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT: + case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE: case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16: case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT: case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16: diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td --- a/llvm/lib/Target/AMDGPU/BUFInstructions.td +++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -1299,6 +1299,15 @@ defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; +defm : MUBUF_LoadIntrinsicPat; +defm : MUBUF_LoadIntrinsicPat; +defm : MUBUF_LoadIntrinsicPat; +defm : MUBUF_LoadIntrinsicPat; +defm : MUBUF_LoadIntrinsicPat; +defm : MUBUF_LoadIntrinsicPat; +defm : MUBUF_LoadIntrinsicPat; +defm : MUBUF_LoadIntrinsicPat; + let SubtargetPredicate = HasUnpackedD16VMem in { defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -919,11 +919,11 @@ Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT); } -static EVT memVTFromImageData(Type *Ty, unsigned DMaskLanes) { - assert(DMaskLanes != 0); +static EVT memVTFromLoadIntrData(Type *Ty, unsigned MaxNumLanes) { + assert(MaxNumLanes != 0); if (auto *VT = dyn_cast(Ty)) { - unsigned NumElts = std::min(DMaskLanes, VT->getNumElements()); + unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements()); return EVT::getVectorVT(Ty->getContext(), EVT::getEVT(VT->getElementType()), NumElts); @@ -933,19 +933,15 @@ } // Peek through TFE struct returns to only use the data size. -static EVT memVTFromImageReturn(Type *Ty, unsigned DMaskLanes) { +static EVT memVTFromLoadIntrReturn(Type *Ty, unsigned MaxNumLanes) { auto *ST = dyn_cast(Ty); if (!ST) - return memVTFromImageData(Ty, DMaskLanes); + return memVTFromLoadIntrData(Ty, MaxNumLanes); - // Some intrinsics return an aggregate type - special case to work out the - // correct memVT. - // - // Only limited forms of aggregate type currently expected. - if (ST->getNumContainedTypes() != 2 || - !ST->getContainedType(1)->isIntegerTy(32)) - return EVT(); - return memVTFromImageData(ST->getContainedType(0), DMaskLanes); + // TFE intrinsics return an aggregate type. + assert(ST->getNumContainedTypes() == 2 && + ST->getContainedType(1)->isIntegerTy(32)); + return memVTFromLoadIntrData(ST->getContainedType(0), MaxNumLanes); } bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, @@ -978,7 +974,7 @@ Info.flags |= MachineMemOperand::MODereferenceable; if (ME.onlyReadsMemory()) { - unsigned DMaskLanes = 4; + unsigned MaxNumLanes = 4; if (RsrcIntr->IsImage) { const AMDGPU::ImageDimIntrinsicInfo *Intr @@ -991,12 +987,11 @@ // IR type. Check the dmask for the real number of elements loaded. unsigned DMask = cast(CI.getArgOperand(0))->getZExtValue(); - DMaskLanes = DMask == 0 ? 1 : countPopulation(DMask); + MaxNumLanes = DMask == 0 ? 1 : countPopulation(DMask); } + } - Info.memVT = memVTFromImageReturn(CI.getType(), DMaskLanes); - } else - Info.memVT = EVT::getEVT(CI.getType()); + Info.memVT = memVTFromLoadIntrReturn(CI.getType(), MaxNumLanes); // FIXME: What does alignment mean for an image? Info.opc = ISD::INTRINSIC_W_CHAIN; @@ -1008,7 +1003,7 @@ if (RsrcIntr->IsImage) { unsigned DMask = cast(CI.getArgOperand(1))->getZExtValue(); unsigned DMaskLanes = DMask == 0 ? 1 : countPopulation(DMask); - Info.memVT = memVTFromImageData(DataTy, DMaskLanes); + Info.memVT = memVTFromLoadIntrData(DataTy, DMaskLanes); } else Info.memVT = EVT::getEVT(DataTy); @@ -4854,8 +4849,18 @@ bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16); - unsigned Opc = - IsFormat ? AMDGPUISD::BUFFER_LOAD_FORMAT : AMDGPUISD::BUFFER_LOAD; + assert(M->getNumValues() == 2 || M->getNumValues() == 3); + bool IsTFE = M->getNumValues() == 3; + + unsigned Opc; + if (IsFormat) { + Opc = IsTFE ? AMDGPUISD::BUFFER_LOAD_FORMAT_TFE + : AMDGPUISD::BUFFER_LOAD_FORMAT; + } else { + // TODO: Support non-format TFE loads. + assert(!IsTFE); + Opc = AMDGPUISD::BUFFER_LOAD; + } if (IsD16) { return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops); @@ -7850,35 +7855,54 @@ } // Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to -// dwordx4 if on SI. +// dwordx4 if on SI and handle TFE loads. SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList, ArrayRef Ops, EVT MemVT, MachineMemOperand *MMO, SelectionDAG &DAG) const { + LLVMContext &C = *DAG.getContext(); + MachineFunction &MF = DAG.getMachineFunction(); EVT VT = VTList.VTs[0]; - EVT WidenedVT = VT; - EVT WidenedMemVT = MemVT; - if (!Subtarget->hasDwordx3LoadStores() && - (WidenedVT == MVT::v3i32 || WidenedVT == MVT::v3f32)) { - WidenedVT = EVT::getVectorVT(*DAG.getContext(), - WidenedVT.getVectorElementType(), 4); - WidenedMemVT = EVT::getVectorVT(*DAG.getContext(), - WidenedMemVT.getVectorElementType(), 4); - MMO = DAG.getMachineFunction().getMachineMemOperand(MMO, 0, 16); - } - assert(VTList.NumVTs == 2); - SDVTList WidenedVTList = DAG.getVTList(WidenedVT, VTList.VTs[1]); - - auto NewOp = DAG.getMemIntrinsicNode(Opcode, DL, WidenedVTList, Ops, - WidenedMemVT, MMO); - if (WidenedVT != VT) { - auto Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, NewOp, - DAG.getVectorIdxConstant(0, DL)); - NewOp = DAG.getMergeValues({ Extract, SDValue(NewOp.getNode(), 1) }, DL); + assert(VTList.NumVTs == 2 || VTList.NumVTs == 3); + bool IsTFE = VTList.NumVTs == 3; + if (IsTFE) { + unsigned NumValueDWords = divideCeil(VT.getSizeInBits(), 32); + unsigned NumOpDWords = NumValueDWords + 1; + EVT OpDWordsVT = EVT::getVectorVT(C, MVT::i32, NumOpDWords); + SDVTList OpDWordsVTList = DAG.getVTList(OpDWordsVT, VTList.VTs[2]); + MachineMemOperand *OpDWordsMMO = + MF.getMachineMemOperand(MMO, 0, NumOpDWords * 4); + SDValue Op = getMemIntrinsicNode(Opcode, DL, OpDWordsVTList, Ops, + OpDWordsVT, OpDWordsMMO, DAG); + SDValue Status = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op, + DAG.getVectorIdxConstant(NumValueDWords, DL)); + SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL); + SDValue ValueDWords = + NumValueDWords == 1 + ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op, ZeroIdx) + : DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, + EVT::getVectorVT(C, MVT::i32, NumValueDWords), Op, + ZeroIdx); + SDValue Value = DAG.getNode(ISD::BITCAST, DL, VT, ValueDWords); + return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL); } - return NewOp; + + if (!Subtarget->hasDwordx3LoadStores() && + (VT == MVT::v3i32 || VT == MVT::v3f32)) { + EVT WidenedVT = EVT::getVectorVT(C, VT.getVectorElementType(), 4); + EVT WidenedMemVT = EVT::getVectorVT(C, MemVT.getVectorElementType(), 4); + MachineMemOperand *WidenedMMO = MF.getMachineMemOperand(MMO, 0, 16); + SDVTList WidenedVTList = DAG.getVTList(WidenedVT, VTList.VTs[1]); + SDValue Op = DAG.getMemIntrinsicNode(Opcode, DL, WidenedVTList, Ops, + WidenedMemVT, WidenedMMO); + SDValue Value = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Op, + DAG.getVectorIdxConstant(0, DL)); + return DAG.getMergeValues({Value, SDValue(Op.getNode(), 1)}, DL); + } + + return DAG.getMemIntrinsicNode(Opcode, DL, VTList, Ops, MemVT, MMO); } SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG, diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -135,6 +135,8 @@ [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>; def SIbuffer_load_format : SDNode <"AMDGPUISD::BUFFER_LOAD_FORMAT", SDTBufferLoad, [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>; +def SIbuffer_load_format_tfe : SDNode <"AMDGPUISD::BUFFER_LOAD_FORMAT_TFE", SDTBufferLoad, + [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>; def SIbuffer_load_format_d16 : SDNode <"AMDGPUISD::BUFFER_LOAD_FORMAT_D16", SDTBufferLoad, [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>; diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -3233,6 +3233,7 @@ def G_AMDGPU_BUFFER_LOAD_SSHORT : BufferLoadGenericInstruction; def G_AMDGPU_BUFFER_LOAD : BufferLoadGenericInstruction; def G_AMDGPU_BUFFER_LOAD_FORMAT : BufferLoadGenericInstruction; +def G_AMDGPU_BUFFER_LOAD_FORMAT_TFE : BufferLoadGenericInstruction; def G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : BufferLoadGenericInstruction; def G_AMDGPU_TBUFFER_LOAD_FORMAT : TBufferLoadGenericInstruction; def G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : TBufferLoadGenericInstruction; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.ll @@ -214,10 +214,125 @@ ret float %fval } +define amdgpu_cs void @struct_buffer_load_format_v4i32_tfe(<4 x i32> inreg %rsrc, <4 x i32> addrspace(1)* %value, i32 addrspace(1)* %status) { + ; CHECK-LABEL: name: struct_buffer_load_format_v4i32_tfe + ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1 + ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_TFE_IDXEN:%[0-9]+]]:vreg_160 = BUFFER_LOAD_FORMAT_XYZW_TFE_IDXEN [[COPY8]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>) from custom "BufferResource", align 1, addrspace 4) + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_TFE_IDXEN]].sub0 + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_TFE_IDXEN]].sub1 + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_TFE_IDXEN]].sub2 + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_TFE_IDXEN]].sub3 + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_TFE_IDXEN]].sub4 + ; CHECK-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1, [[COPY11]], %subreg.sub2, [[COPY12]], %subreg.sub3 + ; CHECK-NEXT: FLAT_STORE_DWORDX4 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<4 x s32>) into %ir.value, addrspace 1) + ; CHECK-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE2]], [[COPY13]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %ir.status, addrspace 1) + ; CHECK-NEXT: S_ENDPGM 0 + %load = call { <4 x i32>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v4i32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + + %v = extractvalue { <4 x i32>, i32 } %load, 0 + store <4 x i32> %v, <4 x i32> addrspace(1)* %value + + %s = extractvalue { <4 x i32>, i32 } %load, 1 + store i32 %s, i32 addrspace(1)* %status + + ret void +} + +define amdgpu_cs void @struct_buffer_load_format_v3i32_tfe(<4 x i32> inreg %rsrc, <3 x i32> addrspace(1)* %value, i32 addrspace(1)* %status) { + ; CHECK-LABEL: name: struct_buffer_load_format_v3i32_tfe + ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1 + ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_XYZ_TFE_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZ_TFE_IDXEN [[COPY8]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (<3 x s32>) from custom "BufferResource", align 1, addrspace 4) + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZ_TFE_IDXEN]].sub0 + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZ_TFE_IDXEN]].sub1 + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZ_TFE_IDXEN]].sub2 + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZ_TFE_IDXEN]].sub3 + ; CHECK-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1, [[COPY11]], %subreg.sub2 + ; CHECK-NEXT: FLAT_STORE_DWORDX3 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<3 x s32>) into %ir.value, align 16, addrspace 1) + ; CHECK-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE2]], [[COPY12]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %ir.status, addrspace 1) + ; CHECK-NEXT: S_ENDPGM 0 + %load = call { <3 x i32>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v3i32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + + %v = extractvalue { <3 x i32>, i32 } %load, 0 + store <3 x i32> %v, <3 x i32> addrspace(1)* %value + + %s = extractvalue { <3 x i32>, i32 } %load, 1 + store i32 %s, i32 addrspace(1)* %status + + ret void +} + +define amdgpu_cs void @struct_buffer_load_format_i32_tfe(<4 x i32> inreg %rsrc, i32 addrspace(1)* %value, i32 addrspace(1)* %status) { + ; CHECK-LABEL: name: struct_buffer_load_format_i32_tfe + ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1 + ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_TFE_IDXEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_FORMAT_X_TFE_IDXEN [[COPY8]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from custom "BufferResource", align 1, addrspace 4) + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_X_TFE_IDXEN]].sub0 + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_X_TFE_IDXEN]].sub1 + ; CHECK-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE1]], [[COPY9]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %ir.value, addrspace 1) + ; CHECK-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE2]], [[COPY10]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %ir.status, addrspace 1) + ; CHECK-NEXT: S_ENDPGM 0 + %load = call { i32, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_i32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + + %v = extractvalue { i32, i32 } %load, 0 + store i32 %v, i32 addrspace(1)* %value + + %s = extractvalue { i32, i32 } %load, 1 + store i32 %s, i32 addrspace(1)* %status + + ret void +} + declare float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32>, i32, i32, i32, i32 immarg) #0 declare <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32>, i32, i32, i32, i32 immarg) #0 declare <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32>, i32, i32, i32, i32 immarg) #0 declare <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32>, i32, i32, i32, i32 immarg) #0 declare i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32>, i32, i32, i32, i32 immarg) #0 +declare { <4 x i32>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v4i32i32s(<4 x i32>, i32, i32, i32, i32 immarg) #0 +declare { <3 x i32>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v3i32i32s(<4 x i32>, i32, i32, i32, i32 immarg) #0 +declare { i32, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_i32i32s(<4 x i32>, i32, i32, i32, i32 immarg) #0 attributes #0 = { nounwind readonly } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.ll @@ -118,9 +118,41 @@ ret <2 x float> %data } +;CHECK-LABEL: {{^}}buffer_load_xyzw_tfe: +;CHECK: buffer_load_format_xyzw v[0:4], {{v[0-9]+}}, s[0:3], 0 idxen tfe +;CHECK: s_waitcnt +define amdgpu_cs <4 x float> @buffer_load_xyzw_tfe(<4 x i32> inreg %rsrc) { + %load = call { <4 x float>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v4f32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + %data = extractvalue { <4 x float>, i32 } %load, 0 + ret <4 x float> %data +} + +;CHECK-LABEL: {{^}}buffer_load_xyz_tfe: +;CHECK: buffer_load_format_xyz v[2:5], {{v[0-9]+}}, s[0:3], 0 idxen tfe +;CHECK: s_waitcnt +define amdgpu_cs void @buffer_load_xyz_tfe(<4 x i32> inreg %rsrc, <3 x i32> addrspace(1)* %out) { + %load = call { <3 x i32>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v3i32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + %data = extractvalue { <3 x i32>, i32 } %load, 0 + store <3 x i32> %data, <3 x i32> addrspace(1)* %out + ret void +} + +;CHECK-LABEL: {{^}}buffer_load_x_tfe: +;CHECK: buffer_load_format_x v[0:1], {{v[0-9]+}}, s[0:3], 0 idxen tfe +;CHECK: s_waitcnt +define amdgpu_cs float @buffer_load_x_tfe(<4 x i32> inreg %rsrc) { + %load = call { i32, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_i32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + %data = extractvalue { i32, i32 } %load, 0 + %fdata = bitcast i32 %data to float + ret float %fdata +} + declare float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32>, i32, i32, i32, i32) #0 declare <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32>, i32, i32, i32, i32) #0 declare <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32>, i32, i32, i32, i32) #0 declare i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32>, i32, i32, i32, i32) #0 +declare { <4 x float>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v4f32i32s(<4 x i32>, i32, i32, i32, i32 immarg) #0 +declare { <3 x i32>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v3i32i32s(<4 x i32>, i32, i32, i32, i32 immarg) #0 +declare { i32, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_i32i32s(<4 x i32>, i32, i32, i32, i32 immarg) #0 attributes #0 = { nounwind readonly }