Index: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h +++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h @@ -60,6 +60,8 @@ MVT VT, unsigned Offset) const; SDValue lowerImage(SDValue Op, const AMDGPU::ImageDimIntrinsicInfo *Intr, SelectionDAG &DAG) const; + SDValue lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc, SDValue Offset, + SDValue GLC, SelectionDAG &DAG) const; SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const; @@ -190,7 +192,7 @@ // three offsets (voffset, soffset and instoffset) into the SDValue[3] array // pointed to by Offsets. void setBufferOffsets(SDValue CombinedOffset, SelectionDAG &DAG, - SDValue *Offsets) const; + SDValue *Offsets, unsigned Align = 4) const; public: SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI); Index: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp @@ -4796,6 +4796,70 @@ return SDValue(NewNode, 0); } +SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc, + SDValue Offset, SDValue GLC, + SelectionDAG &DAG) const { + MachineFunction &MF = DAG.getMachineFunction(); + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo(), + MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | + MachineMemOperand::MOInvariant, + VT.getStoreSize(), VT.getStoreSize()); + + if (!Offset->isDivergent()) { + SDValue Ops[] = { + Rsrc, + Offset, // Offset + GLC // glc + }; + return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL, + DAG.getVTList(VT), Ops, VT, MMO); + } + + // We have a divergent offset. Emit a MUBUF buffer load instead. We can + // assume that the buffer is unswizzled. + SmallVector Loads; + unsigned NumLoads = 1; + MVT LoadVT = VT.getSimpleVT(); + + assert(LoadVT == MVT::i32 || LoadVT == MVT::v2i32 || LoadVT == MVT::v4i32 || + LoadVT == MVT::v8i32 || LoadVT == MVT::v16i32); + + if (VT == MVT::v8i32 || VT == MVT::v16i32) { + NumLoads = VT == MVT::v16i32 ? 4 : 2; + LoadVT = MVT::v4i32; + } + + SDVTList VTList = DAG.getVTList({LoadVT, MVT::Glue}); + unsigned CachePolicy = cast(GLC)->getZExtValue(); + SDValue Ops[] = { + DAG.getEntryNode(), // Chain + Rsrc, // rsrc + DAG.getConstant(0, DL, MVT::i32), // vindex + {}, // voffset + {}, // soffset + {}, // offset + DAG.getConstant(CachePolicy, DL, MVT::i32), // cachepolicy + DAG.getConstant(0, DL, MVT::i1), // idxen + }; + + // Use the alignment to ensure that the required offsets will fit into the + // immediate offsets. + setBufferOffsets(Offset, DAG, &Ops[3], NumLoads > 1 ? 16 * NumLoads : 4); + + uint64_t InstOffset = cast(Ops[5])->getZExtValue(); + for (unsigned i = 0; i < NumLoads; ++i) { + Ops[5] = DAG.getConstant(InstOffset + 16 * i, DL, MVT::i32); + Loads.push_back(DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList, + Ops, LoadVT, MMO)); + } + + if (VT == MVT::v8i32 || VT == MVT::v16i32) + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads); + + return Loads[0]; +} + SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); @@ -4951,38 +5015,15 @@ SDLoc(DAG.getEntryNode()), MFI->getArgInfo().WorkItemIDZ); case AMDGPUIntrinsic::SI_load_const: { - SDValue Ops[] = { - Op.getOperand(1), // Ptr - Op.getOperand(2), // Offset - DAG.getTargetConstant(0, DL, MVT::i1) // glc - }; - - MachineMemOperand *MMO = MF.getMachineMemOperand( - MachinePointerInfo(), - MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | - MachineMemOperand::MOInvariant, - VT.getStoreSize(), 4); - SDVTList VTList = DAG.getVTList(MVT::i32); - SDValue Load = DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL, - VTList, Ops, MVT::i32, MMO); - + SDValue Load = + lowerSBuffer(MVT::i32, DL, Op.getOperand(1), Op.getOperand(2), + DAG.getTargetConstant(0, DL, MVT::i1), DAG); return DAG.getNode(ISD::BITCAST, DL, MVT::f32, Load); } case Intrinsic::amdgcn_s_buffer_load: { unsigned Cache = cast(Op.getOperand(3))->getZExtValue(); - SDValue Ops[] = { - Op.getOperand(1), // Ptr - Op.getOperand(2), // Offset - DAG.getTargetConstant(Cache & 1, DL, MVT::i1) // glc - }; - - MachineMemOperand *MMO = MF.getMachineMemOperand( - MachinePointerInfo(), - MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | - MachineMemOperand::MOInvariant, - VT.getStoreSize(), VT.getStoreSize()); - return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL, - Op->getVTList(), Ops, VT, MMO); + return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2), + DAG.getTargetConstant(Cache & 1, DL, MVT::i1), DAG); } case Intrinsic::amdgcn_fdiv_fast: return lowerFDIV_FAST(Op, DAG); @@ -6017,13 +6058,13 @@ // three offsets (voffset, soffset and instoffset) into the SDValue[3] array // pointed to by Offsets. void SITargetLowering::setBufferOffsets(SDValue CombinedOffset, - SelectionDAG &DAG, - SDValue *Offsets) const { + SelectionDAG &DAG, SDValue *Offsets, + unsigned Align) const { SDLoc DL(CombinedOffset); if (auto C = dyn_cast(CombinedOffset)) { uint32_t Imm = C->getZExtValue(); uint32_t SOffset, ImmOffset; - if (AMDGPU::splitMUBUFOffset(Imm, SOffset, ImmOffset, Subtarget)) { + if (AMDGPU::splitMUBUFOffset(Imm, SOffset, ImmOffset, Subtarget, Align)) { Offsets[0] = DAG.getConstant(0, DL, MVT::i32); Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32); Offsets[2] = DAG.getConstant(ImmOffset, DL, MVT::i32); @@ -6035,8 +6076,8 @@ SDValue N1 = CombinedOffset.getOperand(1); uint32_t SOffset, ImmOffset; int Offset = cast(N1)->getSExtValue(); - if (Offset >= 0 - && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset, Subtarget)) { + if (Offset >= 0 && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset, + Subtarget, Align)) { Offsets[0] = N0; Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32); Offsets[2] = DAG.getConstant(ImmOffset, DL, MVT::i32); Index: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h +++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h @@ -103,8 +103,6 @@ MachineInstr &Inst) const; void splitScalar64BitBFE(SetVectorType &Worklist, MachineInstr &Inst) const; - void splitScalarBuffer(SetVectorType &Worklist, - MachineInstr &Inst) const; void movePackToVALU(SetVectorType &Worklist, MachineRegisterInfo &MRI, MachineInstr &Inst) const; Index: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -3576,8 +3576,13 @@ // pointer value is uniform. MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase); if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) { - unsigned SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI); - SBase->setReg(SGPR); + unsigned SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI); + SBase->setReg(SGPR); + } + MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soff); + if (SOff && !RI.isSGPRClass(MRI.getRegClass(SOff->getReg()))) { + unsigned SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI); + SOff->setReg(SGPR); } } @@ -4206,115 +4211,6 @@ splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT); Inst.eraseFromParent(); continue; - - case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR: - case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR: - case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR: - case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR: - case AMDGPU::S_BUFFER_LOAD_DWORDX16_SGPR: { - unsigned VDst; - unsigned NewOpcode; - - switch(Opcode) { - case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR: - NewOpcode = AMDGPU::BUFFER_LOAD_DWORD_OFFEN; - VDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - break; - case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR: - NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN; - VDst = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); - break; - case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR: - NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN; - VDst = MRI.createVirtualRegister(&AMDGPU::VReg_128RegClass); - break; - case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR: - case AMDGPU::S_BUFFER_LOAD_DWORDX16_SGPR: - splitScalarBuffer(Worklist, Inst); - Inst.eraseFromParent(); - continue; - } - - const MachineOperand *VAddr = getNamedOperand(Inst, AMDGPU::OpName::soff); - auto Add = MRI.getUniqueVRegDef(VAddr->getReg()); - unsigned Offset = 0; - - // FIXME: This isn't safe because the addressing mode doesn't work - // correctly if vaddr is negative. - // - // FIXME: Should probably be done somewhere else, maybe SIFoldOperands. - // - // See if we can extract an immediate offset by recognizing one of these: - // V_ADD_I32_e32 dst, imm, src1 - // V_ADD_I32_e32 dst, (S_MOV_B32 imm), src1 - // V_ADD will be removed by "Remove dead machine instructions". - if (Add && - (Add->getOpcode() == AMDGPU::V_ADD_I32_e32 || - Add->getOpcode() == AMDGPU::V_ADD_U32_e32 || - Add->getOpcode() == AMDGPU::V_ADD_U32_e64)) { - static const unsigned SrcNames[2] = { - AMDGPU::OpName::src0, - AMDGPU::OpName::src1, - }; - - // Find a literal offset in one of source operands. - for (int i = 0; i < 2; i++) { - const MachineOperand *Src = - getNamedOperand(*Add, SrcNames[i]); - - if (Src->isReg()) { - MachineInstr *Def = MRI.getUniqueVRegDef(Src->getReg()); - if (Def) { - if (Def->isMoveImmediate()) - Src = &Def->getOperand(1); - else if (Def->isCopy()) { - auto Mov = MRI.getUniqueVRegDef(Def->getOperand(1).getReg()); - if (Mov && Mov->isMoveImmediate()) { - Src = &Mov->getOperand(1); - } - } - } - } - - if (Src) { - if (Src->isImm()) - Offset = Src->getImm(); - else if (Src->isCImm()) - Offset = Src->getCImm()->getZExtValue(); - } - - if (Offset && isLegalMUBUFImmOffset(Offset)) { - VAddr = getNamedOperand(*Add, SrcNames[!i]); - break; - } - - Offset = 0; - } - } - - MachineInstr *NewInstr = - BuildMI(*MBB, Inst, Inst.getDebugLoc(), - get(NewOpcode), VDst) - .add(*VAddr) // vaddr - .add(*getNamedOperand(Inst, AMDGPU::OpName::sbase)) // srsrc - .addImm(0) // soffset - .addImm(Offset) // offset - .addImm(getNamedOperand(Inst, AMDGPU::OpName::glc)->getImm()) - .addImm(0) // slc - .addImm(0) // tfe - .cloneMemRefs(Inst) - .getInstr(); - - MRI.replaceRegWith(getNamedOperand(Inst, AMDGPU::OpName::sdst)->getReg(), - VDst); - addUsersToMoveToVALUWorklist(VDst, MRI, Worklist); - Inst.eraseFromParent(); - - // Legalize all operands other than the offset. Notably, convert the srsrc - // into SGPRs using v_readfirstlane if needed. - legalizeOperands(*NewInstr, MDT); - continue; - } } if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) { @@ -4796,73 +4692,6 @@ addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); } -void SIInstrInfo::splitScalarBuffer(SetVectorType &Worklist, - MachineInstr &Inst) const { - MachineBasicBlock &MBB = *Inst.getParent(); - MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); - - MachineBasicBlock::iterator MII = Inst; - auto &DL = Inst.getDebugLoc(); - - MachineOperand &Dest = *getNamedOperand(Inst, AMDGPU::OpName::sdst);; - MachineOperand &Rsrc = *getNamedOperand(Inst, AMDGPU::OpName::sbase); - MachineOperand &Offset = *getNamedOperand(Inst, AMDGPU::OpName::soff); - MachineOperand &Glc = *getNamedOperand(Inst, AMDGPU::OpName::glc); - - unsigned Opcode = Inst.getOpcode(); - unsigned NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN; - unsigned Count = 0; - const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); - const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC); - - switch(Opcode) { - default: - return; - case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR: - Count = 2; - break; - case AMDGPU::S_BUFFER_LOAD_DWORDX16_SGPR: - Count = 4; - break; - } - - // FIXME: Should also attempt to build VAddr and Offset like the non-split - // case (see call site for this function) - - // Create a vector of result registers - SmallVector ResultRegs; - for (unsigned i = 0; i < Count ; ++i) { - unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_128RegClass); - MachineInstr &NewMI = *BuildMI(MBB, MII, DL, get(NewOpcode), ResultReg) - .addReg(Offset.getReg()) // offset - .addReg(Rsrc.getReg()) // rsrc - .addImm(0) // soffset - .addImm(i << 4) // inst_offset - .addImm(Glc.getImm()) // glc - .addImm(0) // slc - .addImm(0) // tfe - .addMemOperand(*Inst.memoperands_begin()); - // Extract the 4 32 bit sub-registers from the result to add into the final REG_SEQUENCE - auto &NewDestOp = NewMI.getOperand(0); - for (unsigned i = 0 ; i < 4 ; i++) - ResultRegs.push_back(buildExtractSubReg(MII, MRI, NewDestOp, &AMDGPU::VReg_128RegClass, - RI.getSubRegFromChannel(i), &AMDGPU::VGPR_32RegClass)); - } - // Create a new combined result to replace original with - unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC); - MachineInstrBuilder CombinedResBuilder = BuildMI(MBB, MII, DL, - get(TargetOpcode::REG_SEQUENCE), FullDestReg); - - for (unsigned i = 0 ; i < Count * 4 ; ++i) { - CombinedResBuilder - .addReg(ResultRegs[i]) - .addImm(RI.getSubRegFromChannel(i)); - } - - MRI.replaceRegWith(Dest.getReg(), FullDestReg); - addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); -} - void SIInstrInfo::addUsersToMoveToVALUWorklist( unsigned DstReg, MachineRegisterInfo &MRI, Index: llvm/trunk/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ llvm/trunk/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -440,11 +440,8 @@ /// not the encoded offset. bool isLegalSMRDImmOffset(const MCSubtargetInfo &ST, int64_t ByteOffset); -// Given Imm, split it into the values to put into the SOffset and ImmOffset -// fields in an MUBUF instruction. Return false if it is not possible (due to a -// hardware bug needing a workaround). bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset, - const GCNSubtarget *Subtarget); + const GCNSubtarget *Subtarget, uint32_t Align = 4); /// \returns true if the intrinsic is divergent bool isIntrinsicSourceOfDivergence(unsigned IntrID); Index: llvm/trunk/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ llvm/trunk/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -888,9 +888,12 @@ // Given Imm, split it into the values to put into the SOffset and ImmOffset // fields in an MUBUF instruction. Return false if it is not possible (due to a // hardware bug needing a workaround). +// +// The required alignment ensures that individual address components remain +// aligned if they are aligned to begin with. It also ensures that additional +// offsets within the given alignment can be added to the resulting ImmOffset. bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset, - const GCNSubtarget *Subtarget) { - const uint32_t Align = 4; + const GCNSubtarget *Subtarget, uint32_t Align) { const uint32_t MaxImm = alignDown(4095, Align); uint32_t Overflow = 0; Index: llvm/trunk/test/CodeGen/AMDGPU/smrd-fold-offset.mir =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/smrd-fold-offset.mir +++ llvm/trunk/test/CodeGen/AMDGPU/smrd-fold-offset.mir @@ -1,6 +1,8 @@ # RUN: llc -march=amdgcn -run-pass si-fix-sgpr-copies -o - %s | FileCheck -check-prefix=GCN %s -# GCN: BUFFER_LOAD_DWORD_OFFEN %{{[0-9]+}}, killed %{{[0-9]+}}, 0, 4095 +# GCN-LABEL: name: smrd_vgpr_offset_imm +# GCN: V_READFIRSTLANE_B32 +# GCN: S_BUFFER_LOAD_DWORD_SGPR --- name: smrd_vgpr_offset_imm body: | @@ -22,7 +24,9 @@ SI_RETURN_TO_EPILOG $vgpr0 ... -# GCN: BUFFER_LOAD_DWORD_OFFEN %{{[0-9]+}}, killed %{{[0-9]+}}, 0, 4095 +# GCN-LABEL: name: smrd_vgpr_offset_imm_add_u32 +# GCN: V_READFIRSTLANE_B32 +# GCN: S_BUFFER_LOAD_DWORD_SGPR --- name: smrd_vgpr_offset_imm_add_u32 body: | Index: llvm/trunk/test/CodeGen/AMDGPU/smrd.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/smrd.ll +++ llvm/trunk/test/CodeGen/AMDGPU/smrd.ll @@ -292,18 +292,19 @@ ; GCN-LABEL: {{^}}smrd_vgpr_offset_imm: ; GCN-NEXT: %bb. -; GCN-NEXT: buffer_load_dword v{{[0-9]}}, v0, s[0:3], 0 offen offset:4095 ; +; GCN-NEXT: buffer_load_dword v{{[0-9]}}, v0, s[0:3], 0 offen offset:4092 ; define amdgpu_ps float @smrd_vgpr_offset_imm(<4 x i32> inreg %desc, i32 %offset) #0 { main_body: - %off = add i32 %offset, 4095 + %off = add i32 %offset, 4092 %r = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 %off) ret float %r } ; GCN-LABEL: {{^}}smrd_vgpr_offset_imm_too_large: ; GCN-NEXT: %bb. -; GCN-NEXT: v_add_{{i|u}}32_e32 v0, {{(vcc, )?}}0x1000, v0 -; GCN-NEXT: buffer_load_dword v{{[0-9]}}, v0, s[0:3], 0 offen ; +; SICI-NEXT: v_add_{{i|u}}32_e32 v0, {{(vcc, )?}}0x1000, v0 +; SICI-NEXT: buffer_load_dword v{{[0-9]}}, v0, s[0:3], 0 offen ; +; VIGFX9-NEXT: buffer_load_dword v{{[0-9]}}, v0, s[0:3], 4 offen offset:4092 ; define amdgpu_ps float @smrd_vgpr_offset_imm_too_large(<4 x i32> inreg %desc, i32 %offset) #0 { main_body: %off = add i32 %offset, 4096 @@ -495,6 +496,59 @@ ret void } +; SMRD load with a non-const non-uniform offset of > 4 dwords (requires splitting) +; GCN-LABEL: {{^}}smrd_load_nonconst3: +; GCN-DAG: buffer_load_dwordx4 v[0:3], v{{[0-9]+}}, s[0:3], 0 offen ; +; GCN-DAG: buffer_load_dwordx4 v[4:7], v{{[0-9]+}}, s[0:3], 0 offen offset:16 ; +; GCN-DAG: buffer_load_dwordx4 v[8:11], v{{[0-9]+}}, s[0:3], 0 offen offset:32 ; +; GCN-DAG: buffer_load_dwordx4 v[12:15], v{{[0-9]+}}, s[0:3], 0 offen offset:48 ; +; GCN: ; return to shader part epilog +define amdgpu_ps <16 x float> @smrd_load_nonconst3(<4 x i32> inreg %rsrc, i32 %off) #0 { +main_body: + %ld = call <16 x i32> @llvm.amdgcn.s.buffer.load.v16i32(<4 x i32> %rsrc, i32 %off, i32 0) + %bc = bitcast <16 x i32> %ld to <16 x float> + ret <16 x float> %bc +} + +; GCN-LABEL: {{^}}smrd_load_nonconst4: +; SICI: v_add_i32_e32 v{{[0-9]+}}, vcc, 0xff8, v0 ; +; SICI-DAG: buffer_load_dwordx4 v[0:3], v{{[0-9]+}}, s[0:3], 0 offen ; +; SICI-DAG: buffer_load_dwordx4 v[4:7], v{{[0-9]+}}, s[0:3], 0 offen offset:16 ; +; SICI-DAG: buffer_load_dwordx4 v[8:11], v{{[0-9]+}}, s[0:3], 0 offen offset:32 ; +; SICI-DAG: buffer_load_dwordx4 v[12:15], v{{[0-9]+}}, s[0:3], 0 offen offset:48 ; +; VIGFX9-DAG: buffer_load_dwordx4 v[0:3], v{{[0-9]+}}, s[0:3], 56 offen offset:4032 ; +; VIGFX9-DAG: buffer_load_dwordx4 v[4:7], v{{[0-9]+}}, s[0:3], 56 offen offset:4048 ; +; VIGFX9-DAG: buffer_load_dwordx4 v[8:11], v{{[0-9]+}}, s[0:3], 56 offen offset:4064 ; +; VIGFX9-DAG: buffer_load_dwordx4 v[12:15], v{{[0-9]+}}, s[0:3], 56 offen offset:4080 ; +; GCN: ; return to shader part epilog +define amdgpu_ps <16 x float> @smrd_load_nonconst4(<4 x i32> inreg %rsrc, i32 %off) #0 { +main_body: + %off.2 = add i32 %off, 4088 + %ld = call <16 x i32> @llvm.amdgcn.s.buffer.load.v16i32(<4 x i32> %rsrc, i32 %off.2, i32 0) + %bc = bitcast <16 x i32> %ld to <16 x float> + ret <16 x float> %bc +} + +; GCN-LABEL: {{^}}smrd_load_nonconst5: +; SICI: v_add_i32_e32 v{{[0-9]+}}, vcc, 0x1004, v0 +; SICI-DAG: buffer_load_dwordx4 v[0:3], v{{[0-9]+}}, s[0:3], 0 offen ; +; SICI-DAG: buffer_load_dwordx4 v[4:7], v{{[0-9]+}}, s[0:3], 0 offen offset:16 ; +; SICI-DAG: buffer_load_dwordx4 v[8:11], v{{[0-9]+}}, s[0:3], 0 offen offset:32 ; +; SICI-DAG: buffer_load_dwordx4 v[12:15], v{{[0-9]+}}, s[0:3], 0 offen offset:48 ; +; VIGFX9: s_movk_i32 s4, 0xfc0 +; VIGFX9-DAG: buffer_load_dwordx4 v[0:3], v{{[0-9]+}}, s[0:3], s4 offen offset:68 ; +; VIGFX9-DAG: buffer_load_dwordx4 v[4:7], v{{[0-9]+}}, s[0:3], s4 offen offset:84 ; +; VIGFX9-DAG: buffer_load_dwordx4 v[8:11], v{{[0-9]+}}, s[0:3], s4 offen offset:100 ; +; VIGFX9-DAG: buffer_load_dwordx4 v[12:15], v{{[0-9]+}}, s[0:3], s4 offen offset:116 ; +; GCN: ; return to shader part epilog +define amdgpu_ps <16 x float> @smrd_load_nonconst5(<4 x i32> inreg %rsrc, i32 %off) #0 { +main_body: + %off.2 = add i32 %off, 4100 + %ld = call <16 x i32> @llvm.amdgcn.s.buffer.load.v16i32(<4 x i32> %rsrc, i32 %off.2, i32 0) + %bc = bitcast <16 x i32> %ld to <16 x float> + ret <16 x float> %bc +} + ; SMRD load dwordx2 ; GCN-LABEL: {{^}}smrd_load_dwordx2: ; SIVIGFX9: s_buffer_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} @@ -513,9 +567,10 @@ ; GCN-LABEL: {{^}}smrd_uniform_loop: ; -; TODO: this should use an s_buffer_load +; TODO: we should keep the loop counter in an SGPR ; -; GCN: buffer_load_dword +; GCN: v_readfirstlane_b32 +; GCN: s_buffer_load_dword define amdgpu_ps float @smrd_uniform_loop(<4 x i32> inreg %desc, i32 %bound) #0 { main_body: br label %loop