Index: llvm/include/llvm/IR/IntrinsicsAMDGPU.td =================================================================== --- llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -1261,6 +1261,23 @@ // Legacy form of the intrinsic. raw and struct forms should be preferred. def int_amdgcn_buffer_atomic_fadd : AMDGPUBufferAtomicFP; + +class AMDGPURawBufferLoadLDS : Intrinsic < + [], + [llvm_v4i32_ty, // rsrc(SGPR) + LLVMQualPointerType, // LDS base offset + llvm_i32_ty, // Data byte size: 1/2/4/8/12/16 + llvm_i32_ty, // voffset(VGPR, included in bounds checking and swizzling) + llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) + llvm_i32_ty, // imm offset(imm, included in bounds checking and swizzling) + llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, + // bit 1 = slc, + // bit 2 = dlc on gfx10+)) + // swizzled buffer (bit 3 = swz)) + [IntrWillReturn, NoCapture>, ImmArg>, ImmArg>, + ImmArg>], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; +def int_amdgcn_raw_buffer_load_lds : AMDGPURawBufferLoadLDS; + } // defset AMDGPUBufferIntrinsics // Uses that do not set the done bit should set IntrWriteMem on the Index: llvm/lib/Target/AMDGPU/AMDGPUGISel.td =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -191,6 +191,7 @@ def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; +def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; Index: llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h +++ llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h @@ -173,6 +173,8 @@ bool legalizeBufferLoad(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool IsFormat, bool IsTyped) const; + bool legalizeBufferLoadLds(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B) const; bool legalizeBufferAtomic(MachineInstr &MI, MachineIRBuilder &B, Intrinsic::ID IID) const; Index: llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -4245,6 +4245,53 @@ return true; } +bool AMDGPULegalizerInfo::legalizeBufferLoadLds(MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &B) const { + MachineMemOperand *MMO = *MI.memoperands_begin(); + const LLT S32 = LLT::scalar(32); + + Register RSrc = MI.getOperand(1).getReg(); + + // The struct intrinsic variants add one additional operand over raw. + const bool HasVIndex = MI.getNumOperands() == 9; + Register VIndex; + int OpOffset = 0; + if (HasVIndex) { + VIndex = MI.getOperand(4).getReg(); + OpOffset = 1; + } else { + VIndex = B.buildConstant(S32, 0).getReg(0); + } + + Register VOffset = MI.getOperand(4 + OpOffset).getReg(); + Register SOffset = MI.getOperand(5 + OpOffset).getReg(); + unsigned ImmOffset = MI.getOperand(6 + OpOffset).getImm(); + Register M0Val = MI.getOperand(2).getReg(); + + unsigned Size = MI.getOperand(3).getImm(); + unsigned AuxiliaryData = MI.getOperand(7 + OpOffset).getImm(); + + updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI); + + B.buildInstr(AMDGPU::COPY) + .addDef(AMDGPU::M0) + .addUse(M0Val); + B.buildInstr(AMDGPU::G_AMDGPU_BUFFER_LOAD_LDS) + .addUse(RSrc) // rsrc + .addUse(VIndex) // vindex + .addUse(VOffset) // voffset + .addUse(SOffset) // soffset + .addImm(ImmOffset) // offset(imm) + .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) + .addImm(HasVIndex ? -1 : 0) // idxen(imm) + .addImm(Size) // data byte size + .addMemOperand(MMO); + + MI.eraseFromParent(); + return true; +} + bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, MachineIRBuilder &B, bool IsInc) const { @@ -5322,6 +5369,8 @@ case Intrinsic::amdgcn_raw_buffer_load: case Intrinsic::amdgcn_struct_buffer_load: return legalizeBufferLoad(MI, MRI, B, false, false); + case Intrinsic::amdgcn_raw_buffer_load_lds: + return legalizeBufferLoadLds(MI, MRI, B); case Intrinsic::amdgcn_raw_buffer_load_format: case Intrinsic::amdgcn_struct_buffer_load_format: return legalizeBufferLoad(MI, MRI, B, true, false); Index: llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -2864,6 +2864,11 @@ executeInWaterfallLoop(MI, MRI, {1, 4}); return; } + case AMDGPU::G_AMDGPU_BUFFER_LOAD_LDS: { + applyDefaultMapping(OpdMapper); + executeInWaterfallLoop(MI, MRI, {0, 3}); + return; + } case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP: case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD: case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB: @@ -3962,6 +3967,23 @@ // initialized. break; } + case AMDGPU::G_AMDGPU_BUFFER_LOAD_LDS: { + // rsrc + OpdsMapping[0] = getSGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); + + // vindex + OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); + + // voffset + OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); + + // soffset + OpdsMapping[3] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); + + // Any remaining operands are immediates and were correctly null + // initialized. + break; + } case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP: case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD: case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB: @@ -4436,6 +4458,13 @@ OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); break; } + case Intrinsic::amdgcn_raw_buffer_load_lds: { + OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); + OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); + OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); + OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); + break; + } case Intrinsic::amdgcn_raw_buffer_store: case Intrinsic::amdgcn_raw_buffer_store_format: case Intrinsic::amdgcn_raw_tbuffer_store: { Index: llvm/lib/Target/AMDGPU/BUFInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/BUFInstructions.td +++ llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -1304,6 +1304,45 @@ defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; +multiclass MUBUF_LoadLDSIntrinsicPat { + def : GCNPat< + (ld v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset, + timm:$auxiliary, 0, (i32 size)), + (!cast(opcode # _OFFSET) SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), + (extract_cpol $auxiliary), (extract_swz $auxiliary)) + >; + + def : GCNPat< + (ld v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset, + timm:$auxiliary, 0, (i32 size)), + (!cast(opcode # _OFFEN) VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), + (extract_cpol $auxiliary), (extract_swz $auxiliary)) + >; + + def : GCNPat< + (ld v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset, + timm:$auxiliary, timm, (i32 size)), + (!cast(opcode # _IDXEN) VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), + (extract_cpol $auxiliary), (extract_swz $auxiliary)) + >; + + def : GCNPat< + (ld v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, timm:$offset, + timm:$auxiliary, timm, (i32 size)), + (!cast(opcode # _BOTHEN) + (REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1), + SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), + (extract_cpol $auxiliary), (extract_swz $auxiliary)) + >; +} + +defm : MUBUF_LoadLDSIntrinsicPat; +defm : MUBUF_LoadLDSIntrinsicPat; +defm : MUBUF_LoadLDSIntrinsicPat; +defm : MUBUF_LoadLDSIntrinsicPat; +defm : MUBUF_LoadLDSIntrinsicPat; +defm : MUBUF_LoadLDSIntrinsicPat; + multiclass MUBUF_StoreIntrinsicPat { defvar st = !if(!eq(memoryVt, vt), name, mubuf_intrinsic_store); Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1191,6 +1191,16 @@ // XXX - Should this be volatile without known ordering? Info.flags |= MachineMemOperand::MOVolatile; + + switch (IntrID) { + default: + break; + case Intrinsic::amdgcn_raw_buffer_load_lds: { + unsigned Width = cast(CI.getArgOperand(2))->getZExtValue(); + Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8); + return true; + } + } } return true; } @@ -8219,6 +8229,28 @@ return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, M->getMemoryVT(), M->getMemOperand()); } + case Intrinsic::amdgcn_raw_buffer_load_lds: { + auto *M = cast(Op); + MachineMemOperand *MMO = M->getMemOperand(); + + SDValue Ops[] = { + Op.getOperand(0), // Chain + Op.getOperand(2), // rsrc + DAG.getConstant(0, DL, MVT::i32), // vindex + Op.getOperand(5), // voffset + Op.getOperand(6), // soffset + Op.getOperand(7), // imm_offset + Op.getOperand(8), // cachepolicy, swizzled buffer + DAG.getTargetConstant(0, DL, MVT::i1), // idxen + Op.getOperand(4), // data byte size + copyToM0(DAG, Chain, DL, Op.getOperand(3)).getValue(1) // Glue + }; + + updateBufferMMO(MMO, Ops[3], Ops[4], Ops[5]); + + return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, + M->getVTList(), Ops, M->getMemoryVT(), MMO); + } case Intrinsic::amdgcn_end_cf: return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other, Op->getOperand(2), Chain), 0); Index: llvm/lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -385,6 +385,10 @@ DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst); if (DataOpIdx == -1) DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata); + if (DataOpIdx == -1) { // LDS DMA + Width = (*LdSt.memoperands_begin())->getSize(); + return true; + } Width = getOpSize(LdSt, DataOpIdx); return true; } Index: llvm/lib/Target/AMDGPU/SIInstrInfo.td =================================================================== --- llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -121,6 +121,16 @@ SDTCisVT<6, i32>, // cachepolicy, swizzled buffer(imm) SDTCisVT<7, i1>]>; // idxen(imm) +def SDTBufferLoadLDS : SDTypeProfile<0, 8, + [SDTCisVT<0, v4i32>, // rsrc + SDTCisVT<1, i32>, // vindex(VGPR) + SDTCisVT<2, i32>, // voffset(VGPR) + SDTCisVT<3, i32>, // soffset(SGPR) + SDTCisVT<4, i32>, // offset(imm) + SDTCisVT<5, i32>, // cachepolicy, swizzled buffer(imm) + SDTCisVT<6, i1>, // idxen(imm) + SDTCisVT<7, i32>]>; // data byte size + def SIbuffer_load : SDNode <"AMDGPUISD::BUFFER_LOAD", SDTBufferLoad, [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>; def SIbuffer_load_ubyte : SDNode <"AMDGPUISD::BUFFER_LOAD_UBYTE", SDTBufferLoad, @@ -136,6 +146,8 @@ def SIbuffer_load_format_d16 : SDNode <"AMDGPUISD::BUFFER_LOAD_FORMAT_D16", SDTBufferLoad, [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>; +def SIbuffer_load_lds : SDNode <"AMDGPUISD::BUFFER_LOAD", SDTBufferLoadLDS, + [SDNPMemOperand, SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPInGlue]>; def SDTBufferStore : SDTypeProfile<0, 8, [ // vdata Index: llvm/lib/Target/AMDGPU/SIInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/SIInstructions.td +++ llvm/lib/Target/AMDGPU/SIInstructions.td @@ -3009,6 +3009,17 @@ let mayLoad = 1; } +class BufferLoadLdsGenericInstruction : AMDGPUGenericInstruction { + let OutOperandList = (outs); + let InOperandList = (ins type1:$rsrc, type2:$vindex, type2:$voffset, + type2:$soffset, untyped_imm_0:$offset, + untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen, + untyped_imm_0:$size); + let hasSideEffects = 0; + let mayLoad = 1; + let mayStore = 1; +} + def G_AMDGPU_BUFFER_LOAD_UBYTE : BufferLoadGenericInstruction; def G_AMDGPU_BUFFER_LOAD_SBYTE : BufferLoadGenericInstruction; def G_AMDGPU_BUFFER_LOAD_USHORT : BufferLoadGenericInstruction; @@ -3018,6 +3029,7 @@ def G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : BufferLoadGenericInstruction; def G_AMDGPU_TBUFFER_LOAD_FORMAT : TBufferLoadGenericInstruction; def G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : TBufferLoadGenericInstruction; +def G_AMDGPU_BUFFER_LOAD_LDS : BufferLoadLdsGenericInstruction; class BufferStoreGenericInstruction : AMDGPUGenericInstruction { let OutOperandList = (outs); Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.lds.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.lds.ll @@ -0,0 +1,149 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN + +declare void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* nocapture, i32 %size, i32 %voffset, i32 %soffset, i32 %offset, i32 %aux) + +define amdgpu_ps float @buffer_load_lds_dword(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds) { +; GCN-LABEL: buffer_load_lds_dword: +; GCN: ; %bb.0: ; %main_body +; GCN-NEXT: s_mov_b32 m0, s4 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword off, s[0:3], 0 lds +; GCN-NEXT: buffer_load_dword off, s[0:3], 0 offset:4 glc lds +; GCN-NEXT: buffer_load_dword off, s[0:3], 0 offset:8 slc lds +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ds_read_b32 v0, v0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: ; return to shader part epilog +main_body: + call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 0, i32 0, i32 4, i32 1) + call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 0, i32 0, i32 8, i32 2) + %ptr = bitcast i8 addrspace(3)* %lds to float addrspace(3)* + %res = load float, float addrspace(3)* %ptr + ret float %res +} + +define amdgpu_ps void @buffer_load_lds_dword_imm_voffset(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds) { +; GCN-LABEL: buffer_load_lds_dword_imm_voffset: +; GCN: ; %bb.0: ; %main_body +; GCN-NEXT: v_mov_b32_e32 v0, 0x800 +; GCN-NEXT: s_mov_b32 m0, s4 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v0, s[0:3], 0 offen lds +; GCN-NEXT: s_endpgm +main_body: + call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 2048, i32 0, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @buffer_load_lds_dword_v_offset(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds, i32 %voffset) { +; GCN-LABEL: buffer_load_lds_dword_v_offset: +; GCN: ; %bb.0: ; %main_body +; GCN-NEXT: s_mov_b32 m0, s4 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v0, s[0:3], 0 offen lds +; GCN-NEXT: s_endpgm +main_body: + call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 %voffset, i32 0, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @buffer_load_lds_dword_s_offset(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds, i32 inreg %soffset) { +; GCN-LABEL: buffer_load_lds_dword_s_offset: +; GCN: ; %bb.0: ; %main_body +; GCN-NEXT: s_mov_b32 m0, s4 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword off, s[0:3], s5 lds +; GCN-NEXT: s_endpgm +main_body: + call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 0, i32 %soffset, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @buffer_load_lds_dword_vs_offset(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds, i32 %voffset, i32 inreg %soffset) { +; GCN-LABEL: buffer_load_lds_dword_vs_offset: +; GCN: ; %bb.0: ; %main_body +; GCN-NEXT: s_mov_b32 m0, s4 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v0, s[0:3], s5 offen lds +; GCN-NEXT: s_endpgm +main_body: + call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 %voffset, i32 %soffset, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @buffer_load_lds_dword_vs_imm_offset(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds, i32 %voffset, i32 inreg %soffset) { +; GCN-LABEL: buffer_load_lds_dword_vs_imm_offset: +; GCN: ; %bb.0: ; %main_body +; GCN-NEXT: s_mov_b32 m0, s4 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v0, s[0:3], s5 offen offset:2048 lds +; GCN-NEXT: s_endpgm +main_body: + call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 %voffset, i32 %soffset, i32 2048, i32 0) + ret void +} + +define amdgpu_ps void @buffer_load_lds_dwordx2(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds) { +; GCN-LABEL: buffer_load_lds_dwordx2: +; GCN: ; %bb.0: ; %main_body +; GCN-NEXT: s_mov_b32 m0, s4 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dwordx2 off, s[0:3], 0 lds +; GCN-NEXT: s_endpgm +main_body: + call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 8, i32 0, i32 0, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @buffer_load_lds_dwordx3(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds) { +; GCN-LABEL: buffer_load_lds_dwordx3: +; GCN: ; %bb.0: ; %main_body +; GCN-NEXT: s_mov_b32 m0, s4 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dwordx3 off, s[0:3], 0 lds +; GCN-NEXT: s_endpgm +main_body: + call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 12, i32 0, i32 0, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @buffer_load_lds_dwordx4(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds) { +; GCN-LABEL: buffer_load_lds_dwordx4: +; GCN: ; %bb.0: ; %main_body +; GCN-NEXT: s_mov_b32 m0, s4 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dwordx4 off, s[0:3], 0 lds +; GCN-NEXT: s_endpgm +main_body: + call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 16, i32 0, i32 0, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @buffer_load_lds_ushort(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds) { +; GCN-LABEL: buffer_load_lds_ushort: +; GCN: ; %bb.0: ; %main_body +; GCN-NEXT: v_mov_b32_e32 v0, 0x800 +; GCN-NEXT: s_mov_b32 m0, s4 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_ushort v0, s[0:3], 0 offen lds +; GCN-NEXT: s_endpgm +main_body: + call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 2, i32 2048, i32 0, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @buffer_load_lds_ubyte(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds) { +; GCN-LABEL: buffer_load_lds_ubyte: +; GCN: ; %bb.0: ; %main_body +; GCN-NEXT: s_mov_b32 m0, s4 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_ubyte off, s[0:3], 0 offset:2048 lds +; GCN-NEXT: s_endpgm +main_body: + call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 1, i32 0, i32 0, i32 2048, i32 0) + ret void +}