Index: llvm/include/llvm/IR/IntrinsicsAMDGPU.td =================================================================== --- llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -1261,6 +1261,40 @@ // Legacy form of the intrinsic. raw and struct forms should be preferred. def int_amdgcn_buffer_atomic_fadd : AMDGPUBufferAtomicFP; + +class AMDGPURawBufferLoadLDS : Intrinsic < + [], + [llvm_v4i32_ty, // rsrc(SGPR) + LLVMQualPointerType, // LDS base offset + llvm_i32_ty, // Data byte size: 1/2/4 + llvm_i32_ty, // voffset(VGPR, included in bounds checking and swizzling) + llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) + llvm_i32_ty, // imm offset(imm, included in bounds checking and swizzling) + llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, + // bit 1 = slc, + // bit 2 = dlc on gfx10+)) + // swizzled buffer (bit 3 = swz)) + [IntrWillReturn, NoCapture>, ImmArg>, ImmArg>, + ImmArg>], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; +def int_amdgcn_raw_buffer_load_lds : AMDGPURawBufferLoadLDS; + +class AMDGPUStructBufferLoadLDS : Intrinsic < + [], + [llvm_v4i32_ty, // rsrc(SGPR) + LLVMQualPointerType, // LDS base offset + llvm_i32_ty, // Data byte size: 1/2/4 + llvm_i32_ty, // vindex(VGPR) + llvm_i32_ty, // voffset(VGPR, included in bounds checking and swizzling) + llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) + llvm_i32_ty, // imm offset(imm, included in bounds checking and swizzling) + llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, + // bit 1 = slc, + // bit 2 = dlc on gfx10+)) + // swizzled buffer (bit 3 = swz)) + [IntrWillReturn, NoCapture>, ImmArg>, ImmArg>, + ImmArg>], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; +def int_amdgcn_struct_buffer_load_lds : AMDGPUStructBufferLoadLDS; + } // defset AMDGPUBufferIntrinsics // Uses that do not set the done bit should set IntrWriteMem on the Index: llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -143,6 +143,7 @@ bool selectAMDGPU_BUFFER_ATOMIC_FADD(MachineInstr &I) const; bool selectGlobalAtomicFadd(MachineInstr &I, MachineOperand &AddrOp, MachineOperand &DataOp) const; + bool selectBufferLoadLds(MachineInstr &MI) const; bool selectBVHIntrinsic(MachineInstr &I) const; bool selectSMFMACIntrin(MachineInstr &I) const; bool selectWaveAddress(MachineInstr &I) const; Index: llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -1780,6 +1780,9 @@ return selectSBarrier(I); case Intrinsic::amdgcn_global_atomic_fadd: return selectGlobalAtomicFadd(I, I.getOperand(2), I.getOperand(3)); + case Intrinsic::amdgcn_raw_buffer_load_lds: + case Intrinsic::amdgcn_struct_buffer_load_lds: + return selectBufferLoadLds(I); default: { return selectImpl(I, *CoverageInfo); } @@ -3054,6 +3057,98 @@ return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); } +bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const { + unsigned Opc; + unsigned Size = MI.getOperand(3).getImm(); + + // The struct intrinsic variants add one additional operand over raw. + const bool HasVIndex = MI.getNumOperands() == 9; + Register VIndex; + int OpOffset = 0; + if (HasVIndex) { + VIndex = MI.getOperand(4).getReg(); + OpOffset = 1; + } + + Register VOffset = MI.getOperand(4 + OpOffset).getReg(); + Optional MaybeVOffset = + getIConstantVRegValWithLookThrough(VOffset, *MRI); + const bool HasVOffset = !MaybeVOffset || MaybeVOffset->Value.getZExtValue(); + + switch (Size) { + default: + llvm_unreachable("Unhandled buffer load LDS size"); + case 1: + Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN + : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN + : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN + : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET; + break; + case 2: + Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN + : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN + : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN + : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET; + break; + case 4: + Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN + : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN + : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN + : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET; + break; + } + + MachineBasicBlock *MBB = MI.getParent(); + const DebugLoc &DL = MI.getDebugLoc(); + BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) + .add(MI.getOperand(2)); + + auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc)); + + if (HasVIndex && HasVOffset) { + Register IdxReg = MRI->createVirtualRegister(TRI.getVGPR64Class()); + BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), IdxReg) + .addReg(VIndex) + .addImm(AMDGPU::sub0) + .addReg(VOffset) + .addImm(AMDGPU::sub1); + + MIB.addReg(IdxReg); + } else if (HasVIndex) { + MIB.addReg(VIndex); + } else if (HasVOffset) { + MIB.addReg(VOffset); + } + + MIB.add(MI.getOperand(1)); // rsrc + MIB.add(MI.getOperand(5 + OpOffset)); // soffset + MIB.add(MI.getOperand(6 + OpOffset)); // imm offset + unsigned Aux = MI.getOperand(7 + OpOffset).getImm(); + MIB.addImm(Aux & AMDGPU::CPol::ALL); // cpol + MIB.addImm((Aux >> 3) & 1); // swz + + MachineMemOperand *LoadMMO = *MI.memoperands_begin(); + MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo(); + LoadPtrI.Offset = MI.getOperand(6 + OpOffset).getImm(); + MachinePointerInfo StorePtrI = LoadPtrI; + StorePtrI.V = nullptr; + StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS; + + auto F = LoadMMO->getFlags() & + ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad); + LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad, + Size, LoadMMO->getBaseAlign()); + + MachineMemOperand *StoreMMO = + MF->getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore, + sizeof(int32_t), LoadMMO->getBaseAlign()); + + MIB.setMemRefs({LoadMMO, StoreMMO}); + + MI.eraseFromParent(); + return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); +} + bool AMDGPUInstructionSelector::selectBVHIntrinsic(MachineInstr &MI) const{ MI.setDesc(TII.get(MI.getOperand(1).getImm())); MI.removeOperand(1); Index: llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -3012,6 +3012,20 @@ constrainOpWithReadfirstlane(MI, MRI, 2); return; } + case Intrinsic::amdgcn_raw_buffer_load_lds: { + applyDefaultMapping(OpdMapper); + constrainOpWithReadfirstlane(MI, MRI, 1); // rsrc + constrainOpWithReadfirstlane(MI, MRI, 2); // M0 + constrainOpWithReadfirstlane(MI, MRI, 5); // soffset + return; + } + case Intrinsic::amdgcn_struct_buffer_load_lds: { + applyDefaultMapping(OpdMapper); + constrainOpWithReadfirstlane(MI, MRI, 1); // rsrc + constrainOpWithReadfirstlane(MI, MRI, 2); // M0 + constrainOpWithReadfirstlane(MI, MRI, 6); // soffset + return; + } default: { if (const AMDGPU::RsrcIntrinsic *RSrcIntrin = AMDGPU::lookupRsrcIntrinsic(IntrID)) { @@ -4436,6 +4450,13 @@ OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); break; } + case Intrinsic::amdgcn_raw_buffer_load_lds: { + OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); + OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); + OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); + OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); + break; + } case Intrinsic::amdgcn_raw_buffer_store: case Intrinsic::amdgcn_raw_buffer_store_format: case Intrinsic::amdgcn_raw_tbuffer_store: { @@ -4454,6 +4475,14 @@ OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); break; } + case Intrinsic::amdgcn_struct_buffer_load_lds: { + OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); + OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); + OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); + OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); + OpdsMapping[6] = getSGPROpMapping(MI.getOperand(6).getReg(), MRI, *TRI); + break; + } case Intrinsic::amdgcn_struct_buffer_store: case Intrinsic::amdgcn_struct_tbuffer_store: { OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1191,6 +1191,17 @@ // XXX - Should this be volatile without known ordering? Info.flags |= MachineMemOperand::MOVolatile; + + switch (IntrID) { + default: + break; + case Intrinsic::amdgcn_raw_buffer_load_lds: + case Intrinsic::amdgcn_struct_buffer_load_lds: { + unsigned Width = cast(CI.getArgOperand(2))->getZExtValue(); + Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8); + return true; + } + } } return true; } @@ -8228,6 +8239,85 @@ return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, M->getMemoryVT(), M->getMemOperand()); } + case Intrinsic::amdgcn_raw_buffer_load_lds: + case Intrinsic::amdgcn_struct_buffer_load_lds: { + unsigned Opc; + bool HasVIndex = IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds; + unsigned OpOffset = HasVIndex ? 1 : 0; + SDValue VOffset = Op.getOperand(5 + OpOffset); + auto CVOffset = dyn_cast(VOffset); + bool HasVOffset = !CVOffset || !CVOffset->isZero(); + unsigned Size = Op->getConstantOperandVal(4); + + switch (Size) { + default: + llvm_unreachable("Unhandled buffer load LDS size"); + case 1: + Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN + : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN + : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN + : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET; + break; + case 2: + Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN + : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN + : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN + : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET; + break; + case 4: + Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN + : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN + : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN + : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET; + break; + } + + SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3)); + + SmallVector Ops; + + if (HasVIndex && HasVOffset) + Ops.push_back(DAG.getBuildVector(MVT::v2i32, DL, + { Op.getOperand(5), // VIndex + VOffset })); + else if (HasVIndex) + Ops.push_back(Op.getOperand(5)); + else if (HasVOffset) + Ops.push_back(VOffset); + + Ops.push_back(Op.getOperand(2)); // rsrc + Ops.push_back(Op.getOperand(6 + OpOffset)); // soffset + Ops.push_back(Op.getOperand(7 + OpOffset)); // imm offset + unsigned Aux = Op.getConstantOperandVal(8 + OpOffset); + Ops.push_back( + DAG.getTargetConstant(Aux & AMDGPU::CPol::ALL, DL, MVT::i8)); // cpol + Ops.push_back( + DAG.getTargetConstant((Aux >> 3) & 1, DL, MVT::i8)); // swz + Ops.push_back(M0Val.getValue(0)); // Chain + Ops.push_back(M0Val.getValue(1)); // Glue + + auto *M = cast(Op); + MachineMemOperand *LoadMMO = M->getMemOperand(); + MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo(); + LoadPtrI.Offset = Op->getConstantOperandVal(7 + OpOffset); + MachinePointerInfo StorePtrI = LoadPtrI; + StorePtrI.V = nullptr; + StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS; + + auto F = LoadMMO->getFlags() & + ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad); + LoadMMO = MF.getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad, + Size, LoadMMO->getBaseAlign()); + + MachineMemOperand *StoreMMO = + MF.getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore, + sizeof(int32_t), LoadMMO->getBaseAlign()); + + auto Load = DAG.getMachineNode(Opc, DL, M->getVTList(), Ops); + DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO}); + + return SDValue(Load, 0); + } case Intrinsic::amdgcn_end_cf: return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other, Op->getOperand(2), Chain), 0); Index: llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -1099,6 +1099,9 @@ unsigned AS = Memop->getAddrSpace(); if (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::FLAT_ADDRESS) continue; + // No need to wait before load from VMEM to LDS. + if (mayWriteLDSThroughDMA(MI)) + continue; unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS; // VM_CNT is only relevant to vgpr or LDS. ScoreBrackets.determineWait( Index: llvm/lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -385,6 +385,8 @@ DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst); if (DataOpIdx == -1) DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata); + if (DataOpIdx == -1) // LDS DMA + return false; Width = getOpSize(LdSt, DataOpIdx); return true; } Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.lds.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.lds.ll @@ -0,0 +1,113 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN + +declare void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* nocapture, i32 %size, i32 %voffset, i32 %soffset, i32 %offset, i32 %aux) + +define amdgpu_ps float @buffer_load_lds_dword(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds) { +; GCN-LABEL: buffer_load_lds_dword: +; GCN: ; %bb.0: ; %main_body +; GCN-NEXT: s_mov_b32 m0, s4 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword off, s[0:3], 0 lds +; GCN-NEXT: buffer_load_dword off, s[0:3], 0 offset:4 glc lds +; GCN-NEXT: buffer_load_dword off, s[0:3], 0 offset:8 slc lds +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ds_read_b32 v0, v0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: ; return to shader part epilog +main_body: + call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 0, i32 0, i32 4, i32 1) + call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 0, i32 0, i32 8, i32 2) + %ptr = bitcast i8 addrspace(3)* %lds to float addrspace(3)* + %res = load float, float addrspace(3)* %ptr + ret float %res +} + +define amdgpu_ps void @buffer_load_lds_dword_imm_voffset(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds) { +; GCN-LABEL: buffer_load_lds_dword_imm_voffset: +; GCN: ; %bb.0: ; %main_body +; GCN-NEXT: v_mov_b32_e32 v0, 0x800 +; GCN-NEXT: s_mov_b32 m0, s4 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v0, s[0:3], 0 offen lds +; GCN-NEXT: s_endpgm +main_body: + call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 2048, i32 0, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @buffer_load_lds_dword_v_offset(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds, i32 %voffset) { +; GCN-LABEL: buffer_load_lds_dword_v_offset: +; GCN: ; %bb.0: ; %main_body +; GCN-NEXT: s_mov_b32 m0, s4 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v0, s[0:3], 0 offen lds +; GCN-NEXT: s_endpgm +main_body: + call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 %voffset, i32 0, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @buffer_load_lds_dword_s_offset(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds, i32 inreg %soffset) { +; GCN-LABEL: buffer_load_lds_dword_s_offset: +; GCN: ; %bb.0: ; %main_body +; GCN-NEXT: s_mov_b32 m0, s4 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword off, s[0:3], s5 lds +; GCN-NEXT: s_endpgm +main_body: + call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 0, i32 %soffset, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @buffer_load_lds_dword_vs_offset(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds, i32 %voffset, i32 inreg %soffset) { +; GCN-LABEL: buffer_load_lds_dword_vs_offset: +; GCN: ; %bb.0: ; %main_body +; GCN-NEXT: s_mov_b32 m0, s4 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v0, s[0:3], s5 offen lds +; GCN-NEXT: s_endpgm +main_body: + call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 %voffset, i32 %soffset, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @buffer_load_lds_dword_vs_imm_offset(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds, i32 %voffset, i32 inreg %soffset) { +; GCN-LABEL: buffer_load_lds_dword_vs_imm_offset: +; GCN: ; %bb.0: ; %main_body +; GCN-NEXT: s_mov_b32 m0, s4 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v0, s[0:3], s5 offen offset:2048 lds +; GCN-NEXT: s_endpgm +main_body: + call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 %voffset, i32 %soffset, i32 2048, i32 0) + ret void +} + +define amdgpu_ps void @buffer_load_lds_ushort(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds) { +; GCN-LABEL: buffer_load_lds_ushort: +; GCN: ; %bb.0: ; %main_body +; GCN-NEXT: v_mov_b32_e32 v0, 0x800 +; GCN-NEXT: s_mov_b32 m0, s4 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_ushort v0, s[0:3], 0 offen lds +; GCN-NEXT: s_endpgm +main_body: + call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 2, i32 2048, i32 0, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @buffer_load_lds_ubyte(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds) { +; GCN-LABEL: buffer_load_lds_ubyte: +; GCN: ; %bb.0: ; %main_body +; GCN-NEXT: s_mov_b32 m0, s4 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_ubyte off, s[0:3], 0 offset:2048 lds +; GCN-NEXT: s_endpgm +main_body: + call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 1, i32 0, i32 0, i32 2048, i32 0) + ret void +} Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.lds.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.lds.ll @@ -0,0 +1,126 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,SDAG +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GISEL + +declare void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* nocapture, i32 %size, i32 %vindex, i32 %voffset, i32 %soffset, i32 %offset, i32 %aux) + +define amdgpu_ps float @buffer_load_lds_dword(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds) { +; SDAG-LABEL: buffer_load_lds_dword: +; SDAG: ; %bb.0: ; %main_body +; SDAG-NEXT: v_mov_b32_e32 v0, 8 +; SDAG-NEXT: s_mov_b32 m0, s4 +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: buffer_load_dword v0, s[0:3], 0 idxen lds +; SDAG-NEXT: buffer_load_dword v0, s[0:3], 0 idxen offset:4 glc lds +; SDAG-NEXT: buffer_load_dword v0, s[0:3], 0 idxen offset:8 slc lds +; SDAG-NEXT: v_mov_b32_e32 v0, s4 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: ds_read_b32 v0, v0 +; SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-NEXT: ; return to shader part epilog +; +; GISEL-LABEL: buffer_load_lds_dword: +; GISEL: ; %bb.0: ; %main_body +; GISEL-NEXT: s_mov_b32 m0, s4 +; GISEL-NEXT: v_mov_b32_e32 v0, 8 +; GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 idxen lds +; GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 idxen offset:4 glc lds +; GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 idxen offset:8 slc lds +; GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: ds_read_b32 v0, v0 +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: ; return to shader part epilog +main_body: + call void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 8, i32 0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 8, i32 0, i32 0, i32 4, i32 1) + call void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 8, i32 0, i32 0, i32 8, i32 2) + %ptr = bitcast i8 addrspace(3)* %lds to float addrspace(3)* + %res = load float, float addrspace(3)* %ptr + ret float %res +} + +define amdgpu_ps void @buffer_load_lds_dword_imm_offset(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds, i32 %vindex) { +; GCN-LABEL: buffer_load_lds_dword_imm_offset: +; GCN: ; %bb.0: ; %main_body +; GCN-NEXT: s_mov_b32 m0, s4 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v0, s[0:3], 0 idxen offset:2048 lds +; GCN-NEXT: s_endpgm +main_body: + call void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 %vindex, i32 0, i32 0, i32 2048, i32 0) + ret void +} + +define amdgpu_ps void @buffer_load_lds_dword_v_offset(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds, i32 %vindex, i32 %voffset) { +; GCN-LABEL: buffer_load_lds_dword_v_offset: +; GCN: ; %bb.0: ; %main_body +; GCN-NEXT: s_mov_b32 m0, s4 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v[0:1], s[0:3], 0 idxen offen lds +; GCN-NEXT: s_endpgm +main_body: + call void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 %vindex, i32 %voffset, i32 0, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @buffer_load_lds_dword_s_offset(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds, i32 %vindex, i32 inreg %soffset) { +; GCN-LABEL: buffer_load_lds_dword_s_offset: +; GCN: ; %bb.0: ; %main_body +; GCN-NEXT: s_mov_b32 m0, s4 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v0, s[0:3], s5 idxen lds +; GCN-NEXT: s_endpgm +main_body: + call void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 %vindex, i32 0, i32 %soffset, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @buffer_load_lds_dword_vs_offset(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +; GCN-LABEL: buffer_load_lds_dword_vs_offset: +; GCN: ; %bb.0: ; %main_body +; GCN-NEXT: s_mov_b32 m0, s4 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v[0:1], s[0:3], s5 idxen offen lds +; GCN-NEXT: s_endpgm +main_body: + call void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 %vindex, i32 %voffset, i32 %soffset, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @buffer_load_lds_dword_vs_imm_offset(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +; GCN-LABEL: buffer_load_lds_dword_vs_imm_offset: +; GCN: ; %bb.0: ; %main_body +; GCN-NEXT: s_mov_b32 m0, s4 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v[0:1], s[0:3], s5 idxen offen offset:2048 lds +; GCN-NEXT: s_endpgm +main_body: + call void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 %vindex, i32 %voffset, i32 %soffset, i32 2048, i32 0) + ret void +} + +define amdgpu_ps void @buffer_load_lds_ushort(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds, i32 %vindex) { +; GCN-LABEL: buffer_load_lds_ushort: +; GCN: ; %bb.0: ; %main_body +; GCN-NEXT: v_mov_b32_e32 v1, 0x800 +; GCN-NEXT: s_mov_b32 m0, s4 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_ushort v[0:1], s[0:3], 0 idxen offen lds +; GCN-NEXT: s_endpgm +main_body: + call void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 2, i32 %vindex, i32 2048, i32 0, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @buffer_load_lds_ubyte(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds, i32 %vindex) { +; GCN-LABEL: buffer_load_lds_ubyte: +; GCN: ; %bb.0: ; %main_body +; GCN-NEXT: s_mov_b32 m0, s4 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_ubyte v0, s[0:3], 0 idxen offset:2048 lds +; GCN-NEXT: s_endpgm +main_body: + call void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 1, i32 %vindex, i32 0, i32 0, i32 2048, i32 0) + ret void +}