Index: llvm/include/llvm/IR/IntrinsicsAMDGPU.td =================================================================== --- llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -1278,6 +1278,23 @@ ImmArg>], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; def int_amdgcn_raw_buffer_load_lds : AMDGPURawBufferLoadLDS; +class AMDGPUStructBufferLoadLDS : Intrinsic < + [], + [llvm_v4i32_ty, // rsrc(SGPR) + LLVMQualPointerType, // LDS base offset + llvm_i32_ty, // Data byte size: 1/2/4 + llvm_i32_ty, // vindex(VGPR) + llvm_i32_ty, // voffset(VGPR, included in bounds checking and swizzling) + llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) + llvm_i32_ty, // imm offset(imm, included in bounds checking and swizzling) + llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, + // bit 1 = slc, + // bit 2 = dlc on gfx10+)) + // swizzled buffer (bit 3 = swz)) + [IntrWillReturn, NoCapture>, ImmArg>, ImmArg>, + ImmArg>], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; +def int_amdgcn_struct_buffer_load_lds : AMDGPUStructBufferLoadLDS; + } // defset AMDGPUBufferIntrinsics // Uses that do not set the done bit should set IntrWriteMem on the Index: llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -5370,6 +5370,7 @@ case Intrinsic::amdgcn_struct_buffer_load: return legalizeBufferLoad(MI, MRI, B, false, false); case Intrinsic::amdgcn_raw_buffer_load_lds: + case Intrinsic::amdgcn_struct_buffer_load_lds: return legalizeBufferLoadLds(MI, MRI, B); case Intrinsic::amdgcn_raw_buffer_load_format: case Intrinsic::amdgcn_struct_buffer_load_format: Index: llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4483,6 +4483,14 @@ OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); break; } + case Intrinsic::amdgcn_struct_buffer_load_lds: { + OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); + OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); + OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); + OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); + OpdsMapping[6] = getVGPROpMapping(MI.getOperand(6).getReg(), MRI, *TRI); + break; + } case Intrinsic::amdgcn_struct_buffer_store: case Intrinsic::amdgcn_struct_tbuffer_store: { OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1195,7 +1195,8 @@ switch (IntrID) { default: break; - case Intrinsic::amdgcn_raw_buffer_load_lds: { + case Intrinsic::amdgcn_raw_buffer_load_lds: + case Intrinsic::amdgcn_struct_buffer_load_lds: { unsigned Width = cast(CI.getArgOperand(2))->getZExtValue(); Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8); return true; @@ -8260,6 +8261,28 @@ return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, M->getVTList(), Ops, M->getMemoryVT(), MMO); } + case Intrinsic::amdgcn_struct_buffer_load_lds: { + auto *M = cast(Op); + MachineMemOperand *MMO = M->getMemOperand(); + + SDValue Ops[] = { + Op.getOperand(0), // Chain + Op.getOperand(2), // rsrc + Op.getOperand(5), // vindex + Op.getOperand(6), // voffset + Op.getOperand(7), // soffset + Op.getOperand(8), // imm_offset + Op.getOperand(9), // cachepolicy, swizzled buffer + DAG.getTargetConstant(1, DL, MVT::i1), // idxen + Op.getOperand(4), // data byte size + copyToM0(DAG, Chain, DL, Op.getOperand(3)).getValue(1) // Glue + }; + + updateBufferMMO(MMO, Ops[3], Ops[4], Ops[5], Ops[2]); + + return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, + M->getVTList(), Ops, M->getMemoryVT(), MMO); + } case Intrinsic::amdgcn_end_cf: return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other, Op->getOperand(2), Chain), 0); Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.lds.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.lds.ll @@ -0,0 +1,126 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,SDAG +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GISEL + +declare void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* nocapture, i32 %size, i32 %vindex, i32 %voffset, i32 %soffset, i32 %offset, i32 %aux) + +define amdgpu_ps float @buffer_load_lds_dword(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds) { +; SDAG-LABEL: buffer_load_lds_dword: +; SDAG: ; %bb.0: ; %main_body +; SDAG-NEXT: v_mov_b32_e32 v0, 8 +; SDAG-NEXT: s_mov_b32 m0, s4 +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: buffer_load_dword v0, s[0:3], 0 idxen lds +; SDAG-NEXT: buffer_load_dword v0, s[0:3], 0 idxen offset:4 glc lds +; SDAG-NEXT: buffer_load_dword v0, s[0:3], 0 idxen offset:8 slc lds +; SDAG-NEXT: v_mov_b32_e32 v0, s4 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: ds_read_b32 v0, v0 +; SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-NEXT: ; return to shader part epilog +; +; GISEL-LABEL: buffer_load_lds_dword: +; GISEL: ; %bb.0: ; %main_body +; GISEL-NEXT: s_mov_b32 m0, s4 +; GISEL-NEXT: v_mov_b32_e32 v0, 8 +; GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 idxen lds +; GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 idxen offset:4 glc lds +; GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 idxen offset:8 slc lds +; GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: ds_read_b32 v0, v0 +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: ; return to shader part epilog +main_body: + call void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 8, i32 0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 8, i32 0, i32 0, i32 4, i32 1) + call void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 8, i32 0, i32 0, i32 8, i32 2) + %ptr = bitcast i8 addrspace(3)* %lds to float addrspace(3)* + %res = load float, float addrspace(3)* %ptr + ret float %res +} + +define amdgpu_ps void @buffer_load_lds_dword_imm_offset(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds, i32 %vindex) { +; GCN-LABEL: buffer_load_lds_dword_imm_offset: +; GCN: ; %bb.0: ; %main_body +; GCN-NEXT: s_mov_b32 m0, s4 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v0, s[0:3], 0 idxen offset:2048 lds +; GCN-NEXT: s_endpgm +main_body: + call void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 %vindex, i32 0, i32 0, i32 2048, i32 0) + ret void +} + +define amdgpu_ps void @buffer_load_lds_dword_v_offset(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds, i32 %vindex, i32 %voffset) { +; GCN-LABEL: buffer_load_lds_dword_v_offset: +; GCN: ; %bb.0: ; %main_body +; GCN-NEXT: s_mov_b32 m0, s4 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v[0:1], s[0:3], 0 idxen offen lds +; GCN-NEXT: s_endpgm +main_body: + call void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 %vindex, i32 %voffset, i32 0, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @buffer_load_lds_dword_s_offset(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds, i32 %vindex, i32 inreg %soffset) { +; GCN-LABEL: buffer_load_lds_dword_s_offset: +; GCN: ; %bb.0: ; %main_body +; GCN-NEXT: s_mov_b32 m0, s4 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v0, s[0:3], s5 idxen lds +; GCN-NEXT: s_endpgm +main_body: + call void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 %vindex, i32 0, i32 %soffset, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @buffer_load_lds_dword_vs_offset(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +; GCN-LABEL: buffer_load_lds_dword_vs_offset: +; GCN: ; %bb.0: ; %main_body +; GCN-NEXT: s_mov_b32 m0, s4 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v[0:1], s[0:3], s5 idxen offen lds +; GCN-NEXT: s_endpgm +main_body: + call void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 %vindex, i32 %voffset, i32 %soffset, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @buffer_load_lds_dword_vs_imm_offset(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +; GCN-LABEL: buffer_load_lds_dword_vs_imm_offset: +; GCN: ; %bb.0: ; %main_body +; GCN-NEXT: s_mov_b32 m0, s4 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v[0:1], s[0:3], s5 idxen offen offset:2048 lds +; GCN-NEXT: s_endpgm +main_body: + call void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 4, i32 %vindex, i32 %voffset, i32 %soffset, i32 2048, i32 0) + ret void +} + +define amdgpu_ps void @buffer_load_lds_ushort(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds, i32 %vindex) { +; GCN-LABEL: buffer_load_lds_ushort: +; GCN: ; %bb.0: ; %main_body +; GCN-NEXT: v_mov_b32_e32 v1, 0x800 +; GCN-NEXT: s_mov_b32 m0, s4 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_ushort v[0:1], s[0:3], 0 idxen offen lds +; GCN-NEXT: s_endpgm +main_body: + call void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 2, i32 %vindex, i32 2048, i32 0, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @buffer_load_lds_ubyte(<4 x i32> inreg %rsrc, i8 addrspace(3)* inreg %lds, i32 %vindex) { +; GCN-LABEL: buffer_load_lds_ubyte: +; GCN: ; %bb.0: ; %main_body +; GCN-NEXT: s_mov_b32 m0, s4 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_ubyte v0, s[0:3], 0 idxen offset:2048 lds +; GCN-NEXT: s_endpgm +main_body: + call void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, i8 addrspace(3)* %lds, i32 1, i32 %vindex, i32 0, i32 0, i32 2048, i32 0) + ret void +}