Index: include/llvm/IR/IntrinsicsAMDGPU.td =================================================================== --- include/llvm/IR/IntrinsicsAMDGPU.td +++ include/llvm/IR/IntrinsicsAMDGPU.td @@ -437,6 +437,39 @@ def int_amdgcn_buffer_store_format : AMDGPUBufferStore; def int_amdgcn_buffer_store : AMDGPUBufferStore; +def int_amdgcn_tbuffer_load : Intrinsic < + [llvm_anyint_ty], // overloaded for types i32, v2i32, v4i32 + [llvm_v4i32_ty, // rsrc(SGPR) + llvm_i32_ty, // num_channels(imm), selects opcode suffix: 1=X, 2=XY, 3=XYZ, 4=XYZW + llvm_i32_ty, // vaddr(VGPR) + llvm_i32_ty, // soffset(SGPR) + llvm_i32_ty, // inst_offset(imm) + llvm_i32_ty, // dfmt(imm) + llvm_i32_ty, // nfmt(imm) + llvm_i32_ty, // offen(imm) + llvm_i32_ty, // idxen(imm) + llvm_i32_ty, // glc(imm) + llvm_i32_ty, // slc(imm) + llvm_i32_ty], // tfe(imm) + []>; + +def int_amdgcn_tbuffer_store : Intrinsic < + [], + [llvm_v4i32_ty, // rsrc(SGPR) + llvm_anyint_ty, // vdata(VGPR), overloaded for types i32, v2i32, v4i32 + llvm_i32_ty, // num_channels(imm), selects opcode suffix: 1=X, 2=XY, 3=XYZ, 4=XYZW + llvm_i32_ty, // vaddr(VGPR) + llvm_i32_ty, // soffset(SGPR) + llvm_i32_ty, // inst_offset(imm) + llvm_i32_ty, // dfmt(imm) + llvm_i32_ty, // nfmt(imm) + llvm_i32_ty, // offen(imm) + llvm_i32_ty, // idxen(imm) + llvm_i32_ty, // glc(imm) + llvm_i32_ty, // slc(imm) + llvm_i32_ty], // tfe(imm) + []>; + class AMDGPUBufferAtomic : Intrinsic < [llvm_i32_ty], [llvm_i32_ty, // vdata(VGPR) Index: lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.h +++ lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -349,6 +349,7 @@ STORE_MSKOR, LOAD_CONSTANT, TBUFFER_STORE_FORMAT, + TBUFFER_LOAD_FORMAT, ATOMIC_CMP_SWAP, ATOMIC_INC, ATOMIC_DEC, Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -3482,6 +3482,7 @@ NODE_NAME_CASE(STORE_MSKOR) NODE_NAME_CASE(LOAD_CONSTANT) NODE_NAME_CASE(TBUFFER_STORE_FORMAT) + NODE_NAME_CASE(TBUFFER_LOAD_FORMAT) NODE_NAME_CASE(ATOMIC_CMP_SWAP) NODE_NAME_CASE(ATOMIC_INC) NODE_NAME_CASE(ATOMIC_DEC) Index: lib/Target/AMDGPU/BUFInstructions.td =================================================================== --- lib/Target/AMDGPU/BUFInstructions.td +++ lib/Target/AMDGPU/BUFInstructions.td @@ -674,9 +674,9 @@ // MTBUF Instructions //===----------------------------------------------------------------------===// -//def TBUFFER_LOAD_FORMAT_X : MTBUF_ <0, "tbuffer_load_format_x", []>; -//def TBUFFER_LOAD_FORMAT_XY : MTBUF_ <1, "tbuffer_load_format_xy", []>; -//def TBUFFER_LOAD_FORMAT_XYZ : MTBUF_ <2, "tbuffer_load_format_xyz", []>; +def TBUFFER_LOAD_FORMAT_X : MTBUF_Load_Pseudo <"tbuffer_load_format_x", VGPR_32>; +def TBUFFER_LOAD_FORMAT_XY : MTBUF_Load_Pseudo <"tbuffer_load_format_xy", VReg_64>; +def TBUFFER_LOAD_FORMAT_XYZ : MTBUF_Load_Pseudo <"tbuffer_load_format_xyz", VReg_128>; def TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Load_Pseudo <"tbuffer_load_format_xyzw", VReg_128>; def TBUFFER_STORE_FORMAT_X : MTBUF_Store_Pseudo <"tbuffer_store_format_x", VGPR_32>; def TBUFFER_STORE_FORMAT_XY : MTBUF_Store_Pseudo <"tbuffer_store_format_xy", VReg_64>; @@ -1078,6 +1078,23 @@ // MTBUF Patterns //===----------------------------------------------------------------------===// +// TBUFFER_LOAD_FORMAT_*, addr64=0 +class MTBUF_LoadResource : Pat< + (vt (SItbuffer_load v4i32:$rsrc, num_channels, i32:$vaddr, + i32:$soffset, imm:$inst_offset, imm:$dfmt, + imm:$nfmt, imm:$offen, imm:$idxen, + imm:$glc, imm:$slc, imm:$tfe)), + (opcode + (as_i16imm $inst_offset), (as_i1imm $offen), (as_i1imm $idxen), + (as_i1imm $glc), 0, (as_i8imm $dfmt), (as_i8imm $nfmt), $vaddr, $rsrc, + (as_i1imm $slc), (as_i1imm $tfe), $soffset) +>; + +def : MTBUF_LoadResource ; +def : MTBUF_LoadResource ; +def : MTBUF_LoadResource ; +def : MTBUF_LoadResource ; + // TBUFFER_STORE_FORMAT_*, addr64=0 class MTBUF_StoreResource : Pat< (SItbuffer_store v4i32:$rsrc, vt:$vdata, num_channels, i32:$vaddr, @@ -1218,6 +1235,9 @@ let Inst{18-16} = op; } +def TBUFFER_LOAD_FORMAT_X_si : MTBUF_Real_si <0, TBUFFER_LOAD_FORMAT_X>; +def TBUFFER_LOAD_FORMAT_XY_si : MTBUF_Real_si <1, TBUFFER_LOAD_FORMAT_XY>; +def TBUFFER_LOAD_FORMAT_XYZ_si : MTBUF_Real_si <2, TBUFFER_LOAD_FORMAT_XYZ>; def TBUFFER_LOAD_FORMAT_XYZW_si : MTBUF_Real_si <3, TBUFFER_LOAD_FORMAT_XYZW>; def TBUFFER_STORE_FORMAT_X_si : MTBUF_Real_si <4, TBUFFER_STORE_FORMAT_X>; def TBUFFER_STORE_FORMAT_XY_si : MTBUF_Real_si <5, TBUFFER_STORE_FORMAT_XY>; @@ -1342,6 +1362,9 @@ let Inst{18-15} = op; } +def TBUFFER_LOAD_FORMAT_X_vi : MTBUF_Real_vi <0, TBUFFER_LOAD_FORMAT_X>; +def TBUFFER_LOAD_FORMAT_XY_vi : MTBUF_Real_vi <1, TBUFFER_LOAD_FORMAT_XY>; +def TBUFFER_LOAD_FORMAT_XYZ_vi : MTBUF_Real_vi <2, TBUFFER_LOAD_FORMAT_XYZ>; def TBUFFER_LOAD_FORMAT_XYZW_vi : MTBUF_Real_vi <3, TBUFFER_LOAD_FORMAT_XYZW>; def TBUFFER_STORE_FORMAT_X_vi : MTBUF_Real_vi <4, TBUFFER_STORE_FORMAT_X>; def TBUFFER_STORE_FORMAT_XY_vi : MTBUF_Real_vi <5, TBUFFER_STORE_FORMAT_XY>; Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -2938,6 +2938,8 @@ SelectionDAG &DAG) const { unsigned IntrID = cast(Op.getOperand(1))->getZExtValue(); SDLoc DL(Op); + MachineFunction &MF = DAG.getMachineFunction(); + switch (IntrID) { case Intrinsic::amdgcn_atomic_inc: case Intrinsic::amdgcn_atomic_dec: { @@ -2963,7 +2965,6 @@ Op.getOperand(5), // glc Op.getOperand(6) // slc }; - MachineFunction &MF = DAG.getMachineFunction(); SIMachineFunctionInfo *MFI = MF.getInfo(); unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ? @@ -2978,6 +2979,32 @@ return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, MMO); } + case Intrinsic::amdgcn_tbuffer_load: { + SDValue Ops[] = { + Op.getOperand(0), // Chain + Op.getOperand(2), // rsrc + Op.getOperand(3), // num channels + Op.getOperand(4), // vaddr + Op.getOperand(5), // soffset + Op.getOperand(6), // instr offset + Op.getOperand(7), // dfmt + Op.getOperand(8), // nfmt + Op.getOperand(9), // offen + Op.getOperand(10), // idxen + Op.getOperand(11), // glc + Op.getOperand(12), // slc + Op.getOperand(13) // tfe + }; + + EVT VT = Op.getOperand(2).getValueType(); + + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo(), + MachineMemOperand::MOLoad, + VT.getStoreSize(), 4); + return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL, + Op->getVTList(), Ops, VT, MMO); + } default: return SDValue(); } @@ -3047,7 +3074,8 @@ return DAG.getNode(NodeOp, DL, MVT::Other, Chain, Op.getOperand(2), Glue); } - case AMDGPUIntrinsic::SI_tbuffer_store: { + case AMDGPUIntrinsic::SI_tbuffer_store: // Retained for backward compatibility + case Intrinsic::amdgcn_tbuffer_store: { SDValue Ops[] = { Chain, Op.getOperand(2), Index: lib/Target/AMDGPU/SIInstrInfo.td =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.td +++ lib/Target/AMDGPU/SIInstrInfo.td @@ -39,6 +39,25 @@ [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain] >; +def SItbuffer_load : SDNode<"AMDGPUISD::TBUFFER_LOAD_FORMAT", + SDTypeProfile<1, 12, + [SDTCisVT<0, iAny>, // vdata(VGPR) + SDTCisVT<1, v4i32>, // rsrc(SGPR) + SDTCisVT<2, i32>, // num_channels(imm) + SDTCisVT<3, i32>, // vaddr(VGPR) + SDTCisVT<4, i32>, // soffset(SGPR) + SDTCisVT<5, i32>, // inst_offset(imm) + SDTCisVT<6, i32>, // dfmt(imm) + SDTCisVT<7, i32>, // nfmt(imm) + SDTCisVT<8, i32>, // offen(imm) + SDTCisVT<9, i32>, // idxen(imm) + SDTCisVT<10, i32>, // glc(imm) + SDTCisVT<11, i32>, // slc(imm) + SDTCisVT<12, i32> // tfe(imm) + ]>, + [SDNPMayLoad, SDNPMemOperand, SDNPHasChain] +>; + def SItbuffer_store : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT", SDTypeProfile<0, 13, [SDTCisVT<0, v4i32>, // rsrc(SGPR) Index: lib/Target/AMDGPU/SIIntrinsics.td =================================================================== --- lib/Target/AMDGPU/SIIntrinsics.td +++ lib/Target/AMDGPU/SIIntrinsics.td @@ -32,6 +32,23 @@ def int_SI_load_const : Intrinsic <[llvm_float_ty], [llvm_anyint_ty, llvm_i32_ty], [IntrNoMem]>; def int_SI_vs_load_input : Intrinsic <[llvm_v4f32_ty], [llvm_anyint_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]> ; + // Fully-flexible TBUFFER_LOAD_FORMAT_* except for the ADDR64 bit, which is not exposed + def int_SI_tbuffer_load : Intrinsic < + [llvm_anyint_ty], // vdata(VGPR), overloaded for types i32, v2i32, v4i32 + [llvm_anyint_ty, // rsrc(SGPR) + llvm_i32_ty, // num_channels(imm), selects opcode suffix: 1=X, 2=XY, 3=XYZ, 4=XYZW + llvm_i32_ty, // vaddr(VGPR) + llvm_i32_ty, // soffset(SGPR) + llvm_i32_ty, // inst_offset(imm) + llvm_i32_ty, // dfmt(imm) + llvm_i32_ty, // nfmt(imm) + llvm_i32_ty, // offen(imm) + llvm_i32_ty, // idxen(imm) + llvm_i32_ty, // glc(imm) + llvm_i32_ty, // slc(imm) + llvm_i32_ty], // tfe(imm) + []>; + // Fully-flexible TBUFFER_STORE_FORMAT_* except for the ADDR64 bit, which is not exposed def int_SI_tbuffer_store : Intrinsic < [], Index: test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.ll @@ -0,0 +1,43 @@ +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s + +;CHECK-LABEL: {{^}}test1: +;CHECK: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, 0x20, -1, 0, -1, 0, 14, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0 +define amdgpu_vs <4 x i32> @test1(i32 %vaddr) { + %vdata = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> undef, + i32 4, i32 %vaddr, i32 0, i32 32, i32 14, i32 4, i32 1, i32 0, i32 1, + i32 1, i32 0) + ret <4 x i32> %vdata +} + +;CHECK-LABEL: {{^}}test2: +;CHECK: tbuffer_load_format_xyz {{v\[[0-9]+:[0-9]+\]}}, 0x18, -1, 0, -1, 0, 13, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0 +define amdgpu_vs <4 x i32> @test2(i32 %vaddr) { + %vdata = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> undef, + i32 3, i32 %vaddr, i32 0, i32 24, i32 13, i32 4, i32 1, i32 0, i32 1, + i32 1, i32 0) + ret <4 x i32> %vdata +} + +;CHECK-LABEL: {{^}}test3: +;CHECK: tbuffer_load_format_xy {{v\[[0-9]+:[0-9]+\]}}, 0x10, -1, 0, -1, 0, 11, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0 +define amdgpu_vs <2 x i32> @test3(i32 %vaddr) { + %vdata = call <2 x i32> @llvm.amdgcn.tbuffer.load.v2i32(<4 x i32> undef, + i32 2, i32 %vaddr, i32 0, i32 16, i32 11, i32 4, i32 1, i32 0, i32 1, + i32 1, i32 0) + ret <2 x i32> %vdata +} + +;CHECK-LABEL: {{^}}test4: +;CHECK: tbuffer_load_format_x {{v[0-9]+}}, 0x8, -1, 0, -1, 0, 4, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0 +define amdgpu_vs float @test4(i32 %vaddr) { + %vdata = call i32 @llvm.amdgcn.tbuffer.load.i32(<4 x i32> undef, + i32 1, i32 %vaddr, i32 0, i32 8, i32 4, i32 4, i32 1, i32 0, i32 1, + i32 1, i32 0) + %vdata.f = bitcast i32 %vdata to float + ret float %vdata.f +} + +declare i32 @llvm.amdgcn.tbuffer.load.i32(<4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) +declare <2 x i32> @llvm.amdgcn.tbuffer.load.v2i32(<4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) +declare <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) Index: test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.ll @@ -0,0 +1,45 @@ +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s + +;CHECK-LABEL: {{^}}test1: +;CHECK: tbuffer_store_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, 0x20, -1, 0, -1, 0, 14, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0 +define amdgpu_vs void @test1(i32 %a1, i32 %vaddr) { + %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0 + call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> undef, <4 x i32> %vdata, + i32 4, i32 %vaddr, i32 0, i32 32, i32 14, i32 4, i32 1, i32 0, i32 1, + i32 1, i32 0) + ret void +} + +;CHECK-LABEL: {{^}}test2: +;CHECK: tbuffer_store_format_xyz {{v\[[0-9]+:[0-9]+\]}}, 0x18, -1, 0, -1, 0, 13, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0 +define amdgpu_vs void @test2(i32 %a1, i32 %vaddr) { + %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0 + call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> undef, <4 x i32> %vdata, + i32 3, i32 %vaddr, i32 0, i32 24, i32 13, i32 4, i32 1, i32 0, i32 1, + i32 1, i32 0) + ret void +} + +;CHECK-LABEL: {{^}}test3: +;CHECK: tbuffer_store_format_xy {{v\[[0-9]+:[0-9]+\]}}, 0x10, -1, 0, -1, 0, 11, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0 +define amdgpu_vs void @test3(i32 %a1, i32 %vaddr) { + %vdata = insertelement <2 x i32> undef, i32 %a1, i32 0 + call void @llvm.amdgcn.tbuffer.store.v2i32(<4 x i32> undef, <2 x i32> %vdata, + i32 2, i32 %vaddr, i32 0, i32 16, i32 11, i32 4, i32 1, i32 0, i32 1, + i32 1, i32 0) + ret void +} + +;CHECK-LABEL: {{^}}test4: +;CHECK: tbuffer_store_format_x {{v[0-9]+}}, 0x8, -1, 0, -1, 0, 4, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0 +define amdgpu_vs void @test4(i32 %vdata, i32 %vaddr) { + call void @llvm.amdgcn.tbuffer.store.i32(<4 x i32> undef, i32 %vdata, + i32 1, i32 %vaddr, i32 0, i32 8, i32 4, i32 4, i32 1, i32 0, i32 1, + i32 1, i32 0) + ret void +} + +declare void @llvm.amdgcn.tbuffer.store.i32(<4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) +declare void @llvm.amdgcn.tbuffer.store.v2i32(<4 x i32>, <2 x i32>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) +declare void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)