Index: lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.h +++ lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -437,11 +437,13 @@ TBUFFER_STORE_FORMAT, TBUFFER_STORE_FORMAT_X3, TBUFFER_LOAD_FORMAT, + TBUFFER_LOAD_FORMAT_D16, ATOMIC_CMP_SWAP, ATOMIC_INC, ATOMIC_DEC, BUFFER_LOAD, BUFFER_LOAD_FORMAT, + BUFFER_LOAD_FORMAT_D16, LAST_AMDGPU_ISD_NUMBER }; Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -3982,11 +3982,13 @@ NODE_NAME_CASE(TBUFFER_STORE_FORMAT) NODE_NAME_CASE(TBUFFER_STORE_FORMAT_X3) NODE_NAME_CASE(TBUFFER_LOAD_FORMAT) + NODE_NAME_CASE(TBUFFER_LOAD_FORMAT_D16) NODE_NAME_CASE(ATOMIC_CMP_SWAP) NODE_NAME_CASE(ATOMIC_INC) NODE_NAME_CASE(ATOMIC_DEC) NODE_NAME_CASE(BUFFER_LOAD) NODE_NAME_CASE(BUFFER_LOAD_FORMAT) + NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16) case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break; } return nullptr; Index: lib/Target/AMDGPU/BUFInstructions.td =================================================================== --- lib/Target/AMDGPU/BUFInstructions.td +++ lib/Target/AMDGPU/BUFInstructions.td @@ -659,6 +659,21 @@ defm BUFFER_LOAD_FORMAT_XYZW : MUBUF_Pseudo_Loads < "buffer_load_format_xyzw", VReg_128 >; + +defm BUFFER_LOAD_FORMAT_D16_X : MUBUF_Pseudo_Loads < + "buffer_load_format_d16_x", VGPR_32 +>; +defm BUFFER_LOAD_FORMAT_D16_XY : MUBUF_Pseudo_Loads < + "buffer_load_format_d16_xy", VGPR_32 +>; +defm BUFFER_LOAD_FORMAT_D16_XYZ : MUBUF_Pseudo_Loads < + "buffer_load_format_d16_xyz", VReg_64 +>; +defm BUFFER_LOAD_FORMAT_D16_XYZW : MUBUF_Pseudo_Loads < + "buffer_load_format_d16_xyzw", VReg_64 +>; + + defm BUFFER_STORE_FORMAT_X : MUBUF_Pseudo_Stores < "buffer_store_format_x", VGPR_32 >; @@ -855,11 +870,16 @@ defm TBUFFER_LOAD_FORMAT_XY : MTBUF_Pseudo_Loads <"tbuffer_load_format_xy", VReg_64>; defm TBUFFER_LOAD_FORMAT_XYZ : MTBUF_Pseudo_Loads <"tbuffer_load_format_xyz", VReg_128>; defm TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Pseudo_Loads <"tbuffer_load_format_xyzw", VReg_128>; +defm TBUFFER_LOAD_FORMAT_D16_X : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_x", VGPR_32>; +defm TBUFFER_LOAD_FORMAT_D16_XY : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xy", VGPR_32>; +defm TBUFFER_LOAD_FORMAT_D16_XYZ : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyz", VReg_64>; +defm TBUFFER_LOAD_FORMAT_D16_XYZW : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyzw", VReg_64>; defm TBUFFER_STORE_FORMAT_X : MTBUF_Pseudo_Stores <"tbuffer_store_format_x", VGPR_32>; defm TBUFFER_STORE_FORMAT_XY : MTBUF_Pseudo_Stores <"tbuffer_store_format_xy", VReg_64>; defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyz", VReg_128>; defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyzw", VReg_128>; + let SubtargetPredicate = isCIVI in { //===----------------------------------------------------------------------===// @@ -929,6 +949,12 @@ defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; + +defm : MUBUF_LoadIntrinsicPat; +defm : MUBUF_LoadIntrinsicPat; +defm : MUBUF_LoadIntrinsicPat; +defm : MUBUF_LoadIntrinsicPat; + defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; @@ -1354,6 +1380,10 @@ defm : MTBUF_LoadIntrinsicPat; defm : MTBUF_LoadIntrinsicPat; defm : MTBUF_LoadIntrinsicPat; +defm : MTBUF_LoadIntrinsicPat; +defm : MTBUF_LoadIntrinsicPat; +defm : MTBUF_LoadIntrinsicPat; +defm : MTBUF_LoadIntrinsicPat; multiclass MTBUF_StoreIntrinsicPat { @@ -1404,6 +1434,7 @@ defm : MTBUF_StoreIntrinsicPat; defm : MTBUF_StoreIntrinsicPat; + //===----------------------------------------------------------------------===// // Target instructions, move to the appropriate target TD file //===----------------------------------------------------------------------===// @@ -1617,6 +1648,10 @@ defm BUFFER_STORE_FORMAT_XY : MUBUF_Real_AllAddr_vi <0x05>; defm BUFFER_STORE_FORMAT_XYZ : MUBUF_Real_AllAddr_vi <0x06>; defm BUFFER_STORE_FORMAT_XYZW : MUBUF_Real_AllAddr_vi <0x07>; +defm BUFFER_LOAD_FORMAT_D16_X : MUBUF_Real_AllAddr_vi <0x08>; +defm BUFFER_LOAD_FORMAT_D16_XY : MUBUF_Real_AllAddr_vi <0x09>; +defm BUFFER_LOAD_FORMAT_D16_XYZ : MUBUF_Real_AllAddr_vi <0x0a>; +defm BUFFER_LOAD_FORMAT_D16_XYZW : MUBUF_Real_AllAddr_vi <0x0b>; defm BUFFER_LOAD_UBYTE : MUBUF_Real_AllAddr_vi <0x10>; defm BUFFER_LOAD_SBYTE : MUBUF_Real_AllAddr_vi <0x11>; defm BUFFER_LOAD_USHORT : MUBUF_Real_AllAddr_vi <0x12>; @@ -1710,3 +1745,7 @@ defm TBUFFER_STORE_FORMAT_XY : MTBUF_Real_AllAddr_vi <5>; defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Real_AllAddr_vi <6>; defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Real_AllAddr_vi <7>; +defm TBUFFER_LOAD_FORMAT_D16_X : MTBUF_Real_AllAddr_vi <8>; +defm TBUFFER_LOAD_FORMAT_D16_XY : MTBUF_Real_AllAddr_vi <9>; +//defm TBUFFER_LOAD_FORMAT_D16_XYZ : MTBUF_Real_AllAddr_vi <0x0a>; +defm TBUFFER_LOAD_FORMAT_D16_XYZW : MTBUF_Real_AllAddr_vi <0x0b>; Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -201,6 +201,8 @@ setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f16, Custom); + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2f16, Custom); + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4f16, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); @@ -559,6 +561,19 @@ Info.writeMem = true; return true; } + case Intrinsic::amdgcn_buffer_load: + case Intrinsic::amdgcn_buffer_load_format: + case Intrinsic::amdgcn_tbuffer_load: { + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = MVT::getVT(CI.getType()); + Info.ptrVal = nullptr; + Info.align = 0; + + Info.vol = false; + Info.readMem = true; + Info.writeMem = false; + return true; + } default: return false; } @@ -3219,6 +3234,44 @@ return SDValue(); } +static SDValue ChangeResultType(SDValue Op, EVT EquivResultT, SelectionDAG &DAG) { + // Change from v4f16/v2f16 to EquivResultT. + SDVTList VTList = DAG.getVTList(EquivResultT, MVT::Other); + SDLoc DL(Op); + MemSDNode *M = cast(Op); + unsigned IID = cast(Op.getOperand(1))->getZExtValue(); + if (IID == Intrinsic::amdgcn_tbuffer_load) { + SDValue Ops[] = { + Op.getOperand(0), // Chain + Op.getOperand(2), // rsrc + Op.getOperand(3), // vindex + Op.getOperand(4), // voffset + Op.getOperand(5), // soffset + Op.getOperand(6), // offset + Op.getOperand(7), // dfmt + Op.getOperand(8), // nfmt + Op.getOperand(9), // glc + Op.getOperand(10) // slc + }; + return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, DL, + VTList, Ops, M->getMemoryVT(), + M->getMemOperand()); + } else if (IID == Intrinsic::amdgcn_buffer_load_format) { + SDValue Ops[] = { + Op.getOperand(0), // Chain + Op.getOperand(2), // rsrc + Op.getOperand(3), // vindex + Op.getOperand(4), // offset + Op.getOperand(5), // glc + Op.getOperand(6) // slc + }; + return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, + DL, VTList, Ops, M->getMemoryVT(), + M->getMemOperand()); + } + return SDValue(); +} + void SITargetLowering::ReplaceNodeResults(SDNode *N, SmallVectorImpl &Results, SelectionDAG &DAG) const { @@ -3246,6 +3299,21 @@ } break; } + case ISD::INTRINSIC_W_CHAIN: { + SDValue Op = SDValue(N, 0); + EVT ResultT = Op.getValueType(); + // TODO: handle v3f16. + if (ResultT != MVT::v2f16 && ResultT != MVT::v4f16) + return; + + EVT EquivResultT = getEquivalentMemType(*DAG.getContext(), ResultT); + if (SDValue Res = ChangeResultType(Op, EquivResultT, DAG)) { + // Cast back to the original result type, and put in the "Results" list. + Results.push_back(DAG.getNode(ISD::BITCAST, SDLoc(Op), ResultT, Res)); + Results.push_back(Res.getOperand(0)); // Chain + } + break; + } case ISD::SELECT: { SDLoc SL(N); EVT VT = N->getValueType(0); @@ -4167,10 +4235,8 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const { - unsigned IntrID = cast(Op.getOperand(1))->getZExtValue(); SDLoc DL(Op); - MachineFunction &MF = DAG.getMachineFunction(); - + unsigned IntrID = cast(Op.getOperand(1))->getZExtValue(); switch (IntrID) { case Intrinsic::amdgcn_atomic_inc: case Intrinsic::amdgcn_atomic_dec: { @@ -4188,6 +4254,7 @@ } case Intrinsic::amdgcn_buffer_load: case Intrinsic::amdgcn_buffer_load_format: { + MemSDNode *M = cast(Op); SDValue Ops[] = { Op.getOperand(0), // Chain Op.getOperand(2), // rsrc @@ -4196,21 +4263,14 @@ Op.getOperand(5), // glc Op.getOperand(6) // slc }; - SIMachineFunctionInfo *MFI = MF.getInfo(); - unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ? AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT; - EVT VT = Op.getValueType(); - EVT IntVT = VT.changeTypeToInteger(); - MachineMemOperand *MMO = MF.getMachineMemOperand( - MachinePointerInfo(MFI->getBufferPSV()), - MachineMemOperand::MOLoad, - VT.getStoreSize(), VT.getStoreSize()); - - return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, MMO); + return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, + M->getMemoryVT(), M->getMemOperand()); } case Intrinsic::amdgcn_tbuffer_load: { + MemSDNode *M = cast(Op); SDValue Ops[] = { Op.getOperand(0), // Chain Op.getOperand(2), // rsrc @@ -4221,17 +4281,12 @@ Op.getOperand(7), // dfmt Op.getOperand(8), // nfmt Op.getOperand(9), // glc - Op.getOperand(10) // slc + Op.getOperand(10) // slc }; - EVT VT = Op.getOperand(2).getValueType(); - - MachineMemOperand *MMO = MF.getMachineMemOperand( - MachinePointerInfo(), - MachineMemOperand::MOLoad, - VT.getStoreSize(), VT.getStoreSize()); return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL, - Op->getVTList(), Ops, VT, MMO); + Op->getVTList(), Ops, M->getMemoryVT(), + M->getMemOperand()); } // Basic sample. case Intrinsic::amdgcn_image_sample: Index: lib/Target/AMDGPU/SIInstrInfo.td =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.td +++ lib/Target/AMDGPU/SIInstrInfo.td @@ -45,8 +45,8 @@ [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain] >; -def SItbuffer_load : SDNode<"AMDGPUISD::TBUFFER_LOAD_FORMAT", - SDTypeProfile<1, 9, + +def SDTbuffer_load : SDTypeProfile<1, 9, [ // vdata SDTCisVT<1, v4i32>, // rsrc SDTCisVT<2, i32>, // vindex(VGPR) @@ -57,9 +57,14 @@ SDTCisVT<7, i32>, // nfmt(imm) SDTCisVT<8, i32>, // glc(imm) SDTCisVT<9, i32> // slc(imm) - ]>, - [SDNPMayLoad, SDNPMemOperand, SDNPHasChain] ->; + ]>; + +def SItbuffer_load : SDNode<"AMDGPUISD::TBUFFER_LOAD_FORMAT", SDTbuffer_load, + [SDNPMayLoad, SDNPMemOperand, SDNPHasChain]>; +def SItbuffer_load_d16 : SDNode<"AMDGPUISD::TBUFFER_LOAD_FORMAT_D16", + SDTbuffer_load, + [SDNPMayLoad, SDNPMemOperand, SDNPHasChain]>; + def SDTtbuffer_store : SDTypeProfile<0, 10, [ // vdata @@ -92,6 +97,9 @@ [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>; def SIbuffer_load_format : SDNode <"AMDGPUISD::BUFFER_LOAD_FORMAT", SDTBufferLoad, [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>; +def SIbuffer_load_format_d16 : SDNode <"AMDGPUISD::BUFFER_LOAD_FORMAT_D16", + SDTBufferLoad, + [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>; class SDSample : SDNode , SDTCisVT<2, v8i32>, Index: test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.d16.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.d16.ll @@ -0,0 +1,33 @@ +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=GCN %s +;RUN: llc < %s -march=amdgcn -mcpu=gfx901 -verify-machineinstrs | FileCheck -check-prefix=GCN %s + +;GCN-LABEL: {{^}}buffer_load_format_d16_x: +;GCN: buffer_load_format_d16_x v0, off, s[0:3], 0 +;GCN: s_waitcnt +define amdgpu_ps half @buffer_load_format_d16_x(<4 x i32> inreg %rsrc) { +main_body: + %data = call half @llvm.amdgcn.buffer.load.format.f16(<4 x i32> %rsrc, i32 0, i32 0, i1 0, i1 0) + ret half %data +} + +;GCN-LABEL: {{^}}buffer_load_format_d16_xy: +;GCN: buffer_load_format_d16_xy v0, off, s[0:3], 0 +;GCN: s_waitcnt +define amdgpu_ps <2 x half> @buffer_load_format_d16_xy(<4 x i32> inreg %rsrc) { +main_body: + %data = call <2 x half> @llvm.amdgcn.buffer.load.format.v2f16(<4 x i32> %rsrc, i32 0, i32 0, i1 0, i1 0) + ret < 2 x half> %data +} + +;GCN-LABEL: {{^}}buffer_load_format_d16_xyzw: +;GCN: buffer_load_format_d16_xyzw v[0:1], off, s[0:3], 0 +;GCN: s_waitcnt +define amdgpu_ps <4 x half> @buffer_load_format_d16_xyzw(<4 x i32> inreg %rsrc) { +main_body: + %data = call <4 x half> @llvm.amdgcn.buffer.load.format.v4f16(<4 x i32> %rsrc, i32 0, i32 0, i1 0, i1 0) + ret <4 x half> %data +} + +declare half @llvm.amdgcn.buffer.load.format.f16(<4 x i32>, i32, i32, i1, i1) +declare <2 x half> @llvm.amdgcn.buffer.load.format.v2f16(<4 x i32>, i32, i32, i1, i1) +declare <4 x half> @llvm.amdgcn.buffer.load.format.v4f16(<4 x i32>, i32, i32, i1, i1) Index: test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.d16.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.d16.ll @@ -0,0 +1,33 @@ +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=GCN %s +;RUN: llc < %s -march=amdgcn -mcpu=gfx901 -verify-machineinstrs | FileCheck -check-prefix=GCN %s + +;GCN-LABEL: {{^}}tbuffer_load_d16_x: +;GCN: tbuffer_load_format_d16_x v0, off, s[0:3], dfmt:6, nfmt:1, 0 +;GCN: s_waitcnt +define amdgpu_ps half @tbuffer_load_d16_x(<4 x i32> inreg %rsrc) { +main_body: + %data = call half @llvm.amdgcn.tbuffer.load.f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 6, i32 1, i1 0, i1 0) + ret half %data +} + +;GCN-LABEL: {{^}}tbuffer_load_d16_xy: +;GCN: tbuffer_load_format_d16_xy v0, off, s[0:3], dfmt:6, nfmt:1, 0 +;GCN: s_waitcnt +define amdgpu_ps <2 x half> @tbuffer_load_d16_xy(<4 x i32> inreg %rsrc) { +main_body: + %data = call <2 x half> @llvm.amdgcn.tbuffer.load.v2f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 6, i32 1, i1 0, i1 0) + ret < 2 x half> %data +} + +;GCN-LABEL: {{^}}tbuffer_load_d16_xyzw: +;GCN: tbuffer_load_format_d16_xyzw v[0:1], off, s[0:3], dfmt:6, nfmt:1, 0 +;GCN: s_waitcnt +define amdgpu_ps <4 x half> @tbuffer_load_d16_xyzw(<4 x i32> inreg %rsrc) { +main_body: + %data = call <4 x half> @llvm.amdgcn.tbuffer.load.v4f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 6, i32 1, i1 0, i1 0) + ret <4 x half> %data +} + +declare half @llvm.amdgcn.tbuffer.load.f16(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) +declare <2 x half> @llvm.amdgcn.tbuffer.load.v2f16(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) +declare <4 x half> @llvm.amdgcn.tbuffer.load.v4f16(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1)