Index: lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.h +++ lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -435,11 +435,13 @@ TBUFFER_STORE_FORMAT, TBUFFER_STORE_FORMAT_X3, TBUFFER_LOAD_FORMAT, + TBUFFER_LOAD_FORMAT_D16, ATOMIC_CMP_SWAP, ATOMIC_INC, ATOMIC_DEC, BUFFER_LOAD, BUFFER_LOAD_FORMAT, + BUFFER_LOAD_FORMAT_D16, LAST_AMDGPU_ISD_NUMBER }; Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -3804,11 +3804,13 @@ NODE_NAME_CASE(TBUFFER_STORE_FORMAT) NODE_NAME_CASE(TBUFFER_STORE_FORMAT_X3) NODE_NAME_CASE(TBUFFER_LOAD_FORMAT) + NODE_NAME_CASE(TBUFFER_LOAD_FORMAT_D16) NODE_NAME_CASE(ATOMIC_CMP_SWAP) NODE_NAME_CASE(ATOMIC_INC) NODE_NAME_CASE(ATOMIC_DEC) NODE_NAME_CASE(BUFFER_LOAD) NODE_NAME_CASE(BUFFER_LOAD_FORMAT) + NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16) case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break; } return nullptr; Index: lib/Target/AMDGPU/BUFInstructions.td =================================================================== --- lib/Target/AMDGPU/BUFInstructions.td +++ lib/Target/AMDGPU/BUFInstructions.td @@ -659,6 +659,21 @@ defm BUFFER_LOAD_FORMAT_XYZW : MUBUF_Pseudo_Loads < "buffer_load_format_xyzw", VReg_128 >; + +defm BUFFER_LOAD_FORMAT_D16_X : MUBUF_Pseudo_Loads < + "buffer_load_format_d16_x", VGPR_32 +>; +defm BUFFER_LOAD_FORMAT_D16_XY : MUBUF_Pseudo_Loads < + "buffer_load_format_d16_xy", VGPR_32 +>; +defm BUFFER_LOAD_FORMAT_D16_XYZ : MUBUF_Pseudo_Loads < + "buffer_load_format_d16_xyz", VReg_64 +>; +defm BUFFER_LOAD_FORMAT_D16_XYZW : MUBUF_Pseudo_Loads < + "buffer_load_format_d16_xyzw", VReg_64 +>; + + defm BUFFER_STORE_FORMAT_X : MUBUF_Pseudo_Stores < "buffer_store_format_x", VGPR_32 >; @@ -855,11 +870,16 @@ defm TBUFFER_LOAD_FORMAT_XY : MTBUF_Pseudo_Loads <"tbuffer_load_format_xy", VReg_64>; defm TBUFFER_LOAD_FORMAT_XYZ : MTBUF_Pseudo_Loads <"tbuffer_load_format_xyz", VReg_128>; defm TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Pseudo_Loads <"tbuffer_load_format_xyzw", VReg_128>; +defm TBUFFER_LOAD_FORMAT_D16_X : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_x", VGPR_32>; +defm TBUFFER_LOAD_FORMAT_D16_XY : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xy", VGPR_32>; +defm TBUFFER_LOAD_FORMAT_D16_XYZ : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyz", VReg_64>; +defm TBUFFER_LOAD_FORMAT_D16_XYZW : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyzw", VReg_64>; defm TBUFFER_STORE_FORMAT_X : MTBUF_Pseudo_Stores <"tbuffer_store_format_x", VGPR_32>; defm TBUFFER_STORE_FORMAT_XY : MTBUF_Pseudo_Stores <"tbuffer_store_format_xy", VReg_64>; defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyz", VReg_128>; defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyzw", VReg_128>; + let SubtargetPredicate = isCIVI in { //===----------------------------------------------------------------------===// @@ -929,6 +949,12 @@ defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; + +defm : MUBUF_LoadIntrinsicPat; +defm : MUBUF_LoadIntrinsicPat; +defm : MUBUF_LoadIntrinsicPat; +defm : MUBUF_LoadIntrinsicPat; + defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; @@ -1354,6 +1380,10 @@ defm : MTBUF_LoadIntrinsicPat; defm : MTBUF_LoadIntrinsicPat; defm : MTBUF_LoadIntrinsicPat; +defm : MTBUF_LoadIntrinsicPat; +defm : MTBUF_LoadIntrinsicPat; +defm : MTBUF_LoadIntrinsicPat; +defm : MTBUF_LoadIntrinsicPat; multiclass MTBUF_StoreIntrinsicPat { @@ -1404,6 +1434,7 @@ defm : MTBUF_StoreIntrinsicPat; defm : MTBUF_StoreIntrinsicPat; + //===----------------------------------------------------------------------===// // Target instructions, move to the appropriate target TD file //===----------------------------------------------------------------------===// @@ -1617,6 +1648,10 @@ defm BUFFER_STORE_FORMAT_XY : MUBUF_Real_AllAddr_vi <0x05>; defm BUFFER_STORE_FORMAT_XYZ : MUBUF_Real_AllAddr_vi <0x06>; defm BUFFER_STORE_FORMAT_XYZW : MUBUF_Real_AllAddr_vi <0x07>; +defm BUFFER_LOAD_FORMAT_D16_X : MUBUF_Real_AllAddr_vi <0x08>; +defm BUFFER_LOAD_FORMAT_D16_XY : MUBUF_Real_AllAddr_vi <0x09>; +defm BUFFER_LOAD_FORMAT_D16_XYZ : MUBUF_Real_AllAddr_vi <0x0a>; +defm BUFFER_LOAD_FORMAT_D16_XYZW : MUBUF_Real_AllAddr_vi <0x0b>; defm BUFFER_LOAD_UBYTE : MUBUF_Real_AllAddr_vi <0x10>; defm BUFFER_LOAD_SBYTE : MUBUF_Real_AllAddr_vi <0x11>; defm BUFFER_LOAD_USHORT : MUBUF_Real_AllAddr_vi <0x12>; @@ -1710,3 +1745,7 @@ defm TBUFFER_STORE_FORMAT_XY : MTBUF_Real_AllAddr_vi <5>; defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Real_AllAddr_vi <6>; defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Real_AllAddr_vi <7>; +defm TBUFFER_LOAD_FORMAT_D16_X : MTBUF_Real_AllAddr_vi <8>; +defm TBUFFER_LOAD_FORMAT_D16_XY : MTBUF_Real_AllAddr_vi <9>; +//defm TBUFFER_LOAD_FORMAT_D16_XYZ : MTBUF_Real_AllAddr_vi <0x0a>; +defm TBUFFER_LOAD_FORMAT_D16_XYZW : MTBUF_Real_AllAddr_vi <0x0b>; Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -201,6 +201,9 @@ setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f16, Custom); + if (!Subtarget->hasVOP3PInsts()) // So v2f16 is not legal. + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2f16, Custom); + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4f16, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); @@ -3223,6 +3226,68 @@ } break; } + case ISD::INTRINSIC_W_CHAIN: { + SDValue Op = SDValue(N, 0); + EVT ResultT = Op.getValueType(); + // Should we do assert here? + if (ResultT != MVT::v2f16 && ResultT != MVT::v4f16) + return; + // Get the i32 element size equivalent type which is legal. + EVT EquivResultT = getEquivalentMemType(*DAG.getContext(), ResultT); + // Change from v4f16/v2f16 to EquivResultT. + SDVTList VTList = DAG.getVTList(EquivResultT, MVT::Other); + + SDLoc DL(Op); + SDValue Res; + MachineFunction &MF = DAG.getMachineFunction(); + unsigned IID = cast(Op.getOperand(1))->getZExtValue(); + if (IID == Intrinsic::amdgcn_tbuffer_load) { + SDValue Ops[] = { + Op.getOperand(0), // Chain + Op.getOperand(2), // rsrc + Op.getOperand(3), // vindex + Op.getOperand(4), // voffset + Op.getOperand(5), // soffset + Op.getOperand(6), // offset + Op.getOperand(7), // dfmt + Op.getOperand(8), // nfmt + Op.getOperand(9), // glc + Op.getOperand(10) // slc + }; + EVT VT = Op.getOperand(2).getValueType(); + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo(), + MachineMemOperand::MOLoad, + VT.getStoreSize(), VT.getStoreSize()); + + Res = DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, DL, + VTList, Ops, VT, MMO); + } else if (IID == Intrinsic::amdgcn_buffer_load_format) { + SDValue Ops[] = { + Op.getOperand(0), // Chain + Op.getOperand(2), // rsrc + Op.getOperand(3), // vindex + Op.getOperand(4), // offset + Op.getOperand(5), // glc + Op.getOperand(6) // slc + }; + SIMachineFunctionInfo *MFI = MF.getInfo(); + EVT VT = ResultT; + EVT IntVT = VT.changeTypeToInteger(); + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo(MFI->getBufferPSV()), + MachineMemOperand::MOLoad, + VT.getStoreSize(), VT.getStoreSize()); + + Res = DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, + DL, VTList, Ops, IntVT, MMO); + } else return; // Should we assert here? + + // Cast back yo the original result type, and pu in "Results" list. + Results.push_back(DAG.getNode(ISD::BITCAST, DL, ResultT, Res)); + Results.push_back(Res.getOperand(0)); // Chain + break; + } case ISD::SELECT: { SDLoc SL(N); EVT VT = N->getValueType(0); Index: lib/Target/AMDGPU/SIInstrInfo.td =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.td +++ lib/Target/AMDGPU/SIInstrInfo.td @@ -45,8 +45,8 @@ [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain] >; -def SItbuffer_load : SDNode<"AMDGPUISD::TBUFFER_LOAD_FORMAT", - SDTypeProfile<1, 9, + +def SDTbuffer_load : SDTypeProfile<1, 9, [ // vdata SDTCisVT<1, v4i32>, // rsrc SDTCisVT<2, i32>, // vindex(VGPR) @@ -57,9 +57,14 @@ SDTCisVT<7, i32>, // nfmt(imm) SDTCisVT<8, i32>, // glc(imm) SDTCisVT<9, i32> // slc(imm) - ]>, - [SDNPMayLoad, SDNPMemOperand, SDNPHasChain] ->; + ]>; + +def SItbuffer_load : SDNode<"AMDGPUISD::TBUFFER_LOAD_FORMAT", SDTbuffer_load, + [SDNPMayLoad, SDNPMemOperand, SDNPHasChain]>; +def SItbuffer_load_d16 : SDNode<"AMDGPUISD::TBUFFER_LOAD_FORMAT_D16", + SDTbuffer_load, + [SDNPMayLoad, SDNPMemOperand, SDNPHasChain]>; + def SDTtbuffer_store : SDTypeProfile<0, 10, [ // vdata @@ -92,6 +97,9 @@ [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>; def SIbuffer_load_format : SDNode <"AMDGPUISD::BUFFER_LOAD_FORMAT", SDTBufferLoad, [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>; +def SIbuffer_load_format_d16 : SDNode <"AMDGPUISD::BUFFER_LOAD_FORMAT_D16", + SDTBufferLoad, + [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>; class SDSample : SDNode , SDTCisVT<2, v8i32>, Index: test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.d16.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.d16.ll @@ -0,0 +1,33 @@ +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=D16 %s +;RUN: llc < %s -march=amdgcn -mcpu=gfx901 -verify-machineinstrs | FileCheck -check-prefix=D16 %s + +;D16-LABEL: {{^}}buffer_load_format_d16_x: +;D16: buffer_load_format_d16_x +;D16: s_waitcnt +define amdgpu_ps half @buffer_load_format_d16_x(<4 x i32> inreg %rsrc) { +main_body: + %data = call half @llvm.amdgcn.buffer.load.format.f16(<4 x i32> %rsrc, i32 0, i32 0, i1 0, i1 0) + ret half %data +} + +;D16-LABEL: {{^}}buffer_load_format_d16_xy: +;D16: buffer_load_format_d16_xy +;D16: s_waitcnt +define amdgpu_ps <2 x half> @buffer_load_format_d16_xy(<4 x i32> inreg %rsrc) { +main_body: + %data = call <2 x half> @llvm.amdgcn.buffer.load.format.v2f16(<4 x i32> %rsrc, i32 0, i32 0, i1 0, i1 0) + ret < 2 x half> %data +} + +;D16-LABEL: {{^}}buffer_load_format_d16_xyzw: +;D16: buffer_load_format_d16_xyzw +;D16: s_waitcnt +define amdgpu_ps <4 x half> @buffer_load_format_d16_xyzw(<4 x i32> inreg %rsrc) { +main_body: + %data = call <4 x half> @llvm.amdgcn.buffer.load.format.v4f16(<4 x i32> %rsrc, i32 0, i32 0, i1 0, i1 0) + ret <4 x half> %data +} + +declare half @llvm.amdgcn.buffer.load.format.f16(<4 x i32>, i32, i32, i1, i1) +declare <2 x half> @llvm.amdgcn.buffer.load.format.v2f16(<4 x i32>, i32, i32, i1, i1) +declare <4 x half> @llvm.amdgcn.buffer.load.format.v4f16(<4 x i32>, i32, i32, i1, i1) Index: test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.d16.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.d16.ll @@ -0,0 +1,35 @@ +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=D16 %s +;RUN: llc < %s -march=amdgcn -mcpu=gfx901 -verify-machineinstrs | FileCheck -check-prefix=D16 %s + + +;D16-LABEL: {{^}}tbuffer_load_d16_x: +;D16: tbuffer_load_format_d16_x +;D16: s_waitcnt +define amdgpu_ps half @tbuffer_load_d16_x(<4 x i32> inreg %rsrc) { +main_body: + %data = call half @llvm.amdgcn.tbuffer.load.f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 6, i32 1, i1 0, i1 0) + ret half %data +} + +;D16-LABEL: {{^}}tbuffer_load_d16_xy: +;D16: tbuffer_load_format_d16_xy +;D16: s_waitcnt +define amdgpu_ps <2 x half> @tbuffer_load_d16_xy(<4 x i32> inreg %rsrc) { +main_body: + %data = call <2 x half> @llvm.amdgcn.tbuffer.load.v2f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 6, i32 1, i1 0, i1 0) + ret < 2 x half> %data +} + +;D16-LABEL: {{^}}tbuffer_load_d16_xyzw: +;D16: tbuffer_load_format_d16_xyzw +;D16: s_waitcnt +define amdgpu_ps <4 x half> @tbuffer_load_d16_xyzw(<4 x i32> inreg %rsrc) { +main_body: + %data = call <4 x half> @llvm.amdgcn.tbuffer.load.v4f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 6, i32 1, i1 0, i1 0) + ret <4 x half> %data +} + +declare half @llvm.amdgcn.tbuffer.load.f16(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) +declare <2 x half> @llvm.amdgcn.tbuffer.load.v2f16(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) +declare <4 x half> @llvm.amdgcn.tbuffer.load.v4f16(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) +