Index: lib/Target/AMDGPU/AMDGPU.td =================================================================== --- lib/Target/AMDGPU/AMDGPU.td +++ lib/Target/AMDGPU/AMDGPU.td @@ -286,6 +286,12 @@ "Support clamp for integer destination" >; +def FeaturePackedD16VMem : SubtargetFeature<"packed-d16-vmem", + "HasPackedD16VMem", + "true", + "Has packed d16 vmem instructions" +>; + //===------------------------------------------------------------===// // Subtarget Features (options and debugging) //===------------------------------------------------------------===// @@ -485,7 +491,7 @@ FeatureFastFMAF32, FeatureDPP, FeatureIntClamp, FeatureSDWA, FeatureSDWAOmod, FeatureSDWAScalar, FeatureSDWASdst, FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts, - FeatureAddNoCarryInsts + FeatureAddNoCarryInsts, FeaturePackedD16VMem ] >; @@ -555,7 +561,8 @@ def FeatureISAVersion8_1_0 : SubtargetFeatureISAVersion <8,1,0, [FeatureVolcanicIslands, FeatureLDSBankCount16, - FeatureXNACK]>; + FeatureXNACK, + FeaturePackedD16VMem]>; def FeatureISAVersion9_0_0 : SubtargetFeatureISAVersion <9,0,0, [FeatureGFX9, @@ -714,6 +721,12 @@ def HasD16LoadStore : Predicate<"Subtarget->hasD16LoadStore()">, AssemblerPredicate<"FeatureGFX9Insts">; +def HasPackedD16VMem : Predicate<"Subtarget->hasPackedD16VMem()">, + AssemblerPredicate<"FeaturePackedD16VMem">; + +def NotHasPackedD16VMem : Predicate<"!Subtarget->hasPackedD16VMem()">, + AssemblerPredicate<"!FeaturePackedD16VMem">; + def HasDSAddTid : Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX9">, AssemblerPredicate<"FeatureGFX9Insts">; Index: lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.h +++ lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -436,12 +436,17 @@ LOAD_CONSTANT, TBUFFER_STORE_FORMAT, TBUFFER_STORE_FORMAT_X3, + TBUFFER_STORE_FORMAT_D16, TBUFFER_LOAD_FORMAT, + TBUFFER_LOAD_FORMAT_D16, ATOMIC_CMP_SWAP, ATOMIC_INC, ATOMIC_DEC, BUFFER_LOAD, BUFFER_LOAD_FORMAT, + BUFFER_LOAD_FORMAT_D16, + BUFFER_STORE_FORMAT, + BUFFER_STORE_FORMAT_D16, LAST_AMDGPU_ISD_NUMBER }; Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -3981,12 +3981,17 @@ NODE_NAME_CASE(LOAD_CONSTANT) NODE_NAME_CASE(TBUFFER_STORE_FORMAT) NODE_NAME_CASE(TBUFFER_STORE_FORMAT_X3) + NODE_NAME_CASE(TBUFFER_STORE_FORMAT_D16) NODE_NAME_CASE(TBUFFER_LOAD_FORMAT) + NODE_NAME_CASE(TBUFFER_LOAD_FORMAT_D16) NODE_NAME_CASE(ATOMIC_CMP_SWAP) NODE_NAME_CASE(ATOMIC_INC) NODE_NAME_CASE(ATOMIC_DEC) NODE_NAME_CASE(BUFFER_LOAD) NODE_NAME_CASE(BUFFER_LOAD_FORMAT) + NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16) + NODE_NAME_CASE(BUFFER_STORE_FORMAT) + NODE_NAME_CASE(BUFFER_STORE_FORMAT_D16) case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break; } return nullptr; Index: lib/Target/AMDGPU/AMDGPUSubtarget.h =================================================================== --- lib/Target/AMDGPU/AMDGPUSubtarget.h +++ lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -163,6 +163,7 @@ bool FlatAddressSpace; bool FlatInstOffsets; bool FlatGlobalInsts; + bool HasPackedD16VMem; bool FlatScratchInsts; bool AddNoCarryInsts; bool R600ALUInst; @@ -445,6 +446,10 @@ return getGeneration() >= GFX9; } + bool hasPackedD16VMem() const { + return HasPackedD16VMem; + } + bool hasAddNoCarry() const { return AddNoCarryInsts; } Index: lib/Target/AMDGPU/AMDGPUSubtarget.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -153,6 +153,7 @@ FlatAddressSpace(false), FlatInstOffsets(false), FlatGlobalInsts(false), + HasPackedD16VMem(false), FlatScratchInsts(false), AddNoCarryInsts(false), Index: lib/Target/AMDGPU/BUFInstructions.td =================================================================== --- lib/Target/AMDGPU/BUFInstructions.td +++ lib/Target/AMDGPU/BUFInstructions.td @@ -671,6 +671,61 @@ defm BUFFER_STORE_FORMAT_XYZW : MUBUF_Pseudo_Stores < "buffer_store_format_xyzw", VReg_128 >; + +let SubtargetPredicate = NotHasPackedD16VMem in { // NP means data not packed. +defm BUFFER_LOAD_FORMAT_D16_X_NP : MUBUF_Pseudo_Loads < + "buffer_load_format_d16_x", VGPR_32 +>; +defm BUFFER_LOAD_FORMAT_D16_XY_NP : MUBUF_Pseudo_Loads < + "buffer_load_format_d16_xy", VReg_64 +>; +defm BUFFER_LOAD_FORMAT_D16_XYZ_NP : MUBUF_Pseudo_Loads < + "buffer_load_format_d16_xyz", VReg_96 +>; +defm BUFFER_LOAD_FORMAT_D16_XYZW_NP : MUBUF_Pseudo_Loads < + "buffer_load_format_d16_xyzw", VReg_128 +>; +defm BUFFER_STORE_FORMAT_D16_X_NP : MUBUF_Pseudo_Stores < + "buffer_store_format_d16_x", VGPR_32 +>; +defm BUFFER_STORE_FORMAT_D16_XY_NP : MUBUF_Pseudo_Stores < + "buffer_store_format_d16_xy", VReg_64 +>; +defm BUFFER_STORE_FORMAT_D16_XYZ_NP : MUBUF_Pseudo_Stores < + "buffer_store_format_d15_xyz", VReg_96 +>; +defm BUFFER_STORE_FORMAT_D16_XYZW_NP : MUBUF_Pseudo_Stores < + "buffer_store_format_d16_xyzw", VReg_128 +>; +} // End NotHasPackedD16VMem. + +let SubtargetPredicate = HasPackedD16VMem in { +defm BUFFER_LOAD_FORMAT_D16_X : MUBUF_Pseudo_Loads < + "buffer_load_format_d16_x", VGPR_32 +>; +defm BUFFER_LOAD_FORMAT_D16_XY : MUBUF_Pseudo_Loads < + "buffer_load_format_d16_xy", VGPR_32 +>; +defm BUFFER_LOAD_FORMAT_D16_XYZ : MUBUF_Pseudo_Loads < + "buffer_load_format_d16_xyz", VReg_64 +>; +defm BUFFER_LOAD_FORMAT_D16_XYZW : MUBUF_Pseudo_Loads < + "buffer_load_format_d16_xyzw", VReg_64 +>; +defm BUFFER_STORE_FORMAT_D16_X : MUBUF_Pseudo_Stores < + "buffer_store_format_d16_x", VGPR_32 +>; +defm BUFFER_STORE_FORMAT_D16_XY : MUBUF_Pseudo_Stores < + "buffer_store_format_d16_xy", VGPR_32 +>; +defm BUFFER_STORE_FORMAT_D16_XYZ : MUBUF_Pseudo_Stores < + "buffer_store_format_d15_xyz", VReg_64 +>; +defm BUFFER_STORE_FORMAT_D16_XYZW : MUBUF_Pseudo_Stores < + "buffer_store_format_d16_xyzw", VReg_64 +>; +} // End HasPackedD16VMem. + defm BUFFER_LOAD_UBYTE : MUBUF_Pseudo_Loads < "buffer_load_ubyte", VGPR_32, i32, mubuf_az_extloadi8 >; @@ -860,6 +915,29 @@ defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyz", VReg_128>; defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyzw", VReg_128>; +let SubtargetPredicate = NotHasPackedD16VMem in { // NP means data not packed. +defm TBUFFER_LOAD_FORMAT_D16_X_NP : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_x", VGPR_32>; +defm TBUFFER_LOAD_FORMAT_D16_XY_NP : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xy", VReg_64>; +defm TBUFFER_LOAD_FORMAT_D16_XYZ_NP : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyz", VReg_128>; +defm TBUFFER_LOAD_FORMAT_D16_XYZW_NP : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyzw", VReg_128>; +defm TBUFFER_STORE_FORMAT_D16_X_NP : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_x", VGPR_32>; +defm TBUFFER_STORE_FORMAT_D16_XY_NP : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xy", VReg_64>; +defm TBUFFER_STORE_FORMAT_D16_XYZ_NP : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyz", VReg_128>; +defm TBUFFER_STORE_FORMAT_D16_XYZW_NP : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyzw", VReg_128>; +} // End NotHasPackedD16VMem. + +let SubtargetPredicate = HasPackedD16VMem in { +defm TBUFFER_LOAD_FORMAT_D16_X : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_x", VGPR_32>; +defm TBUFFER_LOAD_FORMAT_D16_XY : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xy", VGPR_32>; +defm TBUFFER_LOAD_FORMAT_D16_XYZ : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyz", VReg_64>; +defm TBUFFER_LOAD_FORMAT_D16_XYZW : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyzw", VReg_64>; +defm TBUFFER_STORE_FORMAT_D16_X : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_x", VGPR_32>; +defm TBUFFER_STORE_FORMAT_D16_XY : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xy", VGPR_32>; +defm TBUFFER_STORE_FORMAT_D16_XYZ : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyz", VReg_64>; +defm TBUFFER_STORE_FORMAT_D16_XYZW : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyzw", VReg_64>; +} // End HasPackedD16VMem. + + let SubtargetPredicate = isCIVI in { //===----------------------------------------------------------------------===// @@ -929,6 +1007,20 @@ defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; + +let SubtargetPredicate = NotHasPackedD16VMem in { // NP means data not packed. +defm : MUBUF_LoadIntrinsicPat; +defm : MUBUF_LoadIntrinsicPat; +defm : MUBUF_LoadIntrinsicPat; +} // End NotHasPackedD16VMem. + +let SubtargetPredicate = HasPackedD16VMem in { +defm : MUBUF_LoadIntrinsicPat; +defm : MUBUF_LoadIntrinsicPat; +defm : MUBUF_LoadIntrinsicPat; +defm : MUBUF_LoadIntrinsicPat; +} // End HasPackedD16VMem. + defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; @@ -973,9 +1065,23 @@ >; } -defm : MUBUF_StoreIntrinsicPat; -defm : MUBUF_StoreIntrinsicPat; -defm : MUBUF_StoreIntrinsicPat; +defm : MUBUF_StoreIntrinsicPat; +defm : MUBUF_StoreIntrinsicPat; +defm : MUBUF_StoreIntrinsicPat; + +let SubtargetPredicate = NotHasPackedD16VMem in { // NP means data not packed. +defm : MUBUF_StoreIntrinsicPat; +defm : MUBUF_StoreIntrinsicPat; +defm : MUBUF_StoreIntrinsicPat; +} // End NotHasPackedD16VMem. + +let SubtargetPredicate = HasPackedD16VMem in { +defm : MUBUF_StoreIntrinsicPat; +defm : MUBUF_StoreIntrinsicPat; +defm : MUBUF_StoreIntrinsicPat; +defm : MUBUF_StoreIntrinsicPat; +} // End HasPackedD16VMem. + defm : MUBUF_StoreIntrinsicPat; defm : MUBUF_StoreIntrinsicPat; defm : MUBUF_StoreIntrinsicPat; @@ -1355,6 +1461,19 @@ defm : MTBUF_LoadIntrinsicPat; defm : MTBUF_LoadIntrinsicPat; +let SubtargetPredicate = NotHasPackedD16VMem in { // NP means data not packed. +defm : MTBUF_LoadIntrinsicPat; +defm : MTBUF_LoadIntrinsicPat; +defm : MTBUF_LoadIntrinsicPat; +} // End NotHasPackedD16VMem. + +let SubtargetPredicate = HasPackedD16VMem in { +defm : MTBUF_LoadIntrinsicPat; +defm : MTBUF_LoadIntrinsicPat; +defm : MTBUF_LoadIntrinsicPat; +defm : MTBUF_LoadIntrinsicPat; +} // End HasPackedD16VMem. + multiclass MTBUF_StoreIntrinsicPat { def : GCNPat< @@ -1404,6 +1523,20 @@ defm : MTBUF_StoreIntrinsicPat; defm : MTBUF_StoreIntrinsicPat; +let SubtargetPredicate = NotHasPackedD16VMem in { // NP means data not packed. +defm : MTBUF_StoreIntrinsicPat; +defm : MTBUF_StoreIntrinsicPat; +defm : MTBUF_StoreIntrinsicPat; +} // End NotHasPackedD16VMem. + +let SubtargetPredicate = HasPackedD16VMem in { +defm : MTBUF_StoreIntrinsicPat; +defm : MTBUF_StoreIntrinsicPat; +defm : MTBUF_StoreIntrinsicPat; +defm : MTBUF_StoreIntrinsicPat; +} // End HasPackedD16VMem. + + //===----------------------------------------------------------------------===// // Target instructions, move to the appropriate target TD file //===----------------------------------------------------------------------===// @@ -1617,6 +1750,14 @@ defm BUFFER_STORE_FORMAT_XY : MUBUF_Real_AllAddr_vi <0x05>; defm BUFFER_STORE_FORMAT_XYZ : MUBUF_Real_AllAddr_vi <0x06>; defm BUFFER_STORE_FORMAT_XYZW : MUBUF_Real_AllAddr_vi <0x07>; +defm BUFFER_LOAD_FORMAT_D16_X : MUBUF_Real_AllAddr_vi <0x08>; +defm BUFFER_LOAD_FORMAT_D16_XY : MUBUF_Real_AllAddr_vi <0x09>; +defm BUFFER_LOAD_FORMAT_D16_XYZ : MUBUF_Real_AllAddr_vi <0x0a>; +defm BUFFER_LOAD_FORMAT_D16_XYZW : MUBUF_Real_AllAddr_vi <0x0b>; +defm BUFFER_STORE_FORMAT_D16_X : MUBUF_Real_AllAddr_vi <0x0c>; +defm BUFFER_STORE_FORMAT_D16_XY : MUBUF_Real_AllAddr_vi <0x0d>; +defm BUFFER_STORE_FORMAT_D16_XYZ : MUBUF_Real_AllAddr_vi <0x0e>; +defm BUFFER_STORE_FORMAT_D16_XYZW : MUBUF_Real_AllAddr_vi <0x0f>; defm BUFFER_LOAD_UBYTE : MUBUF_Real_AllAddr_vi <0x10>; defm BUFFER_LOAD_SBYTE : MUBUF_Real_AllAddr_vi <0x11>; defm BUFFER_LOAD_USHORT : MUBUF_Real_AllAddr_vi <0x12>; @@ -1702,11 +1843,19 @@ def _BOTHEN_vi : MTBUF_Real_vi (NAME#"_BOTHEN")>; } -defm TBUFFER_LOAD_FORMAT_X : MTBUF_Real_AllAddr_vi <0>; -defm TBUFFER_LOAD_FORMAT_XY : MTBUF_Real_AllAddr_vi <1>; -//defm TBUFFER_LOAD_FORMAT_XYZ : MTBUF_Real_AllAddr_vi <2>; -defm TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Real_AllAddr_vi <3>; -defm TBUFFER_STORE_FORMAT_X : MTBUF_Real_AllAddr_vi <4>; -defm TBUFFER_STORE_FORMAT_XY : MTBUF_Real_AllAddr_vi <5>; -defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Real_AllAddr_vi <6>; -defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Real_AllAddr_vi <7>; +defm TBUFFER_LOAD_FORMAT_X : MTBUF_Real_AllAddr_vi <0x00>; +defm TBUFFER_LOAD_FORMAT_XY : MTBUF_Real_AllAddr_vi <0x01>; +//defm TBUFFER_LOAD_FORMAT_XYZ : MTBUF_Real_AllAddr_vi <0x02>; +defm TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Real_AllAddr_vi <0x03>; +defm TBUFFER_STORE_FORMAT_X : MTBUF_Real_AllAddr_vi <0x04>; +defm TBUFFER_STORE_FORMAT_XY : MTBUF_Real_AllAddr_vi <0x05>; +defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Real_AllAddr_vi <0x06>; +defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Real_AllAddr_vi <0x07>; +defm TBUFFER_LOAD_FORMAT_D16_X : MTBUF_Real_AllAddr_vi <0x08>; +defm TBUFFER_LOAD_FORMAT_D16_XY : MTBUF_Real_AllAddr_vi <0x09>; +//defm TBUFFER_LOAD_FORMAT_D16_XYZ : MTBUF_Real_AllAddr_vi <0x0a>; +defm TBUFFER_LOAD_FORMAT_D16_XYZW : MTBUF_Real_AllAddr_vi <0x0b>; +defm TBUFFER_STORE_FORMAT_D16_X : MTBUF_Real_AllAddr_vi <0x0c>; +defm TBUFFER_STORE_FORMAT_D16_XY : MTBUF_Real_AllAddr_vi <0x0d>; +defm TBUFFER_STORE_FORMAT_D16_XYZ : MTBUF_Real_AllAddr_vi <0x0e>; +defm TBUFFER_STORE_FORMAT_D16_XYZW : MTBUF_Real_AllAddr_vi <0x0f>; Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -201,11 +201,14 @@ setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f16, Custom); + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2f16, Custom); + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4f16, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::v2i16, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::v2f16, Custom); + setOperationAction(ISD::INTRINSIC_VOID, MVT::v4f16, Custom); setOperationAction(ISD::BRCOND, MVT::Other, Custom); setOperationAction(ISD::BR_CC, MVT::i1, Expand); @@ -536,6 +539,24 @@ // TargetLowering queries //===----------------------------------------------------------------------===// +static bool isHalfVT (EVT VT) { + return (VT == MVT::f16 || + VT == MVT::v2f16 || + //VT == MVT::v3f16 || + VT == MVT::v4f16); +} + +static EVT getExtVT (EVT VT) { + if (VT == MVT::v2f16) + return MVT::v2f32; + // else if (VT == MVT::v3f16) + // return MVT::v3f32; + else if (VT == MVT::v4f16) + return MVT::v4f32; + else + llvm_unreachable("Handle new hylf type"); +} + bool SITargetLowering::isShuffleMaskLegal(ArrayRef, EVT) const { // SI has some legal vector types, but no legal vector operations. Say no // shuffles are legal in order to prefer scalarizing some vector operations. @@ -559,6 +580,31 @@ Info.writeMem = true; return true; } + case Intrinsic::amdgcn_buffer_load: + case Intrinsic::amdgcn_buffer_load_format: + case Intrinsic::amdgcn_tbuffer_load: { + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = MVT::getVT(CI.getType()); + Info.ptrVal = nullptr; + Info.align = 0; + + Info.vol = false; + Info.readMem = true; + Info.writeMem = false; + return true; + } + case Intrinsic::amdgcn_buffer_store_format: + case Intrinsic::amdgcn_tbuffer_store: { + Info.opc = ISD::INTRINSIC_VOID; + Info.memVT = MVT::getVT(CI.getOperand(0)->getType()); + Info.ptrVal = nullptr; + Info.align = 0; + + Info.vol = false; + Info.readMem = false; + Info.writeMem = true; + return true; + } default: return false; } @@ -3219,6 +3265,44 @@ return SDValue(); } +static SDValue ChangeResultType(SDValue Op, EVT EquivResultT, SelectionDAG &DAG) { + // Change from v4f16/v2f16 to EquivResultT. + SDVTList VTList = DAG.getVTList(EquivResultT, MVT::Other); + SDLoc DL(Op); + MemSDNode *M = cast(Op); + unsigned IID = cast(Op.getOperand(1))->getZExtValue(); + if (IID == Intrinsic::amdgcn_tbuffer_load) { + SDValue Ops[] = { + Op.getOperand(0), // Chain + Op.getOperand(2), // rsrc + Op.getOperand(3), // vindex + Op.getOperand(4), // voffset + Op.getOperand(5), // soffset + Op.getOperand(6), // offset + Op.getOperand(7), // dfmt + Op.getOperand(8), // nfmt + Op.getOperand(9), // glc + Op.getOperand(10) // slc + }; + return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, DL, + VTList, Ops, M->getMemoryVT(), + M->getMemOperand()); + } else if (IID == Intrinsic::amdgcn_buffer_load_format) { + SDValue Ops[] = { + Op.getOperand(0), // Chain + Op.getOperand(2), // rsrc + Op.getOperand(3), // vindex + Op.getOperand(4), // offset + Op.getOperand(5), // glc + Op.getOperand(6) // slc + }; + return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, + DL, VTList, Ops, M->getMemoryVT(), + M->getMemOperand()); + } + return SDValue(); +} + void SITargetLowering::ReplaceNodeResults(SDNode *N, SmallVectorImpl &Results, SelectionDAG &DAG) const { @@ -3246,6 +3330,27 @@ } break; } + case ISD::INTRINSIC_W_CHAIN: { + SDLoc SL(N); + SDValue Op = SDValue(N, 0); + EVT ResultT = Op.getValueType(); + // TODO: handle v3f16. + if (ResultT != MVT::v2f16 && ResultT != MVT::v4f16) + return; + + bool HasPacked = Subtarget->hasPackedD16VMem(); + EVT ToVT = HasPacked ? getEquivalentMemType(*DAG.getContext(), ResultT) + : getExtVT(ResultT); // v2f16/v4f16 to v2f32/v4f32. + if (SDValue Res = ChangeResultType(Op, ToVT, DAG)) { + if (!HasPacked) { // From v2f32/v4f32 back to v2f16/v4f16. + SDValue RFlag = DAG.getTargetConstant(0, SL, MVT::i32); + Results.push_back(DAG.getNode(ISD::FP_ROUND, SL, ResultT, Res, RFlag)); + } else // Cast back to the original packed type. + Results.push_back(DAG.getNode(ISD::BITCAST, SL, ResultT, Res)); + Results.push_back(Res.getOperand(0)); // Chain + } + break; + } case ISD::SELECT: { SDLoc SL(N); EVT VT = N->getValueType(0); @@ -4167,10 +4272,8 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const { - unsigned IntrID = cast(Op.getOperand(1))->getZExtValue(); SDLoc DL(Op); - MachineFunction &MF = DAG.getMachineFunction(); - + unsigned IntrID = cast(Op.getOperand(1))->getZExtValue(); switch (IntrID) { case Intrinsic::amdgcn_atomic_inc: case Intrinsic::amdgcn_atomic_dec: { @@ -4188,6 +4291,7 @@ } case Intrinsic::amdgcn_buffer_load: case Intrinsic::amdgcn_buffer_load_format: { + MemSDNode *M = cast(Op); SDValue Ops[] = { Op.getOperand(0), // Chain Op.getOperand(2), // rsrc @@ -4196,21 +4300,14 @@ Op.getOperand(5), // glc Op.getOperand(6) // slc }; - SIMachineFunctionInfo *MFI = MF.getInfo(); - unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ? AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT; - EVT VT = Op.getValueType(); - EVT IntVT = VT.changeTypeToInteger(); - MachineMemOperand *MMO = MF.getMachineMemOperand( - MachinePointerInfo(MFI->getBufferPSV()), - MachineMemOperand::MOLoad, - VT.getStoreSize(), VT.getStoreSize()); - - return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, MMO); + return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, + M->getMemoryVT(), M->getMemOperand()); } case Intrinsic::amdgcn_tbuffer_load: { + MemSDNode *M = cast(Op); SDValue Ops[] = { Op.getOperand(0), // Chain Op.getOperand(2), // rsrc @@ -4221,17 +4318,12 @@ Op.getOperand(7), // dfmt Op.getOperand(8), // nfmt Op.getOperand(9), // glc - Op.getOperand(10) // slc + Op.getOperand(10) // slc }; - EVT VT = Op.getOperand(2).getValueType(); - - MachineMemOperand *MMO = MF.getMachineMemOperand( - MachinePointerInfo(), - MachineMemOperand::MOLoad, - VT.getStoreSize(), VT.getStoreSize()); return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL, - Op->getVTList(), Ops, VT, MMO); + Op->getVTList(), Ops, M->getMemoryVT(), + M->getMemOperand()); } // Basic sample. case Intrinsic::amdgcn_image_sample: @@ -4437,10 +4529,64 @@ Op->getVTList(), Ops, VT, MMO); } + case Intrinsic::amdgcn_buffer_store_format: { + SDValue VData = Op.getOperand(2); + EVT MemVT = VData.getValueType(); + unsigned Opc = AMDGPUISD::BUFFER_STORE_FORMAT; + if (isHalfVT(MemVT)) { + Opc = AMDGPUISD::BUFFER_STORE_FORMAT_D16; + // TODO: Handle v3f16. + if (MemVT == MVT::v2f16 || MemVT== MVT::v4f16) { + if (Subtarget->hasPackedD16VMem()) { + if (!isTypeLegal(MemVT)) { + // If Target supports packed vmem, we just need to workaround + // the illegal type by casting to an equivalent one. + EVT EquivMemVT = getEquivalentMemType(*DAG.getContext(), MemVT); + VData = DAG.getNode(ISD::BITCAST, DL, EquivMemVT, VData); + } + } else {// We need to unpack the packed data to store. + EVT ExtendedT = getExtVT(MemVT); + VData = DAG.getNode(ISD::FP_EXTEND, DL, ExtendedT, VData); + } + } + } + SDValue Ops[] = { + Chain, // Chain + VData, // vdata + Op.getOperand(3), // rsrc + Op.getOperand(4), // vindex + Op.getOperand(5), // offset + Op.getOperand(6), // glc + Op.getOperand(7) + }; + MemSDNode *M = cast(Op); + return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, + M->getMemoryVT(), M->getMemOperand()); + } case Intrinsic::amdgcn_tbuffer_store: { + SDValue VData = Op.getOperand(2); + EVT MemVT = VData.getValueType(); + unsigned Opc = AMDGPUISD::TBUFFER_STORE_FORMAT; + if (isHalfVT(MemVT)) { + Opc = AMDGPUISD::TBUFFER_STORE_FORMAT_D16; + // TODO: Handle v3f16. + if (MemVT == MVT::v2f16 || MemVT== MVT::v4f16) { + if (Subtarget->hasPackedD16VMem()) { + if (!isTypeLegal(MemVT)) { + // If Target supports packed vmem, we just need to workaround + // the illegal type by casting to an equivalent one. + EVT EquivMemVT = getEquivalentMemType(*DAG.getContext(), MemVT); + VData = DAG.getNode(ISD::BITCAST, DL, EquivMemVT, VData); + } + } else {// We need to unpack the packed data to store. + EVT ExtendedT = getExtVT(MemVT); + VData = DAG.getNode(ISD::FP_EXTEND, DL, ExtendedT, VData); + } + } + } SDValue Ops[] = { Chain, - Op.getOperand(2), // vdata + VData, // vdata Op.getOperand(3), // rsrc Op.getOperand(4), // vindex Op.getOperand(5), // voffset @@ -4451,15 +4597,10 @@ Op.getOperand(10), // glc Op.getOperand(11) // slc }; - EVT VT = Op.getOperand(3).getValueType(); - MachineMemOperand *MMO = MF.getMachineMemOperand( - MachinePointerInfo(), - MachineMemOperand::MOStore, - VT.getStoreSize(), 4); - return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL, - Op->getVTList(), Ops, VT, MMO); + MemSDNode *M = cast(Op); + return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, + M->getMemoryVT(), M->getMemOperand()); } - default: return Op; } Index: lib/Target/AMDGPU/SIInstrInfo.td =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.td +++ lib/Target/AMDGPU/SIInstrInfo.td @@ -45,8 +45,8 @@ [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain] >; -def SItbuffer_load : SDNode<"AMDGPUISD::TBUFFER_LOAD_FORMAT", - SDTypeProfile<1, 9, + +def SDTbuffer_load : SDTypeProfile<1, 9, [ // vdata SDTCisVT<1, v4i32>, // rsrc SDTCisVT<2, i32>, // vindex(VGPR) @@ -57,9 +57,14 @@ SDTCisVT<7, i32>, // nfmt(imm) SDTCisVT<8, i32>, // glc(imm) SDTCisVT<9, i32> // slc(imm) - ]>, - [SDNPMayLoad, SDNPMemOperand, SDNPHasChain] ->; + ]>; + +def SItbuffer_load : SDNode<"AMDGPUISD::TBUFFER_LOAD_FORMAT", SDTbuffer_load, + [SDNPMayLoad, SDNPMemOperand, SDNPHasChain]>; +def SItbuffer_load_d16 : SDNode<"AMDGPUISD::TBUFFER_LOAD_FORMAT_D16", + SDTbuffer_load, + [SDNPMayLoad, SDNPMemOperand, SDNPHasChain]>; + def SDTtbuffer_store : SDTypeProfile<0, 10, [ // vdata @@ -79,6 +84,9 @@ def SItbuffer_store_x3 : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT_X3", SDTtbuffer_store, [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>; +def SItbuffer_store_d16 : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT_D16", + SDTtbuffer_store, + [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>; def SDTBufferLoad : SDTypeProfile<1, 5, [ // vdata @@ -92,6 +100,23 @@ [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>; def SIbuffer_load_format : SDNode <"AMDGPUISD::BUFFER_LOAD_FORMAT", SDTBufferLoad, [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>; +def SIbuffer_load_format_d16 : SDNode <"AMDGPUISD::BUFFER_LOAD_FORMAT_D16", + SDTBufferLoad, + [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>; + +def SDTBufferStore : SDTypeProfile<0, 6, + [ // vdata + SDTCisVT<1, v4i32>, // rsrc + SDTCisVT<2, i32>, // vindex + SDTCisVT<3, i32>, // offset + SDTCisVT<4, i1>, // glc + SDTCisVT<5, i1>]>; // slc +def SIbuffer_store_format : SDNode <"AMDGPUISD::BUFFER_STORE_FORMAT", + SDTBufferStore, + [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>; +def SIbuffer_store_format_d16 : SDNode <"AMDGPUISD::BUFFER_STORE_FORMAT_D16", + SDTBufferStore, + [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>; class SDSample : SDNode , SDTCisVT<2, v8i32>, Index: test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.d16.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.d16.ll @@ -0,0 +1,46 @@ +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=UNPACKED %s +;RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -check-prefix=PACKED %s +;RUN: llc < %s -march=amdgcn -mcpu=gfx901 -verify-machineinstrs | FileCheck -check-prefix=PACKED %s + +;UNPACKED-LABEL: {{^}}buffer_load_format_d16_x: +;UNPACKED: buffer_load_format_d16_x v0, off, s[0:3], 0 +;UNPACKED: s_waitcnt + +;PACKED-LABEL: {{^}}buffer_load_format_d16_x: +;PACKED: buffer_load_format_d16_x v0, off, s[0:3], 0 +;PACKED: s_waitcnt +define amdgpu_ps half @buffer_load_format_d16_x(<4 x i32> inreg %rsrc) { +main_body: + %data = call half @llvm.amdgcn.buffer.load.format.f16(<4 x i32> %rsrc, i32 0, i32 0, i1 0, i1 0) + ret half %data +} + +;UNPACKED-LABEL: {{^}}buffer_load_format_d16_xy: +;UNPACKED: buffer_load_format_d16_xy v[0:1], off, s[0:3], 0 +;UNPACKED: s_waitcnt + +;PACKED-LABEL: {{^}}buffer_load_format_d16_xy: +;PACKED: buffer_load_format_d16_xy v0, off, s[0:3], 0 +;PACKED: s_waitcnt +define amdgpu_ps <2 x half> @buffer_load_format_d16_xy(<4 x i32> inreg %rsrc) { +main_body: + %data = call <2 x half> @llvm.amdgcn.buffer.load.format.v2f16(<4 x i32> %rsrc, i32 0, i32 0, i1 0, i1 0) + ret < 2 x half> %data +} + +;UNPACKED-LABEL: {{^}}buffer_load_format_d16_xyzw: +;UNPACKED: buffer_load_format_d16_xyzw v[0:3], off, s[0:3], 0 +;UNPACKED: s_waitcnt + +;PACKED-LABEL: {{^}}buffer_load_format_d16_xyzw: +;PACKED: buffer_load_format_d16_xyzw v[0:1], off, s[0:3], 0 +;PACKED: s_waitcnt +define amdgpu_ps <4 x half> @buffer_load_format_d16_xyzw(<4 x i32> inreg %rsrc) { +main_body: + %data = call <4 x half> @llvm.amdgcn.buffer.load.format.v4f16(<4 x i32> %rsrc, i32 0, i32 0, i1 0, i1 0) + ret <4 x half> %data +} + +declare half @llvm.amdgcn.buffer.load.format.f16(<4 x i32>, i32, i32, i1, i1) +declare <2 x half> @llvm.amdgcn.buffer.load.format.v2f16(<4 x i32>, i32, i32, i1, i1) +declare <4 x half> @llvm.amdgcn.buffer.load.format.v4f16(<4 x i32>, i32, i32, i1, i1) Index: test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.d16.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.d16.ll @@ -0,0 +1,46 @@ +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=UNPACKED %s +;RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -check-prefix=PACKED %s +;RUN: llc < %s -march=amdgcn -mcpu=gfx901 -verify-machineinstrs | FileCheck -check-prefix=PACKED %s + +;UNPACKED-LABEL: {{^}}buffer_store_format_d16_x: +;UNPACKED: buffer_store_format_d16_x v0, v1, s[4:7], 0 idxen +;UNPACKED: s_endpgm + +;PACKED-LABEL: {{^}}buffer_store_format_d16_x: +;PACKED: buffer_store_format_d16_x v0, v1, s[4:7], 0 idxen +;PACKED: s_endpgm +define amdgpu_kernel void @buffer_store_format_d16_x(<4 x i32> inreg %rsrc, half %data, i32 %index) { +main_body: + call void @llvm.amdgcn.buffer.store.format.f16(half %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0) + ret void +} + +;UNPACKED-LABEL: {{^}}buffer_store_format_d16_xy: +;UNPACKED: buffer_store_format_d16_xy v[0:1], v2, s[4:7], 0 idxen +;UNPACKED: s_endpgm + +;PACKED-LABEL: {{^}}buffer_store_format_d16_xy: +;PACKED: buffer_store_format_d16_xy v0, v1, s[4:7], 0 idxen +;PACKED: s_endpgm +define amdgpu_kernel void @buffer_store_format_d16_xy(<4 x i32> inreg %rsrc, <2 x half> %data, i32 %index) { +main_body: + call void @llvm.amdgcn.buffer.store.format.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0) + ret void +} + +;UNPACKED-LABEL: {{^}}buffer_store_format_d16_xyzw: +;UNPACKED: buffer_store_format_d16_xyzw v[0:3], v4, s[4:7], 0 idxen +;UNPACKED: s_endpgm + +;PACKED-LABEL: {{^}}buffer_store_format_d16_xyzw: +;PACKED: buffer_store_format_d16_xyzw v[0:1], v2, s[4:7], 0 idxen +;PACKED: s_endpgm +define amdgpu_kernel void @buffer_store_format_d16_xyzw(<4 x i32> inreg %rsrc, <4 x half> %data, i32 %index) { +main_body: + call void @llvm.amdgcn.buffer.store.format.v4f16(<4 x half> %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0) + ret void +} + +declare void @llvm.amdgcn.buffer.store.format.f16(half, <4 x i32>, i32, i32, i1, i1) +declare void @llvm.amdgcn.buffer.store.format.v2f16(<2 x half>, <4 x i32>, i32, i32, i1, i1) +declare void @llvm.amdgcn.buffer.store.format.v4f16(<4 x half>, <4 x i32>, i32, i32, i1, i1) Index: test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.d16.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.d16.ll @@ -0,0 +1,47 @@ +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=UNPACKED %s +;RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -check-prefix=PACKED %s +;RUN: llc < %s -march=amdgcn -mcpu=gfx901 -verify-machineinstrs | FileCheck -check-prefix=PACKED %s + + +;UNPACKED-LABEL: {{^}}tbuffer_load_d16_x: +;UNPACKED: tbuffer_load_format_d16_x v0, off, s[0:3], dfmt:6, nfmt:1, 0 +;UNPACKED: s_waitcnt + +;PACKED-LABEL: {{^}}tbuffer_load_d16_x: +;PACKED: tbuffer_load_format_d16_x v0, off, s[0:3], dfmt:6, nfmt:1, 0 +;PACKED: s_waitcnt +define amdgpu_ps half @tbuffer_load_d16_x(<4 x i32> inreg %rsrc) { +main_body: + %data = call half @llvm.amdgcn.tbuffer.load.f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 6, i32 1, i1 0, i1 0) + ret half %data +} + +;UNPACKED-LABEL: {{^}}tbuffer_load_d16_xy: +;UNPACKED: tbuffer_load_format_d16_xy v[0:1], off, s[0:3], dfmt:6, nfmt:1, 0 +;UNPACKED: s_waitcnt + +;PACKED-LABEL: {{^}}tbuffer_load_d16_xy: +;PACKED: tbuffer_load_format_d16_xy v0, off, s[0:3], dfmt:6, nfmt:1, 0 +;PACKED: s_waitcnt +define amdgpu_ps <2 x half> @tbuffer_load_d16_xy(<4 x i32> inreg %rsrc) { +main_body: + %data = call <2 x half> @llvm.amdgcn.tbuffer.load.v2f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 6, i32 1, i1 0, i1 0) + ret < 2 x half> %data +} + +;UNPACKED-LABEL: {{^}}tbuffer_load_d16_xyzw: +;UNPACKED: tbuffer_load_format_d16_xyzw v[0:3], off, s[0:3], dfmt:6, nfmt:1, 0 +;UNPACKED: s_waitcnt + +;PACKED-LABEL: {{^}}tbuffer_load_d16_xyzw: +;PACKED: tbuffer_load_format_d16_xyzw v[0:1], off, s[0:3], dfmt:6, nfmt:1, 0 +;PACKED: s_waitcnt +define amdgpu_ps <4 x half> @tbuffer_load_d16_xyzw(<4 x i32> inreg %rsrc) { +main_body: + %data = call <4 x half> @llvm.amdgcn.tbuffer.load.v4f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 6, i32 1, i1 0, i1 0) + ret <4 x half> %data +} + +declare half @llvm.amdgcn.tbuffer.load.f16(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) +declare <2 x half> @llvm.amdgcn.tbuffer.load.v2f16(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) +declare <4 x half> @llvm.amdgcn.tbuffer.load.v4f16(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) Index: test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll @@ -0,0 +1,49 @@ +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=UNPACKED %s +;RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -check-prefix=PACKED %s +;RUN: llc < %s -march=amdgcn -mcpu=gfx901 -verify-machineinstrs | FileCheck -check-prefix=PACKED %s + + +;UNPACKED-LABEL: {{^}}tbuffer_store_d16_x: +;UNPACKED: tbuffer_store_format_d16_x v0, v1, s[4:7], dfmt:1, nfmt:2, 0 idxen +;UNPACKED: s_endpgm + +;PACKED-LABEL: {{^}}tbuffer_store_d16_x: +;PACKED: tbuffer_store_format_d16_x v0, v1, s[4:7], dfmt:1, nfmt:2, 0 idxen +;PACKED: s_endpgm +define amdgpu_kernel void @tbuffer_store_d16_x(<4 x i32> inreg %rsrc, half %data, i32 %vindex) { +main_body: + call void @llvm.amdgcn.tbuffer.store.f16(half %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0, i32 1, i32 2, i1 0, i1 0) + ret void +} + + +;UNPACKED-LABEL: {{^}}tbuffer_store_d16_xy: +;UNPACKED: tbuffer_store_format_d16_xy v[0:1], v2, s[4:7], dfmt:1, nfmt:2, 0 idxen +;UNPACKED: s_endpgm + +;PACKED-LABEL: {{^}}tbuffer_store_d16_xy: +;PACKED: tbuffer_store_format_d16_xy v0, v1, s[4:7], dfmt:1, nfmt:2, 0 idxen +;PACKED: s_endpgm +define amdgpu_kernel void @tbuffer_store_d16_xy(<4 x i32> inreg %rsrc, <2 x half> %data, i32 %vindex) { +main_body: + call void @llvm.amdgcn.tbuffer.store.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0, i32 1, i32 2, i1 0, i1 0) + ret void +} + + +;UNPACKED-LABEL: {{^}}tbuffer_store_d16_xyzw: +;UNPACKED: tbuffer_store_format_d16_xyzw v[0:3], v4, s[4:7], dfmt:1, nfmt:2, 0 idxen +;UNPACKED: s_endpgm + +;PACKED-LABEL: {{^}}tbuffer_store_d16_xyzw: +;PACKED: tbuffer_store_format_d16_xyzw v[0:1], v2, s[4:7], dfmt:1, nfmt:2, 0 idxen +;PACKED: s_endpgm +define amdgpu_kernel void @tbuffer_store_d16_xyzw(<4 x i32> inreg %rsrc, <4 x half> %data, i32 %vindex) { +main_body: + call void @llvm.amdgcn.tbuffer.store.v4f16(<4 x half> %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0, i32 1, i32 2, i1 0, i1 0) + ret void +} + +declare void @llvm.amdgcn.tbuffer.store.f16(half, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) +declare void @llvm.amdgcn.tbuffer.store.v2f16(<2 x half>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) +declare void @llvm.amdgcn.tbuffer.store.v4f16(<4 x half>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1)