Index: lib/Target/AMDGPU/AMDGPU.td =================================================================== --- lib/Target/AMDGPU/AMDGPU.td +++ lib/Target/AMDGPU/AMDGPU.td @@ -292,6 +292,12 @@ "Support clamp for integer destination" >; +def FeatureUnpackedD16VMem : SubtargetFeature<"unpacked-d16-vmem", + "HasUnpackedD16VMem", + "true", + "Has unpacked d16 vmem instructions" +>; + //===------------------------------------------------------------===// // Subtarget Features (options and debugging) //===------------------------------------------------------------===// @@ -547,23 +553,27 @@ def FeatureISAVersion8_0_0 : SubtargetFeatureISAVersion <8,0,0, [FeatureVolcanicIslands, FeatureLDSBankCount32, - FeatureSGPRInitBug]>; + FeatureSGPRInitBug, + FeatureUnpackedD16VMem]>; def FeatureISAVersion8_0_1 : SubtargetFeatureISAVersion <8,0,1, [FeatureVolcanicIslands, FeatureFastFMAF32, HalfRate64Ops, FeatureLDSBankCount32, - FeatureXNACK]>; + FeatureXNACK, + FeatureUnpackedD16VMem]>; def FeatureISAVersion8_0_2 : SubtargetFeatureISAVersion <8,0,2, [FeatureVolcanicIslands, FeatureLDSBankCount32, - FeatureSGPRInitBug]>; + FeatureSGPRInitBug, + FeatureUnpackedD16VMem]>; def FeatureISAVersion8_0_3 : SubtargetFeatureISAVersion <8,0,3, [FeatureVolcanicIslands, - FeatureLDSBankCount32]>; + FeatureLDSBankCount32, + FeatureUnpackedD16VMem]>; def FeatureISAVersion8_1_0 : SubtargetFeatureISAVersion <8,1,0, [FeatureVolcanicIslands, @@ -715,6 +725,11 @@ def HasD16LoadStore : Predicate<"Subtarget->hasD16LoadStore()">, AssemblerPredicate<"FeatureGFX9Insts">; +def HasUnpackedD16VMem : Predicate<"Subtarget->hasUnpackedD16VMem()">, + AssemblerPredicate<"FeatureUnpackedD16VMem">; +def HasPackedD16VMem : Predicate<"!Subtarget->hasUnpackedD16VMem()">, + AssemblerPredicate<"!FeatureUnpackedD16VMem">; + def LDSRequiresM0Init : Predicate<"Subtarget->ldsRequiresM0Init()">; def NotLDSRequiresM0Init : Predicate<"!Subtarget->ldsRequiresM0Init()">; Index: lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.h +++ lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -441,14 +441,18 @@ LOAD_CONSTANT, TBUFFER_STORE_FORMAT, TBUFFER_STORE_FORMAT_X3, + TBUFFER_STORE_FORMAT_D16, TBUFFER_LOAD_FORMAT, + TBUFFER_LOAD_FORMAT_D16, ATOMIC_CMP_SWAP, ATOMIC_INC, ATOMIC_DEC, BUFFER_LOAD, BUFFER_LOAD_FORMAT, + BUFFER_LOAD_FORMAT_D16, BUFFER_STORE, BUFFER_STORE_FORMAT, + BUFFER_STORE_FORMAT_D16, BUFFER_ATOMIC_SWAP, BUFFER_ATOMIC_ADD, BUFFER_ATOMIC_SUB, Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -3976,14 +3976,18 @@ NODE_NAME_CASE(LOAD_CONSTANT) NODE_NAME_CASE(TBUFFER_STORE_FORMAT) NODE_NAME_CASE(TBUFFER_STORE_FORMAT_X3) + NODE_NAME_CASE(TBUFFER_STORE_FORMAT_D16) NODE_NAME_CASE(TBUFFER_LOAD_FORMAT) + NODE_NAME_CASE(TBUFFER_LOAD_FORMAT_D16) NODE_NAME_CASE(ATOMIC_CMP_SWAP) NODE_NAME_CASE(ATOMIC_INC) NODE_NAME_CASE(ATOMIC_DEC) NODE_NAME_CASE(BUFFER_LOAD) NODE_NAME_CASE(BUFFER_LOAD_FORMAT) + NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16) NODE_NAME_CASE(BUFFER_STORE) NODE_NAME_CASE(BUFFER_STORE_FORMAT) + NODE_NAME_CASE(BUFFER_STORE_FORMAT_D16) NODE_NAME_CASE(BUFFER_ATOMIC_SWAP) NODE_NAME_CASE(BUFFER_ATOMIC_ADD) NODE_NAME_CASE(BUFFER_ATOMIC_SUB) Index: lib/Target/AMDGPU/AMDGPUSubtarget.h =================================================================== --- lib/Target/AMDGPU/AMDGPUSubtarget.h +++ lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -165,6 +165,7 @@ bool FlatGlobalInsts; bool FlatScratchInsts; bool AddNoCarryInsts; + bool HasUnpackedD16VMem; bool R600ALUInst; bool CaymanISA; bool CFALUBug; @@ -481,6 +482,10 @@ return AddNoCarryInsts; } + bool hasUnpackedD16VMem() const { + return HasUnpackedD16VMem; + } + bool isMesaKernel(const MachineFunction &MF) const { return isMesa3DOS() && !AMDGPU::isShader(MF.getFunction()->getCallingConv()); } Index: lib/Target/AMDGPU/AMDGPUSubtarget.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -162,6 +162,7 @@ FlatGlobalInsts(false), FlatScratchInsts(false), AddNoCarryInsts(false), + HasUnpackedD16VMem(false), R600ALUInst(false), CaymanISA(false), Index: lib/Target/AMDGPU/BUFInstructions.td =================================================================== --- lib/Target/AMDGPU/BUFInstructions.td +++ lib/Target/AMDGPU/BUFInstructions.td @@ -671,6 +671,61 @@ defm BUFFER_STORE_FORMAT_XYZW : MUBUF_Pseudo_Stores < "buffer_store_format_xyzw", VReg_128 >; + +let SubtargetPredicate = HasUnpackedD16VMem in { + defm BUFFER_LOAD_FORMAT_D16_X_gfx80 : MUBUF_Pseudo_Loads < + "buffer_load_format_d16_x", VGPR_32 + >; + defm BUFFER_LOAD_FORMAT_D16_XY_gfx80 : MUBUF_Pseudo_Loads < + "buffer_load_format_d16_xy", VReg_64 + >; + defm BUFFER_LOAD_FORMAT_D16_XYZ_gfx80 : MUBUF_Pseudo_Loads < + "buffer_load_format_d16_xyz", VReg_96 + >; + defm BUFFER_LOAD_FORMAT_D16_XYZW_gfx80 : MUBUF_Pseudo_Loads < + "buffer_load_format_d16_xyzw", VReg_128 + >; + defm BUFFER_STORE_FORMAT_D16_X_gfx80 : MUBUF_Pseudo_Stores < + "buffer_store_format_d16_x", VGPR_32 + >; + defm BUFFER_STORE_FORMAT_D16_XY_gfx80 : MUBUF_Pseudo_Stores < + "buffer_store_format_d16_xy", VReg_64 + >; + defm BUFFER_STORE_FORMAT_D16_XYZ_gfx80 : MUBUF_Pseudo_Stores < + "buffer_store_format_d16_xyz", VReg_96 + >; + defm BUFFER_STORE_FORMAT_D16_XYZW_gfx80 : MUBUF_Pseudo_Stores < + "buffer_store_format_d16_xyzw", VReg_128 + >; +} // End HasUnpackedD16VMem. + +let SubtargetPredicate = HasPackedD16VMem in { + defm BUFFER_LOAD_FORMAT_D16_X : MUBUF_Pseudo_Loads < + "buffer_load_format_d16_x", VGPR_32 + >; + defm BUFFER_LOAD_FORMAT_D16_XY : MUBUF_Pseudo_Loads < + "buffer_load_format_d16_xy", VGPR_32 + >; + defm BUFFER_LOAD_FORMAT_D16_XYZ : MUBUF_Pseudo_Loads < + "buffer_load_format_d16_xyz", VReg_64 + >; + defm BUFFER_LOAD_FORMAT_D16_XYZW : MUBUF_Pseudo_Loads < + "buffer_load_format_d16_xyzw", VReg_64 + >; + defm BUFFER_STORE_FORMAT_D16_X : MUBUF_Pseudo_Stores < + "buffer_store_format_d16_x", VGPR_32 + >; + defm BUFFER_STORE_FORMAT_D16_XY : MUBUF_Pseudo_Stores < + "buffer_store_format_d16_xy", VGPR_32 + >; + defm BUFFER_STORE_FORMAT_D16_XYZ : MUBUF_Pseudo_Stores < + "buffer_store_format_d16_xyz", VReg_64 + >; + defm BUFFER_STORE_FORMAT_D16_XYZW : MUBUF_Pseudo_Stores < + "buffer_store_format_d16_xyzw", VReg_64 + >; +} // End HasPackedD16VMem. + defm BUFFER_LOAD_UBYTE : MUBUF_Pseudo_Loads < "buffer_load_ubyte", VGPR_32, i32, mubuf_az_extloadi8 >; @@ -860,6 +915,28 @@ defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyz", VReg_128>; defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyzw", VReg_128>; +let SubtargetPredicate = HasUnpackedD16VMem in { + defm TBUFFER_LOAD_FORMAT_D16_X_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_x", VGPR_32>; + defm TBUFFER_LOAD_FORMAT_D16_XY_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xy", VReg_64>; + defm TBUFFER_LOAD_FORMAT_D16_XYZ_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyz", VReg_128>; + defm TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyzw", VReg_128>; + defm TBUFFER_STORE_FORMAT_D16_X_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_x", VGPR_32>; + defm TBUFFER_STORE_FORMAT_D16_XY_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xy", VReg_64>; + defm TBUFFER_STORE_FORMAT_D16_XYZ_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyz", VReg_128>; + defm TBUFFER_STORE_FORMAT_D16_XYZW_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyzw", VReg_128>; +} // End HasUnpackedD16VMem. + +let SubtargetPredicate = HasPackedD16VMem in { + defm TBUFFER_LOAD_FORMAT_D16_X : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_x", VGPR_32>; + defm TBUFFER_LOAD_FORMAT_D16_XY : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xy", VGPR_32>; + defm TBUFFER_LOAD_FORMAT_D16_XYZ : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyz", VReg_64>; + defm TBUFFER_LOAD_FORMAT_D16_XYZW : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyzw", VReg_64>; + defm TBUFFER_STORE_FORMAT_D16_X : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_x", VGPR_32>; + defm TBUFFER_STORE_FORMAT_D16_XY : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xy", VGPR_32>; + defm TBUFFER_STORE_FORMAT_D16_XYZ : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyz", VReg_64>; + defm TBUFFER_STORE_FORMAT_D16_XYZW : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyzw", VReg_64>; +} // End HasPackedD16VMem. + let SubtargetPredicate = isCIVI in { //===----------------------------------------------------------------------===// @@ -922,6 +999,20 @@ defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; + +let SubtargetPredicate = HasUnpackedD16VMem in { + defm : MUBUF_LoadIntrinsicPat; + defm : MUBUF_LoadIntrinsicPat; + defm : MUBUF_LoadIntrinsicPat; +} // End HasUnpackedD16VMem. + +let SubtargetPredicate = HasPackedD16VMem in { + defm : MUBUF_LoadIntrinsicPat; + defm : MUBUF_LoadIntrinsicPat; + defm : MUBUF_LoadIntrinsicPat; + defm : MUBUF_LoadIntrinsicPat; +} // End HasPackedD16VMem. + defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; @@ -969,6 +1060,20 @@ defm : MUBUF_StoreIntrinsicPat; defm : MUBUF_StoreIntrinsicPat; defm : MUBUF_StoreIntrinsicPat; + +let SubtargetPredicate = HasUnpackedD16VMem in { + defm : MUBUF_StoreIntrinsicPat; + defm : MUBUF_StoreIntrinsicPat; + defm : MUBUF_StoreIntrinsicPat; +} // End HasUnpackedD16VMem. + +let SubtargetPredicate = HasPackedD16VMem in { + defm : MUBUF_StoreIntrinsicPat; + defm : MUBUF_StoreIntrinsicPat; + defm : MUBUF_StoreIntrinsicPat; + defm : MUBUF_StoreIntrinsicPat; +} // End HasPackedD16VMem. + defm : MUBUF_StoreIntrinsicPat; defm : MUBUF_StoreIntrinsicPat; defm : MUBUF_StoreIntrinsicPat; @@ -1382,6 +1487,19 @@ defm : MTBUF_LoadIntrinsicPat; defm : MTBUF_LoadIntrinsicPat; +let SubtargetPredicate = HasUnpackedD16VMem in { + defm : MTBUF_LoadIntrinsicPat; + defm : MTBUF_LoadIntrinsicPat; + defm : MTBUF_LoadIntrinsicPat; +} // End HasUnpackedD16VMem. + +let SubtargetPredicate = HasPackedD16VMem in { + defm : MTBUF_LoadIntrinsicPat; + defm : MTBUF_LoadIntrinsicPat; + defm : MTBUF_LoadIntrinsicPat; + defm : MTBUF_LoadIntrinsicPat; +} // End HasPackedD16VMem. + multiclass MTBUF_StoreIntrinsicPat { def : GCNPat< @@ -1431,6 +1549,19 @@ defm : MTBUF_StoreIntrinsicPat; defm : MTBUF_StoreIntrinsicPat; +let SubtargetPredicate = HasUnpackedD16VMem in { + defm : MTBUF_StoreIntrinsicPat; + defm : MTBUF_StoreIntrinsicPat; + defm : MTBUF_StoreIntrinsicPat; +} // End HasUnpackedD16VMem. + +let SubtargetPredicate = HasPackedD16VMem in { + defm : MTBUF_StoreIntrinsicPat; + defm : MTBUF_StoreIntrinsicPat; + defm : MTBUF_StoreIntrinsicPat; + defm : MTBUF_StoreIntrinsicPat; +} // End HasPackedD16VMem. + //===----------------------------------------------------------------------===// // Target instructions, move to the appropriate target TD file //===----------------------------------------------------------------------===// @@ -1644,6 +1775,14 @@ defm BUFFER_STORE_FORMAT_XY : MUBUF_Real_AllAddr_vi <0x05>; defm BUFFER_STORE_FORMAT_XYZ : MUBUF_Real_AllAddr_vi <0x06>; defm BUFFER_STORE_FORMAT_XYZW : MUBUF_Real_AllAddr_vi <0x07>; +defm BUFFER_LOAD_FORMAT_D16_X : MUBUF_Real_AllAddr_vi <0x08>; +defm BUFFER_LOAD_FORMAT_D16_XY : MUBUF_Real_AllAddr_vi <0x09>; +defm BUFFER_LOAD_FORMAT_D16_XYZ : MUBUF_Real_AllAddr_vi <0x0a>; +defm BUFFER_LOAD_FORMAT_D16_XYZW : MUBUF_Real_AllAddr_vi <0x0b>; +defm BUFFER_STORE_FORMAT_D16_X : MUBUF_Real_AllAddr_vi <0x0c>; +defm BUFFER_STORE_FORMAT_D16_XY : MUBUF_Real_AllAddr_vi <0x0d>; +defm BUFFER_STORE_FORMAT_D16_XYZ : MUBUF_Real_AllAddr_vi <0x0e>; +defm BUFFER_STORE_FORMAT_D16_XYZW : MUBUF_Real_AllAddr_vi <0x0f>; defm BUFFER_LOAD_UBYTE : MUBUF_Real_AllAddr_vi <0x10>; defm BUFFER_LOAD_SBYTE : MUBUF_Real_AllAddr_vi <0x11>; defm BUFFER_LOAD_USHORT : MUBUF_Real_AllAddr_vi <0x12>; @@ -1729,11 +1868,19 @@ def _BOTHEN_vi : MTBUF_Real_vi (NAME#"_BOTHEN")>; } -defm TBUFFER_LOAD_FORMAT_X : MTBUF_Real_AllAddr_vi <0>; -defm TBUFFER_LOAD_FORMAT_XY : MTBUF_Real_AllAddr_vi <1>; -//defm TBUFFER_LOAD_FORMAT_XYZ : MTBUF_Real_AllAddr_vi <2>; -defm TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Real_AllAddr_vi <3>; -defm TBUFFER_STORE_FORMAT_X : MTBUF_Real_AllAddr_vi <4>; -defm TBUFFER_STORE_FORMAT_XY : MTBUF_Real_AllAddr_vi <5>; -defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Real_AllAddr_vi <6>; -defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Real_AllAddr_vi <7>; +defm TBUFFER_LOAD_FORMAT_X : MTBUF_Real_AllAddr_vi <0x00>; +defm TBUFFER_LOAD_FORMAT_XY : MTBUF_Real_AllAddr_vi <0x01>; +//defm TBUFFER_LOAD_FORMAT_XYZ : MTBUF_Real_AllAddr_vi <0x02>; +defm TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Real_AllAddr_vi <0x03>; +defm TBUFFER_STORE_FORMAT_X : MTBUF_Real_AllAddr_vi <0x04>; +defm TBUFFER_STORE_FORMAT_XY : MTBUF_Real_AllAddr_vi <0x05>; +defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Real_AllAddr_vi <0x06>; +defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Real_AllAddr_vi <0x07>; +defm TBUFFER_LOAD_FORMAT_D16_X : MTBUF_Real_AllAddr_vi <0x08>; +defm TBUFFER_LOAD_FORMAT_D16_XY : MTBUF_Real_AllAddr_vi <0x09>; +//defm TBUFFER_LOAD_FORMAT_D16_XYZ : MTBUF_Real_AllAddr_vi <0x0a>; +defm TBUFFER_LOAD_FORMAT_D16_XYZW : MTBUF_Real_AllAddr_vi <0x0b>; +defm TBUFFER_STORE_FORMAT_D16_X : MTBUF_Real_AllAddr_vi <0x0c>; +defm TBUFFER_STORE_FORMAT_D16_XY : MTBUF_Real_AllAddr_vi <0x0d>; +defm TBUFFER_STORE_FORMAT_D16_XYZ : MTBUF_Real_AllAddr_vi <0x0e>; +defm TBUFFER_STORE_FORMAT_D16_XYZW : MTBUF_Real_AllAddr_vi <0x0f>; Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -207,11 +207,14 @@ setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f16, Custom); + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2f16, Custom); + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4f16, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::v2i16, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::v2f16, Custom); + setOperationAction(ISD::INTRINSIC_VOID, MVT::v4f16, Custom); setOperationAction(ISD::BRCOND, MVT::Other, Custom); setOperationAction(ISD::BR_CC, MVT::i1, Expand); @@ -550,6 +553,13 @@ // TargetLowering queries //===----------------------------------------------------------------------===// +static bool isHalfVT (EVT VT) { + return (VT == MVT::f16 || + VT == MVT::v2f16 || + //VT == MVT::v3f16 || + VT == MVT::v4f16); +} + bool SITargetLowering::isShuffleMaskLegal(ArrayRef, EVT) const { // SI has some legal vector types, but no legal vector operations. Say no // shuffles are legal in order to prefer scalarizing some vector operations. @@ -573,6 +583,32 @@ Info.writeMem = true; return true; } + case Intrinsic::amdgcn_buffer_load: + case Intrinsic::amdgcn_buffer_load_format: + case Intrinsic::amdgcn_tbuffer_load: { + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = MVT::getVT(CI.getType()); + Info.ptrVal = nullptr; + Info.align = 0; + + Info.vol = false; + Info.readMem = true; + Info.writeMem = false; + return true; + } + case Intrinsic::amdgcn_buffer_store: + case Intrinsic::amdgcn_buffer_store_format: + case Intrinsic::amdgcn_tbuffer_store: { + Info.opc = ISD::INTRINSIC_VOID; + Info.memVT = MVT::getVT(CI.getOperand(0)->getType()); + Info.ptrVal = nullptr; + Info.align = 0; + + Info.vol = false; + Info.readMem = false; + Info.writeMem = true; + return true; + } default: return false; } @@ -3296,6 +3332,50 @@ return SDValue(); } +static SDValue lowerIntrinsicWChain(SDValue Op, EVT EquivResultT, + SelectionDAG &DAG) { + // Change from v4f16/v2f16 to EquivResultT. + SDVTList VTList = DAG.getVTList(EquivResultT, MVT::Other); + SDLoc DL(Op); + MemSDNode *M = cast(Op); + unsigned IID = cast(Op.getOperand(1))->getZExtValue(); + switch (IID) { + case Intrinsic::amdgcn_tbuffer_load: { + SDValue Ops[] = { + Op.getOperand(0), // Chain + Op.getOperand(2), // rsrc + Op.getOperand(3), // vindex + Op.getOperand(4), // voffset + Op.getOperand(5), // soffset + Op.getOperand(6), // offset + Op.getOperand(7), // dfmt + Op.getOperand(8), // nfmt + Op.getOperand(9), // glc + Op.getOperand(10) // slc + }; + return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, DL, + VTList, Ops, M->getMemoryVT(), + M->getMemOperand()); + } + case Intrinsic::amdgcn_buffer_load_format: { + SDValue Ops[] = { + Op.getOperand(0), // Chain + Op.getOperand(2), // rsrc + Op.getOperand(3), // vindex + Op.getOperand(4), // offset + Op.getOperand(5), // glc + Op.getOperand(6) // slc + }; + return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, + DL, VTList, Ops, M->getMemoryVT(), + M->getMemOperand()); + } + default: + return SDValue(); + } // End switch. + return SDValue(); +} + void SITargetLowering::ReplaceNodeResults(SDNode *N, SmallVectorImpl &Results, SelectionDAG &DAG) const { @@ -3323,6 +3403,32 @@ } break; } + case ISD::INTRINSIC_W_CHAIN: { + SDLoc SL(N); + SDValue Op = SDValue(N, 0); + EVT LoadVT = Op.getValueType(); + // TODO: handle v3f16. + if (LoadVT != MVT::v2f16 && LoadVT != MVT::v4f16) + return; + + bool HasPacked = !Subtarget->hasUnpackedD16VMem(); + EVT UnpackedLoadVT = (LoadVT == MVT::v2f16) ? MVT::v2i32 : MVT::v4i32; + EVT EquivLoadVT = HasPacked ? + getEquivalentMemType(*DAG.getContext(), LoadVT) + : UnpackedLoadVT; + if (SDValue Res = lowerIntrinsicWChain(Op, EquivLoadVT, DAG)) { + if (!HasPacked) { // From v2i32/v4i32 back to v2f16/v4f16. + // Truncate to v2i16/v4i16. + EVT IntLoadVT = LoadVT.changeTypeToInteger(); + SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, IntLoadVT, Res); + // Bitcast to original type (v2f16/v4f16). + Results.push_back(DAG.getNode(ISD::BITCAST, SL, LoadVT, Trunc)); + } else // Cast back to the original packed type. + Results.push_back(DAG.getNode(ISD::BITCAST, SL, LoadVT, Res)); + Results.push_back(Res.getOperand(0)); // Chain + } + break; + } case ISD::SELECT: { SDLoc SL(N); EVT VT = N->getValueType(0); @@ -4274,6 +4380,7 @@ } case Intrinsic::amdgcn_buffer_load: case Intrinsic::amdgcn_buffer_load_format: { + MemSDNode *M = cast(Op); SDValue Ops[] = { Op.getOperand(0), // Chain Op.getOperand(2), // rsrc @@ -4282,21 +4389,13 @@ Op.getOperand(5), // glc Op.getOperand(6) // slc }; - SIMachineFunctionInfo *MFI = MF.getInfo(); - unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ? - AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT; - EVT VT = Op.getValueType(); - EVT IntVT = VT.changeTypeToInteger(); - - MachineMemOperand *MMO = MF.getMachineMemOperand( - MachinePointerInfo(MFI->getBufferPSV()), - MachineMemOperand::MOLoad, - VT.getStoreSize(), VT.getStoreSize()); - - return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, MMO); + AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT; + return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, + M->getMemoryVT(), M->getMemOperand()); } case Intrinsic::amdgcn_tbuffer_load: { + MemSDNode *M = cast(Op); SDValue Ops[] = { Op.getOperand(0), // Chain Op.getOperand(2), // rsrc @@ -4307,17 +4406,11 @@ Op.getOperand(7), // dfmt Op.getOperand(8), // nfmt Op.getOperand(9), // glc - Op.getOperand(10) // slc + Op.getOperand(10) // slc }; - - EVT VT = Op.getOperand(2).getValueType(); - - MachineMemOperand *MMO = MF.getMachineMemOperand( - MachinePointerInfo(), - MachineMemOperand::MOLoad, - VT.getStoreSize(), VT.getStoreSize()); return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL, - Op->getVTList(), Ops, VT, MMO); + Op->getVTList(), Ops, M->getMemoryVT(), + M->getMemOperand()); } case Intrinsic::amdgcn_buffer_atomic_swap: case Intrinsic::amdgcn_buffer_atomic_add: @@ -4611,9 +4704,32 @@ } case Intrinsic::amdgcn_tbuffer_store: { + SDValue VData = Op.getOperand(2); + EVT StoreVT = VData.getValueType(); + unsigned Opc = AMDGPUISD::TBUFFER_STORE_FORMAT; + if (isHalfVT(StoreVT)) { + Opc = AMDGPUISD::TBUFFER_STORE_FORMAT_D16; + // TODO: Handle v3f16. + if (StoreVT == MVT::v2f16 || StoreVT== MVT::v4f16) { + if (!Subtarget->hasUnpackedD16VMem()) { + if (!isTypeLegal(StoreVT)) { + // If Target supports packed vmem, we just need to workaround + // the illegal type by casting to an equivalent one. + EVT EquivStoreVT = getEquivalentMemType(*DAG.getContext(), + StoreVT); + VData = DAG.getNode(ISD::BITCAST, DL, EquivStoreVT, VData); + } + } else {// We need to unpack the packed data to store. + EVT IntStoreVT = StoreVT.changeTypeToInteger(); + SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData); + EVT EquivStoreVT = (StoreVT == MVT::v2f16) ? MVT::v2i32 : MVT::v4i32; + VData = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData); + } + } + } SDValue Ops[] = { Chain, - Op.getOperand(2), // vdata + VData, // vdata Op.getOperand(3), // rsrc Op.getOperand(4), // vindex Op.getOperand(5), // voffset @@ -4624,37 +4740,49 @@ Op.getOperand(10), // glc Op.getOperand(11) // slc }; - EVT VT = Op.getOperand(3).getValueType(); - MachineMemOperand *MMO = MF.getMachineMemOperand( - MachinePointerInfo(), - MachineMemOperand::MOStore, - VT.getStoreSize(), 4); - return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL, - Op->getVTList(), Ops, VT, MMO); + MemSDNode *M = cast(Op); + return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, + M->getMemoryVT(), M->getMemOperand()); } case Intrinsic::amdgcn_buffer_store: case Intrinsic::amdgcn_buffer_store_format: { + SDValue VData = Op.getOperand(2); + EVT StoreVT = VData.getValueType(); + unsigned Opc = IntrinsicID == Intrinsic::amdgcn_buffer_store ? + AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT; + if (isHalfVT(StoreVT)) { + Opc = AMDGPUISD::BUFFER_STORE_FORMAT_D16; + // TODO: Handle v3f16. + if (StoreVT == MVT::v2f16 || StoreVT== MVT::v4f16) { + if (!Subtarget->hasUnpackedD16VMem()) { + if (!isTypeLegal(StoreVT)) { + // If Target supports packed vmem, we just need to workaround + // the illegal type by casting to an equivalent one. + EVT EquivStoreVT = getEquivalentMemType(*DAG.getContext(), + StoreVT); + VData = DAG.getNode(ISD::BITCAST, DL, EquivStoreVT, VData); + } + } else {// We need to unpack the packed data to store. + EVT IntStoreVT = StoreVT.changeTypeToInteger(); + SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData); + EVT EquivStoreVT = (StoreVT == MVT::v2f16) ? MVT::v2i32 : MVT::v4i32; + VData = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData); + } + } + } SDValue Ops[] = { Chain, - Op.getOperand(2), // vdata + VData, // vdata Op.getOperand(3), // rsrc Op.getOperand(4), // vindex Op.getOperand(5), // offset Op.getOperand(6), // glc Op.getOperand(7) // slc }; - EVT VT = Op.getOperand(3).getValueType(); - MachineMemOperand *MMO = MF.getMachineMemOperand( - MachinePointerInfo(), - MachineMemOperand::MOStore | - MachineMemOperand::MODereferenceable, - VT.getStoreSize(), 4); - - unsigned Opcode = IntrinsicID == Intrinsic::amdgcn_buffer_store ? - AMDGPUISD::BUFFER_STORE : - AMDGPUISD::BUFFER_STORE_FORMAT; - return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT, MMO); + MemSDNode *M = cast(Op); + return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, + M->getMemoryVT(), M->getMemOperand()); } default: Index: lib/Target/AMDGPU/SIInstrInfo.td =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.td +++ lib/Target/AMDGPU/SIInstrInfo.td @@ -45,21 +45,24 @@ [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain] >; -def SItbuffer_load : SDNode<"AMDGPUISD::TBUFFER_LOAD_FORMAT", - SDTypeProfile<1, 9, - [ // vdata - SDTCisVT<1, v4i32>, // rsrc - SDTCisVT<2, i32>, // vindex(VGPR) - SDTCisVT<3, i32>, // voffset(VGPR) - SDTCisVT<4, i32>, // soffset(SGPR) - SDTCisVT<5, i32>, // offset(imm) - SDTCisVT<6, i32>, // dfmt(imm) - SDTCisVT<7, i32>, // nfmt(imm) - SDTCisVT<8, i32>, // glc(imm) - SDTCisVT<9, i32> // slc(imm) - ]>, - [SDNPMayLoad, SDNPMemOperand, SDNPHasChain] ->; +def SDTbuffer_load : SDTypeProfile<1, 9, + [ // vdata + SDTCisVT<1, v4i32>, // rsrc + SDTCisVT<2, i32>, // vindex(VGPR) + SDTCisVT<3, i32>, // voffset(VGPR) + SDTCisVT<4, i32>, // soffset(SGPR) + SDTCisVT<5, i32>, // offset(imm) + SDTCisVT<6, i32>, // dfmt(imm) + SDTCisVT<7, i32>, // nfmt(imm) + SDTCisVT<8, i32>, // glc(imm) + SDTCisVT<9, i32> // slc(imm) + ]>; + +def SItbuffer_load : SDNode<"AMDGPUISD::TBUFFER_LOAD_FORMAT", SDTbuffer_load, + [SDNPMayLoad, SDNPMemOperand, SDNPHasChain]>; +def SItbuffer_load_d16 : SDNode<"AMDGPUISD::TBUFFER_LOAD_FORMAT_D16", + SDTbuffer_load, + [SDNPMayLoad, SDNPMemOperand, SDNPHasChain]>; def SDTtbuffer_store : SDTypeProfile<0, 10, [ // vdata @@ -79,6 +82,9 @@ def SItbuffer_store_x3 : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT_X3", SDTtbuffer_store, [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>; +def SItbuffer_store_d16 : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT_D16", + SDTtbuffer_store, + [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>; def SDTBufferLoad : SDTypeProfile<1, 5, [ // vdata @@ -92,6 +98,9 @@ [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>; def SIbuffer_load_format : SDNode <"AMDGPUISD::BUFFER_LOAD_FORMAT", SDTBufferLoad, [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>; +def SIbuffer_load_format_d16 : SDNode <"AMDGPUISD::BUFFER_LOAD_FORMAT_D16", + SDTBufferLoad, + [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>; def SDTBufferStore : SDTypeProfile<0, 6, [ // vdata @@ -102,9 +111,13 @@ SDTCisVT<5, i1>]>; // slc def SIbuffer_store : SDNode <"AMDGPUISD::BUFFER_STORE", SDTBufferStore, - [SDNPMemOperand, SDNPHasChain, SDNPMayStore]>; -def SIbuffer_store_format : SDNode <"AMDGPUISD::BUFFER_STORE_FORMAT", SDTBufferStore, - [SDNPMemOperand, SDNPHasChain, SDNPMayStore]>; + [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>; +def SIbuffer_store_format : SDNode <"AMDGPUISD::BUFFER_STORE_FORMAT", + SDTBufferStore, + [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>; +def SIbuffer_store_format_d16 : SDNode <"AMDGPUISD::BUFFER_STORE_FORMAT_D16", + SDTBufferStore, + [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>; class SDBufferAtomic : SDNode inreg %rsrc) { +main_body: + %data = call half @llvm.amdgcn.buffer.load.format.f16(<4 x i32> %rsrc, i32 0, i32 0, i1 0, i1 0) + ret half %data +} + +; GCN-LABEL: {{^}}buffer_load_format_d16_xy: +; UNPACKED: buffer_load_format_d16_xy v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0 +; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]] + +; PACKED: buffer_load_format_d16_xy v[[FULL:[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 +; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[FULL]] +define amdgpu_ps half @buffer_load_format_d16_xy(<4 x i32> inreg %rsrc) { +main_body: + %data = call <2 x half> @llvm.amdgcn.buffer.load.format.v2f16(<4 x i32> %rsrc, i32 0, i32 0, i1 0, i1 0) + %elt = extractelement <2 x half> %data, i32 1 + ret half %elt +} + +; GCN-LABEL: {{^}}buffer_load_format_d16_xyzw: +; UNPACKED: buffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0 +; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]] + +; PACKED: buffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0 +; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[HI]] +define amdgpu_ps half @buffer_load_format_d16_xyzw(<4 x i32> inreg %rsrc) { +main_body: + %data = call <4 x half> @llvm.amdgcn.buffer.load.format.v4f16(<4 x i32> %rsrc, i32 0, i32 0, i1 0, i1 0) + %elt = extractelement <4 x half> %data, i32 3 + ret half %elt +} + +declare half @llvm.amdgcn.buffer.load.format.f16(<4 x i32>, i32, i32, i1, i1) +declare <2 x half> @llvm.amdgcn.buffer.load.format.v2f16(<4 x i32>, i32, i32, i1, i1) +declare <4 x half> @llvm.amdgcn.buffer.load.format.v4f16(<4 x i32>, i32, i32, i1, i1) Index: test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.d16.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.d16.ll @@ -0,0 +1,50 @@ +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=UNPACKED %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX81 %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX9 %s + +; GCN-LABEL: {{^}}buffer_store_format_d16_x: +; GCN: v_trunc_f16_e32 v[[LO:[0-9]+]], s{{[0-9]+}} +; GCN: buffer_store_format_d16_x v[[LO]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen +define amdgpu_kernel void @buffer_store_format_d16_x(<4 x i32> %rsrc, half %data, i32 %index) { +main_body: + call void @llvm.amdgcn.buffer.store.format.f16(half %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0) + ret void +} + +; GCN-LABEL: {{^}}buffer_store_format_d16_xy: + +; UNPACKED: flat_load_ushort v[[HI:[0-9]+]], v[{{[0-9]+:[0-9]+}}] glc slc +; UNPACKED: flat_load_ushort v[[LO:[0-9]+]], v[{{[0-9]+:[0-9]+}}] glc slc +; UNPACKED: buffer_store_format_d16_xy v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen + +; PACKED: buffer_store_format_d16_xy v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen +define amdgpu_kernel void @buffer_store_format_d16_xy(<4 x i32> %rsrc, <2 x half> %data, i32 %index) { +main_body: + call void @llvm.amdgcn.buffer.store.format.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0) + ret void +} + +; GCN-LABEL: {{^}}buffer_store_format_d16_xyzw: + +; UNPACKED: flat_load_ushort v[[HI:[0-9]+]], v[{{[0-9]+:[0-9]+}}] glc slc +; UNPACKED: flat_load_ushort v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] glc slc +; UNPACKED: flat_load_ushort v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] glc slc +; UNPACKED: flat_load_ushort v[[LO:[0-9]+]], v[{{[0-9]+:[0-9]+}}] glc slc +; UNPACKED: buffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen + +; GFX81: v_or_b32_e32 v[[HI:[0-9]+]] +; GFX81: v_or_b32_e32 v[[LO:[0-9]+]] + +; GFX9: v_mov_b32_e32 v[[LO:[0-9]+]] +; GFX9: v_mov_b32_e32 v[[HI:[0-9]+]] + +; PACKED: buffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen +define amdgpu_kernel void @buffer_store_format_d16_xyzw(<4 x i32> %rsrc, <4 x half> %data, i32 %index) { +main_body: + call void @llvm.amdgcn.buffer.store.format.v4f16(<4 x half> %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0) + ret void +} + +declare void @llvm.amdgcn.buffer.store.format.f16(half, <4 x i32>, i32, i32, i1, i1) +declare void @llvm.amdgcn.buffer.store.format.v2f16(<2 x half>, <4 x i32>, i32, i32, i1, i1) +declare void @llvm.amdgcn.buffer.store.format.v4f16(<4 x half>, <4 x i32>, i32, i32, i1, i1) Index: test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.d16.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.d16.ll @@ -0,0 +1,41 @@ +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=UNPACKED %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=PACKED %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=PACKED %s + +; GCN-LABEL: {{^}}tbuffer_load_d16_x: +; GCN: tbuffer_load_format_d16_x v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], dfmt:6, nfmt:1, 0 +define amdgpu_ps half @tbuffer_load_d16_x(<4 x i32> inreg %rsrc) { +main_body: + %data = call half @llvm.amdgcn.tbuffer.load.f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 6, i32 1, i1 0, i1 0) + ret half %data +} + +; GCN-LABEL: {{^}}tbuffer_load_d16_xy: +; UNPACKED: tbuffer_load_format_d16_xy v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], dfmt:6, nfmt:1, 0 +; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]] + +; PACKED: tbuffer_load_format_d16_xy v[[FULL:[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], dfmt:6, nfmt:1, 0 +; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[FULL]] +define amdgpu_ps half @tbuffer_load_d16_xy(<4 x i32> inreg %rsrc) { +main_body: + %data = call <2 x half> @llvm.amdgcn.tbuffer.load.v2f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 6, i32 1, i1 0, i1 0) + %elt = extractelement <2 x half> %data, i32 1 + ret half %elt +} + +; GCN-LABEL: {{^}}tbuffer_load_d16_xyzw: +; UNPACKED: tbuffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], dfmt:6, nfmt:1, 0 +; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]] + +; PACKED: tbuffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], dfmt:6, nfmt:1, 0 +; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[HI]] +define amdgpu_ps half @tbuffer_load_d16_xyzw(<4 x i32> inreg %rsrc) { +main_body: + %data = call <4 x half> @llvm.amdgcn.tbuffer.load.v4f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 6, i32 1, i1 0, i1 0) + %elt = extractelement <4 x half> %data, i32 3 + ret half %elt +} + +declare half @llvm.amdgcn.tbuffer.load.f16(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) +declare <2 x half> @llvm.amdgcn.tbuffer.load.v2f16(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) +declare <4 x half> @llvm.amdgcn.tbuffer.load.v4f16(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) Index: test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll @@ -0,0 +1,53 @@ +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=UNPACKED %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX81 %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX9 %s + + +; GCN-LABEL: {{^}}tbuffer_store_d16_x: +; GCN: v_trunc_f16_e32 v[[LO:[0-9]+]], s{{[0-9]+}} +; GCN: tbuffer_store_format_d16_x v[[LO]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen +define amdgpu_kernel void @tbuffer_store_d16_x(<4 x i32> %rsrc, half %data, i32 %vindex) { +main_body: + call void @llvm.amdgcn.tbuffer.store.f16(half %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0, i32 1, i32 2, i1 0, i1 0) + ret void +} + + +; GCN-LABEL: {{^}}tbuffer_store_d16_xy: + +; UNPACKED: flat_load_ushort v[[HI:[0-9]+]], v[{{[0-9]+:[0-9]+}}] glc slc +; UNPACKED: flat_load_ushort v[[LO:[0-9]+]], v[{{[0-9]+:[0-9]+}}] glc slc +; UNPACKED: tbuffer_store_format_d16_xy v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen + +; PACKED: tbuffer_store_format_d16_xy v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen +define amdgpu_kernel void @tbuffer_store_d16_xy(<4 x i32> %rsrc, <2 x half> %data, i32 %vindex) { +main_body: + call void @llvm.amdgcn.tbuffer.store.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0, i32 1, i32 2, i1 0, i1 0) + ret void +} + + +; GCN-LABEL: {{^}}tbuffer_store_d16_xyzw: + +; UNPACKED: flat_load_ushort v[[HI:[0-9]+]], v[{{[0-9]+:[0-9]+}}] glc slc +; UNPACKED: flat_load_ushort v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] glc slc +; UNPACKED: flat_load_ushort v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] glc slc +; UNPACKED: flat_load_ushort v[[LO:[0-9]+]], v[{{[0-9]+:[0-9]+}}] glc slc +; UNPACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen + +; GFX81: v_or_b32_e32 v[[HI:[0-9]+]] +; GFX81: v_or_b32_e32 v[[LO:[0-9]+]] + +; GFX9: v_mov_b32_e32 v[[LO:[0-9]+]] +; GFX9: v_mov_b32_e32 v[[HI:[0-9]+]] + +; PACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen +define amdgpu_kernel void @tbuffer_store_d16_xyzw(<4 x i32> %rsrc, <4 x half> %data, i32 %vindex) { +main_body: + call void @llvm.amdgcn.tbuffer.store.v4f16(<4 x half> %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0, i32 1, i32 2, i1 0, i1 0) + ret void +} + +declare void @llvm.amdgcn.tbuffer.store.f16(half, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) +declare void @llvm.amdgcn.tbuffer.store.v2f16(<2 x half>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) +declare void @llvm.amdgcn.tbuffer.store.v4f16(<4 x half>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1)