Index: lib/Target/AMDGPU/AMDGPU.td =================================================================== --- lib/Target/AMDGPU/AMDGPU.td +++ lib/Target/AMDGPU/AMDGPU.td @@ -292,6 +292,12 @@ "Support clamp for integer destination" >; +def FeatureUnpackedD16VMem : SubtargetFeature<"unpacked-d16-vmem", + "HasUnpackedD16VMem", + "true", + "Has unpacked d16 vmem instructions" +>; + //===------------------------------------------------------------===// // Subtarget Features (options and debugging) //===------------------------------------------------------------===// @@ -547,23 +553,27 @@ def FeatureISAVersion8_0_0 : SubtargetFeatureISAVersion <8,0,0, [FeatureVolcanicIslands, FeatureLDSBankCount32, - FeatureSGPRInitBug]>; + FeatureSGPRInitBug, + FeatureUnpackedD16VMem]>; def FeatureISAVersion8_0_1 : SubtargetFeatureISAVersion <8,0,1, [FeatureVolcanicIslands, FeatureFastFMAF32, HalfRate64Ops, FeatureLDSBankCount32, - FeatureXNACK]>; + FeatureXNACK, + FeatureUnpackedD16VMem]>; def FeatureISAVersion8_0_2 : SubtargetFeatureISAVersion <8,0,2, [FeatureVolcanicIslands, FeatureLDSBankCount32, - FeatureSGPRInitBug]>; + FeatureSGPRInitBug, + FeatureUnpackedD16VMem]>; def FeatureISAVersion8_0_3 : SubtargetFeatureISAVersion <8,0,3, [FeatureVolcanicIslands, - FeatureLDSBankCount32]>; + FeatureLDSBankCount32, + FeatureUnpackedD16VMem]>; def FeatureISAVersion8_1_0 : SubtargetFeatureISAVersion <8,1,0, [FeatureVolcanicIslands, @@ -715,6 +725,11 @@ def HasD16LoadStore : Predicate<"Subtarget->hasD16LoadStore()">, AssemblerPredicate<"FeatureGFX9Insts">; +def HasUnpackedD16VMem : Predicate<"Subtarget->hasUnpackedD16VMem()">, + AssemblerPredicate<"FeatureUnpackedD16VMem">; +def HasPackedD16VMem : Predicate<"!Subtarget->hasUnpackedD16VMem()">, + AssemblerPredicate<"!FeatureUnpackedD16VMem">; + def LDSRequiresM0Init : Predicate<"Subtarget->ldsRequiresM0Init()">; def NotLDSRequiresM0Init : Predicate<"!Subtarget->ldsRequiresM0Init()">; Index: lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.h +++ lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -451,14 +451,18 @@ LOAD_CONSTANT, TBUFFER_STORE_FORMAT, TBUFFER_STORE_FORMAT_X3, + TBUFFER_STORE_FORMAT_D16, TBUFFER_LOAD_FORMAT, + TBUFFER_LOAD_FORMAT_D16, ATOMIC_CMP_SWAP, ATOMIC_INC, ATOMIC_DEC, BUFFER_LOAD, BUFFER_LOAD_FORMAT, + BUFFER_LOAD_FORMAT_D16, BUFFER_STORE, BUFFER_STORE_FORMAT, + BUFFER_STORE_FORMAT_D16, BUFFER_ATOMIC_SWAP, BUFFER_ATOMIC_ADD, BUFFER_ATOMIC_SUB, Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -3976,14 +3976,18 @@ NODE_NAME_CASE(LOAD_CONSTANT) NODE_NAME_CASE(TBUFFER_STORE_FORMAT) NODE_NAME_CASE(TBUFFER_STORE_FORMAT_X3) + NODE_NAME_CASE(TBUFFER_STORE_FORMAT_D16) NODE_NAME_CASE(TBUFFER_LOAD_FORMAT) + NODE_NAME_CASE(TBUFFER_LOAD_FORMAT_D16) NODE_NAME_CASE(ATOMIC_CMP_SWAP) NODE_NAME_CASE(ATOMIC_INC) NODE_NAME_CASE(ATOMIC_DEC) NODE_NAME_CASE(BUFFER_LOAD) NODE_NAME_CASE(BUFFER_LOAD_FORMAT) + NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16) NODE_NAME_CASE(BUFFER_STORE) NODE_NAME_CASE(BUFFER_STORE_FORMAT) + NODE_NAME_CASE(BUFFER_STORE_FORMAT_D16) NODE_NAME_CASE(BUFFER_ATOMIC_SWAP) NODE_NAME_CASE(BUFFER_ATOMIC_ADD) NODE_NAME_CASE(BUFFER_ATOMIC_SUB) Index: lib/Target/AMDGPU/AMDGPUInstrInfo.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUInstrInfo.cpp +++ lib/Target/AMDGPU/AMDGPUInstrInfo.cpp @@ -61,7 +61,8 @@ VI = 1, SDWA = 2, SDWA9 = 3, - GFX9 = 4 + GFX80 = 4, + GFX9 = 5 }; static SIEncodingFamily subtargetEncodingFamily(const AMDGPUSubtarget &ST) { Index: lib/Target/AMDGPU/AMDGPUSubtarget.h =================================================================== --- lib/Target/AMDGPU/AMDGPUSubtarget.h +++ lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -165,6 +165,7 @@ bool FlatGlobalInsts; bool FlatScratchInsts; bool AddNoCarryInsts; + bool HasUnpackedD16VMem; bool R600ALUInst; bool CaymanISA; bool CFALUBug; @@ -481,6 +482,10 @@ return AddNoCarryInsts; } + bool hasUnpackedD16VMem() const { + return HasUnpackedD16VMem; + } + bool isMesaKernel(const MachineFunction &MF) const { return isMesa3DOS() && !AMDGPU::isShader(MF.getFunction().getCallingConv()); } Index: lib/Target/AMDGPU/AMDGPUSubtarget.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -162,6 +162,7 @@ FlatGlobalInsts(false), FlatScratchInsts(false), AddNoCarryInsts(false), + HasUnpackedD16VMem(false), R600ALUInst(false), CaymanISA(false), Index: lib/Target/AMDGPU/BUFInstructions.td =================================================================== --- lib/Target/AMDGPU/BUFInstructions.td +++ lib/Target/AMDGPU/BUFInstructions.td @@ -671,6 +671,61 @@ defm BUFFER_STORE_FORMAT_XYZW : MUBUF_Pseudo_Stores < "buffer_store_format_xyzw", VReg_128 >; + +let SubtargetPredicate = HasUnpackedD16VMem in { + defm BUFFER_LOAD_FORMAT_D16_X_gfx80 : MUBUF_Pseudo_Loads < + "buffer_load_format_d16_x", VGPR_32 + >; + defm BUFFER_LOAD_FORMAT_D16_XY_gfx80 : MUBUF_Pseudo_Loads < + "buffer_load_format_d16_xy", VReg_64 + >; + defm BUFFER_LOAD_FORMAT_D16_XYZ_gfx80 : MUBUF_Pseudo_Loads < + "buffer_load_format_d16_xyz", VReg_96 + >; + defm BUFFER_LOAD_FORMAT_D16_XYZW_gfx80 : MUBUF_Pseudo_Loads < + "buffer_load_format_d16_xyzw", VReg_128 + >; + defm BUFFER_STORE_FORMAT_D16_X_gfx80 : MUBUF_Pseudo_Stores < + "buffer_store_format_d16_x", VGPR_32 + >; + defm BUFFER_STORE_FORMAT_D16_XY_gfx80 : MUBUF_Pseudo_Stores < + "buffer_store_format_d16_xy", VReg_64 + >; + defm BUFFER_STORE_FORMAT_D16_XYZ_gfx80 : MUBUF_Pseudo_Stores < + "buffer_store_format_d16_xyz", VReg_96 + >; + defm BUFFER_STORE_FORMAT_D16_XYZW_gfx80 : MUBUF_Pseudo_Stores < + "buffer_store_format_d16_xyzw", VReg_128 + >; +} // End HasUnpackedD16VMem. + +let SubtargetPredicate = HasPackedD16VMem in { + defm BUFFER_LOAD_FORMAT_D16_X : MUBUF_Pseudo_Loads < + "buffer_load_format_d16_x", VGPR_32 + >; + defm BUFFER_LOAD_FORMAT_D16_XY : MUBUF_Pseudo_Loads < + "buffer_load_format_d16_xy", VGPR_32 + >; + defm BUFFER_LOAD_FORMAT_D16_XYZ : MUBUF_Pseudo_Loads < + "buffer_load_format_d16_xyz", VReg_64 + >; + defm BUFFER_LOAD_FORMAT_D16_XYZW : MUBUF_Pseudo_Loads < + "buffer_load_format_d16_xyzw", VReg_64 + >; + defm BUFFER_STORE_FORMAT_D16_X : MUBUF_Pseudo_Stores < + "buffer_store_format_d16_x", VGPR_32 + >; + defm BUFFER_STORE_FORMAT_D16_XY : MUBUF_Pseudo_Stores < + "buffer_store_format_d16_xy", VGPR_32 + >; + defm BUFFER_STORE_FORMAT_D16_XYZ : MUBUF_Pseudo_Stores < + "buffer_store_format_d16_xyz", VReg_64 + >; + defm BUFFER_STORE_FORMAT_D16_XYZW : MUBUF_Pseudo_Stores < + "buffer_store_format_d16_xyzw", VReg_64 + >; +} // End HasPackedD16VMem. + defm BUFFER_LOAD_UBYTE : MUBUF_Pseudo_Loads < "buffer_load_ubyte", VGPR_32, i32, mubuf_az_extloadi8 >; @@ -860,6 +915,28 @@ defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyz", VReg_128>; defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyzw", VReg_128>; +let SubtargetPredicate = HasUnpackedD16VMem in { + defm TBUFFER_LOAD_FORMAT_D16_X_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_x", VGPR_32>; + defm TBUFFER_LOAD_FORMAT_D16_XY_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xy", VReg_64>; + defm TBUFFER_LOAD_FORMAT_D16_XYZ_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyz", VReg_96>; + defm TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyzw", VReg_128>; + defm TBUFFER_STORE_FORMAT_D16_X_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_x", VGPR_32>; + defm TBUFFER_STORE_FORMAT_D16_XY_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xy", VReg_64>; + defm TBUFFER_STORE_FORMAT_D16_XYZ_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyz", VReg_96>; + defm TBUFFER_STORE_FORMAT_D16_XYZW_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyzw", VReg_128>; +} // End HasUnpackedD16VMem. + +let SubtargetPredicate = HasPackedD16VMem in { + defm TBUFFER_LOAD_FORMAT_D16_X : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_x", VGPR_32>; + defm TBUFFER_LOAD_FORMAT_D16_XY : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xy", VGPR_32>; + defm TBUFFER_LOAD_FORMAT_D16_XYZ : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyz", VReg_64>; + defm TBUFFER_LOAD_FORMAT_D16_XYZW : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyzw", VReg_64>; + defm TBUFFER_STORE_FORMAT_D16_X : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_x", VGPR_32>; + defm TBUFFER_STORE_FORMAT_D16_XY : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xy", VGPR_32>; + defm TBUFFER_STORE_FORMAT_D16_XYZ : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyz", VReg_64>; + defm TBUFFER_STORE_FORMAT_D16_XYZW : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyzw", VReg_64>; +} // End HasPackedD16VMem. + let SubtargetPredicate = isCIVI in { //===----------------------------------------------------------------------===// @@ -922,6 +999,20 @@ defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; + +let SubtargetPredicate = HasUnpackedD16VMem in { + defm : MUBUF_LoadIntrinsicPat; + defm : MUBUF_LoadIntrinsicPat; + defm : MUBUF_LoadIntrinsicPat; +} // End HasUnpackedD16VMem. + +let SubtargetPredicate = HasPackedD16VMem in { + defm : MUBUF_LoadIntrinsicPat; + defm : MUBUF_LoadIntrinsicPat; + defm : MUBUF_LoadIntrinsicPat; + defm : MUBUF_LoadIntrinsicPat; +} // End HasPackedD16VMem. + defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; @@ -969,6 +1060,20 @@ defm : MUBUF_StoreIntrinsicPat; defm : MUBUF_StoreIntrinsicPat; defm : MUBUF_StoreIntrinsicPat; + +let SubtargetPredicate = HasUnpackedD16VMem in { + defm : MUBUF_StoreIntrinsicPat; + defm : MUBUF_StoreIntrinsicPat; + defm : MUBUF_StoreIntrinsicPat; +} // End HasUnpackedD16VMem. + +let SubtargetPredicate = HasPackedD16VMem in { + defm : MUBUF_StoreIntrinsicPat; + defm : MUBUF_StoreIntrinsicPat; + defm : MUBUF_StoreIntrinsicPat; + defm : MUBUF_StoreIntrinsicPat; +} // End HasPackedD16VMem. + defm : MUBUF_StoreIntrinsicPat; defm : MUBUF_StoreIntrinsicPat; defm : MUBUF_StoreIntrinsicPat; @@ -1382,6 +1487,19 @@ defm : MTBUF_LoadIntrinsicPat; defm : MTBUF_LoadIntrinsicPat; +let SubtargetPredicate = HasUnpackedD16VMem in { + defm : MTBUF_LoadIntrinsicPat; + defm : MTBUF_LoadIntrinsicPat; + defm : MTBUF_LoadIntrinsicPat; +} // End HasUnpackedD16VMem. + +let SubtargetPredicate = HasPackedD16VMem in { + defm : MTBUF_LoadIntrinsicPat; + defm : MTBUF_LoadIntrinsicPat; + defm : MTBUF_LoadIntrinsicPat; + defm : MTBUF_LoadIntrinsicPat; +} // End HasPackedD16VMem. + multiclass MTBUF_StoreIntrinsicPat { def : GCNPat< @@ -1431,6 +1549,19 @@ defm : MTBUF_StoreIntrinsicPat; defm : MTBUF_StoreIntrinsicPat; +let SubtargetPredicate = HasUnpackedD16VMem in { + defm : MTBUF_StoreIntrinsicPat; + defm : MTBUF_StoreIntrinsicPat; + defm : MTBUF_StoreIntrinsicPat; +} // End HasUnpackedD16VMem. + +let SubtargetPredicate = HasPackedD16VMem in { + defm : MTBUF_StoreIntrinsicPat; + defm : MTBUF_StoreIntrinsicPat; + defm : MTBUF_StoreIntrinsicPat; + defm : MTBUF_StoreIntrinsicPat; +} // End HasPackedD16VMem. + //===----------------------------------------------------------------------===// // Target instructions, move to the appropriate target TD file //===----------------------------------------------------------------------===// @@ -1628,6 +1759,35 @@ def _BOTHEN_vi : MUBUF_Real_vi (NAME#"_BOTHEN")>; } +class MUBUF_Real_gfx80 op, MUBUF_Pseudo ps> : + MUBUF_Real, + Enc64, + SIMCInstr { + let AssemblerPredicate=HasUnpackedD16VMem; + let DecoderNamespace="GFX80_UNPACKED"; + + let Inst{11-0} = !if(ps.has_offset, offset, ?); + let Inst{12} = ps.offen; + let Inst{13} = ps.idxen; + let Inst{14} = !if(ps.has_glc, glc, ps.glc_value); + let Inst{16} = lds; + let Inst{17} = !if(ps.has_slc, slc, ?); + let Inst{24-18} = op; + let Inst{31-26} = 0x38; //encoding + let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?); + let Inst{47-40} = !if(ps.has_vdata, vdata, ?); + let Inst{52-48} = !if(ps.has_srsrc, srsrc{6-2}, ?); + let Inst{55} = !if(ps.has_tfe, tfe, ?); + let Inst{63-56} = !if(ps.has_soffset, soffset, ?); +} + +multiclass MUBUF_Real_AllAddr_gfx80 op> { + def _OFFSET_vi : MUBUF_Real_gfx80 (NAME#"_OFFSET")>; + def _OFFEN_vi : MUBUF_Real_gfx80 (NAME#"_OFFEN")>; + def _IDXEN_vi : MUBUF_Real_gfx80 (NAME#"_IDXEN")>; + def _BOTHEN_vi : MUBUF_Real_gfx80 (NAME#"_BOTHEN")>; +} + multiclass MUBUF_Real_Atomic_vi op> : MUBUF_Real_AllAddr_vi { def _OFFSET_RTN_vi : MUBUF_Real_vi (NAME#"_OFFSET_RTN")>; @@ -1644,6 +1804,26 @@ defm BUFFER_STORE_FORMAT_XY : MUBUF_Real_AllAddr_vi <0x05>; defm BUFFER_STORE_FORMAT_XYZ : MUBUF_Real_AllAddr_vi <0x06>; defm BUFFER_STORE_FORMAT_XYZW : MUBUF_Real_AllAddr_vi <0x07>; +let SubtargetPredicate = HasUnpackedD16VMem in { + defm BUFFER_LOAD_FORMAT_D16_X_gfx80 : MUBUF_Real_AllAddr_gfx80 <0x08>; + defm BUFFER_LOAD_FORMAT_D16_XY_gfx80 : MUBUF_Real_AllAddr_gfx80 <0x09>; + defm BUFFER_LOAD_FORMAT_D16_XYZ_gfx80 : MUBUF_Real_AllAddr_gfx80 <0x0a>; + defm BUFFER_LOAD_FORMAT_D16_XYZW_gfx80 : MUBUF_Real_AllAddr_gfx80 <0x0b>; + defm BUFFER_STORE_FORMAT_D16_X_gfx80 : MUBUF_Real_AllAddr_gfx80 <0x0c>; + defm BUFFER_STORE_FORMAT_D16_XY_gfx80 : MUBUF_Real_AllAddr_gfx80 <0x0d>; + defm BUFFER_STORE_FORMAT_D16_XYZ_gfx80 : MUBUF_Real_AllAddr_gfx80 <0x0e>; + defm BUFFER_STORE_FORMAT_D16_XYZW_gfx80 : MUBUF_Real_AllAddr_gfx80 <0x0f>; +} // End HasUnpackedD16VMem. +let SubtargetPredicate = HasPackedD16VMem in { + defm BUFFER_LOAD_FORMAT_D16_X : MUBUF_Real_AllAddr_vi <0x08>; + defm BUFFER_LOAD_FORMAT_D16_XY : MUBUF_Real_AllAddr_vi <0x09>; + defm BUFFER_LOAD_FORMAT_D16_XYZ : MUBUF_Real_AllAddr_vi <0x0a>; + defm BUFFER_LOAD_FORMAT_D16_XYZW : MUBUF_Real_AllAddr_vi <0x0b>; + defm BUFFER_STORE_FORMAT_D16_X : MUBUF_Real_AllAddr_vi <0x0c>; + defm BUFFER_STORE_FORMAT_D16_XY : MUBUF_Real_AllAddr_vi <0x0d>; + defm BUFFER_STORE_FORMAT_D16_XYZ : MUBUF_Real_AllAddr_vi <0x0e>; + defm BUFFER_STORE_FORMAT_D16_XYZW : MUBUF_Real_AllAddr_vi <0x0f>; +} // End HasPackedD16VMem. defm BUFFER_LOAD_UBYTE : MUBUF_Real_AllAddr_vi <0x10>; defm BUFFER_LOAD_SBYTE : MUBUF_Real_AllAddr_vi <0x11>; defm BUFFER_LOAD_USHORT : MUBUF_Real_AllAddr_vi <0x12>; @@ -1729,11 +1909,61 @@ def _BOTHEN_vi : MTBUF_Real_vi (NAME#"_BOTHEN")>; } -defm TBUFFER_LOAD_FORMAT_X : MTBUF_Real_AllAddr_vi <0>; -defm TBUFFER_LOAD_FORMAT_XY : MTBUF_Real_AllAddr_vi <1>; -//defm TBUFFER_LOAD_FORMAT_XYZ : MTBUF_Real_AllAddr_vi <2>; -defm TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Real_AllAddr_vi <3>; -defm TBUFFER_STORE_FORMAT_X : MTBUF_Real_AllAddr_vi <4>; -defm TBUFFER_STORE_FORMAT_XY : MTBUF_Real_AllAddr_vi <5>; -defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Real_AllAddr_vi <6>; -defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Real_AllAddr_vi <7>; +class MTBUF_Real_gfx80 op, MTBUF_Pseudo ps> : + MTBUF_Real, + Enc64, + SIMCInstr { + let AssemblerPredicate=HasUnpackedD16VMem; + let DecoderNamespace="GFX80_UNPACKED"; + + let Inst{11-0} = !if(ps.has_offset, offset, ?); + let Inst{12} = ps.offen; + let Inst{13} = ps.idxen; + let Inst{14} = !if(ps.has_glc, glc, ps.glc_value); + let Inst{18-15} = op; + let Inst{22-19} = !if(ps.has_dfmt, dfmt, ps.dfmt_value); + let Inst{25-23} = !if(ps.has_nfmt, nfmt, ps.nfmt_value); + let Inst{31-26} = 0x3a; //encoding + let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?); + let Inst{47-40} = !if(ps.has_vdata, vdata, ?); + let Inst{52-48} = !if(ps.has_srsrc, srsrc{6-2}, ?); + let Inst{54} = !if(ps.has_slc, slc, ?); + let Inst{55} = !if(ps.has_tfe, tfe, ?); + let Inst{63-56} = !if(ps.has_soffset, soffset, ?); +} + +multiclass MTBUF_Real_AllAddr_gfx80 op> { + def _OFFSET_gfx80 : MTBUF_Real_gfx80 (NAME#"_OFFSET")>; + def _OFFEN_gfx80 : MTBUF_Real_gfx80 (NAME#"_OFFEN")>; + def _IDXEN_gfx80 : MTBUF_Real_gfx80 (NAME#"_IDXEN")>; + def _BOTHEN_gfx80 : MTBUF_Real_gfx80 (NAME#"_BOTHEN")>; +} + +defm TBUFFER_LOAD_FORMAT_X : MTBUF_Real_AllAddr_vi <0x00>; +defm TBUFFER_LOAD_FORMAT_XY : MTBUF_Real_AllAddr_vi <0x01>; +defm TBUFFER_LOAD_FORMAT_XYZ : MTBUF_Real_AllAddr_vi <0x02>; +defm TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Real_AllAddr_vi <0x03>; +defm TBUFFER_STORE_FORMAT_X : MTBUF_Real_AllAddr_vi <0x04>; +defm TBUFFER_STORE_FORMAT_XY : MTBUF_Real_AllAddr_vi <0x05>; +defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Real_AllAddr_vi <0x06>; +defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Real_AllAddr_vi <0x07>; +let SubtargetPredicate = HasUnpackedD16VMem in { + defm TBUFFER_LOAD_FORMAT_D16_X_gfx80 : MTBUF_Real_AllAddr_gfx80 <0x08>; + defm TBUFFER_LOAD_FORMAT_D16_XY_gfx80 : MTBUF_Real_AllAddr_gfx80 <0x09>; + defm TBUFFER_LOAD_FORMAT_D16_XYZ_gfx80 : MTBUF_Real_AllAddr_gfx80 <0x0a>; + defm TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80 : MTBUF_Real_AllAddr_gfx80 <0x0b>; + defm TBUFFER_STORE_FORMAT_D16_X_gfx80 : MTBUF_Real_AllAddr_gfx80 <0x0c>; + defm TBUFFER_STORE_FORMAT_D16_XY_gfx80 : MTBUF_Real_AllAddr_gfx80 <0x0d>; + defm TBUFFER_STORE_FORMAT_D16_XYZ_gfx80 : MTBUF_Real_AllAddr_gfx80 <0x0e>; + defm TBUFFER_STORE_FORMAT_D16_XYZW_gfx80 : MTBUF_Real_AllAddr_gfx80 <0x0f>; +} // End HasUnpackedD16VMem. +let SubtargetPredicate = HasPackedD16VMem in { + defm TBUFFER_LOAD_FORMAT_D16_X : MTBUF_Real_AllAddr_vi <0x08>; + defm TBUFFER_LOAD_FORMAT_D16_XY : MTBUF_Real_AllAddr_vi <0x09>; + defm TBUFFER_LOAD_FORMAT_D16_XYZ : MTBUF_Real_AllAddr_vi <0x0a>; + defm TBUFFER_LOAD_FORMAT_D16_XYZW : MTBUF_Real_AllAddr_vi <0x0b>; + defm TBUFFER_STORE_FORMAT_D16_X : MTBUF_Real_AllAddr_vi <0x0c>; + defm TBUFFER_STORE_FORMAT_D16_XY : MTBUF_Real_AllAddr_vi <0x0d>; + defm TBUFFER_STORE_FORMAT_D16_XYZ : MTBUF_Real_AllAddr_vi <0x0e>; + defm TBUFFER_STORE_FORMAT_D16_XYZW : MTBUF_Real_AllAddr_vi <0x0f>; +} // End HasUnpackedD16VMem. Index: lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- lib/Target/AMDGPU/SIISelLowering.h +++ lib/Target/AMDGPU/SIISelLowering.h @@ -60,6 +60,10 @@ SDValue LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerIntrinsicWChain_IllegalReturnType(SDValue Op, SDValue &Chain, + SelectionDAG &DAG) const; + SDValue handleD16VData(SDValue VData, SelectionDAG &DAG) const; + /// \brief Converts \p Op, which must be of floating point type, to the /// floating point type \p VT, by either extending or truncating it. SDValue getFPExtOrFPTrunc(SelectionDAG &DAG, Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -207,11 +207,14 @@ setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f16, Custom); + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2f16, Custom); + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4f16, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::v2i16, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::v2f16, Custom); + setOperationAction(ISD::INTRINSIC_VOID, MVT::v4f16, Custom); setOperationAction(ISD::BRCOND, MVT::Other, Custom); setOperationAction(ISD::BR_CC, MVT::i1, Expand); @@ -3501,6 +3504,78 @@ return SDValue(); } +static SDValue adjustLoadValueType(SDValue Result, EVT LoadVT, SDLoc DL, + SelectionDAG &DAG, bool Unpacked) { + if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16. + // Truncate to v2i16/v4i16. + EVT IntLoadVT = LoadVT.changeTypeToInteger(); + SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, IntLoadVT, Result); + // Bitcast to original type (v2f16/v4f16). + return DAG.getNode(ISD::BITCAST, DL, LoadVT, Trunc); + } + // Cast back to the original packed type. + return DAG.getNode(ISD::BITCAST, DL, LoadVT, Result); +} + +// This is to lower INTRINSIC_W_CHAIN with illegal result types. +SDValue SITargetLowering::lowerIntrinsicWChain_IllegalReturnType(SDValue Op, + SDValue &Chain, SelectionDAG &DAG) const { + EVT LoadVT = Op.getValueType(); + // TODO: handle v3f16. + if (LoadVT != MVT::v2f16 && LoadVT != MVT::v4f16) + return SDValue(); + + bool Unpacked = Subtarget->hasUnpackedD16VMem(); + EVT UnpackedLoadVT = (LoadVT == MVT::v2f16) ? MVT::v2i32 : MVT::v4i32; + EVT EquivLoadVT = Unpacked ? UnpackedLoadVT : + getEquivalentMemType(*DAG.getContext(), LoadVT); + // Change from v4f16/v2f16 to EquivLoadVT. + SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other); + + SDValue Res; + SDLoc DL(Op); + MemSDNode *M = cast(Op); + unsigned IID = cast(Op.getOperand(1))->getZExtValue(); + switch (IID) { + case Intrinsic::amdgcn_tbuffer_load: { + SDValue Ops[] = { + Op.getOperand(0), // Chain + Op.getOperand(2), // rsrc + Op.getOperand(3), // vindex + Op.getOperand(4), // voffset + Op.getOperand(5), // soffset + Op.getOperand(6), // offset + Op.getOperand(7), // dfmt + Op.getOperand(8), // nfmt + Op.getOperand(9), // glc + Op.getOperand(10) // slc + }; + Res = DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, DL, + VTList, Ops, M->getMemoryVT(), + M->getMemOperand()); + Chain = Res.getValue(1); + return adjustLoadValueType(Res, LoadVT, DL, DAG, Unpacked); + } + case Intrinsic::amdgcn_buffer_load_format: { + SDValue Ops[] = { + Op.getOperand(0), // Chain + Op.getOperand(2), // rsrc + Op.getOperand(3), // vindex + Op.getOperand(4), // offset + Op.getOperand(5), // glc + Op.getOperand(6) // slc + }; + Res = DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, + DL, VTList, Ops, M->getMemoryVT(), + M->getMemOperand()); + Chain = Res.getValue(1); + return adjustLoadValueType(Res, LoadVT, DL, DAG, Unpacked); + } + default: + return SDValue(); + } +} + void SITargetLowering::ReplaceNodeResults(SDNode *N, SmallVectorImpl &Results, SelectionDAG &DAG) const { @@ -3528,6 +3603,16 @@ } break; } + case ISD::INTRINSIC_W_CHAIN: { + SDValue Chain; + if (SDValue Res = lowerIntrinsicWChain_IllegalReturnType(SDValue(N, 0), + Chain, DAG)) { + Results.push_back(Res); + Results.push_back(Chain); + return; + } + break; + } case ISD::SELECT: { SDLoc SL(N); EVT VT = N->getValueType(0); @@ -4656,6 +4741,31 @@ } } +SDValue SITargetLowering::handleD16VData(SDValue VData, + SelectionDAG &DAG) const { + EVT StoreVT = VData.getValueType(); + SDLoc DL(VData); + + if (StoreVT.isVector()) { + assert ((StoreVT.getVectorNumElements() != 3) && "Handle v3f16"); + if (!Subtarget->hasUnpackedD16VMem()) { + if (!isTypeLegal(StoreVT)) { + // If Target supports packed vmem, we just need to workaround + // the illegal type by casting to an equivalent one. + EVT EquivStoreVT = getEquivalentMemType(*DAG.getContext(), StoreVT); + return DAG.getNode(ISD::BITCAST, DL, EquivStoreVT, VData); + } + } else { // We need to unpack the packed data to store. + EVT IntStoreVT = StoreVT.changeTypeToInteger(); + SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData); + EVT EquivStoreVT = (StoreVT == MVT::v2f16) ? MVT::v2i32 : MVT::v4i32; + return DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData); + } + } + // No change for f16 and legal vector D16 types. + return VData; +} + SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); @@ -4798,9 +4908,13 @@ } case Intrinsic::amdgcn_tbuffer_store: { + SDValue VData = Op.getOperand(2); + bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16); + if (IsD16) + VData = handleD16VData(VData, DAG); SDValue Ops[] = { Chain, - Op.getOperand(2), // vdata + VData, // vdata Op.getOperand(3), // rsrc Op.getOperand(4), // vindex Op.getOperand(5), // voffset @@ -4811,37 +4925,34 @@ Op.getOperand(10), // glc Op.getOperand(11) // slc }; - EVT VT = Op.getOperand(3).getValueType(); - MachineMemOperand *MMO = MF.getMachineMemOperand( - MachinePointerInfo(), - MachineMemOperand::MOStore, - VT.getStoreSize(), 4); - return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL, - Op->getVTList(), Ops, VT, MMO); + unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 : + AMDGPUISD::TBUFFER_STORE_FORMAT; + MemSDNode *M = cast(Op); + return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, + M->getMemoryVT(), M->getMemOperand()); } case Intrinsic::amdgcn_buffer_store: case Intrinsic::amdgcn_buffer_store_format: { + SDValue VData = Op.getOperand(2); + bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16); + if (IsD16) + VData = handleD16VData(VData, DAG); SDValue Ops[] = { Chain, - Op.getOperand(2), // vdata + VData, // vdata Op.getOperand(3), // rsrc Op.getOperand(4), // vindex Op.getOperand(5), // offset Op.getOperand(6), // glc Op.getOperand(7) // slc }; - EVT VT = Op.getOperand(3).getValueType(); - MachineMemOperand *MMO = MF.getMachineMemOperand( - MachinePointerInfo(), - MachineMemOperand::MOStore | - MachineMemOperand::MODereferenceable, - VT.getStoreSize(), 4); - - unsigned Opcode = IntrinsicID == Intrinsic::amdgcn_buffer_store ? - AMDGPUISD::BUFFER_STORE : - AMDGPUISD::BUFFER_STORE_FORMAT; - return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT, MMO); + unsigned Opc = IntrinsicID == Intrinsic::amdgcn_buffer_store ? + AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT; + Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc; + MemSDNode *M = cast(Op); + return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, + M->getMemoryVT(), M->getMemOperand()); } default: Index: lib/Target/AMDGPU/SIInstrInfo.td =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.td +++ lib/Target/AMDGPU/SIInstrInfo.td @@ -25,7 +25,8 @@ int VI = 1; int SDWA = 2; int SDWA9 = 3; - int GFX9 = 4; + int GFX80 = 4; + int GFX9 = 5; } //===----------------------------------------------------------------------===// @@ -45,21 +46,24 @@ [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain] >; -def SItbuffer_load : SDNode<"AMDGPUISD::TBUFFER_LOAD_FORMAT", - SDTypeProfile<1, 9, - [ // vdata - SDTCisVT<1, v4i32>, // rsrc - SDTCisVT<2, i32>, // vindex(VGPR) - SDTCisVT<3, i32>, // voffset(VGPR) - SDTCisVT<4, i32>, // soffset(SGPR) - SDTCisVT<5, i32>, // offset(imm) - SDTCisVT<6, i32>, // dfmt(imm) - SDTCisVT<7, i32>, // nfmt(imm) - SDTCisVT<8, i32>, // glc(imm) - SDTCisVT<9, i32> // slc(imm) - ]>, - [SDNPMayLoad, SDNPMemOperand, SDNPHasChain] ->; +def SDTbuffer_load : SDTypeProfile<1, 9, + [ // vdata + SDTCisVT<1, v4i32>, // rsrc + SDTCisVT<2, i32>, // vindex(VGPR) + SDTCisVT<3, i32>, // voffset(VGPR) + SDTCisVT<4, i32>, // soffset(SGPR) + SDTCisVT<5, i32>, // offset(imm) + SDTCisVT<6, i32>, // dfmt(imm) + SDTCisVT<7, i32>, // nfmt(imm) + SDTCisVT<8, i32>, // glc(imm) + SDTCisVT<9, i32> // slc(imm) + ]>; + +def SItbuffer_load : SDNode<"AMDGPUISD::TBUFFER_LOAD_FORMAT", SDTbuffer_load, + [SDNPMayLoad, SDNPMemOperand, SDNPHasChain]>; +def SItbuffer_load_d16 : SDNode<"AMDGPUISD::TBUFFER_LOAD_FORMAT_D16", + SDTbuffer_load, + [SDNPMayLoad, SDNPMemOperand, SDNPHasChain]>; def SDTtbuffer_store : SDTypeProfile<0, 10, [ // vdata @@ -79,6 +83,9 @@ def SItbuffer_store_x3 : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT_X3", SDTtbuffer_store, [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>; +def SItbuffer_store_d16 : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT_D16", + SDTtbuffer_store, + [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>; def SDTBufferLoad : SDTypeProfile<1, 5, [ // vdata @@ -92,6 +99,9 @@ [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>; def SIbuffer_load_format : SDNode <"AMDGPUISD::BUFFER_LOAD_FORMAT", SDTBufferLoad, [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>; +def SIbuffer_load_format_d16 : SDNode <"AMDGPUISD::BUFFER_LOAD_FORMAT_D16", + SDTBufferLoad, + [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>; def SDTBufferStore : SDTypeProfile<0, 6, [ // vdata @@ -102,9 +112,13 @@ SDTCisVT<5, i1>]>; // slc def SIbuffer_store : SDNode <"AMDGPUISD::BUFFER_STORE", SDTBufferStore, - [SDNPMemOperand, SDNPHasChain, SDNPMayStore]>; -def SIbuffer_store_format : SDNode <"AMDGPUISD::BUFFER_STORE_FORMAT", SDTBufferStore, - [SDNPMemOperand, SDNPHasChain, SDNPMayStore]>; + [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>; +def SIbuffer_store_format : SDNode <"AMDGPUISD::BUFFER_STORE_FORMAT", + SDTBufferStore, + [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>; +def SIbuffer_store_format_d16 : SDNode <"AMDGPUISD::BUFFER_STORE_FORMAT_D16", + SDTBufferStore, + [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>; class SDBufferAtomic : SDNode (SIEncodingFamily.VI)], [!cast(SIEncodingFamily.SDWA)], [!cast(SIEncodingFamily.SDWA9)], + // GFX80 encoding is added to work around a multiple matching + // issue for buffer instructions with unpacked d16 data. This + // does not actually change the encoding, and thus may be + // removed later. + [!cast(SIEncodingFamily.GFX80)], [!cast(SIEncodingFamily.GFX9)]]; } Index: test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.d16.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.d16.ll @@ -0,0 +1,41 @@ +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=UNPACKED %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED %s + +; GCN-LABEL: {{^}}buffer_load_format_d16_x: +; GCN: buffer_load_format_d16_x v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 +define amdgpu_ps half @buffer_load_format_d16_x(<4 x i32> inreg %rsrc) { +main_body: + %data = call half @llvm.amdgcn.buffer.load.format.f16(<4 x i32> %rsrc, i32 0, i32 0, i1 0, i1 0) + ret half %data +} + +; GCN-LABEL: {{^}}buffer_load_format_d16_xy: +; UNPACKED: buffer_load_format_d16_xy v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0 +; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]] + +; PACKED: buffer_load_format_d16_xy v[[FULL:[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 +; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[FULL]] +define amdgpu_ps half @buffer_load_format_d16_xy(<4 x i32> inreg %rsrc) { +main_body: + %data = call <2 x half> @llvm.amdgcn.buffer.load.format.v2f16(<4 x i32> %rsrc, i32 0, i32 0, i1 0, i1 0) + %elt = extractelement <2 x half> %data, i32 1 + ret half %elt +} + +; GCN-LABEL: {{^}}buffer_load_format_d16_xyzw: +; UNPACKED: buffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0 +; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]] + +; PACKED: buffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0 +; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[HI]] +define amdgpu_ps half @buffer_load_format_d16_xyzw(<4 x i32> inreg %rsrc) { +main_body: + %data = call <4 x half> @llvm.amdgcn.buffer.load.format.v4f16(<4 x i32> %rsrc, i32 0, i32 0, i1 0, i1 0) + %elt = extractelement <4 x half> %data, i32 3 + ret half %elt +} + +declare half @llvm.amdgcn.buffer.load.format.f16(<4 x i32>, i32, i32, i1, i1) +declare <2 x half> @llvm.amdgcn.buffer.load.format.v2f16(<4 x i32>, i32, i32, i1, i1) +declare <4 x half> @llvm.amdgcn.buffer.load.format.v4f16(<4 x i32>, i32, i32, i1, i1) Index: test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.d16.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.d16.ll @@ -0,0 +1,50 @@ +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=UNPACKED %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX81 %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX9 %s + +; GCN-LABEL: {{^}}buffer_store_format_d16_x: +; GCN: v_trunc_f16_e32 v[[LO:[0-9]+]], s{{[0-9]+}} +; GCN: buffer_store_format_d16_x v[[LO]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen +define amdgpu_kernel void @buffer_store_format_d16_x(<4 x i32> %rsrc, half %data, i32 %index) { +main_body: + call void @llvm.amdgcn.buffer.store.format.f16(half %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0) + ret void +} + +; GCN-LABEL: {{^}}buffer_store_format_d16_xy: + +; UNPACKED: flat_load_ushort v[[HI:[0-9]+]], v[{{[0-9]+:[0-9]+}}] glc slc +; UNPACKED: flat_load_ushort v[[LO:[0-9]+]], v[{{[0-9]+:[0-9]+}}] glc slc +; UNPACKED: buffer_store_format_d16_xy v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen + +; PACKED: buffer_store_format_d16_xy v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen +define amdgpu_kernel void @buffer_store_format_d16_xy(<4 x i32> %rsrc, <2 x half> %data, i32 %index) { +main_body: + call void @llvm.amdgcn.buffer.store.format.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0) + ret void +} + +; GCN-LABEL: {{^}}buffer_store_format_d16_xyzw: + +; UNPACKED: flat_load_ushort v[[HI:[0-9]+]], v[{{[0-9]+:[0-9]+}}] glc slc +; UNPACKED: flat_load_ushort v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] glc slc +; UNPACKED: flat_load_ushort v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] glc slc +; UNPACKED: flat_load_ushort v[[LO:[0-9]+]], v[{{[0-9]+:[0-9]+}}] glc slc +; UNPACKED: buffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen + +; GFX81: v_or_b32_e32 v[[HI:[0-9]+]] +; GFX81: v_or_b32_e32 v[[LO:[0-9]+]] + +; GFX9: v_mov_b32_e32 v[[LO:[0-9]+]] +; GFX9: v_mov_b32_e32 v[[HI:[0-9]+]] + +; PACKED: buffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen +define amdgpu_kernel void @buffer_store_format_d16_xyzw(<4 x i32> %rsrc, <4 x half> %data, i32 %index) { +main_body: + call void @llvm.amdgcn.buffer.store.format.v4f16(<4 x half> %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0) + ret void +} + +declare void @llvm.amdgcn.buffer.store.format.f16(half, <4 x i32>, i32, i32, i1, i1) +declare void @llvm.amdgcn.buffer.store.format.v2f16(<2 x half>, <4 x i32>, i32, i32, i1, i1) +declare void @llvm.amdgcn.buffer.store.format.v4f16(<4 x half>, <4 x i32>, i32, i32, i1, i1) Index: test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.d16.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.d16.ll @@ -0,0 +1,41 @@ +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=UNPACKED %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED %s + +; GCN-LABEL: {{^}}tbuffer_load_d16_x: +; GCN: tbuffer_load_format_d16_x v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], dfmt:6, nfmt:1, 0 +define amdgpu_ps half @tbuffer_load_d16_x(<4 x i32> inreg %rsrc) { +main_body: + %data = call half @llvm.amdgcn.tbuffer.load.f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 6, i32 1, i1 0, i1 0) + ret half %data +} + +; GCN-LABEL: {{^}}tbuffer_load_d16_xy: +; UNPACKED: tbuffer_load_format_d16_xy v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], dfmt:6, nfmt:1, 0 +; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]] + +; PACKED: tbuffer_load_format_d16_xy v[[FULL:[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], dfmt:6, nfmt:1, 0 +; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[FULL]] +define amdgpu_ps half @tbuffer_load_d16_xy(<4 x i32> inreg %rsrc) { +main_body: + %data = call <2 x half> @llvm.amdgcn.tbuffer.load.v2f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 6, i32 1, i1 0, i1 0) + %elt = extractelement <2 x half> %data, i32 1 + ret half %elt +} + +; GCN-LABEL: {{^}}tbuffer_load_d16_xyzw: +; UNPACKED: tbuffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], dfmt:6, nfmt:1, 0 +; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]] + +; PACKED: tbuffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], dfmt:6, nfmt:1, 0 +; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[HI]] +define amdgpu_ps half @tbuffer_load_d16_xyzw(<4 x i32> inreg %rsrc) { +main_body: + %data = call <4 x half> @llvm.amdgcn.tbuffer.load.v4f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 6, i32 1, i1 0, i1 0) + %elt = extractelement <4 x half> %data, i32 3 + ret half %elt +} + +declare half @llvm.amdgcn.tbuffer.load.f16(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) +declare <2 x half> @llvm.amdgcn.tbuffer.load.v2f16(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) +declare <4 x half> @llvm.amdgcn.tbuffer.load.v4f16(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) Index: test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll @@ -0,0 +1,53 @@ +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=UNPACKED %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX81 %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX9 %s + + +; GCN-LABEL: {{^}}tbuffer_store_d16_x: +; GCN: v_trunc_f16_e32 v[[LO:[0-9]+]], s{{[0-9]+}} +; GCN: tbuffer_store_format_d16_x v[[LO]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen +define amdgpu_kernel void @tbuffer_store_d16_x(<4 x i32> %rsrc, half %data, i32 %vindex) { +main_body: + call void @llvm.amdgcn.tbuffer.store.f16(half %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0, i32 1, i32 2, i1 0, i1 0) + ret void +} + + +; GCN-LABEL: {{^}}tbuffer_store_d16_xy: + +; UNPACKED: flat_load_ushort v[[HI:[0-9]+]], v[{{[0-9]+:[0-9]+}}] glc slc +; UNPACKED: flat_load_ushort v[[LO:[0-9]+]], v[{{[0-9]+:[0-9]+}}] glc slc +; UNPACKED: tbuffer_store_format_d16_xy v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen + +; PACKED: tbuffer_store_format_d16_xy v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen +define amdgpu_kernel void @tbuffer_store_d16_xy(<4 x i32> %rsrc, <2 x half> %data, i32 %vindex) { +main_body: + call void @llvm.amdgcn.tbuffer.store.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0, i32 1, i32 2, i1 0, i1 0) + ret void +} + + +; GCN-LABEL: {{^}}tbuffer_store_d16_xyzw: + +; UNPACKED: flat_load_ushort v[[HI:[0-9]+]], v[{{[0-9]+:[0-9]+}}] glc slc +; UNPACKED: flat_load_ushort v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] glc slc +; UNPACKED: flat_load_ushort v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] glc slc +; UNPACKED: flat_load_ushort v[[LO:[0-9]+]], v[{{[0-9]+:[0-9]+}}] glc slc +; UNPACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen + +; GFX81: v_or_b32_e32 v[[HI:[0-9]+]] +; GFX81: v_or_b32_e32 v[[LO:[0-9]+]] + +; GFX9: v_mov_b32_e32 v[[LO:[0-9]+]] +; GFX9: v_mov_b32_e32 v[[HI:[0-9]+]] + +; PACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen +define amdgpu_kernel void @tbuffer_store_d16_xyzw(<4 x i32> %rsrc, <4 x half> %data, i32 %vindex) { +main_body: + call void @llvm.amdgcn.tbuffer.store.v4f16(<4 x half> %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0, i32 1, i32 2, i1 0, i1 0) + ret void +} + +declare void @llvm.amdgcn.tbuffer.store.f16(half, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) +declare void @llvm.amdgcn.tbuffer.store.v2f16(<2 x half>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) +declare void @llvm.amdgcn.tbuffer.store.v4f16(<4 x half>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) Index: test/MC/AMDGPU/buf-fmt-d16-packed.s =================================================================== --- /dev/null +++ test/MC/AMDGPU/buf-fmt-d16-packed.s @@ -0,0 +1,74 @@ +// RUN: llvm-mc -arch=amdgcn -mcpu=gfx810 -show-encoding %s | FileCheck -check-prefix=GCN -check-prefix=PACKED %s +// RUN: llvm-mc -arch=amdgcn -mcpu=gfx900 -show-encoding %s | FileCheck -check-prefix=GCN -check-prefix=PACKED %s + +// RUN: not llvm-mc -arch=amdgcn -mcpu=fiji -show-encoding 2>&1 %s | FileCheck -check-prefix=UNPACKED-ERR -check-prefix=GCN-ERR %s + + +//===----------------------------------------------------------------------===// +// Buffer Format Instructions. +//===----------------------------------------------------------------------===// + +buffer_load_format_d16_x v1, off, s[4:7], s1 +// PACKED: buffer_load_format_d16_x v1, off, s[4:7], s1 ; encoding: [0x00,0x00,0x20,0xe0,0x00,0x01,0x01,0x01] + +buffer_load_format_d16_xy v1, off, s[4:7], s1 +// PACKED: buffer_load_format_d16_xy v1, off, s[4:7], s1 ; encoding: [0x00,0x00,0x24,0xe0,0x00,0x01,0x01,0x01] +// UNPACKED-ERR: error: instruction not supported on this GPU + +buffer_load_format_d16_xyz v[1:2], off, s[4:7], s1 +// PACKED: buffer_load_format_d16_xyz v[1:2], off, s[4:7], s1 ; encoding: [0x00,0x00,0x28,0xe0,0x00,0x01,0x01,0x01] +// UNPACKED-ERR: error: instruction not supported on this GPU + +buffer_load_format_d16_xyzw v[1:2], off, s[4:7], s1 +// PACKED: buffer_load_format_d16_xyzw v[1:2], off, s[4:7], s1 ; encoding: [0x00,0x00,0x2c,0xe0,0x00,0x01,0x01,0x01] +// UNPACKED-ERR: error: instruction not supported on this GPU + +buffer_store_format_d16_x v1, off, s[4:7], s1 +// PACKED: buffer_store_format_d16_x v1, off, s[4:7], s1 ; encoding: [0x00,0x00,0x30,0xe0,0x00,0x01,0x01,0x01] + +buffer_store_format_d16_xy v1, off, s[4:7], s1 +// PACKED: buffer_store_format_d16_xy v1, off, s[4:7], s1 ; encoding: [0x00,0x00,0x34,0xe0,0x00,0x01,0x01,0x01] +// UNPACKED-ERR: error: instruction not supported on this GPU + +buffer_store_format_d16_xyz v[1:2], off, s[4:7], s1 +// PACKED: buffer_store_format_d16_xyz v[1:2], off, s[4:7], s1 ; encoding: [0x00,0x00,0x38,0xe0,0x00,0x01,0x01,0x01] +// UNPACKED-ERR: error: instruction not supported on this GPU + +buffer_store_format_d16_xyzw v[1:2], off, s[4:7], s1 +// PACKED: buffer_store_format_d16_xyzw v[1:2], off, s[4:7], s1 ; encoding: [0x00,0x00,0x3c,0xe0,0x00,0x01,0x01,0x01] +// UNPACKED-ERR: error: instruction not supported on this GPU + + +//===----------------------------------------------------------------------===// +// TBuffer Format Instructions. +//===----------------------------------------------------------------------===// + +tbuffer_load_format_d16_x v1, off, s[4:7], dfmt:15, nfmt:2, s1 +// PACKED: tbuffer_load_format_d16_x v1, off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x00,0x7c,0xe9,0x00,0x01,0x01,0x01] + +tbuffer_load_format_d16_xy v1, off, s[4:7], dfmt:15, nfmt:2, s1 +// PACKED: tbuffer_load_format_d16_xy v1, off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x80,0x7c,0xe9,0x00,0x01,0x01,0x01] +// UNPACKED-ERR: error: instruction not supported on this GPU + +tbuffer_load_format_d16_xyz v[1:2], off, s[4:7], dfmt:15, nfmt:2, s1 +// PACKED: tbuffer_load_format_d16_xyz v[1:2], off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x00,0x7d,0xe9,0x00,0x01,0x01,0x01] +// UNPACKED-ERR: error: instruction not supported on this GPU + +tbuffer_load_format_d16_xyzw v[1:2], off, s[4:7], dfmt:15, nfmt:2, s1 +// PACKED: tbuffer_load_format_d16_xyzw v[1:2], off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x80,0x7d,0xe9,0x00,0x01,0x01,0x01] +// UNPACKED-ERR: error: instruction not supported on this GPU + +tbuffer_store_format_d16_x v1, off, s[4:7], dfmt:15, nfmt:2, s1 +// PACKED: tbuffer_store_format_d16_x v1, off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x00,0x7e,0xe9,0x00,0x01,0x01,0x01] + +tbuffer_store_format_d16_xy v1, off, s[4:7], dfmt:15, nfmt:2, s1 +// PACKED: tbuffer_store_format_d16_xy v1, off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x80,0x7e,0xe9,0x00,0x01,0x01,0x01] +// UNPACKED-ERR: error: instruction not supported on this GPU + +tbuffer_store_format_d16_xyz v[1:2], off, s[4:7], dfmt:15, nfmt:2, s1 +// PACKED: tbuffer_store_format_d16_xyz v[1:2], off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x00,0x7f,0xe9,0x00,0x01,0x01,0x01] +// UNPACKED-ERR: error: instruction not supported on this GPU + +tbuffer_store_format_d16_xyzw v[1:2], off, s[4:7], dfmt:15, nfmt:2, s1 +// PACKED: tbuffer_store_format_d16_xyzw v[1:2], off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x80,0x7f,0xe9,0x00,0x01,0x01,0x01] +// UNPACKED-ERR: error: instruction not supported on this GPU Index: test/MC/AMDGPU/buf-fmt-d16-unpacked.s =================================================================== --- /dev/null +++ test/MC/AMDGPU/buf-fmt-d16-unpacked.s @@ -0,0 +1,73 @@ +// RUN: llvm-mc -arch=amdgcn -mcpu=fiji -show-encoding %s | FileCheck -check-prefix=GCN -check-prefix=UNPACKED %s +// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx810 -show-encoding 2>&1 %s | FileCheck -check-prefix=PACKED-ERR -check-prefix=GCN-ERR %s +// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx900 -show-encoding 2>&1 %s | FileCheck -check-prefix=PACKED-ERR -check-prefix=GCN-ERR %s + + +//===----------------------------------------------------------------------===// +// Buffer Format Instructions. +//===----------------------------------------------------------------------===// + +buffer_load_format_d16_x v1, off, s[4:7], s1 +// UNPACKED: buffer_load_format_d16_x v1, off, s[4:7], s1 ; encoding: [0x00,0x00,0x20,0xe0,0x00,0x01,0x01,0x01] + +buffer_load_format_d16_xy v[1:2], off, s[4:7], s1 +// UNPACKED: buffer_load_format_d16_xy v[1:2], off, s[4:7], s1 ; encoding: [0x00,0x00,0x24,0xe0,0x00,0x01,0x01,0x01] +// PACKED-ERR: error: instruction not supported on this GPU + +buffer_load_format_d16_xyz v[1:3], off, s[4:7], s1 +// UNPACKED: buffer_load_format_d16_xyz v[1:3], off, s[4:7], s1 ; encoding: [0x00,0x00,0x28,0xe0,0x00,0x01,0x01,0x01] +// PACKED-ERR: error: instruction not supported on this GPU + +buffer_load_format_d16_xyzw v[1:4], off, s[4:7], s1 +// UNPACKED: buffer_load_format_d16_xyzw v[1:4], off, s[4:7], s1 ; encoding: [0x00,0x00,0x2c,0xe0,0x00,0x01,0x01,0x01] +// PACKED-ERR: error: instruction not supported on this GPU + +buffer_store_format_d16_x v1, off, s[4:7], s1 +// UNPACKED: buffer_store_format_d16_x v1, off, s[4:7], s1 ; encoding: [0x00,0x00,0x30,0xe0,0x00,0x01,0x01,0x01] + +buffer_store_format_d16_xy v[1:2], off, s[4:7], s1 +// UNPACKED: buffer_store_format_d16_xy v[1:2], off, s[4:7], s1 ; encoding: [0x00,0x00,0x34,0xe0,0x00,0x01,0x01,0x01] +// PACKED-ERR: error: instruction not supported on this GPU + +buffer_store_format_d16_xyz v[1:3], off, s[4:7], s1 +// UNPACKED: buffer_store_format_d16_xyz v[1:3], off, s[4:7], s1 ; encoding: [0x00,0x00,0x38,0xe0,0x00,0x01,0x01,0x01] +// PACKED-ERR: error: instruction not supported on this GPU + +buffer_store_format_d16_xyzw v[1:4], off, s[4:7], s1 +// UNPACKED: buffer_store_format_d16_xyzw v[1:4], off, s[4:7], s1 ; encoding: [0x00,0x00,0x3c,0xe0,0x00,0x01,0x01,0x01] +// PACKED-ERR: error: instruction not supported on this GPU + + +//===----------------------------------------------------------------------===// +// TBuffer Format Instructions. +//===----------------------------------------------------------------------===// + +tbuffer_load_format_d16_x v1, off, s[4:7], dfmt:15, nfmt:2, s1 +// UNPACKED: tbuffer_load_format_d16_x v1, off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x00,0x7c,0xe9,0x00,0x01,0x01,0x01] + +tbuffer_load_format_d16_xy v[1:2], off, s[4:7], dfmt:15, nfmt:2, s1 +// UNPACKED: tbuffer_load_format_d16_xy v[1:2], off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x80,0x7c,0xe9,0x00,0x01,0x01,0x01] +// PACKED-ERR: error: instruction not supported on this GPU + +tbuffer_load_format_d16_xyz v[1:3], off, s[4:7], dfmt:15, nfmt:2, s1 +// UNPACKED: tbuffer_load_format_d16_xyz v[1:3], off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x00,0x7d,0xe9,0x00,0x01,0x01,0x01] +// PACKED-ERR: error: instruction not supported on this GPU + +tbuffer_load_format_d16_xyzw v[1:4], off, s[4:7], dfmt:15, nfmt:2, s1 +// UNPACKED: tbuffer_load_format_d16_xyzw v[1:4], off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x80,0x7d,0xe9,0x00,0x01,0x01,0x01] +// PACKED-ERR: error: instruction not supported on this GPU + +tbuffer_store_format_d16_x v1, off, s[4:7], dfmt:15, nfmt:2, s1 +// UNPACKED: tbuffer_store_format_d16_x v1, off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x00,0x7e,0xe9,0x00,0x01,0x01,0x01] + +tbuffer_store_format_d16_xy v[1:2], off, s[4:7], dfmt:15, nfmt:2, s1 +// UNPACKED: tbuffer_store_format_d16_xy v[1:2], off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x80,0x7e,0xe9,0x00,0x01,0x01,0x01] +// PACKED-ERR: error: instruction not supported on this GPU + +tbuffer_store_format_d16_xyz v[1:3], off, s[4:7], dfmt:15, nfmt:2, s1 +// UNPACKED: tbuffer_store_format_d16_xyz v[1:3], off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x00,0x7f,0xe9,0x00,0x01,0x01,0x01] +// PACKED-ERR: error: instruction not supported on this GPU + +tbuffer_store_format_d16_xyzw v[1:4], off, s[4:7], dfmt:15, nfmt:2, s1 +// UNPACKED: tbuffer_store_format_d16_xyzw v[1:4], off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x80,0x7f,0xe9,0x00,0x01,0x01,0x01] +// PACKED-ERR: error: instruction not supported on this GPU