diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -369,6 +369,12 @@ "Support gfx10-style A16 for 16-bit coordinates/gradients/lod/clamp/mip image operands" >; +def FeatureG16 : SubtargetFeature<"g16", + "HasG16", + "true", + "Support G16 for 16-bit gradient image operands" +>; + def FeatureNSAEncoding : SubtargetFeature<"nsa-encoding", "HasNSAEncoding", "true", @@ -689,7 +695,7 @@ FeatureNoSdstCMPX, FeatureVscnt, FeatureRegisterBanking, FeatureVOP3Literal, FeatureDPP8, FeatureNoDataDepHazard, FeaturePkFmacF16Inst, FeatureDoesNotSupportSRAMECC, - FeatureGFX10A16 + FeatureGFX10A16, FeatureG16 ] >; @@ -1104,6 +1110,9 @@ def HasGFX10A16 : Predicate<"Subtarget->hasGFX10A16()">, AssemblerPredicate<(all_of FeatureGFX10A16)>; +def HasG16 : Predicate<"Subtarget->hasG16()">, + AssemblerPredicate<(all_of FeatureG16)>; + def HasDPP16 : Predicate<"Subtarget->hasDPP()">, AssemblerPredicate<(all_of (not FeatureGCN3Encoding), FeatureGFX10Insts, FeatureDPP)>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -351,6 +351,7 @@ bool HasDPP8; bool HasR128A16; bool HasGFX10A16; + bool HasG16; bool HasNSAEncoding; bool HasDLInsts; bool HasDot1Insts; @@ -1005,6 +1006,10 @@ return HasGFX10A16; } + bool hasG16() const { + return HasG16; + } + bool hasOffset3fBug() const { return HasOffset3fBug; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -244,6 +244,7 @@ HasDPP8(false), HasR128A16(false), HasGFX10A16(false), + HasG16(false), HasNSAEncoding(false), HasDLInsts(false), HasDot1Insts(false), diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td --- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td +++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td @@ -35,6 +35,7 @@ bit Gather4 = 0; bits<8> NumExtraArgs = 0; bit Gradients = 0; + bit G16 = 0; bit Coordinates = 1; bit LodOrClampOrMip = 0; bit HasD16 = 0; @@ -47,9 +48,9 @@ def MIMGBaseOpcodesTable : GenericTable { let FilterClass = "MIMGBaseOpcode"; let CppTypeName = "MIMGBaseOpcodeInfo"; - let Fields = ["BaseOpcode", "Store", "Atomic", "AtomicX2", "Sampler", "Gather4", - "NumExtraArgs", "Gradients", "Coordinates", "LodOrClampOrMip", - "HasD16"]; + let Fields = ["BaseOpcode", "Store", "Atomic", "AtomicX2", "Sampler", + "Gather4", "NumExtraArgs", "Gradients", "G16", "Coordinates", + "LodOrClampOrMip", "HasD16"]; GenericEnum TypeOf_BaseOpcode = MIMGBaseOpcode; let PrimaryKey = ["BaseOpcode"]; @@ -117,6 +118,22 @@ let PrimaryKeyName = "getMIMGMIPMappingInfo"; } +class MIMGG16Mapping { + MIMGBaseOpcode G = g; + MIMGBaseOpcode G16 = g16; +} + +def MIMGG16MappingTable : GenericTable { + let FilterClass = "MIMGG16Mapping"; + let CppTypeName = "MIMGG16MappingInfo"; + let Fields = ["G", "G16"]; + GenericEnum TypeOf_G = MIMGBaseOpcode; + GenericEnum TypeOf_G16 = MIMGBaseOpcode; + + let PrimaryKey = ["G"]; + let PrimaryKeyName = "getMIMGG16MappingInfo"; +} + class MIMG_Base : InstSI { @@ -646,10 +663,11 @@ } multiclass MIMG_Sampler op, AMDGPUSampleVariant sample, bit wqm = 0, - bit isGetLod = 0, - string asm = "image_sample"#sample.LowerCaseMod> { + bit isG16 = 0, bit isGetLod = 0, + string asm = "image_sample"#sample.LowerCaseMod#!if(isG16, "_g16", "")> { def "" : MIMG_Sampler_BaseOpcode { let HasD16 = !if(isGetLod, 0, 1); + let G16 = isG16; } let BaseOpcode = !cast(NAME), WQM = wqm, @@ -726,73 +744,89 @@ //def IMAGE_ATOMIC_FMIN : MIMG_NoPattern_ <"image_atomic_fmin", 0x0000001e>; -- not on VI //def IMAGE_ATOMIC_FMAX : MIMG_NoPattern_ <"image_atomic_fmax", 0x0000001f>; -- not on VI //} // End let FPAtomic = 1 -defm IMAGE_SAMPLE : MIMG_Sampler_WQM <0x00000020, AMDGPUSample>; -defm IMAGE_SAMPLE_CL : MIMG_Sampler_WQM <0x00000021, AMDGPUSample_cl>; -defm IMAGE_SAMPLE_D : MIMG_Sampler <0x00000022, AMDGPUSample_d>; -defm IMAGE_SAMPLE_D_CL : MIMG_Sampler <0x00000023, AMDGPUSample_d_cl>; -defm IMAGE_SAMPLE_L : MIMG_Sampler <0x00000024, AMDGPUSample_l>; -defm IMAGE_SAMPLE_B : MIMG_Sampler_WQM <0x00000025, AMDGPUSample_b>; -defm IMAGE_SAMPLE_B_CL : MIMG_Sampler_WQM <0x00000026, AMDGPUSample_b_cl>; -defm IMAGE_SAMPLE_LZ : MIMG_Sampler <0x00000027, AMDGPUSample_lz>; -defm IMAGE_SAMPLE_C : MIMG_Sampler_WQM <0x00000028, AMDGPUSample_c>; -defm IMAGE_SAMPLE_C_CL : MIMG_Sampler_WQM <0x00000029, AMDGPUSample_c_cl>; -defm IMAGE_SAMPLE_C_D : MIMG_Sampler <0x0000002a, AMDGPUSample_c_d>; -defm IMAGE_SAMPLE_C_D_CL : MIMG_Sampler <0x0000002b, AMDGPUSample_c_d_cl>; -defm IMAGE_SAMPLE_C_L : MIMG_Sampler <0x0000002c, AMDGPUSample_c_l>; -defm IMAGE_SAMPLE_C_B : MIMG_Sampler_WQM <0x0000002d, AMDGPUSample_c_b>; -defm IMAGE_SAMPLE_C_B_CL : MIMG_Sampler_WQM <0x0000002e, AMDGPUSample_c_b_cl>; -defm IMAGE_SAMPLE_C_LZ : MIMG_Sampler <0x0000002f, AMDGPUSample_c_lz>; -defm IMAGE_SAMPLE_O : MIMG_Sampler_WQM <0x00000030, AMDGPUSample_o>; -defm IMAGE_SAMPLE_CL_O : MIMG_Sampler_WQM <0x00000031, AMDGPUSample_cl_o>; -defm IMAGE_SAMPLE_D_O : MIMG_Sampler <0x00000032, AMDGPUSample_d_o>; -defm IMAGE_SAMPLE_D_CL_O : MIMG_Sampler <0x00000033, AMDGPUSample_d_cl_o>; -defm IMAGE_SAMPLE_L_O : MIMG_Sampler <0x00000034, AMDGPUSample_l_o>; -defm IMAGE_SAMPLE_B_O : MIMG_Sampler_WQM <0x00000035, AMDGPUSample_b_o>; -defm IMAGE_SAMPLE_B_CL_O : MIMG_Sampler_WQM <0x00000036, AMDGPUSample_b_cl_o>; -defm IMAGE_SAMPLE_LZ_O : MIMG_Sampler <0x00000037, AMDGPUSample_lz_o>; -defm IMAGE_SAMPLE_C_O : MIMG_Sampler_WQM <0x00000038, AMDGPUSample_c_o>; -defm IMAGE_SAMPLE_C_CL_O : MIMG_Sampler_WQM <0x00000039, AMDGPUSample_c_cl_o>; -defm IMAGE_SAMPLE_C_D_O : MIMG_Sampler <0x0000003a, AMDGPUSample_c_d_o>; -defm IMAGE_SAMPLE_C_D_CL_O : MIMG_Sampler <0x0000003b, AMDGPUSample_c_d_cl_o>; -defm IMAGE_SAMPLE_C_L_O : MIMG_Sampler <0x0000003c, AMDGPUSample_c_l_o>; -defm IMAGE_SAMPLE_C_B_CL_O : MIMG_Sampler_WQM <0x0000003e, AMDGPUSample_c_b_cl_o>; -defm IMAGE_SAMPLE_C_B_O : MIMG_Sampler_WQM <0x0000003d, AMDGPUSample_c_b_o>; -defm IMAGE_SAMPLE_C_LZ_O : MIMG_Sampler <0x0000003f, AMDGPUSample_c_lz_o>; -defm IMAGE_GATHER4 : MIMG_Gather_WQM <0x00000040, AMDGPUSample>; -defm IMAGE_GATHER4_CL : MIMG_Gather_WQM <0x00000041, AMDGPUSample_cl>; -defm IMAGE_GATHER4_L : MIMG_Gather <0x00000044, AMDGPUSample_l>; -defm IMAGE_GATHER4_B : MIMG_Gather_WQM <0x00000045, AMDGPUSample_b>; -defm IMAGE_GATHER4_B_CL : MIMG_Gather_WQM <0x00000046, AMDGPUSample_b_cl>; -defm IMAGE_GATHER4_LZ : MIMG_Gather <0x00000047, AMDGPUSample_lz>; -defm IMAGE_GATHER4_C : MIMG_Gather_WQM <0x00000048, AMDGPUSample_c>; -defm IMAGE_GATHER4_C_CL : MIMG_Gather_WQM <0x00000049, AMDGPUSample_c_cl>; -defm IMAGE_GATHER4_C_L : MIMG_Gather <0x0000004c, AMDGPUSample_c_l>; -defm IMAGE_GATHER4_C_B : MIMG_Gather_WQM <0x0000004d, AMDGPUSample_c_b>; -defm IMAGE_GATHER4_C_B_CL : MIMG_Gather_WQM <0x0000004e, AMDGPUSample_c_b_cl>; -defm IMAGE_GATHER4_C_LZ : MIMG_Gather <0x0000004f, AMDGPUSample_c_lz>; -defm IMAGE_GATHER4_O : MIMG_Gather_WQM <0x00000050, AMDGPUSample_o>; -defm IMAGE_GATHER4_CL_O : MIMG_Gather_WQM <0x00000051, AMDGPUSample_cl_o>; -defm IMAGE_GATHER4_L_O : MIMG_Gather <0x00000054, AMDGPUSample_l_o>; -defm IMAGE_GATHER4_B_O : MIMG_Gather_WQM <0x00000055, AMDGPUSample_b_o>; -defm IMAGE_GATHER4_B_CL_O : MIMG_Gather <0x00000056, AMDGPUSample_b_cl_o>; -defm IMAGE_GATHER4_LZ_O : MIMG_Gather <0x00000057, AMDGPUSample_lz_o>; -defm IMAGE_GATHER4_C_O : MIMG_Gather_WQM <0x00000058, AMDGPUSample_c_o>; -defm IMAGE_GATHER4_C_CL_O : MIMG_Gather_WQM <0x00000059, AMDGPUSample_c_cl_o>; -defm IMAGE_GATHER4_C_L_O : MIMG_Gather <0x0000005c, AMDGPUSample_c_l_o>; -defm IMAGE_GATHER4_C_B_O : MIMG_Gather_WQM <0x0000005d, AMDGPUSample_c_b_o>; -defm IMAGE_GATHER4_C_B_CL_O : MIMG_Gather_WQM <0x0000005e, AMDGPUSample_c_b_cl_o>; -defm IMAGE_GATHER4_C_LZ_O : MIMG_Gather <0x0000005f, AMDGPUSample_c_lz_o>; - -defm IMAGE_GET_LOD : MIMG_Sampler <0x00000060, AMDGPUSample, 1, 1, "image_get_lod">; - -defm IMAGE_SAMPLE_CD : MIMG_Sampler <0x00000068, AMDGPUSample_cd>; -defm IMAGE_SAMPLE_CD_CL : MIMG_Sampler <0x00000069, AMDGPUSample_cd_cl>; -defm IMAGE_SAMPLE_C_CD : MIMG_Sampler <0x0000006a, AMDGPUSample_c_cd>; -defm IMAGE_SAMPLE_C_CD_CL : MIMG_Sampler <0x0000006b, AMDGPUSample_c_cd_cl>; -defm IMAGE_SAMPLE_CD_O : MIMG_Sampler <0x0000006c, AMDGPUSample_cd_o>; -defm IMAGE_SAMPLE_CD_CL_O : MIMG_Sampler <0x0000006d, AMDGPUSample_cd_cl_o>; -defm IMAGE_SAMPLE_C_CD_O : MIMG_Sampler <0x0000006e, AMDGPUSample_c_cd_o>; -defm IMAGE_SAMPLE_C_CD_CL_O : MIMG_Sampler <0x0000006f, AMDGPUSample_c_cd_cl_o>; +defm IMAGE_SAMPLE : MIMG_Sampler_WQM <0x00000020, AMDGPUSample>; +defm IMAGE_SAMPLE_CL : MIMG_Sampler_WQM <0x00000021, AMDGPUSample_cl>; +defm IMAGE_SAMPLE_D : MIMG_Sampler <0x00000022, AMDGPUSample_d>; +defm IMAGE_SAMPLE_D_CL : MIMG_Sampler <0x00000023, AMDGPUSample_d_cl>; +defm IMAGE_SAMPLE_D_G16 : MIMG_Sampler <0x000000a2, AMDGPUSample_d, 0, 1>; +defm IMAGE_SAMPLE_D_CL_G16 : MIMG_Sampler <0x000000a3, AMDGPUSample_d_cl, 0, 1>; +defm IMAGE_SAMPLE_L : MIMG_Sampler <0x00000024, AMDGPUSample_l>; +defm IMAGE_SAMPLE_B : MIMG_Sampler_WQM <0x00000025, AMDGPUSample_b>; +defm IMAGE_SAMPLE_B_CL : MIMG_Sampler_WQM <0x00000026, AMDGPUSample_b_cl>; +defm IMAGE_SAMPLE_LZ : MIMG_Sampler <0x00000027, AMDGPUSample_lz>; +defm IMAGE_SAMPLE_C : MIMG_Sampler_WQM <0x00000028, AMDGPUSample_c>; +defm IMAGE_SAMPLE_C_CL : MIMG_Sampler_WQM <0x00000029, AMDGPUSample_c_cl>; +defm IMAGE_SAMPLE_C_D : MIMG_Sampler <0x0000002a, AMDGPUSample_c_d>; +defm IMAGE_SAMPLE_C_D_CL : MIMG_Sampler <0x0000002b, AMDGPUSample_c_d_cl>; +defm IMAGE_SAMPLE_C_D_G16 : MIMG_Sampler <0x000000aa, AMDGPUSample_c_d, 0, 1>; +defm IMAGE_SAMPLE_C_D_CL_G16 : MIMG_Sampler <0x000000ab, AMDGPUSample_c_d_cl, 0, 1>; +defm IMAGE_SAMPLE_C_L : MIMG_Sampler <0x0000002c, AMDGPUSample_c_l>; +defm IMAGE_SAMPLE_C_B : MIMG_Sampler_WQM <0x0000002d, AMDGPUSample_c_b>; +defm IMAGE_SAMPLE_C_B_CL : MIMG_Sampler_WQM <0x0000002e, AMDGPUSample_c_b_cl>; +defm IMAGE_SAMPLE_C_LZ : MIMG_Sampler <0x0000002f, AMDGPUSample_c_lz>; +defm IMAGE_SAMPLE_O : MIMG_Sampler_WQM <0x00000030, AMDGPUSample_o>; +defm IMAGE_SAMPLE_CL_O : MIMG_Sampler_WQM <0x00000031, AMDGPUSample_cl_o>; +defm IMAGE_SAMPLE_D_O : MIMG_Sampler <0x00000032, AMDGPUSample_d_o>; +defm IMAGE_SAMPLE_D_CL_O : MIMG_Sampler <0x00000033, AMDGPUSample_d_cl_o>; +defm IMAGE_SAMPLE_D_O_G16 : MIMG_Sampler <0x000000b2, AMDGPUSample_d_o, 0, 1>; +defm IMAGE_SAMPLE_D_CL_O_G16 : MIMG_Sampler <0x000000b3, AMDGPUSample_d_cl_o, 0, 1>; +defm IMAGE_SAMPLE_L_O : MIMG_Sampler <0x00000034, AMDGPUSample_l_o>; +defm IMAGE_SAMPLE_B_O : MIMG_Sampler_WQM <0x00000035, AMDGPUSample_b_o>; +defm IMAGE_SAMPLE_B_CL_O : MIMG_Sampler_WQM <0x00000036, AMDGPUSample_b_cl_o>; +defm IMAGE_SAMPLE_LZ_O : MIMG_Sampler <0x00000037, AMDGPUSample_lz_o>; +defm IMAGE_SAMPLE_C_O : MIMG_Sampler_WQM <0x00000038, AMDGPUSample_c_o>; +defm IMAGE_SAMPLE_C_CL_O : MIMG_Sampler_WQM <0x00000039, AMDGPUSample_c_cl_o>; +defm IMAGE_SAMPLE_C_D_O : MIMG_Sampler <0x0000003a, AMDGPUSample_c_d_o>; +defm IMAGE_SAMPLE_C_D_CL_O : MIMG_Sampler <0x0000003b, AMDGPUSample_c_d_cl_o>; +defm IMAGE_SAMPLE_C_D_O_G16 : MIMG_Sampler <0x000000ba, AMDGPUSample_c_d_o, 0, 1>; +defm IMAGE_SAMPLE_C_D_CL_O_G16 : MIMG_Sampler <0x000000bb, AMDGPUSample_c_d_cl_o, 0, 1>; +defm IMAGE_SAMPLE_C_L_O : MIMG_Sampler <0x0000003c, AMDGPUSample_c_l_o>; +defm IMAGE_SAMPLE_C_B_CL_O : MIMG_Sampler_WQM <0x0000003e, AMDGPUSample_c_b_cl_o>; +defm IMAGE_SAMPLE_C_B_O : MIMG_Sampler_WQM <0x0000003d, AMDGPUSample_c_b_o>; +defm IMAGE_SAMPLE_C_LZ_O : MIMG_Sampler <0x0000003f, AMDGPUSample_c_lz_o>; +defm IMAGE_GATHER4 : MIMG_Gather_WQM <0x00000040, AMDGPUSample>; +defm IMAGE_GATHER4_CL : MIMG_Gather_WQM <0x00000041, AMDGPUSample_cl>; +defm IMAGE_GATHER4_L : MIMG_Gather <0x00000044, AMDGPUSample_l>; +defm IMAGE_GATHER4_B : MIMG_Gather_WQM <0x00000045, AMDGPUSample_b>; +defm IMAGE_GATHER4_B_CL : MIMG_Gather_WQM <0x00000046, AMDGPUSample_b_cl>; +defm IMAGE_GATHER4_LZ : MIMG_Gather <0x00000047, AMDGPUSample_lz>; +defm IMAGE_GATHER4_C : MIMG_Gather_WQM <0x00000048, AMDGPUSample_c>; +defm IMAGE_GATHER4_C_CL : MIMG_Gather_WQM <0x00000049, AMDGPUSample_c_cl>; +defm IMAGE_GATHER4_C_L : MIMG_Gather <0x0000004c, AMDGPUSample_c_l>; +defm IMAGE_GATHER4_C_B : MIMG_Gather_WQM <0x0000004d, AMDGPUSample_c_b>; +defm IMAGE_GATHER4_C_B_CL : MIMG_Gather_WQM <0x0000004e, AMDGPUSample_c_b_cl>; +defm IMAGE_GATHER4_C_LZ : MIMG_Gather <0x0000004f, AMDGPUSample_c_lz>; +defm IMAGE_GATHER4_O : MIMG_Gather_WQM <0x00000050, AMDGPUSample_o>; +defm IMAGE_GATHER4_CL_O : MIMG_Gather_WQM <0x00000051, AMDGPUSample_cl_o>; +defm IMAGE_GATHER4_L_O : MIMG_Gather <0x00000054, AMDGPUSample_l_o>; +defm IMAGE_GATHER4_B_O : MIMG_Gather_WQM <0x00000055, AMDGPUSample_b_o>; +defm IMAGE_GATHER4_B_CL_O : MIMG_Gather <0x00000056, AMDGPUSample_b_cl_o>; +defm IMAGE_GATHER4_LZ_O : MIMG_Gather <0x00000057, AMDGPUSample_lz_o>; +defm IMAGE_GATHER4_C_O : MIMG_Gather_WQM <0x00000058, AMDGPUSample_c_o>; +defm IMAGE_GATHER4_C_CL_O : MIMG_Gather_WQM <0x00000059, AMDGPUSample_c_cl_o>; +defm IMAGE_GATHER4_C_L_O : MIMG_Gather <0x0000005c, AMDGPUSample_c_l_o>; +defm IMAGE_GATHER4_C_B_O : MIMG_Gather_WQM <0x0000005d, AMDGPUSample_c_b_o>; +defm IMAGE_GATHER4_C_B_CL_O : MIMG_Gather_WQM <0x0000005e, AMDGPUSample_c_b_cl_o>; +defm IMAGE_GATHER4_C_LZ_O : MIMG_Gather <0x0000005f, AMDGPUSample_c_lz_o>; + +defm IMAGE_GET_LOD : MIMG_Sampler <0x00000060, AMDGPUSample, 1, 0, 1, "image_get_lod">; + +defm IMAGE_SAMPLE_CD : MIMG_Sampler <0x00000068, AMDGPUSample_cd>; +defm IMAGE_SAMPLE_CD_CL : MIMG_Sampler <0x00000069, AMDGPUSample_cd_cl>; +defm IMAGE_SAMPLE_C_CD : MIMG_Sampler <0x0000006a, AMDGPUSample_c_cd>; +defm IMAGE_SAMPLE_C_CD_CL : MIMG_Sampler <0x0000006b, AMDGPUSample_c_cd_cl>; +defm IMAGE_SAMPLE_CD_O : MIMG_Sampler <0x0000006c, AMDGPUSample_cd_o>; +defm IMAGE_SAMPLE_CD_CL_O : MIMG_Sampler <0x0000006d, AMDGPUSample_cd_cl_o>; +defm IMAGE_SAMPLE_C_CD_O : MIMG_Sampler <0x0000006e, AMDGPUSample_c_cd_o>; +defm IMAGE_SAMPLE_C_CD_CL_O : MIMG_Sampler <0x0000006f, AMDGPUSample_c_cd_cl_o>; +defm IMAGE_SAMPLE_CD_G16 : MIMG_Sampler <0x000000e8, AMDGPUSample_cd, 0, 1>; +defm IMAGE_SAMPLE_CD_CL_G16 : MIMG_Sampler <0x000000e9, AMDGPUSample_cd_cl, 0, 1>; +defm IMAGE_SAMPLE_C_CD_G16 : MIMG_Sampler <0x000000ea, AMDGPUSample_c_cd, 0, 1>; +defm IMAGE_SAMPLE_C_CD_CL_G16 : MIMG_Sampler <0x000000eb, AMDGPUSample_c_cd_cl, 0, 1>; +defm IMAGE_SAMPLE_CD_O_G16 : MIMG_Sampler <0x000000ec, AMDGPUSample_cd_o, 0, 1>; +defm IMAGE_SAMPLE_CD_CL_O_G16 : MIMG_Sampler <0x000000ed, AMDGPUSample_cd_cl_o, 0, 1>; +defm IMAGE_SAMPLE_C_CD_O_G16 : MIMG_Sampler <0x000000ee, AMDGPUSample_c_cd_o, 0, 1>; +defm IMAGE_SAMPLE_C_CD_CL_O_G16 : MIMG_Sampler <0x000000ef, AMDGPUSample_c_cd_cl_o, 0, 1>; //def IMAGE_RSRC256 : MIMG_NoPattern_RSRC256 <"image_rsrc256", 0x0000007e>; //def IMAGE_SAMPLER : MIMG_NoPattern_ <"image_sampler", 0x0000007f>; @@ -835,3 +869,21 @@ // MIP to NONMIP Optimization Mapping def : MIMGMIPMapping; def : MIMGMIPMapping; + +// G to G16 Optimization Mapping +def : MIMGG16Mapping; +def : MIMGG16Mapping; +def : MIMGG16Mapping; +def : MIMGG16Mapping; +def : MIMGG16Mapping; +def : MIMGG16Mapping; +def : MIMGG16Mapping; +def : MIMGG16Mapping; +def : MIMGG16Mapping; +def : MIMGG16Mapping; +def : MIMGG16Mapping; +def : MIMGG16Mapping; +def : MIMGG16Mapping; +def : MIMGG16Mapping; +def : MIMGG16Mapping; +def : MIMGG16Mapping; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -5310,6 +5310,7 @@ SmallVector ResultTypes(Op->value_begin(), Op->value_end()); SmallVector OrigResultTypes(Op->value_begin(), Op->value_end()); bool IsD16 = false; + bool IsG16 = false; bool IsA16 = false; SDValue VData; int NumVDataDwords; @@ -5416,45 +5417,100 @@ } } - // Check for 16 bit addresses and pack if true. + // Push back extra arguments. + for (unsigned i = 0; i < BaseOpcode->NumExtraArgs; i++) + VAddrs.push_back(Op.getOperand(AddrIdx + i)); + + // Check for 16 bit derivatives and pack if true. unsigned DimIdx = AddrIdx + BaseOpcode->NumExtraArgs; MVT VAddrVT = Op.getOperand(DimIdx).getSimpleValueType(); - const MVT VAddrScalarVT = VAddrVT.getScalarType(); + MVT VAddrScalarVT = VAddrVT.getScalarType(); + MVT PackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16; + if (BaseOpcode->Gradients && (VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16)) { + IsG16 = true; + + // 1D: undef,dx/dh; undef,dx/dv + // 2D: dy/dh,dx/dh; dy/dv,dx/dv + // 3D: dy/dh,dx/dh; undef,dz/dh; dy/dv,dx/dv; undef,dz/dv + for (unsigned i = 0; i < 2; i++) { + unsigned Idx = DimIdx + i * (NumGradients / 2); + SDValue Pack; + if ((NumGradients / 2) >= 2) { + Pack = DAG.getBuildVector(PackVectorVT, DL, + {Op.getOperand(Idx), Op.getOperand(Idx + 1)}); + Pack = DAG.getBitcast(MVT::f32, Pack); + VAddrs.push_back(Pack); + } + if ((NumGradients / 2) % 2 != 0) { + // Odd, either 1D or 3D + Pack = Op.getOperand(Idx + NumGradients / 2 - 1); + if (Pack.getValueType() != MVT::i16) + Pack = DAG.getBitcast(MVT::i16, Pack); + Pack = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Pack); + Pack = DAG.getBitcast(MVT::f32, Pack); + VAddrs.push_back(Pack); + } + } + } else { + for (unsigned i = 0; i < NumGradients; i++) + VAddrs.push_back(Op.getOperand(DimIdx + i)); + } + + // Check for 16 bit addresses and pack if true. + unsigned CoordIdx = DimIdx + NumGradients; + unsigned CoordsEnd = AddrIdx + NumMIVAddrs; + VAddrVT = Op.getOperand(CoordIdx).getSimpleValueType(); + VAddrScalarVT = VAddrVT.getScalarType(); + PackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16; if (((VAddrScalarVT == MVT::f16) || (VAddrScalarVT == MVT::i16))) { // Illegal to use a16 images - if (!ST->hasFeature(AMDGPU::FeatureR128A16) && !ST->hasFeature(AMDGPU::FeatureGFX10A16)) + if (!ST->hasFeature(AMDGPU::FeatureR128A16) && !ST->hasFeature(AMDGPU::FeatureGFX10A16)) { + LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not " + "support 16 bit addresses\n"); return Op; + } + + // Illegal to use 16 bit addresses with 32 bit derivatives + if (BaseOpcode->Gradients && !IsG16) { + LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: 16 bit addresses " + "need 16 bit derivatives but got 32 bit derivatives\n"); + return Op; + } + IsG16 = false; IsA16 = true; - const MVT VectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16; - for (unsigned i = AddrIdx; i < (AddrIdx + NumMIVAddrs); ++i) { - SDValue AddrLo; - // Push back extra arguments. - if (i < DimIdx) { - AddrLo = Op.getOperand(i); + for (unsigned i = CoordIdx; i < CoordsEnd; i++) { + SDValue Pack; + if ((i + 1) < CoordsEnd) { + Pack = DAG.getBuildVector(PackVectorVT, DL, + {Op.getOperand(i), + Op.getOperand(i + 1)}); + i++; } else { - // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, - // in 1D, derivatives dx/dh and dx/dv are packed with undef. - if (((i + 1) >= (AddrIdx + NumMIVAddrs)) || - ((NumGradients / 2) % 2 == 1 && - (i == DimIdx + (NumGradients / 2) - 1 || - i == DimIdx + NumGradients - 1))) { - AddrLo = Op.getOperand(i); - if (AddrLo.getValueType() != MVT::i16) - AddrLo = DAG.getBitcast(MVT::i16, Op.getOperand(i)); - AddrLo = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, AddrLo); - } else { - AddrLo = DAG.getBuildVector(VectorVT, DL, - {Op.getOperand(i), Op.getOperand(i + 1)}); - i++; - } - AddrLo = DAG.getBitcast(MVT::f32, AddrLo); + Pack = Op.getOperand(i); + if (Pack.getValueType() != MVT::i16) + Pack = DAG.getBitcast(MVT::i16, Pack); + Pack = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Pack); } - VAddrs.push_back(AddrLo); + Pack = DAG.getBitcast(MVT::f32, Pack); + VAddrs.push_back(Pack); } } else { - for (unsigned i = 0; i < NumMIVAddrs; ++i) - VAddrs.push_back(Op.getOperand(AddrIdx + i)); + for (unsigned i = CoordIdx; i < CoordsEnd; i++) + VAddrs.push_back(Op.getOperand(i)); + } + + // Illegal to use g16 images without 16 bit addresses on target that do not + // support it. + if (IsG16) { + if (!ST->hasFeature(AMDGPU::FeatureG16)) { + LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not " + "support 16 bit derivatives\n"); + return Op; + } + const AMDGPU::MIMGG16MappingInfo *G16MappingInfo = + AMDGPU::getMIMGG16MappingInfo(Intr->BaseOpcode); + IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16 } // If the register allocator cannot place the address registers contiguously diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -63,6 +63,8 @@ using namespace llvm; +#define DEBUG_TYPE "si-instr-info" + #define GET_INSTRINFO_CTOR_DTOR #include "AMDGPUGenInstrInfo.inc" @@ -3754,7 +3756,7 @@ IsA16 = A16->getImm() != 0; } - bool PackDerivatives = IsA16; // Either A16 or G16 + bool PackDerivatives = IsA16 || BaseOpcode->G16; bool IsNSA = SRsrcIdx - VAddr0Idx > 1; unsigned AddrWords = BaseOpcode->NumExtraArgs; @@ -3791,6 +3793,8 @@ } if (VAddrWords != AddrWords) { + LLVM_DEBUG(dbgs() << "bad vaddr size, expected " << AddrWords + << " but got " << VAddrWords << "\n"); ErrInfo = "bad vaddr size"; return false; } diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -211,6 +211,7 @@ uint8_t NumExtraArgs; bool Gradients; + bool G16; bool Coordinates; bool LodOrClampOrMip; bool HasD16; @@ -247,11 +248,19 @@ MIMGBaseOpcode NONMIP; }; +struct MIMGG16MappingInfo { + MIMGBaseOpcode G; + MIMGBaseOpcode G16; +}; + LLVM_READONLY const MIMGLZMappingInfo *getMIMGLZMappingInfo(unsigned L); LLVM_READONLY -const MIMGMIPMappingInfo *getMIMGMIPMappingInfo(unsigned L); +const MIMGMIPMappingInfo *getMIMGMIPMappingInfo(unsigned MIP); + +LLVM_READONLY +const MIMGG16MappingInfo *getMIMGG16MappingInfo(unsigned G); LLVM_READONLY int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, @@ -552,6 +561,7 @@ bool hasSRAMECC(const MCSubtargetInfo &STI); bool hasMIMG_R128(const MCSubtargetInfo &STI); bool hasGFX10A16(const MCSubtargetInfo &STI); +bool hasG16(const MCSubtargetInfo &STI); bool hasPackedD16(const MCSubtargetInfo &STI); bool isSI(const MCSubtargetInfo &STI); diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -108,6 +108,7 @@ #define GET_MIMGInfoTable_IMPL #define GET_MIMGLZMappingTable_IMPL #define GET_MIMGMIPMappingTable_IMPL +#define GET_MIMGG16MappingTable_IMPL #include "AMDGPUGenSearchableTables.inc" int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, @@ -934,6 +935,10 @@ return STI.getFeatureBits()[AMDGPU::FeatureGFX10A16]; } +bool hasG16(const MCSubtargetInfo &STI) { + return STI.getFeatureBits()[AMDGPU::FeatureG16]; +} + bool hasPackedD16(const MCSubtargetInfo &STI) { return !STI.getFeatureBits()[AMDGPU::FeatureUnpackedD16VMem]; } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll @@ -0,0 +1,311 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx1010 -show-mc-encoding < %s | FileCheck -check-prefixes=GFX10 %s + +define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) { +; GFX10-LABEL: sample_d_1d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: image_sample_d_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x01,0x0f,0x88,0xf0,0x00,0x00,0x40,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { +; GFX10-LABEL: sample_d_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff ; encoding: [0xff,0x02,0x0e,0x7e,0xff,0xff,0x00,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_and_b32_e32 v2, v7, v2 ; encoding: [0x07,0x05,0x04,0x36] +; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 ; encoding: [0x07,0x01,0x00,0x36] +; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04] +; GFX10-NEXT: v_lshl_or_b32 v3, v1, 16, v0 ; encoding: [0x03,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04] +; GFX10-NEXT: image_sample_d_g16 v[0:3], [v3, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0x88,0xf0,0x03,0x00,0x40,0x00,0x02,0x04,0x05,0x00] +; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %drdh, half %dsdv, half %dtdv, half %drdv, float %s, float %t, float %r) { +; GFX10-LABEL: sample_d_3d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_mov_b32_e32 v9, 0xffff ; encoding: [0xff,0x02,0x12,0x7e,0xff,0xff,0x00,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_and_b32_e32 v3, v9, v3 ; encoding: [0x09,0x07,0x06,0x36] +; GFX10-NEXT: v_and_b32_e32 v0, v9, v0 ; encoding: [0x09,0x01,0x00,0x36] +; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; encoding: [0x03,0x00,0x6f,0xd7,0x04,0x21,0x0d,0x04] +; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; encoding: [0x00,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04] +; GFX10-NEXT: image_sample_d_g16 v[0:3], [v0, v2, v3, v5, v6, v7, v8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D ; encoding: [0x15,0x0f,0x88,0xf0,0x00,0x00,0x40,0x00,0x02,0x03,0x05,0x06,0x07,0x08,0x00,0x00] +; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %drdh, half %dsdv, half %dtdv, half %drdv, float %s, float %t, float %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s) { +; GFX10-LABEL: sample_c_d_1d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: image_sample_c_d_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x01,0x0f,0xa8,0xf0,0x00,0x00,0x40,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { +; GFX10-LABEL: sample_c_d_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_mov_b32_e32 v10, 0xffff ; encoding: [0xff,0x02,0x14,0x7e,0xff,0xff,0x00,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_and_b32_e32 v3, v10, v3 ; encoding: [0x0a,0x07,0x06,0x36] +; GFX10-NEXT: v_and_b32_e32 v1, v10, v1 ; encoding: [0x0a,0x03,0x02,0x36] +; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; encoding: [0x03,0x00,0x6f,0xd7,0x04,0x21,0x0d,0x04] +; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; encoding: [0x01,0x00,0x6f,0xd7,0x02,0x21,0x05,0x04] +; GFX10-NEXT: image_sample_c_d_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0xa8,0xf0,0x00,0x00,0x40,0x00,0x01,0x03,0x05,0x06] +; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s, float %clamp) { +; GFX10-LABEL: sample_d_cl_1d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x01,0x0f,0x8c,0xf0,0x00,0x00,0x40,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) { +; GFX10-LABEL: sample_d_cl_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff ; encoding: [0xff,0x02,0x0e,0x7e,0xff,0xff,0x00,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 ; encoding: [0x07,0x01,0x00,0x36] +; GFX10-NEXT: v_and_b32_e32 v2, v7, v2 ; encoding: [0x07,0x05,0x04,0x36] +; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; encoding: [0x00,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04] +; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2 ; encoding: [0x03,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04] +; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v3, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0x8c,0xf0,0x00,0x00,0x40,0x00,0x03,0x04,0x05,0x06] +; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp) { +; GFX10-LABEL: sample_c_d_cl_1d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: image_sample_c_d_cl_g16 v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x01,0x0f,0xac,0xf0,0x00,0x00,0x40,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) { +; GFX10-LABEL: sample_c_d_cl_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_mov_b32_e32 v8, 0xffff ; encoding: [0xff,0x02,0x10,0x7e,0xff,0xff,0x00,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_and_b32_e32 v3, v8, v3 ; encoding: [0x08,0x07,0x06,0x36] +; GFX10-NEXT: v_and_b32_e32 v1, v8, v1 ; encoding: [0x08,0x03,0x02,0x36] +; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; encoding: [0x03,0x00,0x6f,0xd7,0x04,0x21,0x0d,0x04] +; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; encoding: [0x01,0x00,0x6f,0xd7,0x02,0x21,0x05,0x04] +; GFX10-NEXT: image_sample_c_d_cl_g16 v[0:3], [v0, v1, v3, v5, v6, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0d,0x0f,0xac,0xf0,0x00,0x00,0x40,0x00,0x01,0x03,0x05,0x06,0x07,0x00,0x00,0x00] +; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) { +; GFX10-LABEL: sample_cd_1d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: image_sample_cd_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x01,0x0f,0xa0,0xf1,0x00,0x00,0x40,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { +; GFX10-LABEL: sample_cd_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff ; encoding: [0xff,0x02,0x0e,0x7e,0xff,0xff,0x00,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_and_b32_e32 v2, v7, v2 ; encoding: [0x07,0x05,0x04,0x36] +; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 ; encoding: [0x07,0x01,0x00,0x36] +; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04] +; GFX10-NEXT: v_lshl_or_b32 v3, v1, 16, v0 ; encoding: [0x03,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04] +; GFX10-NEXT: image_sample_cd_g16 v[0:3], [v3, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0xa0,0xf1,0x03,0x00,0x40,0x00,0x02,0x04,0x05,0x00] +; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s) { +; GFX10-LABEL: sample_c_cd_1d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x01,0x0f,0xa8,0xf1,0x00,0x00,0x40,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { +; GFX10-LABEL: sample_c_cd_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_mov_b32_e32 v10, 0xffff ; encoding: [0xff,0x02,0x14,0x7e,0xff,0xff,0x00,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_and_b32_e32 v3, v10, v3 ; encoding: [0x0a,0x07,0x06,0x36] +; GFX10-NEXT: v_and_b32_e32 v1, v10, v1 ; encoding: [0x0a,0x03,0x02,0x36] +; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; encoding: [0x03,0x00,0x6f,0xd7,0x04,0x21,0x0d,0x04] +; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; encoding: [0x01,0x00,0x6f,0xd7,0x02,0x21,0x05,0x04] +; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0xa8,0xf1,0x00,0x00,0x40,0x00,0x01,0x03,0x05,0x06] +; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s, float %clamp) { +; GFX10-LABEL: sample_cd_cl_1d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x01,0x0f,0xa4,0xf1,0x00,0x00,0x40,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) { +; GFX10-LABEL: sample_cd_cl_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff ; encoding: [0xff,0x02,0x0e,0x7e,0xff,0xff,0x00,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 ; encoding: [0x07,0x01,0x00,0x36] +; GFX10-NEXT: v_and_b32_e32 v2, v7, v2 ; encoding: [0x07,0x05,0x04,0x36] +; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; encoding: [0x00,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04] +; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2 ; encoding: [0x03,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04] +; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v0, v3, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0xa4,0xf1,0x00,0x00,0x40,0x00,0x03,0x04,0x05,0x06] +; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp) { +; GFX10-LABEL: sample_c_cd_cl_1d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x01,0x0f,0xac,0xf1,0x00,0x00,0x40,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) { +; GFX10-LABEL: sample_c_cd_cl_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_mov_b32_e32 v8, 0xffff ; encoding: [0xff,0x02,0x10,0x7e,0xff,0xff,0x00,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_and_b32_e32 v3, v8, v3 ; encoding: [0x08,0x07,0x06,0x36] +; GFX10-NEXT: v_and_b32_e32 v1, v8, v1 ; encoding: [0x08,0x03,0x02,0x36] +; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; encoding: [0x03,0x00,0x6f,0xd7,0x04,0x21,0x0d,0x04] +; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; encoding: [0x01,0x00,0x6f,0xd7,0x02,0x21,0x05,0x04] +; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], [v0, v1, v3, v5, v6, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0d,0x0f,0xac,0xf1,0x00,0x00,0x40,0x00,0x01,0x03,0x05,0x06,0x07,0x00,0x00,0x00] +; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice) { +; GFX10-LABEL: sample_c_d_o_2darray_V1: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_mov_b32_e32 v9, 0xffff ; encoding: [0xff,0x02,0x12,0x7e,0xff,0xff,0x00,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_and_b32_e32 v4, v9, v4 ; encoding: [0x09,0x09,0x08,0x36] +; GFX10-NEXT: v_and_b32_e32 v2, v9, v2 ; encoding: [0x09,0x05,0x04,0x36] +; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4 ; encoding: [0x04,0x00,0x6f,0xd7,0x05,0x21,0x11,0x04] +; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04] +; GFX10-NEXT: image_sample_c_d_o_g16 v0, [v0, v1, v2, v4, v6, v7, v8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY ; encoding: [0x2d,0x04,0xe8,0xf0,0x00,0x00,0x40,0x00,0x01,0x02,0x04,0x06,0x07,0x08,0x00,0x00] +; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call float @llvm.amdgcn.image.sample.c.d.o.2darray.f16.f32.f32(i32 4, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret float %v +} + +define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice) { +; GFX10-LABEL: sample_c_d_o_2darray_V2: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_mov_b32_e32 v9, 0xffff ; encoding: [0xff,0x02,0x12,0x7e,0xff,0xff,0x00,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_and_b32_e32 v4, v9, v4 ; encoding: [0x09,0x09,0x08,0x36] +; GFX10-NEXT: v_and_b32_e32 v2, v9, v2 ; encoding: [0x09,0x05,0x04,0x36] +; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4 ; encoding: [0x04,0x00,0x6f,0xd7,0x05,0x21,0x11,0x04] +; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04] +; GFX10-NEXT: image_sample_c_d_o_g16 v[0:1], [v0, v1, v2, v4, v6, v7, v8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY ; encoding: [0x2d,0x06,0xe8,0xf0,0x00,0x00,0x40,0x00,0x01,0x02,0x04,0x06,0x07,0x08,0x00,0x00] +; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f16.f32(i32 6, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <2 x float> %v +} + +declare <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f16.f32(i32, half, half, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f16.f32(i32, half, half, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f16.f32(i32, half, half, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f16.f32(i32, float, half, half, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f16.f32(i32, float, half, half, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f16.f32(i32, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f16.f32(i32, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f16.f32(i32, float, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f16.f32(i32, float, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 + +declare <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f16.f32(i32, half, half, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f16.f32(i32, half, half, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f16.f32(i32, float, half, half, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f16.f32(i32, float, half, half, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f16.f32(i32, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f16.f32(i32, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f16.f32(i32, float, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f16.f32(i32, float, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 + +declare float @llvm.amdgcn.image.sample.c.d.o.2darray.f16.f32.f32(i32, i32, float, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f16.f32(i32, i32, float, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readonly } +attributes #2 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll @@ -0,0 +1,311 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s + +define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) { +; GFX10-LABEL: sample_d_1d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: image_sample_d_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { +; GFX10-LABEL: sample_d_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_and_b32_e32 v2, v7, v2 +; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 +; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX10-NEXT: v_lshl_or_b32 v3, v1, 16, v0 +; GFX10-NEXT: image_sample_d_g16 v[0:3], [v3, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %drdh, half %dsdv, half %dtdv, half %drdv, float %s, float %t, float %r) { +; GFX10-LABEL: sample_d_3d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_mov_b32_e32 v9, 0xffff +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_and_b32_e32 v3, v9, v3 +; GFX10-NEXT: v_and_b32_e32 v0, v9, v0 +; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: image_sample_d_g16 v[0:3], [v0, v2, v3, v5, v6, v7, v8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %drdh, half %dsdv, half %dtdv, half %drdv, float %s, float %t, float %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s) { +; GFX10-LABEL: sample_c_d_1d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: image_sample_c_d_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { +; GFX10-LABEL: sample_c_d_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_mov_b32_e32 v10, 0xffff +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_and_b32_e32 v3, v10, v3 +; GFX10-NEXT: v_and_b32_e32 v1, v10, v1 +; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10-NEXT: image_sample_c_d_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s, float %clamp) { +; GFX10-LABEL: sample_d_cl_1d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) { +; GFX10-LABEL: sample_d_cl_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 +; GFX10-NEXT: v_and_b32_e32 v2, v7, v2 +; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2 +; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v3, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp) { +; GFX10-LABEL: sample_c_d_cl_1d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: image_sample_c_d_cl_g16 v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) { +; GFX10-LABEL: sample_c_d_cl_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_mov_b32_e32 v8, 0xffff +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_and_b32_e32 v3, v8, v3 +; GFX10-NEXT: v_and_b32_e32 v1, v8, v1 +; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10-NEXT: image_sample_c_d_cl_g16 v[0:3], [v0, v1, v3, v5, v6, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) { +; GFX10-LABEL: sample_cd_1d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: image_sample_cd_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { +; GFX10-LABEL: sample_cd_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_and_b32_e32 v2, v7, v2 +; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 +; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX10-NEXT: v_lshl_or_b32 v3, v1, 16, v0 +; GFX10-NEXT: image_sample_cd_g16 v[0:3], [v3, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s) { +; GFX10-LABEL: sample_c_cd_1d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { +; GFX10-LABEL: sample_c_cd_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_mov_b32_e32 v10, 0xffff +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_and_b32_e32 v3, v10, v3 +; GFX10-NEXT: v_and_b32_e32 v1, v10, v1 +; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s, float %clamp) { +; GFX10-LABEL: sample_cd_cl_1d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) { +; GFX10-LABEL: sample_cd_cl_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 +; GFX10-NEXT: v_and_b32_e32 v2, v7, v2 +; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2 +; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v0, v3, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp) { +; GFX10-LABEL: sample_c_cd_cl_1d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) { +; GFX10-LABEL: sample_c_cd_cl_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_mov_b32_e32 v8, 0xffff +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_and_b32_e32 v3, v8, v3 +; GFX10-NEXT: v_and_b32_e32 v1, v8, v1 +; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], [v0, v1, v3, v5, v6, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice) { +; GFX10-LABEL: sample_c_d_o_2darray_V1: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_mov_b32_e32 v9, 0xffff +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_and_b32_e32 v4, v9, v4 +; GFX10-NEXT: v_and_b32_e32 v2, v9, v2 +; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4 +; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX10-NEXT: image_sample_c_d_o_g16 v0, [v0, v1, v2, v4, v6, v7, v8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call float @llvm.amdgcn.image.sample.c.d.o.2darray.f16.f32.f32(i32 4, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret float %v +} + +define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice) { +; GFX10-LABEL: sample_c_d_o_2darray_V2: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_mov_b32_e32 v9, 0xffff +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_and_b32_e32 v4, v9, v4 +; GFX10-NEXT: v_and_b32_e32 v2, v9, v2 +; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4 +; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX10-NEXT: image_sample_c_d_o_g16 v[0:1], [v0, v1, v2, v4, v6, v7, v8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f16.f32(i32 6, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <2 x float> %v +} + +declare <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f16.f32(i32, half, half, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f16.f32(i32, half, half, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f16.f32(i32, half, half, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f16.f32(i32, float, half, half, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f16.f32(i32, float, half, half, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f16.f32(i32, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f16.f32(i32, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f16.f32(i32, float, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f16.f32(i32, float, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 + +declare <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f16.f32(i32, half, half, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f16.f32(i32, half, half, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f16.f32(i32, float, half, half, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f16.f32(i32, float, half, half, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f16.f32(i32, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f16.f32(i32, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f16.f32(i32, float, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f16.f32(i32, float, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 + +declare float @llvm.amdgcn.image.sample.c.d.o.2darray.f16.f32.f32(i32, i32, float, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f16.f32(i32, i32, float, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readonly } +attributes #2 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/unsupported-image-g16.ll b/llvm/test/CodeGen/AMDGPU/unsupported-image-g16.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/unsupported-image-g16.ll @@ -0,0 +1,17 @@ +; RUN: not --crash llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -o /dev/null %s 2>&1 | FileCheck -check-prefix=ERR %s +; RUN: not --crash llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -o /dev/null %s 2>&1 | FileCheck -check-prefix=ERR %s + +; Make sure this doesn't assert on targets without the g16 feature, and instead +; generates a selection error. + +; ERR: LLVM ERROR: Cannot select: intrinsic %llvm.amdgcn.image.sample.d.1d + +define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +declare <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f16.f32(i32, half, half, float, <8 x i32>, <4 x i32>, i1, i32, i32) #0 + +attributes #0 = { nounwind readonly }