Index: lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.h +++ lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -474,6 +474,91 @@ BUFFER_ATOMIC_OR, BUFFER_ATOMIC_XOR, BUFFER_ATOMIC_CMPSWAP, + IMAGE_LOAD, + IMAGE_LOAD_MIP, + IMAGE_STORE, + IMAGE_STORE_MIP, + + // Basic sample. + IMAGE_SAMPLE, + IMAGE_SAMPLE_CL, + IMAGE_SAMPLE_D, + IMAGE_SAMPLE_D_CL, + IMAGE_SAMPLE_L, + IMAGE_SAMPLE_B, + IMAGE_SAMPLE_B_CL, + IMAGE_SAMPLE_LZ, + IMAGE_SAMPLE_CD, + IMAGE_SAMPLE_CD_CL, + + // Sample with comparison. + IMAGE_SAMPLE_C, + IMAGE_SAMPLE_C_CL, + IMAGE_SAMPLE_C_D, + IMAGE_SAMPLE_C_D_CL, + IMAGE_SAMPLE_C_L, + IMAGE_SAMPLE_C_B, + IMAGE_SAMPLE_C_B_CL, + IMAGE_SAMPLE_C_LZ, + IMAGE_SAMPLE_C_CD, + IMAGE_SAMPLE_C_CD_CL, + + // Sample with offsets. + IMAGE_SAMPLE_O, + IMAGE_SAMPLE_CL_O, + IMAGE_SAMPLE_D_O, + IMAGE_SAMPLE_D_CL_O, + IMAGE_SAMPLE_L_O, + IMAGE_SAMPLE_B_O, + IMAGE_SAMPLE_B_CL_O, + IMAGE_SAMPLE_LZ_O, + IMAGE_SAMPLE_CD_O, + IMAGE_SAMPLE_CD_CL_O, + + // Sample with comparison and offsets. + IMAGE_SAMPLE_C_O, + IMAGE_SAMPLE_C_CL_O, + IMAGE_SAMPLE_C_D_O, + IMAGE_SAMPLE_C_D_CL_O, + IMAGE_SAMPLE_C_L_O, + IMAGE_SAMPLE_C_B_O, + IMAGE_SAMPLE_C_B_CL_O, + IMAGE_SAMPLE_C_LZ_O, + IMAGE_SAMPLE_C_CD_O, + IMAGE_SAMPLE_C_CD_CL_O, + + // Basic gather4. + IMAGE_GATHER4, + IMAGE_GATHER4_CL, + IMAGE_GATHER4_L, + IMAGE_GATHER4_B, + IMAGE_GATHER4_B_CL, + IMAGE_GATHER4_LZ, + + // Gather4 with comparison. + IMAGE_GATHER4_C, + IMAGE_GATHER4_C_CL, + IMAGE_GATHER4_C_L, + IMAGE_GATHER4_C_B, + IMAGE_GATHER4_C_B_CL, + IMAGE_GATHER4_C_LZ, + + // Gather4 with offsets. + IMAGE_GATHER4_O, + IMAGE_GATHER4_CL_O, + IMAGE_GATHER4_L_O, + IMAGE_GATHER4_B_O, + IMAGE_GATHER4_B_CL_O, + IMAGE_GATHER4_LZ_O, + + // Gather4 with comparison and offsets. + IMAGE_GATHER4_C_O, + IMAGE_GATHER4_C_CL_O, + IMAGE_GATHER4_C_L_O, + IMAGE_GATHER4_C_B_O, + IMAGE_GATHER4_C_B_CL_O, + IMAGE_GATHER4_C_LZ_O, + LAST_AMDGPU_ISD_NUMBER }; Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -3999,6 +3999,83 @@ NODE_NAME_CASE(BUFFER_ATOMIC_OR) NODE_NAME_CASE(BUFFER_ATOMIC_XOR) NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP) + NODE_NAME_CASE(IMAGE_LOAD) + NODE_NAME_CASE(IMAGE_LOAD_MIP) + NODE_NAME_CASE(IMAGE_STORE) + NODE_NAME_CASE(IMAGE_STORE_MIP) + // Basic sample. + NODE_NAME_CASE(IMAGE_SAMPLE) + NODE_NAME_CASE(IMAGE_SAMPLE_CL) + NODE_NAME_CASE(IMAGE_SAMPLE_D) + NODE_NAME_CASE(IMAGE_SAMPLE_D_CL) + NODE_NAME_CASE(IMAGE_SAMPLE_L) + NODE_NAME_CASE(IMAGE_SAMPLE_B) + NODE_NAME_CASE(IMAGE_SAMPLE_B_CL) + NODE_NAME_CASE(IMAGE_SAMPLE_LZ) + NODE_NAME_CASE(IMAGE_SAMPLE_CD) + NODE_NAME_CASE(IMAGE_SAMPLE_CD_CL) + // Sample with comparison. + NODE_NAME_CASE(IMAGE_SAMPLE_C) + NODE_NAME_CASE(IMAGE_SAMPLE_C_CL) + NODE_NAME_CASE(IMAGE_SAMPLE_C_D) + NODE_NAME_CASE(IMAGE_SAMPLE_C_D_CL) + NODE_NAME_CASE(IMAGE_SAMPLE_C_L) + NODE_NAME_CASE(IMAGE_SAMPLE_C_B) + NODE_NAME_CASE(IMAGE_SAMPLE_C_B_CL) + NODE_NAME_CASE(IMAGE_SAMPLE_C_LZ) + NODE_NAME_CASE(IMAGE_SAMPLE_C_CD) + NODE_NAME_CASE(IMAGE_SAMPLE_C_CD_CL) + // Sample with offsets. + NODE_NAME_CASE(IMAGE_SAMPLE_O) + NODE_NAME_CASE(IMAGE_SAMPLE_CL_O) + NODE_NAME_CASE(IMAGE_SAMPLE_D_O) + NODE_NAME_CASE(IMAGE_SAMPLE_D_CL_O) + NODE_NAME_CASE(IMAGE_SAMPLE_L_O) + NODE_NAME_CASE(IMAGE_SAMPLE_B_O) + NODE_NAME_CASE(IMAGE_SAMPLE_B_CL_O) + NODE_NAME_CASE(IMAGE_SAMPLE_LZ_O) + NODE_NAME_CASE(IMAGE_SAMPLE_CD_O) + NODE_NAME_CASE(IMAGE_SAMPLE_CD_CL_O) + // Sample with comparison and offsets. + NODE_NAME_CASE(IMAGE_SAMPLE_C_O) + NODE_NAME_CASE(IMAGE_SAMPLE_C_CL_O) + NODE_NAME_CASE(IMAGE_SAMPLE_C_D_O) + NODE_NAME_CASE(IMAGE_SAMPLE_C_D_CL_O) + NODE_NAME_CASE(IMAGE_SAMPLE_C_L_O) + NODE_NAME_CASE(IMAGE_SAMPLE_C_B_O) + NODE_NAME_CASE(IMAGE_SAMPLE_C_B_CL_O) + NODE_NAME_CASE(IMAGE_SAMPLE_C_LZ_O) + NODE_NAME_CASE(IMAGE_SAMPLE_C_CD_O) + NODE_NAME_CASE(IMAGE_SAMPLE_C_CD_CL_O) + // Basic gather4. + NODE_NAME_CASE(IMAGE_GATHER4) + NODE_NAME_CASE(IMAGE_GATHER4_CL) + NODE_NAME_CASE(IMAGE_GATHER4_L) + NODE_NAME_CASE(IMAGE_GATHER4_B) + NODE_NAME_CASE(IMAGE_GATHER4_B_CL) + NODE_NAME_CASE(IMAGE_GATHER4_LZ) + // Gather4 with comparison. + NODE_NAME_CASE(IMAGE_GATHER4_C) + NODE_NAME_CASE(IMAGE_GATHER4_C_CL) + NODE_NAME_CASE(IMAGE_GATHER4_C_L) + NODE_NAME_CASE(IMAGE_GATHER4_C_B) + NODE_NAME_CASE(IMAGE_GATHER4_C_B_CL) + NODE_NAME_CASE(IMAGE_GATHER4_C_LZ) + // Gather4 with offsets. + NODE_NAME_CASE(IMAGE_GATHER4_O) + NODE_NAME_CASE(IMAGE_GATHER4_CL_O) + NODE_NAME_CASE(IMAGE_GATHER4_L_O) + NODE_NAME_CASE(IMAGE_GATHER4_B_O) + NODE_NAME_CASE(IMAGE_GATHER4_B_CL_O) + NODE_NAME_CASE(IMAGE_GATHER4_LZ_O) + // Gather4 with comparison and offsets. + NODE_NAME_CASE(IMAGE_GATHER4_C_O) + NODE_NAME_CASE(IMAGE_GATHER4_C_CL_O) + NODE_NAME_CASE(IMAGE_GATHER4_C_L_O) + NODE_NAME_CASE(IMAGE_GATHER4_C_B_O) + NODE_NAME_CASE(IMAGE_GATHER4_C_B_CL_O) + NODE_NAME_CASE(IMAGE_GATHER4_C_LZ_O) + case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break; } return nullptr; Index: lib/Target/AMDGPU/MIMGInstructions.td =================================================================== --- lib/Target/AMDGPU/MIMGInstructions.td +++ lib/Target/AMDGPU/MIMGInstructions.td @@ -32,26 +32,45 @@ class MIMG_NoSampler_Helper op, string asm, RegisterClass dst_rc, RegisterClass addr_rc, + bit d16_bit=0, string dns=""> : MIMG_Helper < (outs dst_rc:$vdata), (ins addr_rc:$vaddr, SReg_256:$srsrc, dmask:$dmask, unorm:$unorm, GLC:$glc, slc:$slc, r128:$r128, tfe:$tfe, lwe:$lwe, da:$da), - asm#" $vdata, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da", + asm#" $vdata, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da"#!if(d16_bit, " d16", ""), dns>, MIMGe { let ssamp = 0; + let D16 = d16; +} + +multiclass MIMG_NoSampler_Src_Helper_Helper op, string asm, + RegisterClass dst_rc, + int channels, bit d16_bit, + string suffix> { + def _V1 # suffix : MIMG_NoSampler_Helper , + MIMG_Mask; + def _V2 # suffix : MIMG_NoSampler_Helper , + MIMG_Mask; + def _V4 # suffix : MIMG_NoSampler_Helper , + MIMG_Mask; } multiclass MIMG_NoSampler_Src_Helper op, string asm, RegisterClass dst_rc, int channels> { - def _V1 : MIMG_NoSampler_Helper , - MIMG_Mask; - def _V2 : MIMG_NoSampler_Helper , - MIMG_Mask; - def _V4 : MIMG_NoSampler_Helper , - MIMG_Mask; + defm : MIMG_NoSampler_Src_Helper_Helper ; + + let d16 = 1 in { + let SubtargetPredicate = HasPackedD16VMem in { + defm : MIMG_NoSampler_Src_Helper_Helper ; + } // End HasPackedD16VMem. + + let SubtargetPredicate = HasUnpackedD16VMem, DecoderNamespace = "GFX80_UNPACKED" in { + defm : MIMG_NoSampler_Src_Helper_Helper ; + } // End HasUnpackedD16VMem. + } // End d16 = 1. } multiclass MIMG_NoSampler op, string asm> { @@ -64,30 +83,49 @@ class MIMG_Store_Helper op, string asm, RegisterClass data_rc, RegisterClass addr_rc, + bit d16_bit=0, string dns = ""> : MIMG_Helper < (outs), (ins data_rc:$vdata, addr_rc:$vaddr, SReg_256:$srsrc, dmask:$dmask, unorm:$unorm, GLC:$glc, slc:$slc, r128:$r128, tfe:$tfe, lwe:$lwe, da:$da), - asm#" $vdata, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da", dns>, MIMGe { + asm#" $vdata, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da"#!if(d16_bit, " d16", ""), dns>, MIMGe { let ssamp = 0; let mayLoad = 0; let mayStore = 1; let hasSideEffects = 0; let hasPostISelHook = 0; let DisableWQM = 1; + let D16 = d16; +} + +multiclass MIMG_Store_Addr_Helper_Helper op, string asm, + RegisterClass data_rc, + int channels, bit d16_bit, + string suffix> { + def _V1 # suffix : MIMG_Store_Helper , + MIMG_Mask; + def _V2 # suffix : MIMG_Store_Helper , + MIMG_Mask; + def _V4 # suffix : MIMG_Store_Helper , + MIMG_Mask; } multiclass MIMG_Store_Addr_Helper op, string asm, RegisterClass data_rc, int channels> { - def _V1 : MIMG_Store_Helper , - MIMG_Mask; - def _V2 : MIMG_Store_Helper , - MIMG_Mask; - def _V4 : MIMG_Store_Helper , - MIMG_Mask; + defm : MIMG_Store_Addr_Helper_Helper ; + + let d16 = 1 in { + let SubtargetPredicate = HasPackedD16VMem in { + defm : MIMG_Store_Addr_Helper_Helper ; + } // End HasPackedD16VMem. + + let SubtargetPredicate = HasUnpackedD16VMem, DecoderNamespace = "GFX80_UNPACKED" in { + defm : MIMG_Store_Addr_Helper_Helper ; + } // End HasUnpackedD16VMem. + } // End d16 = 1. } multiclass MIMG_Store op, string asm> { @@ -159,30 +197,49 @@ RegisterClass dst_rc, RegisterClass src_rc, bit wqm, + bit d16_bit=0, string dns=""> : MIMG_Helper < (outs dst_rc:$vdata), (ins src_rc:$vaddr, SReg_256:$srsrc, SReg_128:$ssamp, dmask:$dmask, unorm:$unorm, GLC:$glc, slc:$slc, r128:$r128, tfe:$tfe, lwe:$lwe, da:$da), - asm#" $vdata, $vaddr, $srsrc, $ssamp$dmask$unorm$glc$slc$r128$tfe$lwe$da", + asm#" $vdata, $vaddr, $srsrc, $ssamp$dmask$unorm$glc$slc$r128$tfe$lwe$da"#!if(d16_bit, " d16", ""), dns>, MIMGe { let WQM = wqm; + let D16 = d16; +} + +multiclass MIMG_Sampler_Src_Helper_Helper op, string asm, + RegisterClass dst_rc, + int channels, bit wqm, + bit d16_bit, string suffix> { + def _V1 # suffix : MIMG_Sampler_Helper , + MIMG_Mask; + def _V2 # suffix : MIMG_Sampler_Helper , + MIMG_Mask; + def _V4 # suffix : MIMG_Sampler_Helper , + MIMG_Mask; + def _V8 # suffix : MIMG_Sampler_Helper , + MIMG_Mask; + def _V16 # suffix : MIMG_Sampler_Helper , + MIMG_Mask; } multiclass MIMG_Sampler_Src_Helper op, string asm, RegisterClass dst_rc, int channels, bit wqm> { - def _V1 : MIMG_Sampler_Helper , - MIMG_Mask; - def _V2 : MIMG_Sampler_Helper , - MIMG_Mask; - def _V4 : MIMG_Sampler_Helper , - MIMG_Mask; - def _V8 : MIMG_Sampler_Helper , - MIMG_Mask; - def _V16 : MIMG_Sampler_Helper , - MIMG_Mask; + defm : MIMG_Sampler_Src_Helper_Helper ; + + let d16 = 1 in { + let SubtargetPredicate = HasPackedD16VMem in { + defm : MIMG_Sampler_Src_Helper_Helper ; + } // End HasPackedD16VMem. + + let SubtargetPredicate = HasUnpackedD16VMem, DecoderNamespace = "GFX80_UNPACKED" in { + defm : MIMG_Sampler_Src_Helper_Helper ; + } // End HasUnpackedD16VMem. + } // End d16 = 1. } multiclass MIMG_Sampler op, string asm, bit wqm=0> { @@ -196,12 +253,12 @@ class MIMG_Gather_Helper op, string asm, RegisterClass dst_rc, - RegisterClass src_rc, bit wqm> : MIMG < + RegisterClass src_rc, bit wqm, bit d16_bit=0> : MIMG < (outs dst_rc:$vdata), (ins src_rc:$vaddr, SReg_256:$srsrc, SReg_128:$ssamp, dmask:$dmask, unorm:$unorm, GLC:$glc, slc:$slc, r128:$r128, tfe:$tfe, lwe:$lwe, da:$da), - asm#" $vdata, $vaddr, $srsrc, $ssamp$dmask$unorm$glc$slc$r128$tfe$lwe$da", + asm#" $vdata, $vaddr, $srsrc, $ssamp$dmask$unorm$glc$slc$r128$tfe$lwe$da"#!if(d16_bit, " d16", ""), []>, MIMGe { let mayLoad = 1; let mayStore = 0; @@ -216,23 +273,42 @@ let Gather4 = 1; let hasPostISelHook = 0; let WQM = wqm; + let D16 = d16; let isAsmParserOnly = 1; // TBD: fix it later } + +multiclass MIMG_Gather_Src_Helper_Helper op, string asm, + RegisterClass dst_rc, + int channels, bit wqm, + bit d16_bit, string suffix> { + def _V1 # suffix : MIMG_Gather_Helper , + MIMG_Mask; + def _V2 # suffix : MIMG_Gather_Helper , + MIMG_Mask; + def _V4 # suffix : MIMG_Gather_Helper , + MIMG_Mask; + def _V8 # suffix : MIMG_Gather_Helper , + MIMG_Mask; + def _V16 # suffix : MIMG_Gather_Helper , + MIMG_Mask; +} + multiclass MIMG_Gather_Src_Helper op, string asm, RegisterClass dst_rc, int channels, bit wqm> { - def _V1 : MIMG_Gather_Helper , - MIMG_Mask; - def _V2 : MIMG_Gather_Helper , - MIMG_Mask; - def _V4 : MIMG_Gather_Helper , - MIMG_Mask; - def _V8 : MIMG_Gather_Helper , - MIMG_Mask; - def _V16 : MIMG_Gather_Helper , - MIMG_Mask; + defm : MIMG_Gather_Src_Helper_Helper; + + let d16 = 1 in { + let SubtargetPredicate = HasPackedD16VMem in { + defm : MIMG_Gather_Src_Helper_Helper; + } // End HasPackedD16VMem. + + let SubtargetPredicate = HasUnpackedD16VMem, DecoderNamespace = "GFX80_UNPACKED" in { + defm : MIMG_Gather_Src_Helper_Helper; + } // End HasUnpackedD16VMem. + } // End d16 = 1. } multiclass MIMG_Gather op, string asm, bit wqm=0> { @@ -357,29 +433,11 @@ /********** Image sampling patterns **********/ /********** ======================= **********/ -// Image + sampler -class SampleRawPattern : GCNPat < - (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, i32:$dmask, i32:$unorm, - i32:$r128, i32:$da, i32:$glc, i32:$slc, i32:$tfe, i32:$lwe), - (opcode $addr, $rsrc, $sampler, - (as_i32imm $dmask), (as_i1imm $unorm), (as_i1imm $glc), (as_i1imm $slc), - (as_i1imm $r128), (as_i1imm $tfe), (as_i1imm $lwe), (as_i1imm $da)) ->; - -multiclass SampleRawPatterns { - def : SampleRawPattern(opcode # _V4_V1), i32>; - def : SampleRawPattern(opcode # _V4_V2), v2i32>; - def : SampleRawPattern(opcode # _V4_V4), v4i32>; - def : SampleRawPattern(opcode # _V4_V8), v8i32>; - def : SampleRawPattern(opcode # _V4_V16), v16i32>; -} - -// Image + sampler for amdgcn +// ImageSample for amdgcn // TODO: -// 1. Handle half data type like v4f16, and add D16 bit support; -// 2. Handle v4i32 rsrc type (Register Class for the instruction to be SReg_128). -// 3. Add A16 support when we pass address of half type. -multiclass AMDGCNSamplePattern { +// 1. Handle v4i32 rsrc type (Register Class for the instruction to be SReg_128). +// 2. Add A16 support when we pass address of half type. +multiclass ImageSamplePattern { def : GCNPat< (dt (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, i32:$dmask, i1:$unorm, i1:$glc, i1:$slc, i1:$lwe, i1:$da)), @@ -389,36 +447,44 @@ >; } -multiclass AMDGCNSampleDataPatterns { - defm : AMDGCNSamplePattern(opcode # _V1), dt, f32>; - defm : AMDGCNSamplePattern(opcode # _V2), dt, v2f32>; - defm : AMDGCNSamplePattern(opcode # _V4), dt, v4f32>; - defm : AMDGCNSamplePattern(opcode # _V8), dt, v8f32>; - defm : AMDGCNSamplePattern(opcode # _V16), dt, v16f32>; +multiclass ImageSampleDataPatterns { + defm : ImageSamplePattern(opcode # _V1 # suffix), dt, f32>; + defm : ImageSamplePattern(opcode # _V2 # suffix), dt, v2f32>; + defm : ImageSamplePattern(opcode # _V4 # suffix), dt, v4f32>; + defm : ImageSamplePattern(opcode # _V8 # suffix), dt, v8f32>; + defm : ImageSamplePattern(opcode # _V16 # suffix), dt, v16f32>; } -// TODO: support v3f32. -multiclass AMDGCNSamplePatterns { - defm : AMDGCNSampleDataPatterns(opcode # _V1), f32>; - defm : AMDGCNSampleDataPatterns(opcode # _V2), v2f32>; - defm : AMDGCNSampleDataPatterns(opcode # _V4), v4f32>; +// ImageSample patterns. +multiclass ImageSamplePatterns { + defm : ImageSampleDataPatterns(opcode # _V1), f32>; + defm : ImageSampleDataPatterns(opcode # _V2), v2f32>; + defm : ImageSampleDataPatterns(opcode # _V4), v4f32>; + + let SubtargetPredicate = HasUnpackedD16VMem in { + defm : ImageSampleDataPatterns(opcode # _V1), f16, "_D16_gfx80">; + } // End HasUnpackedD16VMem. + + let SubtargetPredicate = HasPackedD16VMem in { + defm : ImageSampleDataPatterns(opcode # _V1), f16, "_D16">; + defm : ImageSampleDataPatterns(opcode # _V1), v2f16, "_D16">; + } // End HasPackedD16VMem. } -// Image only -class ImagePattern : GCNPat < - (name vt:$addr, v8i32:$rsrc, imm:$dmask, imm:$unorm, - imm:$r128, imm:$da, imm:$glc, imm:$slc, imm:$tfe, imm:$lwe), - (opcode $addr, $rsrc, - (as_i32imm $dmask), (as_i1imm $unorm), (as_i1imm $glc), (as_i1imm $slc), - (as_i1imm $r128), (as_i1imm $tfe), (as_i1imm $lwe), (as_i1imm $da)) ->; +// ImageSample alternative patterns for illegal vector half Types. +multiclass ImageSampleAltPatterns { + let SubtargetPredicate = HasUnpackedD16VMem in { + defm : ImageSampleDataPatterns(opcode # _V2), v2i32, "_D16_gfx80">; + defm : ImageSampleDataPatterns(opcode # _V4), v4i32, "_D16_gfx80">; + } // End HasUnpackedD16VMem. -multiclass ImagePatterns { - def : ImagePattern(opcode # _V4_V1), i32>; - def : ImagePattern(opcode # _V4_V2), v2i32>; - def : ImagePattern(opcode # _V4_V4), v4i32>; + let SubtargetPredicate = HasPackedD16VMem in { + defm : ImageSampleDataPatterns(opcode # _V1), i32, "_D16">; + defm : ImageSampleDataPatterns(opcode # _V2), v2i32, "_D16">; + } // End HasPackedD16VMem. } +// ImageLoad for amdgcn. multiclass ImageLoadPattern { def : GCNPat < (dt (name vt:$addr, v8i32:$rsrc, i32:$dmask, i1:$glc, i1:$slc, i1:$lwe, @@ -429,19 +495,43 @@ >; } -multiclass ImageLoadDataPatterns { - defm : ImageLoadPattern(opcode # _V1), dt, i32>; - defm : ImageLoadPattern(opcode # _V2), dt, v2i32>; - defm : ImageLoadPattern(opcode # _V4), dt, v4i32>; +multiclass ImageLoadDataPatterns { + defm : ImageLoadPattern(opcode # _V1 # suffix), dt, i32>; + defm : ImageLoadPattern(opcode # _V2 # suffix), dt, v2i32>; + defm : ImageLoadPattern(opcode # _V4 # suffix), dt, v4i32>; } +// ImageLoad patterns. // TODO: support v3f32. multiclass ImageLoadPatterns { defm : ImageLoadDataPatterns(opcode # _V1), f32>; defm : ImageLoadDataPatterns(opcode # _V2), v2f32>; defm : ImageLoadDataPatterns(opcode # _V4), v4f32>; + + let SubtargetPredicate = HasUnpackedD16VMem in { + defm : ImageLoadDataPatterns(opcode # _V1), f16, "_D16_gfx80">; + } // End HasUnpackedD16VMem. + + let SubtargetPredicate = HasPackedD16VMem in { + defm : ImageLoadDataPatterns(opcode # _V1), f16, "_D16">; + defm : ImageLoadDataPatterns(opcode # _V1), v2f16, "_D16">; + } // End HasPackedD16VMem. } +// ImageLoad alternative patterns for illegal vector half Types. +multiclass ImageLoadAltPatterns { + let SubtargetPredicate = HasUnpackedD16VMem in { + defm : ImageLoadDataPatterns(opcode # _V2), v2i32, "_D16_gfx80">; + defm : ImageLoadDataPatterns(opcode # _V4), v4i32, "_D16_gfx80">; + } // End HasUnPackedD16VMem. + + let SubtargetPredicate = HasPackedD16VMem in { + defm : ImageLoadDataPatterns(opcode # _V1), i32, "_D16">; + defm : ImageLoadDataPatterns(opcode # _V2), v2i32, "_D16">; + } // End HasPackedD16VMem. +} + +// ImageStore for amdgcn. multiclass ImageStorePattern { def : GCNPat < (name dt:$data, vt:$addr, v8i32:$rsrc, i32:$dmask, i1:$glc, i1:$slc, @@ -452,30 +542,56 @@ >; } -multiclass ImageStoreDataPatterns { - defm : ImageStorePattern(opcode # _V1), dt, i32>; - defm : ImageStorePattern(opcode # _V2), dt, v2i32>; - defm : ImageStorePattern(opcode # _V4), dt, v4i32>; +multiclass ImageStoreDataPatterns { + defm : ImageStorePattern(opcode # _V1 # suffix), dt, i32>; + defm : ImageStorePattern(opcode # _V2 # suffix), dt, v2i32>; + defm : ImageStorePattern(opcode # _V4 # suffix), dt, v4i32>; } +// ImageStore patterns. // TODO: support v3f32. multiclass ImageStorePatterns { defm : ImageStoreDataPatterns(opcode # _V1), f32>; defm : ImageStoreDataPatterns(opcode # _V2), v2f32>; defm : ImageStoreDataPatterns(opcode # _V4), v4f32>; + + let SubtargetPredicate = HasUnpackedD16VMem in { + defm : ImageStoreDataPatterns(opcode # _V1), f16, "_D16_gfx80">; + } // End HasUnpackedD16VMem. + + let SubtargetPredicate = HasPackedD16VMem in { + defm : ImageStoreDataPatterns(opcode # _V1), f16, "_D16">; + defm : ImageStoreDataPatterns(opcode # _V1), v2f16, "_D16">; + } // End HasPackedD16VMem. } +// ImageStore alternative patterns. +multiclass ImageStoreAltPatterns { + let SubtargetPredicate = HasUnpackedD16VMem in { + defm : ImageStoreDataPatterns(opcode # _V2), v2i32, "_D16_gfx80">; + defm : ImageStoreDataPatterns(opcode # _V4), v4i32, "_D16_gfx80">; + } // End HasUnpackedD16VMem. + + let SubtargetPredicate = HasPackedD16VMem in { + defm : ImageStoreDataPatterns(opcode # _V1), i32, "_D16">; + defm : ImageStoreDataPatterns(opcode # _V2), v2i32, "_D16">; + } // End HasPackedD16VMem. +} + +// ImageAtomic for amdgcn. class ImageAtomicPattern : GCNPat < (name i32:$vdata, vt:$addr, v8i32:$rsrc, imm:$r128, imm:$da, imm:$slc), (opcode $vdata, $addr, $rsrc, 1, 1, 1, (as_i1imm $slc), (as_i1imm $r128), 0, 0, (as_i1imm $da)) >; +// ImageAtomic patterns. multiclass ImageAtomicPatterns { def : ImageAtomicPattern(opcode # _V1), i32>; def : ImageAtomicPattern(opcode # _V2), v2i32>; def : ImageAtomicPattern(opcode # _V4), v4i32>; } +// ImageAtomicCmpSwap for amdgcn. class ImageAtomicCmpSwapPattern : GCNPat < (int_amdgcn_image_atomic_cmpswap i32:$vsrc, i32:$vcmp, vt:$addr, v8i32:$rsrc, imm:$r128, imm:$da, imm:$slc), @@ -487,93 +603,180 @@ // ======= amdgcn Image Intrinsics ============== -// Image load +// Image load. defm : ImageLoadPatterns; defm : ImageLoadPatterns; defm : ImageLoadPatterns; - -// Image store -defm : ImageStorePatterns; -defm : ImageStorePatterns; - -// Basic sample -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; - -// Sample with comparison -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; - -// Sample with offsets -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; - -// Sample with comparison and offsets -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; - -// Gather opcodes -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; - -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; - -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; - -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; - -defm : AMDGCNSamplePatterns; +defm : ImageLoadAltPatterns; +defm : ImageLoadAltPatterns; + +// Image store. +defm : ImageStorePatterns; +defm : ImageStorePatterns; +defm : ImageStoreAltPatterns; +defm : ImageStoreAltPatterns; + +// Basic sample. +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; + +// Sample with comparison. +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; + +// Sample with offsets. +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; + +// Sample with comparison and offsets. +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; + +// Basic gather4. +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; + +// Gather4 with comparison. +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; + +// Gather4 with offsets. +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; + +// Gather4 with comparison and offsets. +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; + +// Basic sample alternative. +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; + +// Sample with comparison alternative. +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; + +// Sample with offsets alternative. +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; + +// Sample with comparison and offsets alternative. +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; + +// Basic gather4 alternative. +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; + +// Gather4 with comparison alternative. +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; + +// Gather4 with offsets alternative. +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; + +// Gather4 with comparison and offsets alternative. +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; + +defm : ImageSamplePatterns; // Image atomics defm : ImageAtomicPatterns; Index: lib/Target/AMDGPU/SIDefines.h =================================================================== --- lib/Target/AMDGPU/SIDefines.h +++ lib/Target/AMDGPU/SIDefines.h @@ -85,7 +85,10 @@ ClampHi = UINT64_C(1) << 48, // Is a packed VOP3P instruction. - IsPacked = UINT64_C(1) << 49 + IsPacked = UINT64_C(1) << 49, + + // "d16" bit set or not. + D16 = UINT64_C(1) << 50 }; // v_cmp_class_* etc. use a 10-bit mask for what operation is checked. Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -3504,6 +3504,163 @@ return SDValue(); } +static unsigned getImageOpcode(unsigned IID) { + switch (IID) { + case Intrinsic::amdgcn_image_load: + return AMDGPUISD::IMAGE_LOAD; + case Intrinsic::amdgcn_image_load_mip: + return AMDGPUISD::IMAGE_LOAD_MIP; + + // Basic sample. + case Intrinsic::amdgcn_image_sample: + return AMDGPUISD::IMAGE_SAMPLE; + case Intrinsic::amdgcn_image_sample_cl: + return AMDGPUISD::IMAGE_SAMPLE_CL; + case Intrinsic::amdgcn_image_sample_d: + return AMDGPUISD::IMAGE_SAMPLE_D; + case Intrinsic::amdgcn_image_sample_d_cl: + return AMDGPUISD::IMAGE_SAMPLE_D_CL; + case Intrinsic::amdgcn_image_sample_l: + return AMDGPUISD::IMAGE_SAMPLE_L; + case Intrinsic::amdgcn_image_sample_b: + return AMDGPUISD::IMAGE_SAMPLE_B; + case Intrinsic::amdgcn_image_sample_b_cl: + return AMDGPUISD::IMAGE_SAMPLE_B_CL; + case Intrinsic::amdgcn_image_sample_lz: + return AMDGPUISD::IMAGE_SAMPLE_LZ; + case Intrinsic::amdgcn_image_sample_cd: + return AMDGPUISD::IMAGE_SAMPLE_CD; + case Intrinsic::amdgcn_image_sample_cd_cl: + return AMDGPUISD::IMAGE_SAMPLE_CD_CL; + + // Sample with comparison. + case Intrinsic::amdgcn_image_sample_c: + return AMDGPUISD::IMAGE_SAMPLE_C; + case Intrinsic::amdgcn_image_sample_c_cl: + return AMDGPUISD::IMAGE_SAMPLE_C_CL; + case Intrinsic::amdgcn_image_sample_c_d: + return AMDGPUISD::IMAGE_SAMPLE_C_D; + case Intrinsic::amdgcn_image_sample_c_d_cl: + return AMDGPUISD::IMAGE_SAMPLE_C_D_CL; + case Intrinsic::amdgcn_image_sample_c_l: + return AMDGPUISD::IMAGE_SAMPLE_C_L; + case Intrinsic::amdgcn_image_sample_c_b: + return AMDGPUISD::IMAGE_SAMPLE_C_B; + case Intrinsic::amdgcn_image_sample_c_b_cl: + return AMDGPUISD::IMAGE_SAMPLE_C_B_CL; + case Intrinsic::amdgcn_image_sample_c_lz: + return AMDGPUISD::IMAGE_SAMPLE_C_LZ; + case Intrinsic::amdgcn_image_sample_c_cd: + return AMDGPUISD::IMAGE_SAMPLE_C_CD; + case Intrinsic::amdgcn_image_sample_c_cd_cl: + return AMDGPUISD::IMAGE_SAMPLE_C_CD_CL; + + // Sample with offsets. + case Intrinsic::amdgcn_image_sample_o: + return AMDGPUISD::IMAGE_SAMPLE_O; + case Intrinsic::amdgcn_image_sample_cl_o: + return AMDGPUISD::IMAGE_SAMPLE_CL_O; + case Intrinsic::amdgcn_image_sample_d_o: + return AMDGPUISD::IMAGE_SAMPLE_D_O; + case Intrinsic::amdgcn_image_sample_d_cl_o: + return AMDGPUISD::IMAGE_SAMPLE_D_CL_O; + case Intrinsic::amdgcn_image_sample_l_o: + return AMDGPUISD::IMAGE_SAMPLE_L_O; + case Intrinsic::amdgcn_image_sample_b_o: + return AMDGPUISD::IMAGE_SAMPLE_B_O; + case Intrinsic::amdgcn_image_sample_b_cl_o: + return AMDGPUISD::IMAGE_SAMPLE_B_CL_O; + case Intrinsic::amdgcn_image_sample_lz_o: + return AMDGPUISD::IMAGE_SAMPLE_LZ_O; + case Intrinsic::amdgcn_image_sample_cd_o: + return AMDGPUISD::IMAGE_SAMPLE_CD_O; + case Intrinsic::amdgcn_image_sample_cd_cl_o: + return AMDGPUISD::IMAGE_SAMPLE_CD_CL_O; + + // Sample with comparison and offsets. + case Intrinsic::amdgcn_image_sample_c_o: + return AMDGPUISD::IMAGE_SAMPLE_C_O; + case Intrinsic::amdgcn_image_sample_c_cl_o: + return AMDGPUISD::IMAGE_SAMPLE_C_CL_O; + case Intrinsic::amdgcn_image_sample_c_d_o: + return AMDGPUISD::IMAGE_SAMPLE_C_D_O; + case Intrinsic::amdgcn_image_sample_c_d_cl_o: + return AMDGPUISD::IMAGE_SAMPLE_C_D_CL_O; + case Intrinsic::amdgcn_image_sample_c_l_o: + return AMDGPUISD::IMAGE_SAMPLE_C_L_O; + case Intrinsic::amdgcn_image_sample_c_b_o: + return AMDGPUISD::IMAGE_SAMPLE_C_B_O; + case Intrinsic::amdgcn_image_sample_c_b_cl_o: + return AMDGPUISD::IMAGE_SAMPLE_C_B_CL_O; + case Intrinsic::amdgcn_image_sample_c_lz_o: + return AMDGPUISD::IMAGE_SAMPLE_C_LZ_O; + case Intrinsic::amdgcn_image_sample_c_cd_o: + return AMDGPUISD::IMAGE_SAMPLE_C_CD_O; + case Intrinsic::amdgcn_image_sample_c_cd_cl_o: + return AMDGPUISD::IMAGE_SAMPLE_C_CD_CL_O; + + // Basic gather4. + case Intrinsic::amdgcn_image_gather4: + return AMDGPUISD::IMAGE_GATHER4; + case Intrinsic::amdgcn_image_gather4_cl: + return AMDGPUISD::IMAGE_GATHER4_CL; + case Intrinsic::amdgcn_image_gather4_l: + return AMDGPUISD::IMAGE_GATHER4_L; + case Intrinsic::amdgcn_image_gather4_b: + return AMDGPUISD::IMAGE_GATHER4_B; + case Intrinsic::amdgcn_image_gather4_b_cl: + return AMDGPUISD::IMAGE_GATHER4_B_CL; + case Intrinsic::amdgcn_image_gather4_lz: + return AMDGPUISD::IMAGE_GATHER4_LZ; + + // Gather4 with comparison. + case Intrinsic::amdgcn_image_gather4_c: + return AMDGPUISD::IMAGE_GATHER4_C; + case Intrinsic::amdgcn_image_gather4_c_cl: + return AMDGPUISD::IMAGE_GATHER4_C_CL; + case Intrinsic::amdgcn_image_gather4_c_l: + return AMDGPUISD::IMAGE_GATHER4_C_L; + case Intrinsic::amdgcn_image_gather4_c_b: + return AMDGPUISD::IMAGE_GATHER4_C_B; + case Intrinsic::amdgcn_image_gather4_c_b_cl: + return AMDGPUISD::IMAGE_GATHER4_C_B_CL; + case Intrinsic::amdgcn_image_gather4_c_lz: + return AMDGPUISD::IMAGE_GATHER4_C_LZ; + + // Gather4 with offsets. + case Intrinsic::amdgcn_image_gather4_o: + return AMDGPUISD::IMAGE_GATHER4_O; + case Intrinsic::amdgcn_image_gather4_cl_o: + return AMDGPUISD::IMAGE_GATHER4_CL_O; + case Intrinsic::amdgcn_image_gather4_l_o: + return AMDGPUISD::IMAGE_GATHER4_L_O; + case Intrinsic::amdgcn_image_gather4_b_o: + return AMDGPUISD::IMAGE_GATHER4_B_O; + case Intrinsic::amdgcn_image_gather4_b_cl_o: + return AMDGPUISD::IMAGE_GATHER4_B_CL_O; + case Intrinsic::amdgcn_image_gather4_lz_o: + return AMDGPUISD::IMAGE_GATHER4_LZ_O; + + // Gather4 with comparison and offsets. + case Intrinsic::amdgcn_image_gather4_c_o: + return AMDGPUISD::IMAGE_GATHER4_C_O; + case Intrinsic::amdgcn_image_gather4_c_cl_o: + return AMDGPUISD::IMAGE_GATHER4_C_CL_O; + case Intrinsic::amdgcn_image_gather4_c_l_o: + return AMDGPUISD::IMAGE_GATHER4_C_L_O; + case Intrinsic::amdgcn_image_gather4_c_b_o: + return AMDGPUISD::IMAGE_GATHER4_C_B_O; + case Intrinsic::amdgcn_image_gather4_c_b_cl_o: + return AMDGPUISD::IMAGE_GATHER4_C_B_CL_O; + case Intrinsic::amdgcn_image_gather4_c_lz_o: + return AMDGPUISD::IMAGE_GATHER4_C_LZ_O; + + default: + break; + } + return 0; +} + static SDValue adjustLoadValueType(SDValue Result, EVT LoadVT, SDLoc DL, SelectionDAG &DAG, bool Unpacked) { if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16. @@ -3539,16 +3696,16 @@ switch (IID) { case Intrinsic::amdgcn_tbuffer_load: { SDValue Ops[] = { - Op.getOperand(0), // Chain - Op.getOperand(2), // rsrc - Op.getOperand(3), // vindex - Op.getOperand(4), // voffset - Op.getOperand(5), // soffset - Op.getOperand(6), // offset - Op.getOperand(7), // dfmt - Op.getOperand(8), // nfmt - Op.getOperand(9), // glc - Op.getOperand(10) // slc + Op.getOperand(0), // Chain + Op.getOperand(2), // rsrc + Op.getOperand(3), // vindex + Op.getOperand(4), // voffset + Op.getOperand(5), // soffset + Op.getOperand(6), // offset + Op.getOperand(7), // dfmt + Op.getOperand(8), // nfmt + Op.getOperand(9), // glc + Op.getOperand(10) // slc }; Res = DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, DL, VTList, Ops, M->getMemoryVT(), @@ -3557,19 +3714,134 @@ return adjustLoadValueType(Res, LoadVT, DL, DAG, Unpacked); } case Intrinsic::amdgcn_buffer_load_format: { - SDValue Ops[] = { - Op.getOperand(0), // Chain - Op.getOperand(2), // rsrc - Op.getOperand(3), // vindex - Op.getOperand(4), // offset - Op.getOperand(5), // glc - Op.getOperand(6) // slc - }; - Res = DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, - DL, VTList, Ops, M->getMemoryVT(), - M->getMemOperand()); - Chain = Res.getValue(1); - return adjustLoadValueType(Res, LoadVT, DL, DAG, Unpacked); + SDValue Ops[] = { + Op.getOperand(0), // Chain + Op.getOperand(2), // rsrc + Op.getOperand(3), // vindex + Op.getOperand(4), // offset + Op.getOperand(5), // glc + Op.getOperand(6) // slc + }; + Res = DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, + DL, VTList, Ops, M->getMemoryVT(), + M->getMemOperand()); + Chain = Res.getValue(1); + return adjustLoadValueType(Res, LoadVT, DL, DAG, Unpacked); + } + case Intrinsic::amdgcn_image_load: + case Intrinsic::amdgcn_image_load_mip: { + SDValue Ops[] = { + Op.getOperand(0), // Chain + Op.getOperand(2), // vaddr + Op.getOperand(3), // rsrc + Op.getOperand(4), // dmask + Op.getOperand(5), // glc + Op.getOperand(6), // slc + Op.getOperand(7), // lwe + Op.getOperand(8) // da + }; + unsigned Opc = getImageOpcode(IID); + Res = DAG.getMemIntrinsicNode(Opc, DL, VTList, Ops, M->getMemoryVT(), + M->getMemOperand()); + Chain = Res.getValue(1); + return adjustLoadValueType(Res, LoadVT, DL, DAG, Unpacked); + } + // Basic sample. + case Intrinsic::amdgcn_image_sample: + case Intrinsic::amdgcn_image_sample_cl: + case Intrinsic::amdgcn_image_sample_d: + case Intrinsic::amdgcn_image_sample_d_cl: + case Intrinsic::amdgcn_image_sample_l: + case Intrinsic::amdgcn_image_sample_b: + case Intrinsic::amdgcn_image_sample_b_cl: + case Intrinsic::amdgcn_image_sample_lz: + case Intrinsic::amdgcn_image_sample_cd: + case Intrinsic::amdgcn_image_sample_cd_cl: + + // Sample with comparison. + case Intrinsic::amdgcn_image_sample_c: + case Intrinsic::amdgcn_image_sample_c_cl: + case Intrinsic::amdgcn_image_sample_c_d: + case Intrinsic::amdgcn_image_sample_c_d_cl: + case Intrinsic::amdgcn_image_sample_c_l: + case Intrinsic::amdgcn_image_sample_c_b: + case Intrinsic::amdgcn_image_sample_c_b_cl: + case Intrinsic::amdgcn_image_sample_c_lz: + case Intrinsic::amdgcn_image_sample_c_cd: + case Intrinsic::amdgcn_image_sample_c_cd_cl: + + // Sample with offsets. + case Intrinsic::amdgcn_image_sample_o: + case Intrinsic::amdgcn_image_sample_cl_o: + case Intrinsic::amdgcn_image_sample_d_o: + case Intrinsic::amdgcn_image_sample_d_cl_o: + case Intrinsic::amdgcn_image_sample_l_o: + case Intrinsic::amdgcn_image_sample_b_o: + case Intrinsic::amdgcn_image_sample_b_cl_o: + case Intrinsic::amdgcn_image_sample_lz_o: + case Intrinsic::amdgcn_image_sample_cd_o: + case Intrinsic::amdgcn_image_sample_cd_cl_o: + + // Sample with comparison and offsets. + case Intrinsic::amdgcn_image_sample_c_o: + case Intrinsic::amdgcn_image_sample_c_cl_o: + case Intrinsic::amdgcn_image_sample_c_d_o: + case Intrinsic::amdgcn_image_sample_c_d_cl_o: + case Intrinsic::amdgcn_image_sample_c_l_o: + case Intrinsic::amdgcn_image_sample_c_b_o: + case Intrinsic::amdgcn_image_sample_c_b_cl_o: + case Intrinsic::amdgcn_image_sample_c_lz_o: + case Intrinsic::amdgcn_image_sample_c_cd_o: + case Intrinsic::amdgcn_image_sample_c_cd_cl_o: + + // Basic gather4 + case Intrinsic::amdgcn_image_gather4: + case Intrinsic::amdgcn_image_gather4_cl: + case Intrinsic::amdgcn_image_gather4_l: + case Intrinsic::amdgcn_image_gather4_b: + case Intrinsic::amdgcn_image_gather4_b_cl: + case Intrinsic::amdgcn_image_gather4_lz: + + // Gather4 with comparison + case Intrinsic::amdgcn_image_gather4_c: + case Intrinsic::amdgcn_image_gather4_c_cl: + case Intrinsic::amdgcn_image_gather4_c_l: + case Intrinsic::amdgcn_image_gather4_c_b: + case Intrinsic::amdgcn_image_gather4_c_b_cl: + case Intrinsic::amdgcn_image_gather4_c_lz: + + // Gather4 with offsets + case Intrinsic::amdgcn_image_gather4_o: + case Intrinsic::amdgcn_image_gather4_cl_o: + case Intrinsic::amdgcn_image_gather4_l_o: + case Intrinsic::amdgcn_image_gather4_b_o: + case Intrinsic::amdgcn_image_gather4_b_cl_o: + case Intrinsic::amdgcn_image_gather4_lz_o: + + // Gather4 with comparison and offsets + case Intrinsic::amdgcn_image_gather4_c_o: + case Intrinsic::amdgcn_image_gather4_c_cl_o: + case Intrinsic::amdgcn_image_gather4_c_l_o: + case Intrinsic::amdgcn_image_gather4_c_b_o: + case Intrinsic::amdgcn_image_gather4_c_b_cl_o: + case Intrinsic::amdgcn_image_gather4_c_lz_o: { + SDValue Ops[] = { + Op.getOperand(0), // Chain + Op.getOperand(2), // vaddr + Op.getOperand(3), // rsrc + Op.getOperand(4), // sampler + Op.getOperand(5), // dmask + Op.getOperand(6), // unorm + Op.getOperand(7), // glc + Op.getOperand(8), // slc + Op.getOperand(9), // lwe + Op.getOperand(10) // da + }; + unsigned Opc = getImageOpcode(IID); + Res = DAG.getMemIntrinsicNode(Opc, DL, VTList, Ops, M->getMemoryVT(), + M->getMemOperand()); + Chain = Res.getValue(1); + return adjustLoadValueType(Res, LoadVT, DL, DAG, Unpacked); } default: return SDValue(); @@ -4955,6 +5227,30 @@ M->getMemoryVT(), M->getMemOperand()); } + case Intrinsic::amdgcn_image_store: + case Intrinsic::amdgcn_image_store_mip: { + SDValue VData = Op.getOperand(2); + bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16); + if (IsD16) + VData = handleD16VData(VData, DAG); + SDValue Ops[] = { + Chain, // Chain + VData, // vdata + Op.getOperand(3), // vaddr + Op.getOperand(4), // rsrc + Op.getOperand(5), // dmask + Op.getOperand(6), // glc + Op.getOperand(7), // slc + Op.getOperand(8), // lwe + Op.getOperand(9) // da + }; + unsigned Opc = (IntrinsicID==Intrinsic::amdgcn_image_store) ? + AMDGPUISD::IMAGE_STORE : AMDGPUISD::IMAGE_STORE_MIP; + MemSDNode *M = cast(Op); + return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, + M->getMemoryVT(), M->getMemOperand()); + } + default: return Op; } @@ -7071,7 +7367,7 @@ unsigned Opcode = Node->getMachineOpcode(); if (TII->isMIMG(Opcode) && !TII->get(Opcode).mayStore() && - !TII->isGather4(Opcode)) { + !TII->isGather4(Opcode) && !TII->isD16(Opcode)) { return adjustWritemask(Node, DAG); } Index: lib/Target/AMDGPU/SIInstrFormats.td =================================================================== --- lib/Target/AMDGPU/SIInstrFormats.td +++ lib/Target/AMDGPU/SIInstrFormats.td @@ -118,6 +118,9 @@ // This bit indicates that this is a packed VOP3P instruction field bit IsPacked = 0; + // This bit indicates that this is a D16 instruction. + field bit D16 = 0; + // These need to be kept in sync with the enum in SIInstrFlags. let TSFlags{0} = SALU; let TSFlags{1} = VALU; @@ -173,6 +176,8 @@ let TSFlags{49} = IsPacked; + let TSFlags{50} = D16; + let SchedRW = [Write32Bit]; field bits<1> DisableSIDecoder = 0; @@ -247,6 +252,7 @@ bits<1> tfe; bits<1> lwe; bits<1> slc; + bits<1> d16 = 0; bits<8> vaddr; bits<7> srsrc; bits<7> ssamp; @@ -265,6 +271,7 @@ let Inst{47-40} = vdata; let Inst{52-48} = srsrc{6-2}; let Inst{57-53} = ssamp{6-2}; + let Inst{63} = d16; } class EXPe : Enc64 { Index: lib/Target/AMDGPU/SIInstrInfo.h =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.h +++ lib/Target/AMDGPU/SIInstrInfo.h @@ -456,6 +456,14 @@ return get(Opcode).TSFlags & SIInstrFlags::Gather4; } + static bool isD16(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::D16; + } + + bool isD16(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::D16; + } + static bool isFLAT(const MachineInstr &MI) { return MI.getDesc().TSFlags & SIInstrFlags::FLAT; } Index: lib/Target/AMDGPU/SIInstrInfo.td =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.td +++ lib/Target/AMDGPU/SIInstrInfo.td @@ -154,6 +154,134 @@ [SDNPMemOperand, SDNPHasChain, SDNPMayLoad, SDNPMayStore] >; +def SDTImage_load : SDTypeProfile<1, 7, + [ + SDTCisInt<1>, // vaddr + SDTCisInt<2>, // rsrc + SDTCisVT<3, i32>, // dmask + SDTCisVT<4, i1>, // glc + SDTCisVT<5, i1>, // slc + SDTCisVT<6, i1>, // lwe + SDTCisVT<7, i1> // da + ]>; +def SIImage_load : SDNode<"AMDGPUISD::IMAGE_LOAD", SDTImage_load, + [SDNPMayLoad, SDNPMemOperand, SDNPHasChain]>; +def SIImage_load_mip : SDNode<"AMDGPUISD::IMAGE_LOAD_MIP", SDTImage_load, + [SDNPMayLoad, SDNPMemOperand, SDNPHasChain]>; + +def SDTImage_store : SDTypeProfile<0, 8, + [ + SDTCisInt<1>, // vaddr + SDTCisInt<2>, // rsrc + SDTCisVT<3, i32>, // dmask + SDTCisVT<4, i1>, // glc + SDTCisVT<5, i1>, // slc + SDTCisVT<6, i1>, // lwe + SDTCisVT<7, i1> // da + ]>; +def SIImage_store : SDNode <"AMDGPUISD::IMAGE_STORE", + SDTImage_store, + [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>; +def SIImage_store_mip : SDNode <"AMDGPUISD::IMAGE_STORE_MIP", + SDTImage_store, + [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>; + +class SDTImage_sample : SDNode , // vaddr + SDTCisInt<2>, // rsrc + SDTCisVT<3, v4i32>, // sampler + SDTCisVT<4, i32>, // dmask + SDTCisVT<5, i1>, // unorm + SDTCisVT<6, i1>, // glc + SDTCisVT<7, i1>, // slc + SDTCisVT<8, i1>, // lwe + SDTCisVT<9, i1> // da + ]>, + [SDNPMayLoad, SDNPMemOperand, SDNPHasChain] +>; + +// Basic sample. +def SIImage_sample : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE">; +def SIImage_sample_cl : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_CL">; +def SIImage_sample_d : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_D">; +def SIImage_sample_d_cl : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_D_CL">; +def SIImage_sample_l : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_L">; +def SIImage_sample_b : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_B">; +def SIImage_sample_b_cl : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_B_CL">; +def SIImage_sample_lz : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_LZ">; +def SIImage_sample_cd : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_CD">; +def SIImage_sample_cd_cl : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_CD_CL">; + +// Sample with comparison. +def SIImage_sample_c : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_C">; +def SIImage_sample_c_cl : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_C_CL">; +def SIImage_sample_c_d : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_C_D">; +def SIImage_sample_c_d_cl : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_C_D_CL">; +def SIImage_sample_c_l : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_C_L">; +def SIImage_sample_c_b : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_C_B">; +def SIImage_sample_c_b_cl : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_C_B_CL">; +def SIImage_sample_c_lz : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_C_LZ">; +def SIImage_sample_c_cd : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_C_CD">; +def SIImage_sample_c_cd_cl : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_C_CD_CL">; + +// Sample with offsets. +def SIImage_sample_o : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_O">; +def SIImage_sample_cl_o : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_CL_O">; +def SIImage_sample_d_o : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_D_O">; +def SIImage_sample_d_cl_o : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_D_CL_O">; +def SIImage_sample_l_o : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_L_O">; +def SIImage_sample_b_o : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_B_O">; +def SIImage_sample_b_cl_o : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_B_CL_O">; +def SIImage_sample_lz_o : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_LZ_O">; +def SIImage_sample_cd_o : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_CD_O">; +def SIImage_sample_cd_cl_o : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_CD_CL_O">; + +// Sample with comparison and offsets. +def SIImage_sample_c_o : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_C_O">; +def SIImage_sample_c_cl_o : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_C_CL_O">; +def SIImage_sample_c_d_o : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_C_D_O">; +def SIImage_sample_c_d_cl_o : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_C_D_CL_O">; +def SIImage_sample_c_l_o : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_C_L_O">; +def SIImage_sample_c_b_o : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_C_B_O">; +def SIImage_sample_c_b_cl_o : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_C_B_CL_O">; +def SIImage_sample_c_lz_o : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_C_LZ_O">; +def SIImage_sample_c_cd_o : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_C_CD_O">; +def SIImage_sample_c_cd_cl_o : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_C_CD_CL_O">; + +// Basic gather4. +def SIImage_gather4 : SDTImage_sample<"AMDGPUISD::IMAGE_GATHER4">; +def SIImage_gather4_cl : SDTImage_sample<"AMDGPUISD::IMAGE_GATHER4_CL">; +def SIImage_gather4_l : SDTImage_sample<"AMDGPUISD::IMAGE_GATHER4_L">; +def SIImage_gather4_b : SDTImage_sample<"AMDGPUISD::IMAGE_GATHER4_B">; +def SIImage_gather4_b_cl : SDTImage_sample<"AMDGPUISD::IMAGE_GATHER4_B_CL">; +def SIImage_gather4_lz : SDTImage_sample<"AMDGPUISD::IMAGE_GATHER4_LZ">; + +// Gather4 with comparison. +def SIImage_gather4_c : SDTImage_sample<"AMDGPUISD::IMAGE_GATHER4_C">; +def SIImage_gather4_c_cl : SDTImage_sample<"AMDGPUISD::IMAGE_GATHER4_C_CL">; +def SIImage_gather4_c_l : SDTImage_sample<"AMDGPUISD::IMAGE_GATHER4_C_L">; +def SIImage_gather4_c_b : SDTImage_sample<"AMDGPUISD::IMAGE_GATHER4_C_B">; +def SIImage_gather4_c_b_cl : SDTImage_sample<"AMDGPUISD::IMAGE_GATHER4_C_B_CL">; +def SIImage_gather4_c_lz : SDTImage_sample<"AMDGPUISD::IMAGE_GATHER4_C_LZ">; + +// Gather4 with offsets. +def SIImage_gather4_o : SDTImage_sample<"AMDGPUISD::IMAGE_GATHER4_O">; +def SIImage_gather4_cl_o : SDTImage_sample<"AMDGPUISD::IMAGE_GATHER4_CL_O">; +def SIImage_gather4_l_o : SDTImage_sample<"AMDGPUISD::IMAGE_GATHER4_L_O">; +def SIImage_gather4_b_o : SDTImage_sample<"AMDGPUISD::IMAGE_GATHER4_B_O">; +def SIImage_gather4_b_cl_o : SDTImage_sample<"AMDGPUISD::IMAGE_GATHER4_B_CL_O">; +def SIImage_gather4_lz_o : SDTImage_sample<"AMDGPUISD::IMAGE_GATHER4_LZ_O">; + +// Gather4 with comparison and offsets. +def SIImage_gather4_c_o : SDTImage_sample<"AMDGPUISD::IMAGE_GATHER4_C_O">; +def SIImage_gather4_c_cl_o : SDTImage_sample<"AMDGPUISD::IMAGE_GATHER4_C_CL_O">; +def SIImage_gather4_c_l_o : SDTImage_sample<"AMDGPUISD::IMAGE_GATHER4_C_L_O">; +def SIImage_gather4_c_b_o : SDTImage_sample<"AMDGPUISD::IMAGE_GATHER4_C_B_O">; +def SIImage_gather4_c_b_cl_o : SDTImage_sample<"AMDGPUISD::IMAGE_GATHER4_C_B_CL_O">; +def SIImage_gather4_c_lz_o : SDTImage_sample<"AMDGPUISD::IMAGE_GATHER4_C_LZ_O">; + class SDSample : SDNode , SDTCisVT<2, v8i32>, SDTCisVT<3, v4i32>, SDTCisVT<4, i32>]> Index: test/CodeGen/AMDGPU/llvm.amdgcn.image.d16.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.image.d16.ll @@ -0,0 +1,128 @@ +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=UNPACKED %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX81 %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX9 %s + +; GCN-LABEL: {{^}}image_load_f16 +; GCN: image_load v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x1 unorm d16 +define amdgpu_ps half @image_load_f16(<4 x i32> %coords, <8 x i32> inreg %rsrc) { +main_body: + %tex = call half @llvm.amdgcn.image.load.f16.v4i32.v8i32(<4 x i32> %coords, <8 x i32> %rsrc, i32 1, i1 false, i1 false, i1 false, i1 false) + ret half %tex +} + +; GCN-LABEL: {{^}}image_load_v2f16: +; UNPACKED: image_load v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x3 unorm d16 +; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]] + +; PACKED: image_load v[[HI:[0-9]+]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x3 unorm d16 +; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[HI]] +define amdgpu_ps half @image_load_v2f16(<4 x i32> %coords, <8 x i32> inreg %rsrc) { +main_body: + %tex = call <2 x half> @llvm.amdgcn.image.load.v2f16.v4i32.v8i32(<4 x i32> %coords, <8 x i32> %rsrc, i32 3, i1 false, i1 false, i1 false, i1 false) + %elt = extractelement <2 x half> %tex, i32 1 + ret half %elt +} + +; GCN-LABEL: {{^}}image_load_v4f16: +; UNPACKED: image_load v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf unorm d16 +; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]] + +; PACKED: image_load v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf unorm d16 +; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[HI]] +define amdgpu_ps half @image_load_v4f16(<4 x i32> %coords, <8 x i32> inreg %rsrc) { +main_body: + %tex = call <4 x half> @llvm.amdgcn.image.load.v4f16.v4i32.v8i32(<4 x i32> %coords, <8 x i32> %rsrc, i32 15, i1 false, i1 false, i1 false, i1 false) + %elt = extractelement <4 x half> %tex, i32 3 + ret half %elt +} + +; GCN-LABEL: {{^}}image_load_mip_v4f16: +; UNPACKED: image_load_mip v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf unorm d16 +; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]] + +; PACKED: image_load_mip v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf unorm d16 +; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[HI]] +define amdgpu_ps half @image_load_mip_v4f16(<4 x i32> %coords, <8 x i32> inreg %rsrc) { +main_body: + %tex = call <4 x half> @llvm.amdgcn.image.load.mip.v4f16.v4i32.v8i32(<4 x i32> %coords, <8 x i32> %rsrc, i32 15, i1 false, i1 false, i1 false, i1 false) + %elt = extractelement <4 x half> %tex, i32 3 + ret half %elt +} + +; GCN-LABEL: {{^}}image_store_f16 +; GCN: v_trunc_f16_e32 v[[LO:[0-9]+]], s{{[0-9]+}} +; GCN: image_store v[[LO]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x1 unorm d16 +define amdgpu_kernel void @image_store_f16(half %data, <4 x i32> %coords, <8 x i32> inreg %rsrc) { +main_body: + call void @llvm.amdgcn.image.store.f16.v4i32.v8i32(half %data, <4 x i32> %coords, <8 x i32> %rsrc, i32 1, i1 false, i1 false, i1 false, i1 false) + ret void +} + +; GCN-LABEL: {{^}}image_store_v2f16 + +; UNPACKED: flat_load_ushort v[[HI:[0-9]+]], v[{{[0-9]+:[0-9]+}}] glc slc +; UNPACKED: flat_load_ushort v[[LO:[0-9]+]], v[{{[0-9]+:[0-9]+}}] glc slc +; UNPACKED: image_store v{{\[}}[[LO]]:[[HI]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x3 unorm d16 + +; PACKED: image_store v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x3 unorm d16 +define amdgpu_kernel void @image_store_v2f16(<2 x half> %data, <4 x i32> %coords, <8 x i32> inreg %rsrc) { +main_body: + call void @llvm.amdgcn.image.store.v2f16.v4i32.v8i32(<2 x half> %data, <4 x i32> %coords, <8 x i32> %rsrc, i32 3, i1 false, i1 false, i1 false, i1 false) + ret void +} + +; GCN-LABEL: {{^}}image_store_v4f16 + +; UNPACKED: flat_load_ushort v[[HI:[0-9]+]], v[{{[0-9]+:[0-9]+}}] glc slc +; UNPACKED: flat_load_ushort v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] glc slc +; UNPACKED: flat_load_ushort v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] glc slc +; UNPACKED: flat_load_ushort v[[LO:[0-9]+]], v[{{[0-9]+:[0-9]+}}] glc slc +; UNPACKED: image_store v{{\[}}[[LO]]:[[HI]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf unorm d16 + +; GFX81: v_or_b32_e32 v[[HI:[0-9]+]] +; GFX81: v_or_b32_e32 v[[LO:[0-9]+]] + +; GFX9: v_mov_b32_e32 v[[LO:[0-9]+]] +; GFX9: v_mov_b32_e32 v[[HI:[0-9]+]] + +; PACKED: image_store v{{\[}}[[LO]]:[[HI]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf unorm d16 +define amdgpu_kernel void @image_store_v4f16(<4 x half> %data, <4 x i32> %coords, <8 x i32> inreg %rsrc) { +main_body: + call void @llvm.amdgcn.image.store.v4f16.v4i32.v8i32(<4 x half> %data, <4 x i32> %coords, <8 x i32> %rsrc, i32 15, i1 false, i1 false, i1 false, i1 false) + ret void +} + +; GCN-LABEL: {{^}}image_store_mip_v4f16 + +; UNPACKED: flat_load_ushort v[[HI:[0-9]+]], v[{{[0-9]+:[0-9]+}}] glc slc +; UNPACKED: flat_load_ushort v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] glc slc +; UNPACKED: flat_load_ushort v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] glc slc +; UNPACKED: flat_load_ushort v[[LO:[0-9]+]], v[{{[0-9]+:[0-9]+}}] glc slc +; UNPACKED: image_store_mip v{{\[}}[[LO]]:[[HI]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf unorm d16 + +; GFX81: v_or_b32_e32 v[[HI:[0-9]+]] +; GFX81: v_or_b32_e32 v[[LO:[0-9]+]] + +; GFX9: v_mov_b32_e32 v[[LO:[0-9]+]] +; GFX9: v_mov_b32_e32 v[[HI:[0-9]+]] + +; PACKED: image_store_mip v{{\[}}[[LO]]:[[HI]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf unorm d16 +define amdgpu_kernel void @image_store_mip_v4f16(<4 x half> %data, <4 x i32> %coords, <8 x i32> inreg %rsrc) { +main_body: + call void @llvm.amdgcn.image.store.mip.v4f16.v4i32.v8i32(<4 x half> %data, <4 x i32> %coords, <8 x i32> %rsrc, i32 15, i1 false, i1 false, i1 false, i1 false) + ret void +} + + +declare half @llvm.amdgcn.image.load.f16.v4i32.v8i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) +declare <2 x half> @llvm.amdgcn.image.load.v2f16.v4i32.v8i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) +declare <4 x half> @llvm.amdgcn.image.load.v4f16.v4i32.v8i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) +declare <4 x half> @llvm.amdgcn.image.load.mip.v4f16.v4i32.v8i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) + +declare void @llvm.amdgcn.image.store.f16.v4i32.v8i32(half, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) +declare void @llvm.amdgcn.image.store.v2f16.v4i32.v8i32(<2 x half>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) +declare void @llvm.amdgcn.image.store.v4f16.v4i32.v8i32(<4 x half>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) +declare void @llvm.amdgcn.image.store.mip.v4f16.v4i32.v8i32(<4 x half>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) + + + Index: test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.d16.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.d16.ll @@ -0,0 +1,137 @@ +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=UNPACKED %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX81 %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX9 %s + + +; GCN-LABEL: {{^}}image_gather4_f16: +; GCN: image_gather4 v[[HALF:[0-9]+]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x1 d16 + +; UNPACKED: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]] + +; GFX81: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]] + +; GFX9: global_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]], off +define amdgpu_kernel void @image_gather4_f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) { +main_body: + %tex = call half @llvm.amdgcn.image.gather4.f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 1, i1 0, i1 0, i1 0, i1 0, i1 0) + store half %tex, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}image_gather4_v2f16: +; UNPACKED: image_gather4 v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x3 d16 +; UNPACKED: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HI]] + +; PACKED: image_gather4 v[[DATA:[0-9]+]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x3 d16 + +; GFX81: v_lshrrev_b32_e32 v[[HI:[0-9]+]], 16, v[[DATA]] +; GFX81: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HI]] + +; GFX9: global_store_short_d16_hi v[{{[0-9]+:[0-9]+}}], v[[DATA]], off +define amdgpu_kernel void @image_gather4_v2f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) { +main_body: + %tex = call <2 x half> @llvm.amdgcn.image.gather4.v2f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 3, i1 0, i1 0, i1 0, i1 0, i1 0) + %elt = extractelement <2 x half> %tex, i32 1 + store half %elt, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}image_gather4_v4f16: +; UNPACKED: image_gather4 v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf d16 +; UNPACKED: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HI]] + +; PACKED: image_gather4 v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf d16 +; PACKED: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]] + +; GFX81: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]] + +; GFX9: global_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]], off +define amdgpu_kernel void @image_gather4_v4f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) { +main_body: + %tex = call <4 x half> @llvm.amdgcn.image.gather4.v4f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) + %elt = extractelement <4 x half> %tex, i32 3 + store half %elt, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}image_gather4_cl_v4f16: +; UNPACKED: image_gather4_cl v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf d16 +; UNPACKED: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HI]] + +; PACKED: image_gather4_cl v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf d16 +; PACKED: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]] + +; GFX81: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]] + +; GFX9: global_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]], off +define amdgpu_kernel void @image_gather4_cl_v4f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) { +main_body: + %tex = call <4 x half> @llvm.amdgcn.image.gather4.cl.v4f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) + %elt = extractelement <4 x half> %tex, i32 3 + store half %elt, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}image_gather4_c_v4f16: +; UNPACKED: image_gather4_c v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf d16 +; UNPACKED: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HI]] + +; PACKED: image_gather4_c v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf d16 +; PACKED: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]] + +; GFX81: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]] + +; GFX9: global_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]], off +define amdgpu_kernel void @image_gather4_c_v4f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) { +main_body: + %tex = call <4 x half> @llvm.amdgcn.image.gather4.c.v4f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) + %elt = extractelement <4 x half> %tex, i32 3 + store half %elt, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}image_gather4_o_v4f16: +; UNPACKED: image_gather4_o v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf d16 +; UNPACKED: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HI]] + +; PACKED: image_gather4_o v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf d16 +; PACKED: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]] + +; GFX81: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]] + +; GFX9: global_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]], off +define amdgpu_kernel void @image_gather4_o_v4f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) { +main_body: + %tex = call <4 x half> @llvm.amdgcn.image.gather4.o.v4f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) + %elt = extractelement <4 x half> %tex, i32 3 + store half %elt, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}image_gather4_c_o_v4f16: +; UNPACKED: image_gather4_c_o v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf d16 +; UNPACKED: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HI]] + +; PACKED: image_gather4_c_o v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf d16 +; PACKED: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]] + +; GFX81: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]] + +; GFX9: global_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]], off +define amdgpu_kernel void @image_gather4_c_o_v4f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) { +main_body: + %tex = call <4 x half> @llvm.amdgcn.image.gather4.c.o.v4f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) + %elt = extractelement <4 x half> %tex, i32 3 + store half %elt, half addrspace(1)* %out + ret void +} + +declare half @llvm.amdgcn.image.gather4.f16.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) +declare <2 x half> @llvm.amdgcn.image.gather4.v2f16.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) +declare <4 x half> @llvm.amdgcn.image.gather4.v4f16.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) + + +declare <4 x half> @llvm.amdgcn.image.gather4.cl.v4f16.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) +declare <4 x half> @llvm.amdgcn.image.gather4.c.v4f16.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) +declare <4 x half> @llvm.amdgcn.image.gather4.o.v4f16.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) +declare <4 x half> @llvm.amdgcn.image.gather4.c.o.v4f16.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) Index: test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.ll @@ -0,0 +1,135 @@ +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=UNPACKED %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX81 %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX9 %s + + +; GCN-LABEL: {{^}}image_sample_f16: +; GCN: image_sample v[[HALF:[0-9]+]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x1 d16 + +; UNPACKED: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]] + +; GFX81: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]] + +; GFX9: global_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]], off +define amdgpu_kernel void @image_sample_f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) { +main_body: + %tex = call half @llvm.amdgcn.image.sample.f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 1, i1 0, i1 0, i1 0, i1 0, i1 0) + store half %tex, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}image_sample_v2f16: +; UNPACKED: image_sample v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x3 d16 +; UNPACKED: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HI]] + +; PACKED: image_sample v[[DATA:[0-9]+]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x3 d16 + +; GFX81: v_lshrrev_b32_e32 v[[HI:[0-9]+]], 16, v[[DATA]] +; GFX81: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HI]] + +; GFX9: global_store_short_d16_hi v[{{[0-9]+:[0-9]+}}], v[[DATA]], off +define amdgpu_kernel void @image_sample_v2f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) { +main_body: + %tex = call <2 x half> @llvm.amdgcn.image.sample.v2f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 3, i1 0, i1 0, i1 0, i1 0, i1 0) + %elt = extractelement <2 x half> %tex, i32 1 + store half %elt, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}image_sample_v4f16: +; UNPACKED: image_sample v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf d16 +; UNPACKED: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HI]] + +; PACKED: image_sample v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf d16 +; PACKED: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]] + +; GFX81: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]] + +; GFX9: global_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]], off +define amdgpu_kernel void @image_sample_v4f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) { +main_body: + %tex = call <4 x half> @llvm.amdgcn.image.sample.v4f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) + %elt = extractelement <4 x half> %tex, i32 3 + store half %elt, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}image_sample_cl_v4f16: +; UNPACKED: image_sample_cl v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf d16 +; UNPACKED: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HI]] + +; PACKED: image_sample_cl v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf d16 +; PACKED: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]] + +; GFX81: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]] + +; GFX9: global_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]], off +define amdgpu_kernel void @image_sample_cl_v4f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) { +main_body: + %tex = call <4 x half> @llvm.amdgcn.image.sample.cl.v4f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) + %elt = extractelement <4 x half> %tex, i32 3 + store half %elt, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}image_sample_c_v4f16: +; UNPACKED: image_sample_c v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf d16 +; UNPACKED: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HI]] + +; PACKED: image_sample_c v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf d16 +; PACKED: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]] + +; GFX81: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]] + +; GFX9: global_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]], off +define amdgpu_kernel void @image_sample_c_v4f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) { +main_body: + %tex = call <4 x half> @llvm.amdgcn.image.sample.c.v4f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) + %elt = extractelement <4 x half> %tex, i32 3 + store half %elt, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}image_sample_o_v4f16: +; UNPACKED: image_sample_o v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf d16 +; UNPACKED: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HI]] + +; PACKED: image_sample_o v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf d16 +; PACKED: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]] + +; GFX81: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]] +; GFX9: global_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]], off +define amdgpu_kernel void @image_sample_o_v4f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) { +main_body: + %tex = call <4 x half> @llvm.amdgcn.image.sample.o.v4f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) + %elt = extractelement <4 x half> %tex, i32 3 + store half %elt, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}image_sample_c_o_v4f16: +; UNPACKED: image_sample_c_o v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf d16 +; UNPACKED: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HI]] + +; PACKED: image_sample_c_o v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf d16 +; PACKED: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]] + +; GFX81: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]] +; GFX9: global_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]], off +define amdgpu_kernel void @image_sample_c_o_v4f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) { +main_body: + %tex = call <4 x half> @llvm.amdgcn.image.sample.c.o.v4f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) + %elt = extractelement <4 x half> %tex, i32 3 + store half %elt, half addrspace(1)* %out + ret void +} + +declare half @llvm.amdgcn.image.sample.f16.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) +declare <2 x half> @llvm.amdgcn.image.sample.v2f16.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) +declare <4 x half> @llvm.amdgcn.image.sample.v4f16.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) + + +declare <4 x half> @llvm.amdgcn.image.sample.cl.v4f16.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) +declare <4 x half> @llvm.amdgcn.image.sample.c.v4f16.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) +declare <4 x half> @llvm.amdgcn.image.sample.o.v4f16.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) +declare <4 x half> @llvm.amdgcn.image.sample.c.o.v4f16.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1)