Index: lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.h +++ lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -450,6 +450,91 @@ BUFFER_LOAD_FORMAT_D16, BUFFER_STORE_FORMAT, BUFFER_STORE_FORMAT_D16, + IMAGE_LOAD, + IMAGE_LOAD_MIP, + IMAGE_STORE, + IMAGE_STORE_MIP, + + // Basic sample. + IMAGE_SAMPLE, + IMAGE_SAMPLE_CL, + IMAGE_SAMPLE_D, + IMAGE_SAMPLE_D_CL, + IMAGE_SAMPLE_L, + IMAGE_SAMPLE_B, + IMAGE_SAMPLE_B_CL, + IMAGE_SAMPLE_LZ, + IMAGE_SAMPLE_CD, + IMAGE_SAMPLE_CD_CL, + + // Sample with comparison. + IMAGE_SAMPLE_C, + IMAGE_SAMPLE_C_CL, + IMAGE_SAMPLE_C_D, + IMAGE_SAMPLE_C_D_CL, + IMAGE_SAMPLE_C_L, + IMAGE_SAMPLE_C_B, + IMAGE_SAMPLE_C_B_CL, + IMAGE_SAMPLE_C_LZ, + IMAGE_SAMPLE_C_CD, + IMAGE_SAMPLE_C_CD_CL, + + // Sample with offsets. + IMAGE_SAMPLE_O, + IMAGE_SAMPLE_CL_O, + IMAGE_SAMPLE_D_O, + IMAGE_SAMPLE_D_CL_O, + IMAGE_SAMPLE_L_O, + IMAGE_SAMPLE_B_O, + IMAGE_SAMPLE_B_CL_O, + IMAGE_SAMPLE_LZ_O, + IMAGE_SAMPLE_CD_O, + IMAGE_SAMPLE_CD_CL_O, + + // Sample with comparison and offsets. + IMAGE_SAMPLE_C_O, + IMAGE_SAMPLE_C_CL_O, + IMAGE_SAMPLE_C_D_O, + IMAGE_SAMPLE_C_D_CL_O, + IMAGE_SAMPLE_C_L_O, + IMAGE_SAMPLE_C_B_O, + IMAGE_SAMPLE_C_B_CL_O, + IMAGE_SAMPLE_C_LZ_O, + IMAGE_SAMPLE_C_CD_O, + IMAGE_SAMPLE_C_CD_CL_O, + + // Basic gather4. + IMAGE_GATHER4, + IMAGE_GATHER4_CL, + IMAGE_GATHER4_L, + IMAGE_GATHER4_B, + IMAGE_GATHER4_B_CL, + IMAGE_GATHER4_LZ, + + // Gather4 with comparison. + IMAGE_GATHER4_C, + IMAGE_GATHER4_C_CL, + IMAGE_GATHER4_C_L, + IMAGE_GATHER4_C_B, + IMAGE_GATHER4_C_B_CL, + IMAGE_GATHER4_C_LZ, + + // Gather4 with offsets. + IMAGE_GATHER4_O, + IMAGE_GATHER4_CL_O, + IMAGE_GATHER4_L_O, + IMAGE_GATHER4_B_O, + IMAGE_GATHER4_B_CL_O, + IMAGE_GATHER4_LZ_O, + + // Gather4 with comparison and offsets. + IMAGE_GATHER4_C_O, + IMAGE_GATHER4_C_CL_O, + IMAGE_GATHER4_C_L_O, + IMAGE_GATHER4_C_B_O, + IMAGE_GATHER4_C_B_CL_O, + IMAGE_GATHER4_C_LZ_O, + LAST_AMDGPU_ISD_NUMBER }; Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -3965,6 +3965,83 @@ NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16) NODE_NAME_CASE(BUFFER_STORE_FORMAT) NODE_NAME_CASE(BUFFER_STORE_FORMAT_D16) + NODE_NAME_CASE(IMAGE_LOAD) + NODE_NAME_CASE(IMAGE_LOAD_MIP) + NODE_NAME_CASE(IMAGE_STORE) + NODE_NAME_CASE(IMAGE_STORE_MIP) + // Basic sample. + NODE_NAME_CASE(IMAGE_SAMPLE) + NODE_NAME_CASE(IMAGE_SAMPLE_CL) + NODE_NAME_CASE(IMAGE_SAMPLE_D) + NODE_NAME_CASE(IMAGE_SAMPLE_D_CL) + NODE_NAME_CASE(IMAGE_SAMPLE_L) + NODE_NAME_CASE(IMAGE_SAMPLE_B) + NODE_NAME_CASE(IMAGE_SAMPLE_B_CL) + NODE_NAME_CASE(IMAGE_SAMPLE_LZ) + NODE_NAME_CASE(IMAGE_SAMPLE_CD) + NODE_NAME_CASE(IMAGE_SAMPLE_CD_CL) + // Sample with comparison. + NODE_NAME_CASE(IMAGE_SAMPLE_C) + NODE_NAME_CASE(IMAGE_SAMPLE_C_CL) + NODE_NAME_CASE(IMAGE_SAMPLE_C_D) + NODE_NAME_CASE(IMAGE_SAMPLE_C_D_CL) + NODE_NAME_CASE(IMAGE_SAMPLE_C_L) + NODE_NAME_CASE(IMAGE_SAMPLE_C_B) + NODE_NAME_CASE(IMAGE_SAMPLE_C_B_CL) + NODE_NAME_CASE(IMAGE_SAMPLE_C_LZ) + NODE_NAME_CASE(IMAGE_SAMPLE_C_CD) + NODE_NAME_CASE(IMAGE_SAMPLE_C_CD_CL) + // Sample with offsets. + NODE_NAME_CASE(IMAGE_SAMPLE_O) + NODE_NAME_CASE(IMAGE_SAMPLE_CL_O) + NODE_NAME_CASE(IMAGE_SAMPLE_D_O) + NODE_NAME_CASE(IMAGE_SAMPLE_D_CL_O) + NODE_NAME_CASE(IMAGE_SAMPLE_L_O) + NODE_NAME_CASE(IMAGE_SAMPLE_B_O) + NODE_NAME_CASE(IMAGE_SAMPLE_B_CL_O) + NODE_NAME_CASE(IMAGE_SAMPLE_LZ_O) + NODE_NAME_CASE(IMAGE_SAMPLE_CD_O) + NODE_NAME_CASE(IMAGE_SAMPLE_CD_CL_O) + // Sample with comparison and offsets. + NODE_NAME_CASE(IMAGE_SAMPLE_C_O) + NODE_NAME_CASE(IMAGE_SAMPLE_C_CL_O) + NODE_NAME_CASE(IMAGE_SAMPLE_C_D_O) + NODE_NAME_CASE(IMAGE_SAMPLE_C_D_CL_O) + NODE_NAME_CASE(IMAGE_SAMPLE_C_L_O) + NODE_NAME_CASE(IMAGE_SAMPLE_C_B_O) + NODE_NAME_CASE(IMAGE_SAMPLE_C_B_CL_O) + NODE_NAME_CASE(IMAGE_SAMPLE_C_LZ_O) + NODE_NAME_CASE(IMAGE_SAMPLE_C_CD_O) + NODE_NAME_CASE(IMAGE_SAMPLE_C_CD_CL_O) + // Basic gather4. + NODE_NAME_CASE(IMAGE_GATHER4) + NODE_NAME_CASE(IMAGE_GATHER4_CL) + NODE_NAME_CASE(IMAGE_GATHER4_L) + NODE_NAME_CASE(IMAGE_GATHER4_B) + NODE_NAME_CASE(IMAGE_GATHER4_B_CL) + NODE_NAME_CASE(IMAGE_GATHER4_LZ) + // Gather4 with comparison. + NODE_NAME_CASE(IMAGE_GATHER4_C) + NODE_NAME_CASE(IMAGE_GATHER4_C_CL) + NODE_NAME_CASE(IMAGE_GATHER4_C_L) + NODE_NAME_CASE(IMAGE_GATHER4_C_B) + NODE_NAME_CASE(IMAGE_GATHER4_C_B_CL) + NODE_NAME_CASE(IMAGE_GATHER4_C_LZ) + // Gather4 with offsets. + NODE_NAME_CASE(IMAGE_GATHER4_O) + NODE_NAME_CASE(IMAGE_GATHER4_CL_O) + NODE_NAME_CASE(IMAGE_GATHER4_L_O) + NODE_NAME_CASE(IMAGE_GATHER4_B_O) + NODE_NAME_CASE(IMAGE_GATHER4_B_CL_O) + NODE_NAME_CASE(IMAGE_GATHER4_LZ_O) + // Gather4 with comparison and offsets. + NODE_NAME_CASE(IMAGE_GATHER4_C_O) + NODE_NAME_CASE(IMAGE_GATHER4_C_CL_O) + NODE_NAME_CASE(IMAGE_GATHER4_C_L_O) + NODE_NAME_CASE(IMAGE_GATHER4_C_B_O) + NODE_NAME_CASE(IMAGE_GATHER4_C_B_CL_O) + NODE_NAME_CASE(IMAGE_GATHER4_C_LZ_O) + case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break; } return nullptr; Index: lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp =================================================================== --- lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -152,6 +152,7 @@ ImmTyDA, ImmTyR128, ImmTyLWE, + ImmTyD16, ImmTyExpTgt, ImmTyExpCompr, ImmTyExpVM, @@ -283,6 +284,7 @@ bool isDA() const { return isImmTy(ImmTyDA); } bool isR128() const { return isImmTy(ImmTyUNorm); } bool isLWE() const { return isImmTy(ImmTyLWE); } + bool isD16() const { return isImmTy(ImmTyD16); } bool isOff() const { return isImmTy(ImmTyOff); } bool isExpTgt() const { return isImmTy(ImmTyExpTgt); } bool isExpVM() const { return isImmTy(ImmTyExpVM); } @@ -664,6 +666,7 @@ case ImmTyDA: OS << "DA"; break; case ImmTyR128: OS << "R128"; break; case ImmTyLWE: OS << "LWE"; break; + case ImmTyD16: OS << "D16"; break; case ImmTyOff: OS << "Off"; break; case ImmTyExpTgt: OS << "ExpTgt"; break; case ImmTyExpCompr: OS << "ExpCompr"; break; @@ -1065,6 +1068,7 @@ AMDGPUOperand::Ptr defaultDA() const; AMDGPUOperand::Ptr defaultR128() const; AMDGPUOperand::Ptr defaultLWE() const; + AMDGPUOperand::Ptr defaultD16() const; AMDGPUOperand::Ptr defaultSMRDOffset8() const; AMDGPUOperand::Ptr defaultSMRDOffset20() const; AMDGPUOperand::Ptr defaultSMRDLiteralOffset() const; @@ -3996,6 +4000,7 @@ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyR128); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyLWE); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyD16); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySLC); } @@ -4023,6 +4028,10 @@ return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyLWE); } +AMDGPUOperand::Ptr AMDGPUAsmParser::defaultD16() const { + return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyD16); +} + //===----------------------------------------------------------------------===// // smrd //===----------------------------------------------------------------------===// @@ -4122,6 +4131,7 @@ {"da", AMDGPUOperand::ImmTyDA, true, nullptr}, {"r128", AMDGPUOperand::ImmTyR128, true, nullptr}, {"lwe", AMDGPUOperand::ImmTyLWE, true, nullptr}, + {"d16", AMDGPUOperand::ImmTyD16, true, nullptr}, {"dmask", AMDGPUOperand::ImmTyDMask, false, nullptr}, {"row_mask", AMDGPUOperand::ImmTyDppRowMask, false, nullptr}, {"bank_mask", AMDGPUOperand::ImmTyDppBankMask, false, nullptr}, Index: lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h =================================================================== --- lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h +++ lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h @@ -84,6 +84,8 @@ raw_ostream &O); void printLWE(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); + void printD16(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); void printExpCompr(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); void printExpVM(const MCInst *MI, unsigned OpNo, Index: lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp =================================================================== --- lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp +++ lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp @@ -217,6 +217,11 @@ printNamedBit(MI, OpNo, O, "lwe"); } +void AMDGPUInstPrinter::printD16(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O) { + printNamedBit(MI, OpNo, O, "d16"); +} + void AMDGPUInstPrinter::printExpCompr(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { Index: lib/Target/AMDGPU/MIMGInstructions.td =================================================================== --- lib/Target/AMDGPU/MIMGInstructions.td +++ lib/Target/AMDGPU/MIMGInstructions.td @@ -32,26 +32,54 @@ class MIMG_NoSampler_Helper op, string asm, RegisterClass dst_rc, RegisterClass addr_rc, + bit d16_bit=0, string dns=""> : MIMG_Helper < (outs dst_rc:$vdata), (ins addr_rc:$vaddr, SReg_256:$srsrc, dmask:$dmask, unorm:$unorm, GLC:$glc, slc:$slc, r128:$r128, tfe:$tfe, lwe:$lwe, da:$da), - asm#" $vdata, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da", + asm#" $vdata, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da"#!if(d16_bit, " d16", ""), dns>, MIMGe { let ssamp = 0; + let D16 = d16; } multiclass MIMG_NoSampler_Src_Helper op, string asm, RegisterClass dst_rc, int channels> { - def _V1 : MIMG_NoSampler_Helper , - MIMG_Mask; - def _V2 : MIMG_NoSampler_Helper , - MIMG_Mask; - def _V4 : MIMG_NoSampler_Helper , - MIMG_Mask; + let d16 = 0 in { + def _V1 : MIMG_NoSampler_Helper , + MIMG_Mask; + def _V2 : MIMG_NoSampler_Helper , + MIMG_Mask; + def _V4 : MIMG_NoSampler_Helper , + MIMG_Mask; + } // End d16 = 0. + + let d16 = 1 in { + let SubtargetPredicate = HasPackedD16VMem in { + def _V1_D16 : MIMG_NoSampler_Helper , + MIMG_Mask; + def _V2_D16 : MIMG_NoSampler_Helper , + MIMG_Mask; + def _V4_D16 : MIMG_NoSampler_Helper , + MIMG_Mask; + } // End HasPackedD16VMem. + + let SubtargetPredicate = HasUnpackedD16VMem in { + let DecoderNamespace = "GFX80_UNPACKED" in { + def _V1_D16_gfx80 : MIMG_NoSampler_Helper , + MIMG_Mask; + def _V2_D16_gfx80 : MIMG_NoSampler_Helper , + MIMG_Mask; + def _V4_D16_gfx80 : MIMG_NoSampler_Helper , + MIMG_Mask; + } // End GFX80_UNPACKED. + } // End HasUnpackedD16VMem. + } // End d16 = 1. } multiclass MIMG_NoSampler op, string asm> { @@ -63,12 +91,13 @@ class MIMG_Store_Helper op, string asm, RegisterClass data_rc, - RegisterClass addr_rc> : MIMG_Helper < + RegisterClass addr_rc, + bit d16_bit=0> : MIMG_Helper < (outs), (ins data_rc:$vdata, addr_rc:$vaddr, SReg_256:$srsrc, dmask:$dmask, unorm:$unorm, GLC:$glc, slc:$slc, r128:$r128, tfe:$tfe, lwe:$lwe, da:$da), - asm#" $vdata, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da" + asm#" $vdata, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da"#!if(d16_bit, " d16", "") >, MIMGe { let ssamp = 0; let mayLoad = 1; // TableGen requires this for matching with the intrinsics @@ -76,17 +105,42 @@ let hasSideEffects = 1; let hasPostISelHook = 0; let DisableWQM = 1; + let D16 = d16; } multiclass MIMG_Store_Addr_Helper op, string asm, RegisterClass data_rc, int channels> { - def _V1 : MIMG_Store_Helper , - MIMG_Mask; - def _V2 : MIMG_Store_Helper , - MIMG_Mask; - def _V4 : MIMG_Store_Helper , - MIMG_Mask; + let d16 = 0 in { + def _V1 : MIMG_Store_Helper , + MIMG_Mask; + def _V2 : MIMG_Store_Helper , + MIMG_Mask; + def _V4 : MIMG_Store_Helper , + MIMG_Mask; + } // End d16 = 0. + + let d16 = 1 in { + let SubtargetPredicate = HasPackedD16VMem in { + def _V1_D16 : MIMG_Store_Helper , + MIMG_Mask; + def _V2_D16 : MIMG_Store_Helper , + MIMG_Mask; + def _V4_D16 : MIMG_Store_Helper , + MIMG_Mask; + } // End HasPackedD16VMem. + + let SubtargetPredicate = HasUnpackedD16VMem in { + let DecoderNamespace = "GFX80_UNPACKED" in { + def _V1_D16_gfx80 : MIMG_Store_Helper , + MIMG_Mask; + def _V2_D16_gfx80 : MIMG_Store_Helper , + MIMG_Mask; + def _V4_D16_gfx80 : MIMG_Store_Helper , + MIMG_Mask; + } // End GFX80_UNPACKED. + } // End HasPackedD16VMem. + } // End d16 = 1. } multiclass MIMG_Store op, string asm> { @@ -121,6 +175,7 @@ let AssemblerPredicates = [isSICI]; let DecoderNamespace = "SICI"; let DisableDecoder = DisableSIDecoder; + let d16 = 0; } class MIMG_Atomic_Real_vi : MIMG_Helper < (outs dst_rc:$vdata), (ins src_rc:$vaddr, SReg_256:$srsrc, SReg_128:$ssamp, dmask:$dmask, unorm:$unorm, GLC:$glc, slc:$slc, r128:$r128, tfe:$tfe, lwe:$lwe, da:$da), - asm#" $vdata, $vaddr, $srsrc, $ssamp$dmask$unorm$glc$slc$r128$tfe$lwe$da", + asm#" $vdata, $vaddr, $srsrc, $ssamp$dmask$unorm$glc$slc$r128$tfe$lwe$da"#!if(d16_bit, " d16", ""), dns>, MIMGe { let WQM = wqm; + let D16 = d16; } multiclass MIMG_Sampler_Src_Helper op, string asm, RegisterClass dst_rc, int channels, bit wqm> { - def _V1 : MIMG_Sampler_Helper , - MIMG_Mask; - def _V2 : MIMG_Sampler_Helper , - MIMG_Mask; - def _V4 : MIMG_Sampler_Helper , - MIMG_Mask; - def _V8 : MIMG_Sampler_Helper , - MIMG_Mask; - def _V16 : MIMG_Sampler_Helper , - MIMG_Mask; + let d16 = 0 in { + def _V1 : MIMG_Sampler_Helper , + MIMG_Mask; + def _V2 : MIMG_Sampler_Helper , + MIMG_Mask; + def _V4 : MIMG_Sampler_Helper , + MIMG_Mask; + def _V8 : MIMG_Sampler_Helper , + MIMG_Mask; + def _V16 : MIMG_Sampler_Helper , + MIMG_Mask; + } // End d16 = 0. + + let d16 = 1 in { + let SubtargetPredicate = HasPackedD16VMem in { + def _V1_D16 : MIMG_Sampler_Helper , + MIMG_Mask; + def _V2_D16 : MIMG_Sampler_Helper , + MIMG_Mask; + def _V4_D16 : MIMG_Sampler_Helper , + MIMG_Mask; + def _V8_D16 : MIMG_Sampler_Helper , + MIMG_Mask; + def _V16_D16 : MIMG_Sampler_Helper , + MIMG_Mask; + } // End HasPackedD16VMem. + + let SubtargetPredicate = HasUnpackedD16VMem in { + let DecoderNamespace = "GFX80_UNPACKED" in { + def _V1_D16_gfx80 : MIMG_Sampler_Helper , + MIMG_Mask; + def _V2_D16_gfx80 : MIMG_Sampler_Helper , + MIMG_Mask; + def _V4_D16_gfx80 : MIMG_Sampler_Helper , + MIMG_Mask; + def _V8_D16_gfx80 : MIMG_Sampler_Helper , + MIMG_Mask; + def _V16_D16_gfx80 : MIMG_Sampler_Helper , + MIMG_Mask; + } // End GFX80_UNPACKED. + } // End HasUnpackedD16VMem. + } // End d16 = 1. } multiclass MIMG_Sampler op, string asm, bit wqm=0> { @@ -195,12 +287,13 @@ class MIMG_Gather_Helper op, string asm, RegisterClass dst_rc, - RegisterClass src_rc, bit wqm> : MIMG < + RegisterClass src_rc, + bit wqm, bit d16_bit=0> : MIMG < (outs dst_rc:$vdata), (ins src_rc:$vaddr, SReg_256:$srsrc, SReg_128:$ssamp, dmask:$dmask, unorm:$unorm, GLC:$glc, slc:$slc, r128:$r128, tfe:$tfe, lwe:$lwe, da:$da), - asm#" $vdata, $vaddr, $srsrc, $ssamp$dmask$unorm$glc$slc$r128$tfe$lwe$da", + asm#" $vdata, $vaddr, $srsrc, $ssamp$dmask$unorm$glc$slc$r128$tfe$lwe$da"#!if(d16_bit, " d16", ""), []>, MIMGe { let mayLoad = 1; let mayStore = 0; @@ -215,6 +308,7 @@ let Gather4 = 1; let hasPostISelHook = 0; let WQM = wqm; + let D16 = d16; let isAsmParserOnly = 1; // TBD: fix it later } @@ -222,16 +316,48 @@ multiclass MIMG_Gather_Src_Helper op, string asm, RegisterClass dst_rc, int channels, bit wqm> { - def _V1 : MIMG_Gather_Helper , - MIMG_Mask; - def _V2 : MIMG_Gather_Helper , - MIMG_Mask; - def _V4 : MIMG_Gather_Helper , - MIMG_Mask; - def _V8 : MIMG_Gather_Helper , - MIMG_Mask; - def _V16 : MIMG_Gather_Helper , - MIMG_Mask; + let d16 = 0 in { + def _V1 : MIMG_Gather_Helper , + MIMG_Mask; + def _V2 : MIMG_Gather_Helper , + MIMG_Mask; + def _V4 : MIMG_Gather_Helper , + MIMG_Mask; + def _V8 : MIMG_Gather_Helper , + MIMG_Mask; + def _V16 : MIMG_Gather_Helper , + MIMG_Mask; + } // End d16 = 0. + + let d16 = 1 in { + let SubtargetPredicate = HasPackedD16VMem in { + def _V1_D16 : MIMG_Gather_Helper , + MIMG_Mask; + def _V2_D16 : MIMG_Gather_Helper , + MIMG_Mask; + def _V4_D16 : MIMG_Gather_Helper , + MIMG_Mask; + def _V8_D16 : MIMG_Gather_Helper , + MIMG_Mask; + def _V16_D16 : MIMG_Gather_Helper , + MIMG_Mask; + } // End HasPackedD16VMem. + + let SubtargetPredicate = HasUnpackedD16VMem in { + let DecoderNamespace = "GFX80_UNPACKED" in { + def _V1_D16_gfx80 : MIMG_Gather_Helper , + MIMG_Mask; + def _V2_D16_gfx80 : MIMG_Gather_Helper , + MIMG_Mask; + def _V4_D16_gfx80 : MIMG_Gather_Helper , + MIMG_Mask; + def _V8_D16_gfx80 : MIMG_Gather_Helper , + MIMG_Mask; + def _V16_D16_gfx80 : MIMG_Gather_Helper , + MIMG_Mask; + } // End HasPackedD16VMem. + } // End HasUnpackedD16VMem. + } // End d16 = 1. } multiclass MIMG_Gather op, string asm, bit wqm=0> { @@ -365,12 +491,11 @@ def : SampleRawPattern(opcode # _V4_V16), v16i32>; } -// Image + sampler for amdgcn +// ImageSample for amdgcn // TODO: -// 1. Handle half data type like v4f16, and add D16 bit support; -// 2. Handle v4i32 rsrc type (Register Class for the instruction to be SReg_128). -// 3. Add A16 support when we pass address of half type. -multiclass AMDGCNSamplePattern { +// 1. Handle v4i32 rsrc type (Register Class for the instruction to be SReg_128). +// 2. Add A16 support when we pass address of half type. +multiclass ImageSamplePattern { def : GCNPat< (dt (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, i32:$dmask, i1:$unorm, i1:$glc, i1:$slc, i1:$lwe, i1:$da)), @@ -380,36 +505,44 @@ >; } -multiclass AMDGCNSampleDataPatterns { - defm : AMDGCNSamplePattern(opcode # _V1), dt, f32>; - defm : AMDGCNSamplePattern(opcode # _V2), dt, v2f32>; - defm : AMDGCNSamplePattern(opcode # _V4), dt, v4f32>; - defm : AMDGCNSamplePattern(opcode # _V8), dt, v8f32>; - defm : AMDGCNSamplePattern(opcode # _V16), dt, v16f32>; +multiclass ImageSampleDataPatterns { + defm : ImageSamplePattern(opcode # _V1 # suffix), dt, f32>; + defm : ImageSamplePattern(opcode # _V2 # suffix), dt, v2f32>; + defm : ImageSamplePattern(opcode # _V4 # suffix), dt, v4f32>; + defm : ImageSamplePattern(opcode # _V8 # suffix), dt, v8f32>; + defm : ImageSamplePattern(opcode # _V16 # suffix), dt, v16f32>; } -// TODO: support v3f32. -multiclass AMDGCNSamplePatterns { - defm : AMDGCNSampleDataPatterns(opcode # _V1), f32>; - defm : AMDGCNSampleDataPatterns(opcode # _V2), v2f32>; - defm : AMDGCNSampleDataPatterns(opcode # _V4), v4f32>; +// ImageSample patterns. +multiclass ImageSamplePatterns { + defm : ImageSampleDataPatterns(opcode # _V1), f32>; + defm : ImageSampleDataPatterns(opcode # _V2), v2f32>; + defm : ImageSampleDataPatterns(opcode # _V4), v4f32>; + + let SubtargetPredicate = HasUnpackedD16VMem in { + defm : ImageSampleDataPatterns(opcode # _V1), f16, "_D16_gfx80">; + } // End HasUnpackedD16VMem. + + let SubtargetPredicate = HasPackedD16VMem in { + defm : ImageSampleDataPatterns(opcode # _V1), f16, "_D16">; + defm : ImageSampleDataPatterns(opcode # _V1), v2f16, "_D16">; + } // End HasPackedD16VMem. } -// Image only -class ImagePattern : GCNPat < - (name vt:$addr, v8i32:$rsrc, imm:$dmask, imm:$unorm, - imm:$r128, imm:$da, imm:$glc, imm:$slc, imm:$tfe, imm:$lwe), - (opcode $addr, $rsrc, - (as_i32imm $dmask), (as_i1imm $unorm), (as_i1imm $glc), (as_i1imm $slc), - (as_i1imm $r128), (as_i1imm $tfe), (as_i1imm $lwe), (as_i1imm $da)) ->; +// ImageSample altervative patterns for illegal vector half Types. +multiclass ImageSampleAltPatterns { + let SubtargetPredicate = HasUnpackedD16VMem in { + defm : ImageSampleDataPatterns(opcode # _V2), v2i32, "_D16_gfx80">; + defm : ImageSampleDataPatterns(opcode # _V4), v4i32, "_D16_gfx80">; + } // End HasUnpackedD16VMem. -multiclass ImagePatterns { - def : ImagePattern(opcode # _V4_V1), i32>; - def : ImagePattern(opcode # _V4_V2), v2i32>; - def : ImagePattern(opcode # _V4_V4), v4i32>; + let SubtargetPredicate = HasPackedD16VMem in { + defm : ImageSampleDataPatterns(opcode # _V1), i32, "_D16">; + defm : ImageSampleDataPatterns(opcode # _V2), v2i32, "_D16">; + } // End HasPackedD16VMem. } +// ImageLoad for amdgcn multiclass ImageLoadPattern { def : GCNPat < (dt (name vt:$addr, v8i32:$rsrc, i32:$dmask, i1:$glc, i1:$slc, i1:$lwe, @@ -420,19 +553,43 @@ >; } -multiclass ImageLoadDataPatterns { - defm : ImageLoadPattern(opcode # _V1), dt, i32>; - defm : ImageLoadPattern(opcode # _V2), dt, v2i32>; - defm : ImageLoadPattern(opcode # _V4), dt, v4i32>; +multiclass ImageLoadDataPatterns { + defm : ImageLoadPattern(opcode # _V1 # suffix), dt, i32>; + defm : ImageLoadPattern(opcode # _V2 # suffix ), dt, v2i32>; + defm : ImageLoadPattern(opcode # _V4 # suffix), dt, v4i32>; } +// ImageLoad patterns. // TODO: support v3f32. multiclass ImageLoadPatterns { defm : ImageLoadDataPatterns(opcode # _V1), f32>; defm : ImageLoadDataPatterns(opcode # _V2), v2f32>; defm : ImageLoadDataPatterns(opcode # _V4), v4f32>; + + let SubtargetPredicate = HasUnpackedD16VMem in { + defm : ImageLoadDataPatterns(opcode # _V1), f16, "_D16_gfx80">; + } // End HasUnpackedD16VMem. + + let SubtargetPredicate = HasPackedD16VMem in { + defm : ImageLoadDataPatterns(opcode # _V1), f16, "_D16">; + defm : ImageLoadDataPatterns(opcode # _V1), v2f16, "_D16">; + } // End HasPackedD16VMem. } +// ImageLoad alternative patterns for illegal vector half Types. +multiclass ImageLoadAltPatterns { + let SubtargetPredicate = HasUnpackedD16VMem in { + defm : ImageLoadDataPatterns(opcode # _V2), v2i32, "_D16_gfx80">; + defm : ImageLoadDataPatterns(opcode # _V4), v4i32, "_D16_gfx80">; + } // End HasUnPackedD16VMem. + + let SubtargetPredicate = HasPackedD16VMem in { + defm : ImageLoadDataPatterns(opcode # _V1), i32, "_D16">; + defm : ImageLoadDataPatterns(opcode # _V2), v2i32, "_D16">; + } // End HasPackedD16VMem. +} + +// ImageStore for amdgcn multiclass ImageStorePattern { def : GCNPat < (name dt:$data, vt:$addr, v8i32:$rsrc, i32:$dmask, i1:$glc, i1:$slc, @@ -443,30 +600,56 @@ >; } -multiclass ImageStoreDataPatterns { - defm : ImageStorePattern(opcode # _V1), dt, i32>; - defm : ImageStorePattern(opcode # _V2), dt, v2i32>; - defm : ImageStorePattern(opcode # _V4), dt, v4i32>; +multiclass ImageStoreDataPatterns { + defm : ImageStorePattern(opcode # _V1 # suffix), dt, i32>; + defm : ImageStorePattern(opcode # _V2 # suffix), dt, v2i32>; + defm : ImageStorePattern(opcode # _V4 # suffix), dt, v4i32>; } +// ImageStore patterns. // TODO: support v3f32. multiclass ImageStorePatterns { defm : ImageStoreDataPatterns(opcode # _V1), f32>; defm : ImageStoreDataPatterns(opcode # _V2), v2f32>; defm : ImageStoreDataPatterns(opcode # _V4), v4f32>; + + let SubtargetPredicate = HasUnpackedD16VMem in { + defm : ImageStoreDataPatterns(opcode # _V1), f16, "_D16_gfx80">; + } // End HasUnpackedD16VMem. + + let SubtargetPredicate = HasPackedD16VMem in { + defm : ImageStoreDataPatterns(opcode # _V1), f16, "_D16">; + defm : ImageStoreDataPatterns(opcode # _V1), v2f16, "_D16">; + } // End HasPackedD16VMem. } +// ImageStore alternative patterns. +multiclass ImageStoreAltPatterns { + let SubtargetPredicate = HasUnpackedD16VMem in { + defm : ImageStoreDataPatterns(opcode # _V2), v2i32, "_D16_gfx80">; + defm : ImageStoreDataPatterns(opcode # _V4), v4i32, "_D16_gfx80">; + } // End HasUnpackedD16VMem. + + let SubtargetPredicate = HasPackedD16VMem in { + defm : ImageStoreDataPatterns(opcode # _V1), i32, "_D16">; + defm : ImageStoreDataPatterns(opcode # _V2), v2i32, "_D16">; + } // End HasPackedD16VMem. +} + +// ImageAtomic for amdgcn. class ImageAtomicPattern : GCNPat < (name i32:$vdata, vt:$addr, v8i32:$rsrc, imm:$r128, imm:$da, imm:$slc), (opcode $vdata, $addr, $rsrc, 1, 1, 1, (as_i1imm $slc), (as_i1imm $r128), 0, 0, (as_i1imm $da)) >; +// ImageAtomic patterns. multiclass ImageAtomicPatterns { def : ImageAtomicPattern(opcode # _V1), i32>; def : ImageAtomicPattern(opcode # _V2), v2i32>; def : ImageAtomicPattern(opcode # _V4), v4i32>; } +// ImageAtomicCmpSwap for amdgcn. class ImageAtomicCmpSwapPattern : GCNPat < (int_amdgcn_image_atomic_cmpswap i32:$vsrc, i32:$vcmp, vt:$addr, v8i32:$rsrc, imm:$r128, imm:$da, imm:$slc), @@ -478,93 +661,180 @@ // ======= amdgcn Image Intrinsics ============== -// Image load +// Image load. defm : ImageLoadPatterns; defm : ImageLoadPatterns; defm : ImageLoadPatterns; - -// Image store -defm : ImageStorePatterns; -defm : ImageStorePatterns; - -// Basic sample -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; - -// Sample with comparison -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; - -// Sample with offsets -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; - -// Sample with comparison and offsets -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; - -// Gather opcodes -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; - -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; - -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; - -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; - -defm : AMDGCNSamplePatterns; +defm : ImageLoadAltPatterns; +defm : ImageLoadAltPatterns; + +// Image store. +defm : ImageStorePatterns; +defm : ImageStorePatterns; +defm : ImageStoreAltPatterns; +defm : ImageStoreAltPatterns; + +// Basic sample. +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; + +// Sample with comparison. +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; + +// Sample with offsets. +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; + +// Sample with comparison and offsets. +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; + +// Basic gather4. +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; + +// Gather4 with comparison. +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; + +// Gather4 with offsets. +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; + +// Gather4 with comparison and offsets. +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; + +// Basic sample alternative. +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; + +// Sample with comparison alternative. +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; + +// Sample with offsets alternative. +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; + +// Sample with comparison and offsets alternative. +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; + +// Basic gather4 alternative. +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; + +// Gather4 with comparison alternative. +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; + +// Gather4 with offsets alternative. +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; + +// Gather4 with comparison and offsets alternative. +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; + +defm : ImageSamplePatterns; // Image atomics defm : ImageAtomicPatterns; Index: lib/Target/AMDGPU/SIDefines.h =================================================================== --- lib/Target/AMDGPU/SIDefines.h +++ lib/Target/AMDGPU/SIDefines.h @@ -82,7 +82,10 @@ // Clamps hi component of register. // ClampLo and ClampHi set for packed clamp. - ClampHi = UINT64_C(1) << 48 + ClampHi = UINT64_C(1) << 48, + + // "d16" bit set or not. + D16 = UINT64_C(1) << 49 }; // v_cmp_class_* etc. use a 10-bit mask for what operation is checked. Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -546,6 +546,98 @@ VT == MVT::v4f16); } +static unsigned getImageOpcode (unsigned IID) { + switch (IID) { + case Intrinsic::amdgcn_image_load : return AMDGPUISD::IMAGE_LOAD; + case Intrinsic::amdgcn_image_load_mip : return AMDGPUISD::IMAGE_LOAD_MIP; + + // Basic sample. + case Intrinsic::amdgcn_image_sample: return AMDGPUISD::IMAGE_SAMPLE; + case Intrinsic::amdgcn_image_sample_cl: return AMDGPUISD::IMAGE_SAMPLE_CL; + case Intrinsic::amdgcn_image_sample_d: return AMDGPUISD::IMAGE_SAMPLE_D; + case Intrinsic::amdgcn_image_sample_d_cl: return AMDGPUISD::IMAGE_SAMPLE_D_CL; + case Intrinsic::amdgcn_image_sample_l: return AMDGPUISD::IMAGE_SAMPLE_L; + case Intrinsic::amdgcn_image_sample_b: return AMDGPUISD::IMAGE_SAMPLE_B; + case Intrinsic::amdgcn_image_sample_b_cl: return AMDGPUISD::IMAGE_SAMPLE_B_CL; + case Intrinsic::amdgcn_image_sample_lz: return AMDGPUISD::IMAGE_SAMPLE_LZ; + case Intrinsic::amdgcn_image_sample_cd: return AMDGPUISD::IMAGE_SAMPLE_CD; + case Intrinsic::amdgcn_image_sample_cd_cl: return AMDGPUISD::IMAGE_SAMPLE_CD_CL; + + // Sample with comparison. + case Intrinsic::amdgcn_image_sample_c: return AMDGPUISD::IMAGE_SAMPLE_C; + case Intrinsic::amdgcn_image_sample_c_cl: return AMDGPUISD::IMAGE_SAMPLE_C_CL; + case Intrinsic::amdgcn_image_sample_c_d: return AMDGPUISD::IMAGE_SAMPLE_C_D; + case Intrinsic::amdgcn_image_sample_c_d_cl: return AMDGPUISD::IMAGE_SAMPLE_C_D_CL; + case Intrinsic::amdgcn_image_sample_c_l: return AMDGPUISD::IMAGE_SAMPLE_C_L; + case Intrinsic::amdgcn_image_sample_c_b: return AMDGPUISD::IMAGE_SAMPLE_C_B; + case Intrinsic::amdgcn_image_sample_c_b_cl: return AMDGPUISD::IMAGE_SAMPLE_C_B_CL; + case Intrinsic::amdgcn_image_sample_c_lz: return AMDGPUISD::IMAGE_SAMPLE_C_LZ; + case Intrinsic::amdgcn_image_sample_c_cd: return AMDGPUISD::IMAGE_SAMPLE_C_CD; + case Intrinsic::amdgcn_image_sample_c_cd_cl: return AMDGPUISD::IMAGE_SAMPLE_C_CD_CL; + + // Sample with offsets. + case Intrinsic::amdgcn_image_sample_o: return AMDGPUISD::IMAGE_SAMPLE_O; + case Intrinsic::amdgcn_image_sample_cl_o: return AMDGPUISD::IMAGE_SAMPLE_CL_O; + case Intrinsic::amdgcn_image_sample_d_o: return AMDGPUISD::IMAGE_SAMPLE_D_O; + case Intrinsic::amdgcn_image_sample_d_cl_o: return AMDGPUISD::IMAGE_SAMPLE_D_CL_O; + case Intrinsic::amdgcn_image_sample_l_o: return AMDGPUISD::IMAGE_SAMPLE_L_O; + case Intrinsic::amdgcn_image_sample_b_o: return AMDGPUISD::IMAGE_SAMPLE_B_O; + case Intrinsic::amdgcn_image_sample_b_cl_o: return AMDGPUISD::IMAGE_SAMPLE_B_CL_O; + case Intrinsic::amdgcn_image_sample_lz_o: return AMDGPUISD::IMAGE_SAMPLE_LZ_O; + case Intrinsic::amdgcn_image_sample_cd_o: return AMDGPUISD::IMAGE_SAMPLE_CD_O; + case Intrinsic::amdgcn_image_sample_cd_cl_o: return AMDGPUISD::IMAGE_SAMPLE_CD_CL_O; + + // Sample with comparison and offsets. + case Intrinsic::amdgcn_image_sample_c_o: return AMDGPUISD::IMAGE_SAMPLE_C_O; + case Intrinsic::amdgcn_image_sample_c_cl_o: return AMDGPUISD::IMAGE_SAMPLE_C_CL_O; + case Intrinsic::amdgcn_image_sample_c_d_o: return AMDGPUISD::IMAGE_SAMPLE_C_D_O; + case Intrinsic::amdgcn_image_sample_c_d_cl_o: return AMDGPUISD::IMAGE_SAMPLE_C_D_CL_O; + case Intrinsic::amdgcn_image_sample_c_l_o: return AMDGPUISD::IMAGE_SAMPLE_C_L_O; + case Intrinsic::amdgcn_image_sample_c_b_o: return AMDGPUISD::IMAGE_SAMPLE_C_B_O; + case Intrinsic::amdgcn_image_sample_c_b_cl_o: return AMDGPUISD::IMAGE_SAMPLE_C_B_CL_O; + case Intrinsic::amdgcn_image_sample_c_lz_o: return AMDGPUISD::IMAGE_SAMPLE_C_LZ_O; + case Intrinsic::amdgcn_image_sample_c_cd_o: return AMDGPUISD::IMAGE_SAMPLE_C_CD_O; + case Intrinsic::amdgcn_image_sample_c_cd_cl_o: return AMDGPUISD::IMAGE_SAMPLE_C_CD_CL_O; + + // Basic gather4. + case Intrinsic::amdgcn_image_gather4: return AMDGPUISD::IMAGE_GATHER4; + case Intrinsic::amdgcn_image_gather4_cl: return AMDGPUISD::IMAGE_GATHER4_CL; + case Intrinsic::amdgcn_image_gather4_l: return AMDGPUISD::IMAGE_GATHER4_L; + case Intrinsic::amdgcn_image_gather4_b: return AMDGPUISD::IMAGE_GATHER4_B; + case Intrinsic::amdgcn_image_gather4_b_cl: return AMDGPUISD::IMAGE_GATHER4_B_CL; + case Intrinsic::amdgcn_image_gather4_lz: return AMDGPUISD::IMAGE_GATHER4_LZ; + + // Gather4 with comparison. + case Intrinsic::amdgcn_image_gather4_c: return AMDGPUISD::IMAGE_GATHER4_C; + case Intrinsic::amdgcn_image_gather4_c_cl: return AMDGPUISD::IMAGE_GATHER4_C_CL; + case Intrinsic::amdgcn_image_gather4_c_l: return AMDGPUISD::IMAGE_GATHER4_C_L; + case Intrinsic::amdgcn_image_gather4_c_b: return AMDGPUISD::IMAGE_GATHER4_C_B; + case Intrinsic::amdgcn_image_gather4_c_b_cl: return AMDGPUISD::IMAGE_GATHER4_C_B_CL; + case Intrinsic::amdgcn_image_gather4_c_lz: return AMDGPUISD::IMAGE_GATHER4_C_LZ; + + // Gather4 with offsets. + case Intrinsic::amdgcn_image_gather4_o: return AMDGPUISD::IMAGE_GATHER4_O; + case Intrinsic::amdgcn_image_gather4_cl_o: return AMDGPUISD::IMAGE_GATHER4_CL_O; + case Intrinsic::amdgcn_image_gather4_l_o: return AMDGPUISD::IMAGE_GATHER4_L_O; + case Intrinsic::amdgcn_image_gather4_b_o: return AMDGPUISD::IMAGE_GATHER4_B_O; + case Intrinsic::amdgcn_image_gather4_b_cl_o: return AMDGPUISD::IMAGE_GATHER4_B_CL_O; + case Intrinsic::amdgcn_image_gather4_lz_o: return AMDGPUISD::IMAGE_GATHER4_LZ_O; + + // Gather4 with comparison and offsets. + case Intrinsic::amdgcn_image_gather4_c_o: return AMDGPUISD::IMAGE_GATHER4_C_O; + case Intrinsic::amdgcn_image_gather4_c_cl_o: return AMDGPUISD::IMAGE_GATHER4_C_CL_O; + case Intrinsic::amdgcn_image_gather4_c_l_o: return AMDGPUISD::IMAGE_GATHER4_C_L_O; + case Intrinsic::amdgcn_image_gather4_c_b_o: return AMDGPUISD::IMAGE_GATHER4_C_B_O; + case Intrinsic::amdgcn_image_gather4_c_b_cl_o: return AMDGPUISD::IMAGE_GATHER4_C_B_CL_O; + case Intrinsic::amdgcn_image_gather4_c_lz_o: return AMDGPUISD::IMAGE_GATHER4_C_LZ_O; + + default: break; + } + return 0; +} + + + bool SITargetLowering::isShuffleMaskLegal(ArrayRef, EVT) const { // SI has some legal vector types, but no legal vector operations. Say no // shuffles are legal in order to prefer scalarizing some vector operations. @@ -569,6 +661,88 @@ Info.writeMem = true; return true; } + // Basic sample. + case Intrinsic::amdgcn_image_sample: + case Intrinsic::amdgcn_image_sample_cl: + case Intrinsic::amdgcn_image_sample_d: + case Intrinsic::amdgcn_image_sample_d_cl: + case Intrinsic::amdgcn_image_sample_l: + case Intrinsic::amdgcn_image_sample_b: + case Intrinsic::amdgcn_image_sample_b_cl: + case Intrinsic::amdgcn_image_sample_lz: + case Intrinsic::amdgcn_image_sample_cd: + case Intrinsic::amdgcn_image_sample_cd_cl: + + // Sample with comparison. + case Intrinsic::amdgcn_image_sample_c: + case Intrinsic::amdgcn_image_sample_c_cl: + case Intrinsic::amdgcn_image_sample_c_d: + case Intrinsic::amdgcn_image_sample_c_d_cl: + case Intrinsic::amdgcn_image_sample_c_l: + case Intrinsic::amdgcn_image_sample_c_b: + case Intrinsic::amdgcn_image_sample_c_b_cl: + case Intrinsic::amdgcn_image_sample_c_lz: + case Intrinsic::amdgcn_image_sample_c_cd: + case Intrinsic::amdgcn_image_sample_c_cd_cl: + + // Sample with offsets. + case Intrinsic::amdgcn_image_sample_o: + case Intrinsic::amdgcn_image_sample_cl_o: + case Intrinsic::amdgcn_image_sample_d_o: + case Intrinsic::amdgcn_image_sample_d_cl_o: + case Intrinsic::amdgcn_image_sample_l_o: + case Intrinsic::amdgcn_image_sample_b_o: + case Intrinsic::amdgcn_image_sample_b_cl_o: + case Intrinsic::amdgcn_image_sample_lz_o: + case Intrinsic::amdgcn_image_sample_cd_o: + case Intrinsic::amdgcn_image_sample_cd_cl_o: + + // Sample with comparison and offsets. + case Intrinsic::amdgcn_image_sample_c_o: + case Intrinsic::amdgcn_image_sample_c_cl_o: + case Intrinsic::amdgcn_image_sample_c_d_o: + case Intrinsic::amdgcn_image_sample_c_d_cl_o: + case Intrinsic::amdgcn_image_sample_c_l_o: + case Intrinsic::amdgcn_image_sample_c_b_o: + case Intrinsic::amdgcn_image_sample_c_b_cl_o: + case Intrinsic::amdgcn_image_sample_c_lz_o: + case Intrinsic::amdgcn_image_sample_c_cd_o: + case Intrinsic::amdgcn_image_sample_c_cd_cl_o: + + // Basic gather4. + case Intrinsic::amdgcn_image_gather4: + case Intrinsic::amdgcn_image_gather4_cl: + case Intrinsic::amdgcn_image_gather4_l: + case Intrinsic::amdgcn_image_gather4_b: + case Intrinsic::amdgcn_image_gather4_b_cl: + case Intrinsic::amdgcn_image_gather4_lz: + + // Gather4 with comparison. + case Intrinsic::amdgcn_image_gather4_c: + case Intrinsic::amdgcn_image_gather4_c_cl: + case Intrinsic::amdgcn_image_gather4_c_l: + case Intrinsic::amdgcn_image_gather4_c_b: + case Intrinsic::amdgcn_image_gather4_c_b_cl: + case Intrinsic::amdgcn_image_gather4_c_lz: + + // Gather4 with offsets. + case Intrinsic::amdgcn_image_gather4_o: + case Intrinsic::amdgcn_image_gather4_cl_o: + case Intrinsic::amdgcn_image_gather4_l_o: + case Intrinsic::amdgcn_image_gather4_b_o: + case Intrinsic::amdgcn_image_gather4_b_cl_o: + case Intrinsic::amdgcn_image_gather4_lz_o: + + // Gather4 with comparison and offsets. + case Intrinsic::amdgcn_image_gather4_c_o: + case Intrinsic::amdgcn_image_gather4_c_cl_o: + case Intrinsic::amdgcn_image_gather4_c_l_o: + case Intrinsic::amdgcn_image_gather4_c_b_o: + case Intrinsic::amdgcn_image_gather4_c_b_cl_o: + case Intrinsic::amdgcn_image_gather4_c_lz_o: + + case Intrinsic::amdgcn_image_load: + case Intrinsic::amdgcn_image_load_mip: case Intrinsic::amdgcn_buffer_load: case Intrinsic::amdgcn_buffer_load_format: case Intrinsic::amdgcn_tbuffer_load: { @@ -583,7 +757,9 @@ return true; } case Intrinsic::amdgcn_buffer_store_format: - case Intrinsic::amdgcn_tbuffer_store: { + case Intrinsic::amdgcn_tbuffer_store: + case Intrinsic::amdgcn_image_store: + case Intrinsic::amdgcn_image_store_mip: { Info.opc = ISD::INTRINSIC_VOID; Info.memVT = MVT::getVT(CI.getOperand(0)->getType()); Info.ptrVal = nullptr; @@ -3284,7 +3460,7 @@ M->getMemOperand()); } case Intrinsic::amdgcn_buffer_load_format: { - SDValue Ops[] = { + SDValue Ops[] = { Op.getOperand(0), // Chain Op.getOperand(2), // rsrc Op.getOperand(3), // vindex @@ -3296,6 +3472,117 @@ DL, VTList, Ops, M->getMemoryVT(), M->getMemOperand()); } + case Intrinsic::amdgcn_image_load: + case Intrinsic::amdgcn_image_load_mip: { + SDValue Ops[] = { + Op.getOperand(0), // Chain + Op.getOperand(2), // vaddr + Op.getOperand(3), // rsrc + Op.getOperand(4), // dmask + Op.getOperand(5), // glc + Op.getOperand(6), // slc + Op.getOperand(7), // lwe + Op.getOperand(8) // da + }; + unsigned Opc = getImageOpcode (IID); + return DAG.getMemIntrinsicNode(Opc, DL, VTList, Ops, M->getMemoryVT(), + M->getMemOperand()); + } + // Basic sample. + case Intrinsic::amdgcn_image_sample: + case Intrinsic::amdgcn_image_sample_cl: + case Intrinsic::amdgcn_image_sample_d: + case Intrinsic::amdgcn_image_sample_d_cl: + case Intrinsic::amdgcn_image_sample_l: + case Intrinsic::amdgcn_image_sample_b: + case Intrinsic::amdgcn_image_sample_b_cl: + case Intrinsic::amdgcn_image_sample_lz: + case Intrinsic::amdgcn_image_sample_cd: + case Intrinsic::amdgcn_image_sample_cd_cl: + + // Sample with comparison. + case Intrinsic::amdgcn_image_sample_c: + case Intrinsic::amdgcn_image_sample_c_cl: + case Intrinsic::amdgcn_image_sample_c_d: + case Intrinsic::amdgcn_image_sample_c_d_cl: + case Intrinsic::amdgcn_image_sample_c_l: + case Intrinsic::amdgcn_image_sample_c_b: + case Intrinsic::amdgcn_image_sample_c_b_cl: + case Intrinsic::amdgcn_image_sample_c_lz: + case Intrinsic::amdgcn_image_sample_c_cd: + case Intrinsic::amdgcn_image_sample_c_cd_cl: + + // Sample with offsets. + case Intrinsic::amdgcn_image_sample_o: + case Intrinsic::amdgcn_image_sample_cl_o: + case Intrinsic::amdgcn_image_sample_d_o: + case Intrinsic::amdgcn_image_sample_d_cl_o: + case Intrinsic::amdgcn_image_sample_l_o: + case Intrinsic::amdgcn_image_sample_b_o: + case Intrinsic::amdgcn_image_sample_b_cl_o: + case Intrinsic::amdgcn_image_sample_lz_o: + case Intrinsic::amdgcn_image_sample_cd_o: + case Intrinsic::amdgcn_image_sample_cd_cl_o: + + // Sample with comparison and offsets. + case Intrinsic::amdgcn_image_sample_c_o: + case Intrinsic::amdgcn_image_sample_c_cl_o: + case Intrinsic::amdgcn_image_sample_c_d_o: + case Intrinsic::amdgcn_image_sample_c_d_cl_o: + case Intrinsic::amdgcn_image_sample_c_l_o: + case Intrinsic::amdgcn_image_sample_c_b_o: + case Intrinsic::amdgcn_image_sample_c_b_cl_o: + case Intrinsic::amdgcn_image_sample_c_lz_o: + case Intrinsic::amdgcn_image_sample_c_cd_o: + case Intrinsic::amdgcn_image_sample_c_cd_cl_o: + + // Basic gather4 + case Intrinsic::amdgcn_image_gather4: + case Intrinsic::amdgcn_image_gather4_cl: + case Intrinsic::amdgcn_image_gather4_l: + case Intrinsic::amdgcn_image_gather4_b: + case Intrinsic::amdgcn_image_gather4_b_cl: + case Intrinsic::amdgcn_image_gather4_lz: + + // Gather4 with comparison + case Intrinsic::amdgcn_image_gather4_c: + case Intrinsic::amdgcn_image_gather4_c_cl: + case Intrinsic::amdgcn_image_gather4_c_l: + case Intrinsic::amdgcn_image_gather4_c_b: + case Intrinsic::amdgcn_image_gather4_c_b_cl: + case Intrinsic::amdgcn_image_gather4_c_lz: + + // Gather4 with offsets + case Intrinsic::amdgcn_image_gather4_o: + case Intrinsic::amdgcn_image_gather4_cl_o: + case Intrinsic::amdgcn_image_gather4_l_o: + case Intrinsic::amdgcn_image_gather4_b_o: + case Intrinsic::amdgcn_image_gather4_b_cl_o: + case Intrinsic::amdgcn_image_gather4_lz_o: + + // Gather4 with comparison and offsets + case Intrinsic::amdgcn_image_gather4_c_o: + case Intrinsic::amdgcn_image_gather4_c_cl_o: + case Intrinsic::amdgcn_image_gather4_c_l_o: + case Intrinsic::amdgcn_image_gather4_c_b_o: + case Intrinsic::amdgcn_image_gather4_c_b_cl_o: + case Intrinsic::amdgcn_image_gather4_c_lz_o: { + SDValue Ops[] = { + Op.getOperand(0), // Chain + Op.getOperand(2), // vaddr + Op.getOperand(3), // rsrc + Op.getOperand(4), // sampler + Op.getOperand(5), // dmask + Op.getOperand(6), // unorm + Op.getOperand(7), // glc + Op.getOperand(8), // slc + Op.getOperand(9), // lwe + Op.getOperand(10) // da + }; + unsigned Opc = getImageOpcode (IID); + return DAG.getMemIntrinsicNode(Opc, DL, VTList, Ops, M->getMemoryVT(), + M->getMemOperand()); + } default: return SDValue(); } // End switch. @@ -4611,6 +4898,50 @@ return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, M->getMemoryVT(), M->getMemOperand()); } + + case Intrinsic::amdgcn_image_store: + case Intrinsic::amdgcn_image_store_mip: { + // TODO: factor out the common part from byffer_store, + // tbuffer_store and image_store. + SDValue VData = Op.getOperand(2); + EVT StoreVT = VData.getValueType(); + if (isHalfVT(StoreVT)) { + // TODO: Handle v3f16. + if (StoreVT == MVT::v2f16 || StoreVT== MVT::v4f16) { + if (!Subtarget->hasUnpackedD16VMem()) { + if (!isTypeLegal(StoreVT)) { + // If Target supports packed vmem, we just need to workaround + // the illegal type by casting to an equivalent one. + EVT EquivStoreVT = getEquivalentMemType(*DAG.getContext(), + StoreVT); + VData = DAG.getNode(ISD::BITCAST, DL, EquivStoreVT, VData); + } + } else {// We need to unpack the packed data to store. + EVT IntStoreVT = StoreVT.changeTypeToInteger(); + SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData); + EVT EquivStoreVT = (StoreVT == MVT::v2f16) ? MVT::v2i32 : MVT::v4i32; + VData = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData); + } + } + } + SDValue Ops[] = { + Chain, // Chain + VData, // vdata + Op.getOperand(3), // vaddr + Op.getOperand(4), // rsrc + Op.getOperand(5), // dmask + Op.getOperand(6), // glc + Op.getOperand(7), // slc + Op.getOperand(8), // lwe + Op.getOperand(9) // da + }; + + unsigned Opc = (IntrinsicID==Intrinsic::amdgcn_image_store) ? + AMDGPUISD::IMAGE_STORE : AMDGPUISD::IMAGE_STORE_MIP; + MemSDNode *M = cast(Op); + return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, + M->getMemoryVT(), M->getMemOperand()); + } default: return Op; } @@ -6568,6 +6899,7 @@ SDNode *Users[4] = { }; unsigned Lane = 0; unsigned DmaskIdx = (Node->getNumOperands() - Node->getNumValues() == 9) ? 2 : 3; + unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx); unsigned NewDmask = 0; @@ -6708,7 +7040,7 @@ unsigned Opcode = Node->getMachineOpcode(); if (TII->isMIMG(Opcode) && !TII->get(Opcode).mayStore() && - !TII->isGather4(Opcode)) + !TII->isGather4(Opcode) && !TII->isD16(Opcode)) adjustWritemask(Node, DAG); if (Opcode == AMDGPU::INSERT_SUBREG || @@ -6788,13 +7120,19 @@ } if (TII->isMIMG(MI)) { + // Don't adjust for D16. Also MIMG may not have d16 bit. + //int D16Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::d16); + //if (D16Idx > 0 && MI.getOperand(D16Idx).getImm() != 0) + if (TII->isD16(MI)) + return; + unsigned VReg = MI.getOperand(0).getReg(); const TargetRegisterClass *RC = MRI.getRegClass(VReg); // TODO: Need mapping tables to handle other cases (register classes). if (RC != &AMDGPU::VReg_128RegClass) return; - unsigned DmaskIdx = MI.getNumOperands() == 12 ? 3 : 4; + unsigned DmaskIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dmask); unsigned Writemask = MI.getOperand(DmaskIdx).getImm(); unsigned BitsSet = 0; for (unsigned i = 0; i < 4; ++i) Index: lib/Target/AMDGPU/SIInstrFormats.td =================================================================== --- lib/Target/AMDGPU/SIInstrFormats.td +++ lib/Target/AMDGPU/SIInstrFormats.td @@ -115,6 +115,8 @@ // of a packed output register. field bit ClampHi = 0; + field bit D16 = 0; + // These need to be kept in sync with the enum in SIInstrFlags. let TSFlags{0} = SALU; let TSFlags{1} = VALU; @@ -167,6 +169,7 @@ let TSFlags{46} = IntClamp; let TSFlags{47} = ClampLo; let TSFlags{48} = ClampHi; + let TSFlags{49} = D16; let SchedRW = [Write32Bit]; @@ -242,6 +245,7 @@ bits<1> tfe; bits<1> lwe; bits<1> slc; + bits<1> d16; bits<8> vaddr; bits<7> srsrc; bits<7> ssamp; @@ -260,6 +264,7 @@ let Inst{47-40} = vdata; let Inst{52-48} = srsrc{6-2}; let Inst{57-53} = ssamp{6-2}; + let Inst{63} = d16; } class EXPe : Enc64 { Index: lib/Target/AMDGPU/SIInstrInfo.h =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.h +++ lib/Target/AMDGPU/SIInstrInfo.h @@ -436,6 +436,14 @@ return get(Opcode).TSFlags & SIInstrFlags::Gather4; } + static bool isD16(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::D16; + } + + bool isD16(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::D16; + } + static bool isFLAT(const MachineInstr &MI) { return MI.getDesc().TSFlags & SIInstrFlags::FLAT; } Index: lib/Target/AMDGPU/SIInstrInfo.td =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.td +++ lib/Target/AMDGPU/SIInstrInfo.td @@ -118,6 +118,135 @@ SDTBufferStore, [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>; +def SDTImage_load : SDTypeProfile<1, 7, + [ + SDTCisInt<1>, // vaddr + SDTCisInt<2>, // rsrc + SDTCisVT<3, i32>, // dmask + SDTCisVT<4, i1>, // glc + SDTCisVT<5, i1>, // slc + SDTCisVT<6, i1>, // lwe + SDTCisVT<7, i1> // da + ]>; +def SIImage_load : SDNode<"AMDGPUISD::IMAGE_LOAD", SDTImage_load, + [SDNPMayLoad, SDNPMemOperand, SDNPHasChain]>; +def SIImage_load_mip : SDNode<"AMDGPUISD::IMAGE_LOAD_MIP", SDTImage_load, + [SDNPMayLoad, SDNPMemOperand, SDNPHasChain]>; + +def SDTImage_store : SDTypeProfile<0, 8, + [ + SDTCisInt<1>, // vaddr + SDTCisInt<2>, // rsrc + SDTCisVT<3, i32>, // dmask + SDTCisVT<4, i1>, // glc + SDTCisVT<5, i1>, // slc + SDTCisVT<6, i1>, // lwe + SDTCisVT<7, i1> // da + ]>; +def SIImage_store : SDNode <"AMDGPUISD::IMAGE_STORE", + SDTImage_store, + [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>; +def SIImage_store_mip : SDNode <"AMDGPUISD::IMAGE_STORE_MIP", + SDTImage_store, + [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>; + +class SDTImage_sample : SDNode , // vaddr + SDTCisInt<2>, // rsrc + SDTCisVT<3, v4i32>, // sampler + SDTCisVT<4, i32>, // dmask + SDTCisVT<5, i1>, // unorm + SDTCisVT<6, i1>, // glc + SDTCisVT<7, i1>, // slc + SDTCisVT<8, i1>, // lwe + SDTCisVT<9, i1> // da + ]>, + [SDNPMayLoad, SDNPMemOperand, SDNPHasChain] +>; + +// Basic sample. +def SIImage_sample : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE">; +def SIImage_sample_cl : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_CL">; +def SIImage_sample_d : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_D">; +def SIImage_sample_d_cl : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_D_CL">; +def SIImage_sample_l : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_L">; +def SIImage_sample_b : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_B">; +def SIImage_sample_b_cl : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_B_CL">; +def SIImage_sample_lz : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_LZ">; +def SIImage_sample_cd : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_CD">; +def SIImage_sample_cd_cl : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_CD_CL">; + +// Sample with comparison. +def SIImage_sample_c : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_C">; +def SIImage_sample_c_cl : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_C_CL">; +def SIImage_sample_c_d : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_C_D">; +def SIImage_sample_c_d_cl : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_C_D_CL">; +def SIImage_sample_c_l : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_C_L">; +def SIImage_sample_c_b : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_C_B">; +def SIImage_sample_c_b_cl : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_C_B_CL">; +def SIImage_sample_c_lz : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_C_LZ">; +def SIImage_sample_c_cd : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_C_CD">; +def SIImage_sample_c_cd_cl : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_C_CD_CL">; + +// Sample with offsets. +def SIImage_sample_o : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_O">; +def SIImage_sample_cl_o : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_CL_O">; +def SIImage_sample_d_o : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_D_O">; +def SIImage_sample_d_cl_o : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_D_CL_O">; +def SIImage_sample_l_o : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_L_O">; +def SIImage_sample_b_o : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_B_O">; +def SIImage_sample_b_cl_o : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_B_CL_O">; +def SIImage_sample_lz_o : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_LZ_O">; +def SIImage_sample_cd_o : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_CD_O">; +def SIImage_sample_cd_cl_o : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_CD_CL_O">; + +// Sample with comparison and offsets. +def SIImage_sample_c_o : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_C_O">; +def SIImage_sample_c_cl_o : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_C_CL_O">; +def SIImage_sample_c_d_o : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_C_D_O">; +def SIImage_sample_c_d_cl_o : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_C_D_CL_O">; +def SIImage_sample_c_l_o : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_C_L_O">; +def SIImage_sample_c_b_o : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_C_B_O">; +def SIImage_sample_c_b_cl_o : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_C_B_CL_O">; +def SIImage_sample_c_lz_o : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_C_LZ_O">; +def SIImage_sample_c_cd_o : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_C_CD_O">; +def SIImage_sample_c_cd_cl_o : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_C_CD_CL_O">; + +// Basic gather4. +def SIImage_gather4 : SDTImage_sample<"AMDGPUISD::IMAGE_GATHER4">; +def SIImage_gather4_cl : SDTImage_sample<"AMDGPUISD::IMAGE_GATHER4_CL">; +def SIImage_gather4_l : SDTImage_sample<"AMDGPUISD::IMAGE_GATHER4_L">; +def SIImage_gather4_b : SDTImage_sample<"AMDGPUISD::IMAGE_GATHER4_B">; +def SIImage_gather4_b_cl : SDTImage_sample<"AMDGPUISD::IMAGE_GATHER4_B_CL">; +def SIImage_gather4_lz : SDTImage_sample<"AMDGPUISD::IMAGE_GATHER4_LZ">; + +// Gather4 with comparison. +def SIImage_gather4_c : SDTImage_sample<"AMDGPUISD::IMAGE_GATHER4_C">; +def SIImage_gather4_c_cl : SDTImage_sample<"AMDGPUISD::IMAGE_GATHER4_C_CL">; +def SIImage_gather4_c_l : SDTImage_sample<"AMDGPUISD::IMAGE_GATHER4_C_L">; +def SIImage_gather4_c_b : SDTImage_sample<"AMDGPUISD::IMAGE_GATHER4_C_B">; +def SIImage_gather4_c_b_cl : SDTImage_sample<"AMDGPUISD::IMAGE_GATHER4_C_B_CL">; +def SIImage_gather4_c_lz : SDTImage_sample<"AMDGPUISD::IMAGE_GATHER4_C_LZ">; + +// Gather4 with offsets. +def SIImage_gather4_o : SDTImage_sample<"AMDGPUISD::IMAGE_GATHER4_O">; +def SIImage_gather4_cl_o : SDTImage_sample<"AMDGPUISD::IMAGE_GATHER4_CL_O">; +def SIImage_gather4_l_o : SDTImage_sample<"AMDGPUISD::IMAGE_GATHER4_L_O">; +def SIImage_gather4_b_o : SDTImage_sample<"AMDGPUISD::IMAGE_GATHER4_B_O">; +def SIImage_gather4_b_cl_o : SDTImage_sample<"AMDGPUISD::IMAGE_GATHER4_B_CL_O">; +def SIImage_gather4_lz_o : SDTImage_sample<"AMDGPUISD::IMAGE_GATHER4_LZ_O">; + +// Gather4 with comparison and offsets. +def SIImage_gather4_c_o : SDTImage_sample<"AMDGPUISD::IMAGE_GATHER4_C_O">; +def SIImage_gather4_c_cl_o : SDTImage_sample<"AMDGPUISD::IMAGE_GATHER4_C_CL_O">; +def SIImage_gather4_c_l_o : SDTImage_sample<"AMDGPUISD::IMAGE_GATHER4_C_L_O">; +def SIImage_gather4_c_b_o : SDTImage_sample<"AMDGPUISD::IMAGE_GATHER4_C_B_O">; +def SIImage_gather4_c_b_cl_o : SDTImage_sample<"AMDGPUISD::IMAGE_GATHER4_C_B_CL_O">; +def SIImage_gather4_c_lz_o : SDTImage_sample<"AMDGPUISD::IMAGE_GATHER4_C_LZ_O">; + + class SDSample : SDNode , SDTCisVT<2, v8i32>, SDTCisVT<3, v4i32>, SDTCisVT<4, i32>]> @@ -618,6 +747,7 @@ def da : NamedOperandBit<"DA", NamedMatchClass<"DA">>; def r128 : NamedOperandBit<"R128", NamedMatchClass<"R128">>; def lwe : NamedOperandBit<"LWE", NamedMatchClass<"LWE">>; +def d16 : NamedOperandBit<"D16", NamedMatchClass<"D16">>; def exp_compr : NamedOperandBit<"ExpCompr", NamedMatchClass<"ExpCompr">>; def exp_vm : NamedOperandBit<"ExpVM", NamedMatchClass<"ExpVM">>; Index: test/CodeGen/AMDGPU/llvm.amdgcn.image.d16.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.image.d16.ll @@ -0,0 +1,132 @@ +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=UNPACKED %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX81 %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx901 -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX9 %s + +;v{{\[}}[[0-9]+]:[[HI:[0-9]+]]{{\]}} +;v[[HI:[0-9]+]] +; v{{\[}}[[LO]]:[[HI]]{{\]}} + +; GCN-LABEL: {{^}}image_load_f16 +; GCN: image_load v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x1 unorm d16 +define amdgpu_ps half @image_load_f16(<4 x i32> %coords, <8 x i32> inreg %rsrc) { +main_body: + %tex = call half @llvm.amdgcn.image.load.f16.v4i32.v8i32(<4 x i32> %coords, <8 x i32> %rsrc, i32 1, i1 false, i1 false, i1 false, i1 false) + ret half %tex +} + +; GCN-LABEL: {{^}}image_load_v2f16: +; UNPACKED: image_load v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x3 unorm d16 +; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]] + +; PACKED: image_load v[[HI:[0-9]+]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x3 unorm d16 +; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[HI]] +define amdgpu_ps half @image_load_v2f16(<4 x i32> %coords, <8 x i32> inreg %rsrc) { +main_body: + %tex = call <2 x half> @llvm.amdgcn.image.load.v2f16.v4i32.v8i32(<4 x i32> %coords, <8 x i32> %rsrc, i32 3, i1 false, i1 false, i1 false, i1 false) + %elt = extractelement <2 x half> %tex, i32 1 + ret half %elt +} + +; GCN-LABEL: {{^}}image_load_v4f16: +; UNPACKED: image_load v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf unorm d16 +; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]] + +; PACKED: image_load v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf unorm d16 +; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[HI]] +define amdgpu_ps half @image_load_v4f16(<4 x i32> %coords, <8 x i32> inreg %rsrc) { +main_body: + %tex = call <4 x half> @llvm.amdgcn.image.load.v4f16.v4i32.v8i32(<4 x i32> %coords, <8 x i32> %rsrc, i32 15, i1 false, i1 false, i1 false, i1 false) + %elt = extractelement <4 x half> %tex, i32 3 + ret half %elt +} + +; GCN-LABEL: {{^}}image_load_mip_v4f16: +; UNPACKED: image_load_mip v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf unorm d16 +; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]] + +; PACKED: image_load_mip v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf unorm d16 +; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[HI]] +define amdgpu_ps half @image_load_mip_v4f16(<4 x i32> %coords, <8 x i32> inreg %rsrc) { +main_body: + %tex = call <4 x half> @llvm.amdgcn.image.load.mip.v4f16.v4i32.v8i32(<4 x i32> %coords, <8 x i32> %rsrc, i32 15, i1 false, i1 false, i1 false, i1 false) + %elt = extractelement <4 x half> %tex, i32 3 + ret half %elt +} + +; GCN-LABEL: {{^}}image_store_f16 +; GCN: v_trunc_f16_e32 v[[LO:[0-9]+]], s{{[0-9]+}} +; GCN: image_store v[[LO]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x1 unorm d16 +define amdgpu_kernel void @image_store_f16(half %data, <4 x i32> %coords, <8 x i32> inreg %rsrc) { +main_body: + call void @llvm.amdgcn.image.store.f16.v4i32.v8i32(half %data, <4 x i32> %coords, <8 x i32> %rsrc, i32 1, i1 false, i1 false, i1 false, i1 false) + ret void +} + +; GCN-LABEL: {{^}}image_store_v2f16 + +; UNPACKED: flat_load_ushort v[[HI:[0-9]+]], v[{{[0-9]+:[0-9]+}}] glc slc +; UNPACKED: flat_load_ushort v[[LO:[0-9]+]], v[{{[0-9]+:[0-9]+}}] glc slc +; UNPACKED: image_store v{{\[}}[[LO]]:[[HI]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x3 unorm d16 + +; PACKED: image_store v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x3 unorm d16 +define amdgpu_kernel void @image_store_v2f16(<2 x half> %data, <4 x i32> %coords, <8 x i32> inreg %rsrc) { +main_body: + call void @llvm.amdgcn.image.store.v2f16.v4i32.v8i32(<2 x half> %data, <4 x i32> %coords, <8 x i32> %rsrc, i32 3, i1 false, i1 false, i1 false, i1 false) + ret void +} + +; GCN-LABEL: {{^}}image_store_v4f16 + +; UNPACKED: flat_load_ushort v[[HI:[0-9]+]], v[{{[0-9]+:[0-9]+}}] glc slc +; UNPACKED: flat_load_ushort v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] glc slc +; UNPACKED: flat_load_ushort v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] glc slc +; UNPACKED: flat_load_ushort v[[LO:[0-9]+]], v[{{[0-9]+:[0-9]+}}] glc slc +; UNPACKED: image_store v{{\[}}[[LO]]:[[HI]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf unorm d16 + +; GFX81: v_or_b32_e32 v[[HI:[0-9]+]] +; GFX81: v_or_b32_e32 v[[LO:[0-9]+]] + +; GFX9: v_mov_b32_e32 v[[LO:[0-9]+]] +; GFX9: v_mov_b32_e32 v[[HI:[0-9]+]] + +; PACKED: image_store v{{\[}}[[LO]]:[[HI]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf unorm d16 +define amdgpu_kernel void @image_store_v4f16(<4 x half> %data, <4 x i32> %coords, <8 x i32> inreg %rsrc) { +main_body: + call void @llvm.amdgcn.image.store.v4f16.v4i32.v8i32(<4 x half> %data, <4 x i32> %coords, <8 x i32> %rsrc, i32 15, i1 false, i1 false, i1 false, i1 false) + ret void +} + +; GCN-LABEL: {{^}}image_store_mip_v4f16 + +; UNPACKED: flat_load_ushort v[[HI:[0-9]+]], v[{{[0-9]+:[0-9]+}}] glc slc +; UNPACKED: flat_load_ushort v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] glc slc +; UNPACKED: flat_load_ushort v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] glc slc +; UNPACKED: flat_load_ushort v[[LO:[0-9]+]], v[{{[0-9]+:[0-9]+}}] glc slc +; UNPACKED: image_store_mip v{{\[}}[[LO]]:[[HI]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf unorm d16 + +; GFX81: v_or_b32_e32 v[[HI:[0-9]+]] +; GFX81: v_or_b32_e32 v[[LO:[0-9]+]] + +; GFX9: v_mov_b32_e32 v[[LO:[0-9]+]] +; GFX9: v_mov_b32_e32 v[[HI:[0-9]+]] + +; PACKED: image_store_mip v{{\[}}[[LO]]:[[HI]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf unorm d16 +define amdgpu_kernel void @image_store_mip_v4f16(<4 x half> %data, <4 x i32> %coords, <8 x i32> inreg %rsrc) { +main_body: + call void @llvm.amdgcn.image.store.mip.v4f16.v4i32.v8i32(<4 x half> %data, <4 x i32> %coords, <8 x i32> %rsrc, i32 15, i1 false, i1 false, i1 false, i1 false) + ret void +} + + +declare half @llvm.amdgcn.image.load.f16.v4i32.v8i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) +declare <2 x half> @llvm.amdgcn.image.load.v2f16.v4i32.v8i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) +declare <4 x half> @llvm.amdgcn.image.load.v4f16.v4i32.v8i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) +declare <4 x half> @llvm.amdgcn.image.load.mip.v4f16.v4i32.v8i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) + +declare void @llvm.amdgcn.image.store.f16.v4i32.v8i32(half, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) +declare void @llvm.amdgcn.image.store.v2f16.v4i32.v8i32(<2 x half>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) +declare void @llvm.amdgcn.image.store.v4f16.v4i32.v8i32(<4 x half>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) +declare void @llvm.amdgcn.image.store.mip.v4f16.v4i32.v8i32(<4 x half>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) + + + Index: test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.d16.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.d16.ll @@ -0,0 +1,137 @@ +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=UNPACKED %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX81 %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx901 -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX9 %s + + +; GCN-LABEL: {{^}}image_gather4_f16: +; GCN: image_gather4 v[[HALF:[0-9]+]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x1 d16 + +; UNPACKED: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]] + +; GFX81: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]] + +; GFX9: global_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]], off +define amdgpu_kernel void @image_gather4_f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) { +main_body: + %tex = call half @llvm.amdgcn.image.gather4.f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 1, i1 0, i1 0, i1 0, i1 0, i1 0) + store half %tex, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}image_gather4_v2f16: +; UNPACKED: image_gather4 v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x3 d16 +; UNPACKED: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HI]] + +; PACKED: image_gather4 v[[DATA:[0-9]+]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x3 d16 + +; GFX81: v_lshrrev_b32_e32 v[[HI:[0-9]+]], 16, v[[DATA]] +; GFX81: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HI]] + +; GFX9: global_store_short_d16_hi v[{{[0-9]+:[0-9]+}}], v[[DATA]], off +define amdgpu_kernel void @image_gather4_v2f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) { +main_body: + %tex = call <2 x half> @llvm.amdgcn.image.gather4.v2f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 3, i1 0, i1 0, i1 0, i1 0, i1 0) + %elt = extractelement <2 x half> %tex, i32 1 + store half %elt, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}image_gather4_v4f16: +; UNPACKED: image_gather4 v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf d16 +; UNPACKED: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HI]] + +; PACKED: image_gather4 v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf d16 +; PACKED: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]] + +; GFX81: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]] + +; GFX9: global_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]], off +define amdgpu_kernel void @image_gather4_v4f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) { +main_body: + %tex = call <4 x half> @llvm.amdgcn.image.gather4.v4f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) + %elt = extractelement <4 x half> %tex, i32 3 + store half %elt, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}image_gather4_cl_v4f16: +; UNPACKED: image_gather4_cl v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf d16 +; UNPACKED: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HI]] + +; PACKED: image_gather4_cl v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf d16 +; PACKED: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]] + +; GFX81: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]] + +; GFX9: global_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]], off +define amdgpu_kernel void @image_gather4_cl_v4f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) { +main_body: + %tex = call <4 x half> @llvm.amdgcn.image.gather4.cl.v4f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) + %elt = extractelement <4 x half> %tex, i32 3 + store half %elt, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}image_gather4_c_v4f16: +; UNPACKED: image_gather4_c v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf d16 +; UNPACKED: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HI]] + +; PACKED: image_gather4_c v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf d16 +; PACKED: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]] + +; GFX81: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]] + +; GFX9: global_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]], off +define amdgpu_kernel void @image_gather4_c_v4f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) { +main_body: + %tex = call <4 x half> @llvm.amdgcn.image.gather4.c.v4f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) + %elt = extractelement <4 x half> %tex, i32 3 + store half %elt, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}image_gather4_o_v4f16: +; UNPACKED: image_gather4_o v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf d16 +; UNPACKED: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HI]] + +; PACKED: image_gather4_o v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf d16 +; PACKED: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]] + +; GFX81: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]] + +; GFX9: global_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]], off +define amdgpu_kernel void @image_gather4_o_v4f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) { +main_body: + %tex = call <4 x half> @llvm.amdgcn.image.gather4.o.v4f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) + %elt = extractelement <4 x half> %tex, i32 3 + store half %elt, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}image_gather4_c_o_v4f16: +; UNPACKED: image_gather4_c_o v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf d16 +; UNPACKED: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HI]] + +; PACKED: image_gather4_c_o v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf d16 +; PACKED: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]] + +; GFX81: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]] + +; GFX9: global_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]], off +define amdgpu_kernel void @image_gather4_c_o_v4f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) { +main_body: + %tex = call <4 x half> @llvm.amdgcn.image.gather4.c.o.v4f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) + %elt = extractelement <4 x half> %tex, i32 3 + store half %elt, half addrspace(1)* %out + ret void +} + +declare half @llvm.amdgcn.image.gather4.f16.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) +declare <2 x half> @llvm.amdgcn.image.gather4.v2f16.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) +declare <4 x half> @llvm.amdgcn.image.gather4.v4f16.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) + + +declare <4 x half> @llvm.amdgcn.image.gather4.cl.v4f16.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) +declare <4 x half> @llvm.amdgcn.image.gather4.c.v4f16.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) +declare <4 x half> @llvm.amdgcn.image.gather4.o.v4f16.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) +declare <4 x half> @llvm.amdgcn.image.gather4.c.o.v4f16.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) Index: test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.ll @@ -0,0 +1,135 @@ +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=UNPACKED %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX81 %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx901 -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX9 %s + + +; GCN-LABEL: {{^}}image_sample_f16: +; GCN: image_sample v[[HALF:[0-9]+]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x1 d16 + +; UNPACKED: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]] + +; GFX81: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]] + +; GFX9: global_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]], off +define amdgpu_kernel void @image_sample_f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) { +main_body: + %tex = call half @llvm.amdgcn.image.sample.f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 1, i1 0, i1 0, i1 0, i1 0, i1 0) + store half %tex, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}image_sample_v2f16: +; UNPACKED: image_sample v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x3 d16 +; UNPACKED: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HI]] + +; PACKED: image_sample v[[DATA:[0-9]+]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x3 d16 + +; GFX81: v_lshrrev_b32_e32 v[[HI:[0-9]+]], 16, v[[DATA]] +; GFX81: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HI]] + +; GFX9: global_store_short_d16_hi v[{{[0-9]+:[0-9]+}}], v[[DATA]], off +define amdgpu_kernel void @image_sample_v2f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) { +main_body: + %tex = call <2 x half> @llvm.amdgcn.image.sample.v2f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 3, i1 0, i1 0, i1 0, i1 0, i1 0) + %elt = extractelement <2 x half> %tex, i32 1 + store half %elt, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}image_sample_v4f16: +; UNPACKED: image_sample v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf d16 +; UNPACKED: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HI]] + +; PACKED: image_sample v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf d16 +; PACKED: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]] + +; GFX81: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]] + +; GFX9: global_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]], off +define amdgpu_kernel void @image_sample_v4f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) { +main_body: + %tex = call <4 x half> @llvm.amdgcn.image.sample.v4f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) + %elt = extractelement <4 x half> %tex, i32 3 + store half %elt, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}image_sample_cl_v4f16: +; UNPACKED: image_sample_cl v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf d16 +; UNPACKED: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HI]] + +; PACKED: image_sample_cl v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf d16 +; PACKED: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]] + +; GFX81: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]] + +; GFX9: global_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]], off +define amdgpu_kernel void @image_sample_cl_v4f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) { +main_body: + %tex = call <4 x half> @llvm.amdgcn.image.sample.cl.v4f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) + %elt = extractelement <4 x half> %tex, i32 3 + store half %elt, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}image_sample_c_v4f16: +; UNPACKED: image_sample_c v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf d16 +; UNPACKED: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HI]] + +; PACKED: image_sample_c v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf d16 +; PACKED: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]] + +; GFX81: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]] + +; GFX9: global_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]], off +define amdgpu_kernel void @image_sample_c_v4f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) { +main_body: + %tex = call <4 x half> @llvm.amdgcn.image.sample.c.v4f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) + %elt = extractelement <4 x half> %tex, i32 3 + store half %elt, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}image_sample_o_v4f16: +; UNPACKED: image_sample_o v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf d16 +; UNPACKED: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HI]] + +; PACKED: image_sample_o v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf d16 +; PACKED: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]] + +; GFX81: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]] +; GFX9: global_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]], off +define amdgpu_kernel void @image_sample_o_v4f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) { +main_body: + %tex = call <4 x half> @llvm.amdgcn.image.sample.o.v4f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) + %elt = extractelement <4 x half> %tex, i32 3 + store half %elt, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}image_sample_c_o_v4f16: +; UNPACKED: image_sample_c_o v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf d16 +; UNPACKED: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HI]] + +; PACKED: image_sample_c_o v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf d16 +; PACKED: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]] + +; GFX81: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]] +; GFX9: global_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]], off +define amdgpu_kernel void @image_sample_c_o_v4f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) { +main_body: + %tex = call <4 x half> @llvm.amdgcn.image.sample.c.o.v4f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) + %elt = extractelement <4 x half> %tex, i32 3 + store half %elt, half addrspace(1)* %out + ret void +} + +declare half @llvm.amdgcn.image.sample.f16.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) +declare <2 x half> @llvm.amdgcn.image.sample.v2f16.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) +declare <4 x half> @llvm.amdgcn.image.sample.v4f16.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) + + +declare <4 x half> @llvm.amdgcn.image.sample.cl.v4f16.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) +declare <4 x half> @llvm.amdgcn.image.sample.c.v4f16.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) +declare <4 x half> @llvm.amdgcn.image.sample.o.v4f16.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) +declare <4 x half> @llvm.amdgcn.image.sample.c.o.v4f16.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1)