Index: lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.h +++ lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -450,6 +450,91 @@ BUFFER_LOAD_FORMAT_D16, BUFFER_STORE_FORMAT, BUFFER_STORE_FORMAT_D16, + IMAGE_LOAD, + IMAGE_LOAD_MIP, + IMAGE_STORE, + IMAGE_STORE_MIP, + + // Basic sample. + IMAGE_SAMPLE, + IMAGE_SAMPLE_CL, + IMAGE_SAMPLE_D, + IMAGE_SAMPLE_D_CL, + IMAGE_SAMPLE_L, + IMAGE_SAMPLE_B, + IMAGE_SAMPLE_B_CL, + IMAGE_SAMPLE_LZ, + IMAGE_SAMPLE_CD, + IMAGE_SAMPLE_CD_CL, + + // Sample with comparison. + IMAGE_SAMPLE_C, + IMAGE_SAMPLE_C_CL, + IMAGE_SAMPLE_C_D, + IMAGE_SAMPLE_C_D_CL, + IMAGE_SAMPLE_C_L, + IMAGE_SAMPLE_C_B, + IMAGE_SAMPLE_C_B_CL, + IMAGE_SAMPLE_C_LZ, + IMAGE_SAMPLE_C_CD, + IMAGE_SAMPLE_C_CD_CL, + + // Sample with offsets. + IMAGE_SAMPLE_O, + IMAGE_SAMPLE_CL_O, + IMAGE_SAMPLE_D_O, + IMAGE_SAMPLE_D_CL_O, + IMAGE_SAMPLE_L_O, + IMAGE_SAMPLE_B_O, + IMAGE_SAMPLE_B_CL_O, + IMAGE_SAMPLE_LZ_O, + IMAGE_SAMPLE_CD_O, + IMAGE_SAMPLE_CD_CL_O, + + // Sample with comparison and offsets. + IMAGE_SAMPLE_C_O, + IMAGE_SAMPLE_C_CL_O, + IMAGE_SAMPLE_C_D_O, + IMAGE_SAMPLE_C_D_CL_O, + IMAGE_SAMPLE_C_L_O, + IMAGE_SAMPLE_C_B_O, + IMAGE_SAMPLE_C_B_CL_O, + IMAGE_SAMPLE_C_LZ_O, + IMAGE_SAMPLE_C_CD_O, + IMAGE_SAMPLE_C_CD_CL_O, + + // Basic gather4. + IMAGE_GATHER4, + IMAGE_GATHER4_CL, + IMAGE_GATHER4_L, + IMAGE_GATHER4_B, + IMAGE_GATHER4_B_CL, + IMAGE_GATHER4_LZ, + + // Gather4 with comparison. + IMAGE_GATHER4_C, + IMAGE_GATHER4_C_CL, + IMAGE_GATHER4_C_L, + IMAGE_GATHER4_C_B, + IMAGE_GATHER4_C_B_CL, + IMAGE_GATHER4_C_LZ, + + // Gather4 with offsets. + IMAGE_GATHER4_O, + IMAGE_GATHER4_CL_O, + IMAGE_GATHER4_L_O, + IMAGE_GATHER4_B_O, + IMAGE_GATHER4_B_CL_O, + IMAGE_GATHER4_LZ_O, + + // Gather4 with comparison and offsets. + IMAGE_GATHER4_C_O, + IMAGE_GATHER4_C_CL_O, + IMAGE_GATHER4_C_L_O, + IMAGE_GATHER4_C_B_O, + IMAGE_GATHER4_C_B_CL_O, + IMAGE_GATHER4_C_LZ_O, + LAST_AMDGPU_ISD_NUMBER }; Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -3965,6 +3965,83 @@ NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16) NODE_NAME_CASE(BUFFER_STORE_FORMAT) NODE_NAME_CASE(BUFFER_STORE_FORMAT_D16) + NODE_NAME_CASE(IMAGE_LOAD) + NODE_NAME_CASE(IMAGE_LOAD_MIP) + NODE_NAME_CASE(IMAGE_STORE) + NODE_NAME_CASE(IMAGE_STORE_MIP) + // Basic sample. + NODE_NAME_CASE(IMAGE_SAMPLE) + NODE_NAME_CASE(IMAGE_SAMPLE_CL) + NODE_NAME_CASE(IMAGE_SAMPLE_D) + NODE_NAME_CASE(IMAGE_SAMPLE_D_CL) + NODE_NAME_CASE(IMAGE_SAMPLE_L) + NODE_NAME_CASE(IMAGE_SAMPLE_B) + NODE_NAME_CASE(IMAGE_SAMPLE_B_CL) + NODE_NAME_CASE(IMAGE_SAMPLE_LZ) + NODE_NAME_CASE(IMAGE_SAMPLE_CD) + NODE_NAME_CASE(IMAGE_SAMPLE_CD_CL) + // Sample with comparison. + NODE_NAME_CASE(IMAGE_SAMPLE_C) + NODE_NAME_CASE(IMAGE_SAMPLE_C_CL) + NODE_NAME_CASE(IMAGE_SAMPLE_C_D) + NODE_NAME_CASE(IMAGE_SAMPLE_C_D_CL) + NODE_NAME_CASE(IMAGE_SAMPLE_C_L) + NODE_NAME_CASE(IMAGE_SAMPLE_C_B) + NODE_NAME_CASE(IMAGE_SAMPLE_C_B_CL) + NODE_NAME_CASE(IMAGE_SAMPLE_C_LZ) + NODE_NAME_CASE(IMAGE_SAMPLE_C_CD) + NODE_NAME_CASE(IMAGE_SAMPLE_C_CD_CL) + // Sample with offsets. + NODE_NAME_CASE(IMAGE_SAMPLE_O) + NODE_NAME_CASE(IMAGE_SAMPLE_CL_O) + NODE_NAME_CASE(IMAGE_SAMPLE_D_O) + NODE_NAME_CASE(IMAGE_SAMPLE_D_CL_O) + NODE_NAME_CASE(IMAGE_SAMPLE_L_O) + NODE_NAME_CASE(IMAGE_SAMPLE_B_O) + NODE_NAME_CASE(IMAGE_SAMPLE_B_CL_O) + NODE_NAME_CASE(IMAGE_SAMPLE_LZ_O) + NODE_NAME_CASE(IMAGE_SAMPLE_CD_O) + NODE_NAME_CASE(IMAGE_SAMPLE_CD_CL_O) + // Sample with comparison and offsets. + NODE_NAME_CASE(IMAGE_SAMPLE_C_O) + NODE_NAME_CASE(IMAGE_SAMPLE_C_CL_O) + NODE_NAME_CASE(IMAGE_SAMPLE_C_D_O) + NODE_NAME_CASE(IMAGE_SAMPLE_C_D_CL_O) + NODE_NAME_CASE(IMAGE_SAMPLE_C_L_O) + NODE_NAME_CASE(IMAGE_SAMPLE_C_B_O) + NODE_NAME_CASE(IMAGE_SAMPLE_C_B_CL_O) + NODE_NAME_CASE(IMAGE_SAMPLE_C_LZ_O) + NODE_NAME_CASE(IMAGE_SAMPLE_C_CD_O) + NODE_NAME_CASE(IMAGE_SAMPLE_C_CD_CL_O) + // Basic gather4. + NODE_NAME_CASE(IMAGE_GATHER4) + NODE_NAME_CASE(IMAGE_GATHER4_CL) + NODE_NAME_CASE(IMAGE_GATHER4_L) + NODE_NAME_CASE(IMAGE_GATHER4_B) + NODE_NAME_CASE(IMAGE_GATHER4_B_CL) + NODE_NAME_CASE(IMAGE_GATHER4_LZ) + // Gather4 with comparison. + NODE_NAME_CASE(IMAGE_GATHER4_C) + NODE_NAME_CASE(IMAGE_GATHER4_C_CL) + NODE_NAME_CASE(IMAGE_GATHER4_C_L) + NODE_NAME_CASE(IMAGE_GATHER4_C_B) + NODE_NAME_CASE(IMAGE_GATHER4_C_B_CL) + NODE_NAME_CASE(IMAGE_GATHER4_C_LZ) + // Gather4 with offsets. + NODE_NAME_CASE(IMAGE_GATHER4_O) + NODE_NAME_CASE(IMAGE_GATHER4_CL_O) + NODE_NAME_CASE(IMAGE_GATHER4_L_O) + NODE_NAME_CASE(IMAGE_GATHER4_B_O) + NODE_NAME_CASE(IMAGE_GATHER4_B_CL_O) + NODE_NAME_CASE(IMAGE_GATHER4_LZ_O) + // Gather4 with comparison and offsets. + NODE_NAME_CASE(IMAGE_GATHER4_C_O) + NODE_NAME_CASE(IMAGE_GATHER4_C_CL_O) + NODE_NAME_CASE(IMAGE_GATHER4_C_L_O) + NODE_NAME_CASE(IMAGE_GATHER4_C_B_O) + NODE_NAME_CASE(IMAGE_GATHER4_C_B_CL_O) + NODE_NAME_CASE(IMAGE_GATHER4_C_LZ_O) + case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break; } return nullptr; Index: lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp =================================================================== --- lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -152,6 +152,7 @@ ImmTyDA, ImmTyR128, ImmTyLWE, + ImmTyD16, ImmTyExpTgt, ImmTyExpCompr, ImmTyExpVM, @@ -283,6 +284,7 @@ bool isDA() const { return isImmTy(ImmTyDA); } bool isR128() const { return isImmTy(ImmTyUNorm); } bool isLWE() const { return isImmTy(ImmTyLWE); } + bool isD16() const { return isImmTy(ImmTyD16); } bool isOff() const { return isImmTy(ImmTyOff); } bool isExpTgt() const { return isImmTy(ImmTyExpTgt); } bool isExpVM() const { return isImmTy(ImmTyExpVM); } @@ -664,6 +666,7 @@ case ImmTyDA: OS << "DA"; break; case ImmTyR128: OS << "R128"; break; case ImmTyLWE: OS << "LWE"; break; + case ImmTyD16: OS << "D16"; break; case ImmTyOff: OS << "Off"; break; case ImmTyExpTgt: OS << "ExpTgt"; break; case ImmTyExpCompr: OS << "ExpCompr"; break; @@ -1065,6 +1068,7 @@ AMDGPUOperand::Ptr defaultDA() const; AMDGPUOperand::Ptr defaultR128() const; AMDGPUOperand::Ptr defaultLWE() const; + AMDGPUOperand::Ptr defaultD16() const; AMDGPUOperand::Ptr defaultSMRDOffset8() const; AMDGPUOperand::Ptr defaultSMRDOffset20() const; AMDGPUOperand::Ptr defaultSMRDLiteralOffset() const; @@ -3996,6 +4000,7 @@ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyR128); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyLWE); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyD16); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySLC); } @@ -4023,6 +4028,10 @@ return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyLWE); } +AMDGPUOperand::Ptr AMDGPUAsmParser::defaultD16() const { + return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyD16); +} + //===----------------------------------------------------------------------===// // smrd //===----------------------------------------------------------------------===// @@ -4122,6 +4131,7 @@ {"da", AMDGPUOperand::ImmTyDA, true, nullptr}, {"r128", AMDGPUOperand::ImmTyR128, true, nullptr}, {"lwe", AMDGPUOperand::ImmTyLWE, true, nullptr}, + {"d16", AMDGPUOperand::ImmTyD16, true, nullptr}, {"dmask", AMDGPUOperand::ImmTyDMask, false, nullptr}, {"row_mask", AMDGPUOperand::ImmTyDppRowMask, false, nullptr}, {"bank_mask", AMDGPUOperand::ImmTyDppBankMask, false, nullptr}, Index: lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h =================================================================== --- lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h +++ lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h @@ -84,6 +84,8 @@ raw_ostream &O); void printLWE(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); + void printD16(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); void printExpCompr(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); void printExpVM(const MCInst *MI, unsigned OpNo, Index: lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp =================================================================== --- lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp +++ lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp @@ -217,6 +217,11 @@ printNamedBit(MI, OpNo, O, "lwe"); } +void AMDGPUInstPrinter::printD16(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O) { + printNamedBit(MI, OpNo, O, "d16"); +} + void AMDGPUInstPrinter::printExpCompr(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { Index: lib/Target/AMDGPU/MIMGInstructions.td =================================================================== --- lib/Target/AMDGPU/MIMGInstructions.td +++ lib/Target/AMDGPU/MIMGInstructions.td @@ -36,8 +36,8 @@ (outs dst_rc:$vdata), (ins addr_rc:$vaddr, SReg_256:$srsrc, dmask:$dmask, unorm:$unorm, GLC:$glc, slc:$slc, - r128:$r128, tfe:$tfe, lwe:$lwe, da:$da), - asm#" $vdata, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da", + r128:$r128, tfe:$tfe, lwe:$lwe, da:$da, d16:$d16), + asm#" $vdata, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da$d16", dns>, MIMGe { let ssamp = 0; } @@ -67,8 +67,8 @@ (outs), (ins data_rc:$vdata, addr_rc:$vaddr, SReg_256:$srsrc, dmask:$dmask, unorm:$unorm, GLC:$glc, slc:$slc, - r128:$r128, tfe:$tfe, lwe:$lwe, da:$da), - asm#" $vdata, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da" + r128:$r128, tfe:$tfe, lwe:$lwe, da:$da, d16:$d16), + asm#" $vdata, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da$d16" >, MIMGe { let ssamp = 0; let mayLoad = 1; // TableGen requires this for matching with the intrinsics @@ -117,6 +117,7 @@ MIMG_Atomic_Helper, SIMCInstr, MIMGe { + let d16 = 0; let isCodeGenOnly = 0; let AssemblerPredicates = [isSICI]; let DecoderNamespace = "SICI"; @@ -128,6 +129,7 @@ MIMG_Atomic_Helper, SIMCInstr, MIMGe { + let d16 = 0; let isCodeGenOnly = 0; let AssemblerPredicates = [isVI]; let DecoderNamespace = "VI"; @@ -162,8 +164,8 @@ (outs dst_rc:$vdata), (ins src_rc:$vaddr, SReg_256:$srsrc, SReg_128:$ssamp, dmask:$dmask, unorm:$unorm, GLC:$glc, slc:$slc, - r128:$r128, tfe:$tfe, lwe:$lwe, da:$da), - asm#" $vdata, $vaddr, $srsrc, $ssamp$dmask$unorm$glc$slc$r128$tfe$lwe$da", + r128:$r128, tfe:$tfe, lwe:$lwe, da:$da, d16:$d16), + asm#" $vdata, $vaddr, $srsrc, $ssamp$dmask$unorm$glc$slc$r128$tfe$lwe$da$d16", dns>, MIMGe { let WQM = wqm; } @@ -199,8 +201,8 @@ (outs dst_rc:$vdata), (ins src_rc:$vaddr, SReg_256:$srsrc, SReg_128:$ssamp, dmask:$dmask, unorm:$unorm, GLC:$glc, slc:$slc, - r128:$r128, tfe:$tfe, lwe:$lwe, da:$da), - asm#" $vdata, $vaddr, $srsrc, $ssamp$dmask$unorm$glc$slc$r128$tfe$lwe$da", + r128:$r128, tfe:$tfe, lwe:$lwe, da:$da, d16:$d16), + asm#" $vdata, $vaddr, $srsrc, $ssamp$dmask$unorm$glc$slc$r128$tfe$lwe$da$d16", []>, MIMGe { let mayLoad = 1; let mayStore = 0; @@ -365,108 +367,144 @@ def : SampleRawPattern(opcode # _V4_V16), v16i32>; } -// Image + sampler for amdgcn +// ImageSample for amdgcn // TODO: -// 1. Handle half data type like v4f16, and add D16 bit support; -// 2. Handle v4i32 rsrc type (Register Class for the instruction to be SReg_128). -// 3. Add A16 support when we pass address of half type. -multiclass AMDGCNSamplePattern { +// 1. Handle v4i32 rsrc type (Register Class for the instruction to be SReg_128). +// 2. Add A16 support when we pass address of half type. +multiclass ImageSamplePattern { def : GCNPat< (dt (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, i32:$dmask, i1:$unorm, i1:$glc, i1:$slc, i1:$lwe, i1:$da)), (opcode $addr, $rsrc, $sampler, (as_i32imm $dmask), (as_i1imm $unorm), (as_i1imm $glc), (as_i1imm $slc), - 0, 0, (as_i1imm $lwe), (as_i1imm $da)) + 0, 0, (as_i1imm $lwe), (as_i1imm $da), d16) >; } -multiclass AMDGCNSampleDataPatterns { - defm : AMDGCNSamplePattern(opcode # _V1), dt, f32>; - defm : AMDGCNSamplePattern(opcode # _V2), dt, v2f32>; - defm : AMDGCNSamplePattern(opcode # _V4), dt, v4f32>; - defm : AMDGCNSamplePattern(opcode # _V8), dt, v8f32>; - defm : AMDGCNSamplePattern(opcode # _V16), dt, v16f32>; +multiclass ImageSampleDataPatterns { + defm : ImageSamplePattern(opcode # _V1), dt, f32, d16>; + defm : ImageSamplePattern(opcode # _V2), dt, v2f32, d16>; + defm : ImageSamplePattern(opcode # _V4), dt, v4f32, d16>; + defm : ImageSamplePattern(opcode # _V8), dt, v8f32, d16>; + defm : ImageSamplePattern(opcode # _V16), dt, v16f32, d16>; } -// TODO: support v3f32. -multiclass AMDGCNSamplePatterns { - defm : AMDGCNSampleDataPatterns(opcode # _V1), f32>; - defm : AMDGCNSampleDataPatterns(opcode # _V2), v2f32>; - defm : AMDGCNSampleDataPatterns(opcode # _V4), v4f32>; +// ImageSample patterns. +multiclass ImageSamplePatterns { + defm : ImageSampleDataPatterns(opcode # _V1), f16, 1>; + defm : ImageSampleDataPatterns(opcode # _V1), f32, 0>; +let SubtargetPredicate = HasPackedD16VMem in { + defm : ImageSampleDataPatterns(opcode # _V1), v2f16, 1>; +} // End HasPackedD16VMem. + defm : ImageSampleDataPatterns(opcode # _V2), v2f32, 0>; + defm : ImageSampleDataPatterns(opcode # _V4), v4f32, 0>; } -// Image only -class ImagePattern : GCNPat < - (name vt:$addr, v8i32:$rsrc, imm:$dmask, imm:$unorm, - imm:$r128, imm:$da, imm:$glc, imm:$slc, imm:$tfe, imm:$lwe), - (opcode $addr, $rsrc, - (as_i32imm $dmask), (as_i1imm $unorm), (as_i1imm $glc), (as_i1imm $slc), - (as_i1imm $r128), (as_i1imm $tfe), (as_i1imm $lwe), (as_i1imm $da)) ->; - -multiclass ImagePatterns { - def : ImagePattern(opcode # _V4_V1), i32>; - def : ImagePattern(opcode # _V4_V2), v2i32>; - def : ImagePattern(opcode # _V4_V4), v4i32>; +// ImageSample altervative patterns for illegal vector half Types. +multiclass ImageSampleAltPatterns { +let SubtargetPredicate = HasPackedD16VMem in { + defm : ImageSampleDataPatterns(opcode # _V1), i32, 1>; +} // End HasPackedD16VMem. + defm : ImageSampleDataPatterns(opcode # _V2), v2i32, 1>; +let SubtargetPredicate = HasUnpackedD16VMem in { + defm : ImageSampleDataPatterns(opcode # _V4), v4i32, 1>; +} // End HasUnpackedD16VMem. } -multiclass ImageLoadPattern { +// ImageLoad for amdgcn +multiclass ImageLoadPattern { def : GCNPat < (dt (name vt:$addr, v8i32:$rsrc, i32:$dmask, i1:$glc, i1:$slc, i1:$lwe, i1:$da)), (opcode $addr, $rsrc, (as_i32imm $dmask), 1, (as_i1imm $glc), (as_i1imm $slc), - 0, 0, (as_i1imm $lwe), (as_i1imm $da)) + 0, 0, (as_i1imm $lwe), (as_i1imm $da), d16) >; } -multiclass ImageLoadDataPatterns { - defm : ImageLoadPattern(opcode # _V1), dt, i32>; - defm : ImageLoadPattern(opcode # _V2), dt, v2i32>; - defm : ImageLoadPattern(opcode # _V4), dt, v4i32>; +multiclass ImageLoadDataPatterns { + defm : ImageLoadPattern(opcode # _V1), dt, i32, d16>; + defm : ImageLoadPattern(opcode # _V2), dt, v2i32, d16>; + defm : ImageLoadPattern(opcode # _V4), dt, v4i32, d16>; } +// ImageLoad patterns. // TODO: support v3f32. multiclass ImageLoadPatterns { - defm : ImageLoadDataPatterns(opcode # _V1), f32>; - defm : ImageLoadDataPatterns(opcode # _V2), v2f32>; - defm : ImageLoadDataPatterns(opcode # _V4), v4f32>; -} - -multiclass ImageStorePattern { + defm : ImageLoadDataPatterns(opcode # _V1), f16, 1>; + defm : ImageLoadDataPatterns(opcode # _V1), f32, 0>; +let SubtargetPredicate = HasPackedD16VMem in { + defm : ImageLoadDataPatterns(opcode # _V1), v2f16, 1>; +} // End HasPackedD16VMem. + defm : ImageLoadDataPatterns(opcode # _V2), v2f32, 0>; + defm : ImageLoadDataPatterns(opcode # _V4), v4f32, 0>; +} + +// ImageLoad alternative patterns for illegal vector half Types. +multiclass ImageLoadAltPatterns { +let SubtargetPredicate = HasPackedD16VMem in { + defm : ImageLoadDataPatterns(opcode # _V1), i32, 1>; +} // End HasPackedD16VMem. + defm : ImageLoadDataPatterns(opcode # _V2), v2i32, 1>; +let SubtargetPredicate = HasUnpackedD16VMem in { + defm : ImageLoadDataPatterns(opcode # _V4), v4i32, 1>; +} // End HasUnPackedD16VMem. +} + +// ImageStore for amdgcn +multiclass ImageStorePattern { def : GCNPat < (name dt:$data, vt:$addr, v8i32:$rsrc, i32:$dmask, i1:$glc, i1:$slc, i1:$lwe, i1:$da), (opcode $data, $addr, $rsrc, (as_i32imm $dmask), 1, (as_i1imm $glc), (as_i1imm $slc), - 0, 0, (as_i1imm $lwe), (as_i1imm $da)) + 0, 0, (as_i1imm $lwe), (as_i1imm $da), d16) >; } -multiclass ImageStoreDataPatterns { - defm : ImageStorePattern(opcode # _V1), dt, i32>; - defm : ImageStorePattern(opcode # _V2), dt, v2i32>; - defm : ImageStorePattern(opcode # _V4), dt, v4i32>; +multiclass ImageStoreDataPatterns { + defm : ImageStorePattern(opcode # _V1), dt, i32, d16>; + defm : ImageStorePattern(opcode # _V2), dt, v2i32, d16>; + defm : ImageStorePattern(opcode # _V4), dt, v4i32, d16>; } +// ImageStore patterns. // TODO: support v3f32. multiclass ImageStorePatterns { - defm : ImageStoreDataPatterns(opcode # _V1), f32>; - defm : ImageStoreDataPatterns(opcode # _V2), v2f32>; - defm : ImageStoreDataPatterns(opcode # _V4), v4f32>; -} - + defm : ImageStoreDataPatterns(opcode # _V1), f16, 1>; + defm : ImageStoreDataPatterns(opcode # _V1), f32, 0>; +let SubtargetPredicate = HasPackedD16VMem in { + defm : ImageStoreDataPatterns(opcode # _V1), v2f16, 1>; +} // End HasPackedD16VMem. + defm : ImageStoreDataPatterns(opcode # _V2), v2f32, 0>; + defm : ImageStoreDataPatterns(opcode # _V4), v4f32, 0>; +} + +// ImageStore alternative patterns. +multiclass ImageStoreAltPatterns { +let SubtargetPredicate = HasPackedD16VMem in { + defm : ImageStoreDataPatterns(opcode # _V1), i32, 1>; +} // End HasPackedD16VMem. + defm : ImageStoreDataPatterns(opcode # _V2), v2i32, 1>; +let SubtargetPredicate = HasUnpackedD16VMem in { + defm : ImageStoreDataPatterns(opcode # _V4), v4i32, 1>; +} // End HasUnpackedD16VMem. +} + +// ImageAtomic for amdgcn. class ImageAtomicPattern : GCNPat < (name i32:$vdata, vt:$addr, v8i32:$rsrc, imm:$r128, imm:$da, imm:$slc), (opcode $vdata, $addr, $rsrc, 1, 1, 1, (as_i1imm $slc), (as_i1imm $r128), 0, 0, (as_i1imm $da)) >; +// ImageAtomic patterns. multiclass ImageAtomicPatterns { def : ImageAtomicPattern(opcode # _V1), i32>; def : ImageAtomicPattern(opcode # _V2), v2i32>; def : ImageAtomicPattern(opcode # _V4), v4i32>; } +// ImageAtomicCmpSwap for amdgcn. class ImageAtomicCmpSwapPattern : GCNPat < (int_amdgcn_image_atomic_cmpswap i32:$vsrc, i32:$vcmp, vt:$addr, v8i32:$rsrc, imm:$r128, imm:$da, imm:$slc), @@ -478,93 +516,180 @@ // ======= amdgcn Image Intrinsics ============== -// Image load +// Image load. defm : ImageLoadPatterns; defm : ImageLoadPatterns; defm : ImageLoadPatterns; - -// Image store -defm : ImageStorePatterns; -defm : ImageStorePatterns; - -// Basic sample -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; - -// Sample with comparison -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; - -// Sample with offsets -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; - -// Sample with comparison and offsets -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; - -// Gather opcodes -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; - -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; - -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; - -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; -defm : AMDGCNSamplePatterns; - -defm : AMDGCNSamplePatterns; +defm : ImageLoadAltPatterns; +defm : ImageLoadAltPatterns; + +// Image store. +defm : ImageStorePatterns; +defm : ImageStorePatterns; +defm : ImageStoreAltPatterns; +defm : ImageStoreAltPatterns; + +// Basic sample. +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; + +// Sample with comparison. +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; + +// Sample with offsets. +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; + +// Sample with comparison and offsets. +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; + +// Basic gather4. +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; + +// Gather4 with comparison. +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; + +// Gather4 with offsets. +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; + +// Gather4 with comparison and offsets. +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; +defm : ImageSamplePatterns; + +// Basic sample alternative. +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; + +// Sample with comparison alternative. +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; + +// Sample with offsets alternative. +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; + +// Sample with comparison and offsets alternative. +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; + +// Basic gather4 alternative. +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; + +// Gather4 with comparison alternative. +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; + +// Gather4 with offsets alternative. +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; + +// Gather4 with comparison and offsets alternative. +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; +defm : ImageSampleAltPatterns; + +defm : ImageSamplePatterns; // Image atomics defm : ImageAtomicPatterns; @@ -586,34 +711,34 @@ /* SIsample for simple 1D texture lookup */ def : GCNPat < (SIsample i32:$addr, v8i32:$rsrc, v4i32:$sampler, imm), - (IMAGE_SAMPLE_V4_V1 $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 0) + (IMAGE_SAMPLE_V4_V1 $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 0, 0) >; class SamplePattern : GCNPat < (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, imm), - (opcode $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 0) + (opcode $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 0, 0) >; class SampleRectPattern : GCNPat < (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, TEX_RECT), - (opcode $addr, $rsrc, $sampler, 0xf, 1, 0, 0, 0, 0, 0, 0) + (opcode $addr, $rsrc, $sampler, 0xf, 1, 0, 0, 0, 0, 0, 0, 0) >; class SampleArrayPattern : GCNPat < (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, TEX_ARRAY), - (opcode $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 1) + (opcode $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 1, 0) >; class SampleShadowPattern : GCNPat < (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, TEX_SHADOW), - (opcode $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 0) + (opcode $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 0, 0) >; class SampleShadowArrayPattern : GCNPat < (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, TEX_SHADOW_ARRAY), - (opcode $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 1) + (opcode $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 1, 0) >; /* SIsample* for texture lookups consuming more address parameters */ Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -546,6 +546,98 @@ VT == MVT::v4f16); } +static unsigned getImageOpcode (unsigned IID) { + switch (IID) { + case Intrinsic::amdgcn_image_load : return AMDGPUISD::IMAGE_LOAD; + case Intrinsic::amdgcn_image_load_mip : return AMDGPUISD::IMAGE_LOAD_MIP; + + // Basic sample. + case Intrinsic::amdgcn_image_sample: return AMDGPUISD::IMAGE_SAMPLE; + case Intrinsic::amdgcn_image_sample_cl: return AMDGPUISD::IMAGE_SAMPLE_CL; + case Intrinsic::amdgcn_image_sample_d: return AMDGPUISD::IMAGE_SAMPLE_D; + case Intrinsic::amdgcn_image_sample_d_cl: return AMDGPUISD::IMAGE_SAMPLE_D_CL; + case Intrinsic::amdgcn_image_sample_l: return AMDGPUISD::IMAGE_SAMPLE_L; + case Intrinsic::amdgcn_image_sample_b: return AMDGPUISD::IMAGE_SAMPLE_B; + case Intrinsic::amdgcn_image_sample_b_cl: return AMDGPUISD::IMAGE_SAMPLE_B_CL; + case Intrinsic::amdgcn_image_sample_lz: return AMDGPUISD::IMAGE_SAMPLE_LZ; + case Intrinsic::amdgcn_image_sample_cd: return AMDGPUISD::IMAGE_SAMPLE_CD; + case Intrinsic::amdgcn_image_sample_cd_cl: return AMDGPUISD::IMAGE_SAMPLE_CD_CL; + + // Sample with comparison. + case Intrinsic::amdgcn_image_sample_c: return AMDGPUISD::IMAGE_SAMPLE_C; + case Intrinsic::amdgcn_image_sample_c_cl: return AMDGPUISD::IMAGE_SAMPLE_C_CL; + case Intrinsic::amdgcn_image_sample_c_d: return AMDGPUISD::IMAGE_SAMPLE_C_D; + case Intrinsic::amdgcn_image_sample_c_d_cl: return AMDGPUISD::IMAGE_SAMPLE_C_D_CL; + case Intrinsic::amdgcn_image_sample_c_l: return AMDGPUISD::IMAGE_SAMPLE_C_L; + case Intrinsic::amdgcn_image_sample_c_b: return AMDGPUISD::IMAGE_SAMPLE_C_B; + case Intrinsic::amdgcn_image_sample_c_b_cl: return AMDGPUISD::IMAGE_SAMPLE_C_B_CL; + case Intrinsic::amdgcn_image_sample_c_lz: return AMDGPUISD::IMAGE_SAMPLE_C_LZ; + case Intrinsic::amdgcn_image_sample_c_cd: return AMDGPUISD::IMAGE_SAMPLE_C_CD; + case Intrinsic::amdgcn_image_sample_c_cd_cl: return AMDGPUISD::IMAGE_SAMPLE_C_CD_CL; + + // Sample with offsets. + case Intrinsic::amdgcn_image_sample_o: return AMDGPUISD::IMAGE_SAMPLE_O; + case Intrinsic::amdgcn_image_sample_cl_o: return AMDGPUISD::IMAGE_SAMPLE_CL_O; + case Intrinsic::amdgcn_image_sample_d_o: return AMDGPUISD::IMAGE_SAMPLE_D_O; + case Intrinsic::amdgcn_image_sample_d_cl_o: return AMDGPUISD::IMAGE_SAMPLE_D_CL_O; + case Intrinsic::amdgcn_image_sample_l_o: return AMDGPUISD::IMAGE_SAMPLE_L_O; + case Intrinsic::amdgcn_image_sample_b_o: return AMDGPUISD::IMAGE_SAMPLE_B_O; + case Intrinsic::amdgcn_image_sample_b_cl_o: return AMDGPUISD::IMAGE_SAMPLE_B_CL_O; + case Intrinsic::amdgcn_image_sample_lz_o: return AMDGPUISD::IMAGE_SAMPLE_LZ_O; + case Intrinsic::amdgcn_image_sample_cd_o: return AMDGPUISD::IMAGE_SAMPLE_CD_O; + case Intrinsic::amdgcn_image_sample_cd_cl_o: return AMDGPUISD::IMAGE_SAMPLE_CD_CL_O; + + // Sample with comparison and offsets. + case Intrinsic::amdgcn_image_sample_c_o: return AMDGPUISD::IMAGE_SAMPLE_C_O; + case Intrinsic::amdgcn_image_sample_c_cl_o: return AMDGPUISD::IMAGE_SAMPLE_C_CL_O; + case Intrinsic::amdgcn_image_sample_c_d_o: return AMDGPUISD::IMAGE_SAMPLE_C_D_O; + case Intrinsic::amdgcn_image_sample_c_d_cl_o: return AMDGPUISD::IMAGE_SAMPLE_C_D_CL_O; + case Intrinsic::amdgcn_image_sample_c_l_o: return AMDGPUISD::IMAGE_SAMPLE_C_L_O; + case Intrinsic::amdgcn_image_sample_c_b_o: return AMDGPUISD::IMAGE_SAMPLE_C_B_O; + case Intrinsic::amdgcn_image_sample_c_b_cl_o: return AMDGPUISD::IMAGE_SAMPLE_C_B_CL_O; + case Intrinsic::amdgcn_image_sample_c_lz_o: return AMDGPUISD::IMAGE_SAMPLE_C_LZ_O; + case Intrinsic::amdgcn_image_sample_c_cd_o: return AMDGPUISD::IMAGE_SAMPLE_C_CD_O; + case Intrinsic::amdgcn_image_sample_c_cd_cl_o: return AMDGPUISD::IMAGE_SAMPLE_C_CD_CL_O; + + // Basic gather4. + case Intrinsic::amdgcn_image_gather4: return AMDGPUISD::IMAGE_GATHER4; + case Intrinsic::amdgcn_image_gather4_cl: return AMDGPUISD::IMAGE_GATHER4_CL; + case Intrinsic::amdgcn_image_gather4_l: return AMDGPUISD::IMAGE_GATHER4_L; + case Intrinsic::amdgcn_image_gather4_b: return AMDGPUISD::IMAGE_GATHER4_B; + case Intrinsic::amdgcn_image_gather4_b_cl: return AMDGPUISD::IMAGE_GATHER4_B_CL; + case Intrinsic::amdgcn_image_gather4_lz: return AMDGPUISD::IMAGE_GATHER4_LZ; + + // Gather4 with comparison. + case Intrinsic::amdgcn_image_gather4_c: return AMDGPUISD::IMAGE_GATHER4_C; + case Intrinsic::amdgcn_image_gather4_c_cl: return AMDGPUISD::IMAGE_GATHER4_C_CL; + case Intrinsic::amdgcn_image_gather4_c_l: return AMDGPUISD::IMAGE_GATHER4_C_L; + case Intrinsic::amdgcn_image_gather4_c_b: return AMDGPUISD::IMAGE_GATHER4_C_B; + case Intrinsic::amdgcn_image_gather4_c_b_cl: return AMDGPUISD::IMAGE_GATHER4_C_B_CL; + case Intrinsic::amdgcn_image_gather4_c_lz: return AMDGPUISD::IMAGE_GATHER4_C_LZ; + + // Gather4 with offsets. + case Intrinsic::amdgcn_image_gather4_o: return AMDGPUISD::IMAGE_GATHER4_O; + case Intrinsic::amdgcn_image_gather4_cl_o: return AMDGPUISD::IMAGE_GATHER4_CL_O; + case Intrinsic::amdgcn_image_gather4_l_o: return AMDGPUISD::IMAGE_GATHER4_L_O; + case Intrinsic::amdgcn_image_gather4_b_o: return AMDGPUISD::IMAGE_GATHER4_B_O; + case Intrinsic::amdgcn_image_gather4_b_cl_o: return AMDGPUISD::IMAGE_GATHER4_B_CL_O; + case Intrinsic::amdgcn_image_gather4_lz_o: return AMDGPUISD::IMAGE_GATHER4_LZ_O; + + // Gather4 with comparison and offsets. + case Intrinsic::amdgcn_image_gather4_c_o: return AMDGPUISD::IMAGE_GATHER4_C_O; + case Intrinsic::amdgcn_image_gather4_c_cl_o: return AMDGPUISD::IMAGE_GATHER4_C_CL_O; + case Intrinsic::amdgcn_image_gather4_c_l_o: return AMDGPUISD::IMAGE_GATHER4_C_L_O; + case Intrinsic::amdgcn_image_gather4_c_b_o: return AMDGPUISD::IMAGE_GATHER4_C_B_O; + case Intrinsic::amdgcn_image_gather4_c_b_cl_o: return AMDGPUISD::IMAGE_GATHER4_C_B_CL_O; + case Intrinsic::amdgcn_image_gather4_c_lz_o: return AMDGPUISD::IMAGE_GATHER4_C_LZ_O; + + default: break; + } + return 0; +} + + + bool SITargetLowering::isShuffleMaskLegal(ArrayRef, EVT) const { // SI has some legal vector types, but no legal vector operations. Say no // shuffles are legal in order to prefer scalarizing some vector operations. @@ -569,6 +661,88 @@ Info.writeMem = true; return true; } + // Basic sample. + case Intrinsic::amdgcn_image_sample: + case Intrinsic::amdgcn_image_sample_cl: + case Intrinsic::amdgcn_image_sample_d: + case Intrinsic::amdgcn_image_sample_d_cl: + case Intrinsic::amdgcn_image_sample_l: + case Intrinsic::amdgcn_image_sample_b: + case Intrinsic::amdgcn_image_sample_b_cl: + case Intrinsic::amdgcn_image_sample_lz: + case Intrinsic::amdgcn_image_sample_cd: + case Intrinsic::amdgcn_image_sample_cd_cl: + + // Sample with comparison. + case Intrinsic::amdgcn_image_sample_c: + case Intrinsic::amdgcn_image_sample_c_cl: + case Intrinsic::amdgcn_image_sample_c_d: + case Intrinsic::amdgcn_image_sample_c_d_cl: + case Intrinsic::amdgcn_image_sample_c_l: + case Intrinsic::amdgcn_image_sample_c_b: + case Intrinsic::amdgcn_image_sample_c_b_cl: + case Intrinsic::amdgcn_image_sample_c_lz: + case Intrinsic::amdgcn_image_sample_c_cd: + case Intrinsic::amdgcn_image_sample_c_cd_cl: + + // Sample with offsets. + case Intrinsic::amdgcn_image_sample_o: + case Intrinsic::amdgcn_image_sample_cl_o: + case Intrinsic::amdgcn_image_sample_d_o: + case Intrinsic::amdgcn_image_sample_d_cl_o: + case Intrinsic::amdgcn_image_sample_l_o: + case Intrinsic::amdgcn_image_sample_b_o: + case Intrinsic::amdgcn_image_sample_b_cl_o: + case Intrinsic::amdgcn_image_sample_lz_o: + case Intrinsic::amdgcn_image_sample_cd_o: + case Intrinsic::amdgcn_image_sample_cd_cl_o: + + // Sample with comparison and offsets. + case Intrinsic::amdgcn_image_sample_c_o: + case Intrinsic::amdgcn_image_sample_c_cl_o: + case Intrinsic::amdgcn_image_sample_c_d_o: + case Intrinsic::amdgcn_image_sample_c_d_cl_o: + case Intrinsic::amdgcn_image_sample_c_l_o: + case Intrinsic::amdgcn_image_sample_c_b_o: + case Intrinsic::amdgcn_image_sample_c_b_cl_o: + case Intrinsic::amdgcn_image_sample_c_lz_o: + case Intrinsic::amdgcn_image_sample_c_cd_o: + case Intrinsic::amdgcn_image_sample_c_cd_cl_o: + + // Basic gather4. + case Intrinsic::amdgcn_image_gather4: + case Intrinsic::amdgcn_image_gather4_cl: + case Intrinsic::amdgcn_image_gather4_l: + case Intrinsic::amdgcn_image_gather4_b: + case Intrinsic::amdgcn_image_gather4_b_cl: + case Intrinsic::amdgcn_image_gather4_lz: + + // Gather4 with comparison. + case Intrinsic::amdgcn_image_gather4_c: + case Intrinsic::amdgcn_image_gather4_c_cl: + case Intrinsic::amdgcn_image_gather4_c_l: + case Intrinsic::amdgcn_image_gather4_c_b: + case Intrinsic::amdgcn_image_gather4_c_b_cl: + case Intrinsic::amdgcn_image_gather4_c_lz: + + // Gather4 with offsets. + case Intrinsic::amdgcn_image_gather4_o: + case Intrinsic::amdgcn_image_gather4_cl_o: + case Intrinsic::amdgcn_image_gather4_l_o: + case Intrinsic::amdgcn_image_gather4_b_o: + case Intrinsic::amdgcn_image_gather4_b_cl_o: + case Intrinsic::amdgcn_image_gather4_lz_o: + + // Gather4 with comparison and offsets. + case Intrinsic::amdgcn_image_gather4_c_o: + case Intrinsic::amdgcn_image_gather4_c_cl_o: + case Intrinsic::amdgcn_image_gather4_c_l_o: + case Intrinsic::amdgcn_image_gather4_c_b_o: + case Intrinsic::amdgcn_image_gather4_c_b_cl_o: + case Intrinsic::amdgcn_image_gather4_c_lz_o: + + case Intrinsic::amdgcn_image_load: + case Intrinsic::amdgcn_image_load_mip: case Intrinsic::amdgcn_buffer_load: case Intrinsic::amdgcn_buffer_load_format: case Intrinsic::amdgcn_tbuffer_load: { @@ -583,7 +757,9 @@ return true; } case Intrinsic::amdgcn_buffer_store_format: - case Intrinsic::amdgcn_tbuffer_store: { + case Intrinsic::amdgcn_tbuffer_store: + case Intrinsic::amdgcn_image_store: + case Intrinsic::amdgcn_image_store_mip: { Info.opc = ISD::INTRINSIC_VOID; Info.memVT = MVT::getVT(CI.getOperand(0)->getType()); Info.ptrVal = nullptr; @@ -3284,7 +3460,7 @@ M->getMemOperand()); } case Intrinsic::amdgcn_buffer_load_format: { - SDValue Ops[] = { + SDValue Ops[] = { Op.getOperand(0), // Chain Op.getOperand(2), // rsrc Op.getOperand(3), // vindex @@ -3296,6 +3472,117 @@ DL, VTList, Ops, M->getMemoryVT(), M->getMemOperand()); } + case Intrinsic::amdgcn_image_load: + case Intrinsic::amdgcn_image_load_mip: { + SDValue Ops[] = { + Op.getOperand(0), // Chain + Op.getOperand(2), // vaddr + Op.getOperand(3), // rsrc + Op.getOperand(4), // dmask + Op.getOperand(5), // glc + Op.getOperand(6), // slc + Op.getOperand(7), // lwe + Op.getOperand(8) // da + }; + unsigned Opc = getImageOpcode (IID); + return DAG.getMemIntrinsicNode(Opc, DL, VTList, Ops, M->getMemoryVT(), + M->getMemOperand()); + } + // Basic sample. + case Intrinsic::amdgcn_image_sample: + case Intrinsic::amdgcn_image_sample_cl: + case Intrinsic::amdgcn_image_sample_d: + case Intrinsic::amdgcn_image_sample_d_cl: + case Intrinsic::amdgcn_image_sample_l: + case Intrinsic::amdgcn_image_sample_b: + case Intrinsic::amdgcn_image_sample_b_cl: + case Intrinsic::amdgcn_image_sample_lz: + case Intrinsic::amdgcn_image_sample_cd: + case Intrinsic::amdgcn_image_sample_cd_cl: + + // Sample with comparison. + case Intrinsic::amdgcn_image_sample_c: + case Intrinsic::amdgcn_image_sample_c_cl: + case Intrinsic::amdgcn_image_sample_c_d: + case Intrinsic::amdgcn_image_sample_c_d_cl: + case Intrinsic::amdgcn_image_sample_c_l: + case Intrinsic::amdgcn_image_sample_c_b: + case Intrinsic::amdgcn_image_sample_c_b_cl: + case Intrinsic::amdgcn_image_sample_c_lz: + case Intrinsic::amdgcn_image_sample_c_cd: + case Intrinsic::amdgcn_image_sample_c_cd_cl: + + // Sample with offsets. + case Intrinsic::amdgcn_image_sample_o: + case Intrinsic::amdgcn_image_sample_cl_o: + case Intrinsic::amdgcn_image_sample_d_o: + case Intrinsic::amdgcn_image_sample_d_cl_o: + case Intrinsic::amdgcn_image_sample_l_o: + case Intrinsic::amdgcn_image_sample_b_o: + case Intrinsic::amdgcn_image_sample_b_cl_o: + case Intrinsic::amdgcn_image_sample_lz_o: + case Intrinsic::amdgcn_image_sample_cd_o: + case Intrinsic::amdgcn_image_sample_cd_cl_o: + + // Sample with comparison and offsets. + case Intrinsic::amdgcn_image_sample_c_o: + case Intrinsic::amdgcn_image_sample_c_cl_o: + case Intrinsic::amdgcn_image_sample_c_d_o: + case Intrinsic::amdgcn_image_sample_c_d_cl_o: + case Intrinsic::amdgcn_image_sample_c_l_o: + case Intrinsic::amdgcn_image_sample_c_b_o: + case Intrinsic::amdgcn_image_sample_c_b_cl_o: + case Intrinsic::amdgcn_image_sample_c_lz_o: + case Intrinsic::amdgcn_image_sample_c_cd_o: + case Intrinsic::amdgcn_image_sample_c_cd_cl_o: + + // Basic gather4 + case Intrinsic::amdgcn_image_gather4: + case Intrinsic::amdgcn_image_gather4_cl: + case Intrinsic::amdgcn_image_gather4_l: + case Intrinsic::amdgcn_image_gather4_b: + case Intrinsic::amdgcn_image_gather4_b_cl: + case Intrinsic::amdgcn_image_gather4_lz: + + // Gather4 with comparison + case Intrinsic::amdgcn_image_gather4_c: + case Intrinsic::amdgcn_image_gather4_c_cl: + case Intrinsic::amdgcn_image_gather4_c_l: + case Intrinsic::amdgcn_image_gather4_c_b: + case Intrinsic::amdgcn_image_gather4_c_b_cl: + case Intrinsic::amdgcn_image_gather4_c_lz: + + // Gather4 with offsets + case Intrinsic::amdgcn_image_gather4_o: + case Intrinsic::amdgcn_image_gather4_cl_o: + case Intrinsic::amdgcn_image_gather4_l_o: + case Intrinsic::amdgcn_image_gather4_b_o: + case Intrinsic::amdgcn_image_gather4_b_cl_o: + case Intrinsic::amdgcn_image_gather4_lz_o: + + // Gather4 with comparison and offsets + case Intrinsic::amdgcn_image_gather4_c_o: + case Intrinsic::amdgcn_image_gather4_c_cl_o: + case Intrinsic::amdgcn_image_gather4_c_l_o: + case Intrinsic::amdgcn_image_gather4_c_b_o: + case Intrinsic::amdgcn_image_gather4_c_b_cl_o: + case Intrinsic::amdgcn_image_gather4_c_lz_o: { + SDValue Ops[] = { + Op.getOperand(0), // Chain + Op.getOperand(2), // vaddr + Op.getOperand(3), // rsrc + Op.getOperand(4), // sampler + Op.getOperand(5), // dmask + Op.getOperand(6), // unorm + Op.getOperand(7), // glc + Op.getOperand(8), // slc + Op.getOperand(9), // lwe + Op.getOperand(10) // da + }; + unsigned Opc = getImageOpcode (IID); + return DAG.getMemIntrinsicNode(Opc, DL, VTList, Ops, M->getMemoryVT(), + M->getMemOperand()); + } default: return SDValue(); } // End switch. @@ -4611,6 +4898,50 @@ return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, M->getMemoryVT(), M->getMemOperand()); } + + case Intrinsic::amdgcn_image_store: + case Intrinsic::amdgcn_image_store_mip: { + // TODO: factor out the common part from byffer_store, + // tbuffer_store and image_store. + SDValue VData = Op.getOperand(2); + EVT StoreVT = VData.getValueType(); + if (isHalfVT(StoreVT)) { + // TODO: Handle v3f16. + if (StoreVT == MVT::v2f16 || StoreVT== MVT::v4f16) { + if (!Subtarget->hasUnpackedD16VMem()) { + if (!isTypeLegal(StoreVT)) { + // If Target supports packed vmem, we just need to workaround + // the illegal type by casting to an equivalent one. + EVT EquivStoreVT = getEquivalentMemType(*DAG.getContext(), + StoreVT); + VData = DAG.getNode(ISD::BITCAST, DL, EquivStoreVT, VData); + } + } else {// We need to unpack the packed data to store. + EVT IntStoreVT = StoreVT.changeTypeToInteger(); + SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData); + EVT EquivStoreVT = (StoreVT == MVT::v2f16) ? MVT::v2i32 : MVT::v4i32; + VData = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData); + } + } + } + SDValue Ops[] = { + Chain, // Chain + VData, // vdata + Op.getOperand(3), // vaddr + Op.getOperand(4), // rsrc + Op.getOperand(5), // dmask + Op.getOperand(6), // glc + Op.getOperand(7), // slc + Op.getOperand(8), // lwe + Op.getOperand(9) // da + }; + + unsigned Opc = (IntrinsicID==Intrinsic::amdgcn_image_store) ? + AMDGPUISD::IMAGE_STORE : AMDGPUISD::IMAGE_STORE_MIP; + MemSDNode *M = cast(Op); + return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, + M->getMemoryVT(), M->getMemOperand()); + } default: return Op; } @@ -6567,7 +6898,15 @@ SelectionDAG &DAG) const { SDNode *Users[4] = { }; unsigned Lane = 0; - unsigned DmaskIdx = (Node->getNumOperands() - Node->getNumValues() == 9) ? 2 : 3; + unsigned DmaskIdx = (Node->getNumOperands() - Node->getNumValues() == 10) ? 2 : 3; + + if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { + // Don't adjust if d16 bit is set. + unsigned D16Idx = DmaskIdx + 8; // Offset from dmask to d16 is always 8. + if (Node->getConstantOperandVal(D16Idx) == 1) + return; + } + unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx); unsigned NewDmask = 0; @@ -6788,13 +7127,18 @@ } if (TII->isMIMG(MI)) { + // Don't adjust for D16. Also MIMG may not have d16 bit. + int D16Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::d16); + if (D16Idx > 0 && MI.getOperand(D16Idx).getImm() != 0) + return; + unsigned VReg = MI.getOperand(0).getReg(); const TargetRegisterClass *RC = MRI.getRegClass(VReg); // TODO: Need mapping tables to handle other cases (register classes). if (RC != &AMDGPU::VReg_128RegClass) return; - unsigned DmaskIdx = MI.getNumOperands() == 12 ? 3 : 4; + unsigned DmaskIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dmask); unsigned Writemask = MI.getOperand(DmaskIdx).getImm(); unsigned BitsSet = 0; for (unsigned i = 0; i < 4; ++i) Index: lib/Target/AMDGPU/SIInstrFormats.td =================================================================== --- lib/Target/AMDGPU/SIInstrFormats.td +++ lib/Target/AMDGPU/SIInstrFormats.td @@ -242,6 +242,7 @@ bits<1> tfe; bits<1> lwe; bits<1> slc; + bits<1> d16; bits<8> vaddr; bits<7> srsrc; bits<7> ssamp; @@ -260,6 +261,7 @@ let Inst{47-40} = vdata; let Inst{52-48} = srsrc{6-2}; let Inst{57-53} = ssamp{6-2}; + let Inst{63} = d16; } class EXPe : Enc64 { Index: lib/Target/AMDGPU/SIInstrInfo.td =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.td +++ lib/Target/AMDGPU/SIInstrInfo.td @@ -118,6 +118,135 @@ SDTBufferStore, [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>; +def SDTImage_load : SDTypeProfile<1, 7, + [ + SDTCisInt<1>, // vaddr + SDTCisInt<2>, // rsrc + SDTCisVT<3, i32>, // dmask + SDTCisVT<4, i1>, // glc + SDTCisVT<5, i1>, // slc + SDTCisVT<6, i1>, // lwe + SDTCisVT<7, i1> // da + ]>; +def SIImage_load : SDNode<"AMDGPUISD::IMAGE_LOAD", SDTImage_load, + [SDNPMayLoad, SDNPMemOperand, SDNPHasChain]>; +def SIImage_load_mip : SDNode<"AMDGPUISD::IMAGE_LOAD_MIP", SDTImage_load, + [SDNPMayLoad, SDNPMemOperand, SDNPHasChain]>; + +def SDTImage_store : SDTypeProfile<0, 8, + [ + SDTCisInt<1>, // vaddr + SDTCisInt<2>, // rsrc + SDTCisVT<3, i32>, // dmask + SDTCisVT<4, i1>, // glc + SDTCisVT<5, i1>, // slc + SDTCisVT<6, i1>, // lwe + SDTCisVT<7, i1> // da + ]>; +def SIImage_store : SDNode <"AMDGPUISD::IMAGE_STORE", + SDTImage_store, + [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>; +def SIImage_store_mip : SDNode <"AMDGPUISD::IMAGE_STORE_MIP", + SDTImage_store, + [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>; + +class SDTImage_sample : SDNode , // vaddr + SDTCisInt<2>, // rsrc + SDTCisVT<3, v4i32>, // sampler + SDTCisVT<4, i32>, // dmask + SDTCisVT<5, i1>, // unorm + SDTCisVT<6, i1>, // glc + SDTCisVT<7, i1>, // slc + SDTCisVT<8, i1>, // lwe + SDTCisVT<9, i1> // da + ]>, + [SDNPMayLoad, SDNPMemOperand, SDNPHasChain] +>; + +// Basic sample. +def SIImage_sample : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE">; +def SIImage_sample_cl : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_CL">; +def SIImage_sample_d : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_D">; +def SIImage_sample_d_cl : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_D_CL">; +def SIImage_sample_l : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_L">; +def SIImage_sample_b : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_B">; +def SIImage_sample_b_cl : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_B_CL">; +def SIImage_sample_lz : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_LZ">; +def SIImage_sample_cd : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_CD">; +def SIImage_sample_cd_cl : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_CD_CL">; + +// Sample with comparison. +def SIImage_sample_c : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_C">; +def SIImage_sample_c_cl : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_C_CL">; +def SIImage_sample_c_d : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_C_D">; +def SIImage_sample_c_d_cl : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_C_D_CL">; +def SIImage_sample_c_l : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_C_L">; +def SIImage_sample_c_b : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_C_B">; +def SIImage_sample_c_b_cl : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_C_B_CL">; +def SIImage_sample_c_lz : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_C_LZ">; +def SIImage_sample_c_cd : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_C_CD">; +def SIImage_sample_c_cd_cl : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_C_CD_CL">; + +// Sample with offsets. +def SIImage_sample_o : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_O">; +def SIImage_sample_cl_o : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_CL_O">; +def SIImage_sample_d_o : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_D_O">; +def SIImage_sample_d_cl_o : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_D_CL_O">; +def SIImage_sample_l_o : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_L_O">; +def SIImage_sample_b_o : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_B_O">; +def SIImage_sample_b_cl_o : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_B_CL_O">; +def SIImage_sample_lz_o : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_LZ_O">; +def SIImage_sample_cd_o : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_CD_O">; +def SIImage_sample_cd_cl_o : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_CD_CL_O">; + +// Sample with comparison and offsets. +def SIImage_sample_c_o : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_C_O">; +def SIImage_sample_c_cl_o : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_C_CL_O">; +def SIImage_sample_c_d_o : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_C_D_O">; +def SIImage_sample_c_d_cl_o : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_C_D_CL_O">; +def SIImage_sample_c_l_o : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_C_L_O">; +def SIImage_sample_c_b_o : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_C_B_O">; +def SIImage_sample_c_b_cl_o : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_C_B_CL_O">; +def SIImage_sample_c_lz_o : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_C_LZ_O">; +def SIImage_sample_c_cd_o : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_C_CD_O">; +def SIImage_sample_c_cd_cl_o : SDTImage_sample<"AMDGPUISD::IMAGE_SAMPLE_C_CD_CL_O">; + +// Basic gather4. +def SIImage_gather4 : SDTImage_sample<"AMDGPUISD::IMAGE_GATHER4">; +def SIImage_gather4_cl : SDTImage_sample<"AMDGPUISD::IMAGE_GATHER4_CL">; +def SIImage_gather4_l : SDTImage_sample<"AMDGPUISD::IMAGE_GATHER4_L">; +def SIImage_gather4_b : SDTImage_sample<"AMDGPUISD::IMAGE_GATHER4_B">; +def SIImage_gather4_b_cl : SDTImage_sample<"AMDGPUISD::IMAGE_GATHER4_B_CL">; +def SIImage_gather4_lz : SDTImage_sample<"AMDGPUISD::IMAGE_GATHER4_LZ">; + +// Gather4 with comparison. +def SIImage_gather4_c : SDTImage_sample<"AMDGPUISD::IMAGE_GATHER4_C">; +def SIImage_gather4_c_cl : SDTImage_sample<"AMDGPUISD::IMAGE_GATHER4_C_CL">; +def SIImage_gather4_c_l : SDTImage_sample<"AMDGPUISD::IMAGE_GATHER4_C_L">; +def SIImage_gather4_c_b : SDTImage_sample<"AMDGPUISD::IMAGE_GATHER4_C_B">; +def SIImage_gather4_c_b_cl : SDTImage_sample<"AMDGPUISD::IMAGE_GATHER4_C_B_CL">; +def SIImage_gather4_c_lz : SDTImage_sample<"AMDGPUISD::IMAGE_GATHER4_C_LZ">; + +// Gather4 with offsets. +def SIImage_gather4_o : SDTImage_sample<"AMDGPUISD::IMAGE_GATHER4_O">; +def SIImage_gather4_cl_o : SDTImage_sample<"AMDGPUISD::IMAGE_GATHER4_CL_O">; +def SIImage_gather4_l_o : SDTImage_sample<"AMDGPUISD::IMAGE_GATHER4_L_O">; +def SIImage_gather4_b_o : SDTImage_sample<"AMDGPUISD::IMAGE_GATHER4_B_O">; +def SIImage_gather4_b_cl_o : SDTImage_sample<"AMDGPUISD::IMAGE_GATHER4_B_CL_O">; +def SIImage_gather4_lz_o : SDTImage_sample<"AMDGPUISD::IMAGE_GATHER4_LZ_O">; + +// Gather4 with comparison and offsets. +def SIImage_gather4_c_o : SDTImage_sample<"AMDGPUISD::IMAGE_GATHER4_C_O">; +def SIImage_gather4_c_cl_o : SDTImage_sample<"AMDGPUISD::IMAGE_GATHER4_C_CL_O">; +def SIImage_gather4_c_l_o : SDTImage_sample<"AMDGPUISD::IMAGE_GATHER4_C_L_O">; +def SIImage_gather4_c_b_o : SDTImage_sample<"AMDGPUISD::IMAGE_GATHER4_C_B_O">; +def SIImage_gather4_c_b_cl_o : SDTImage_sample<"AMDGPUISD::IMAGE_GATHER4_C_B_CL_O">; +def SIImage_gather4_c_lz_o : SDTImage_sample<"AMDGPUISD::IMAGE_GATHER4_C_LZ_O">; + + class SDSample : SDNode , SDTCisVT<2, v8i32>, SDTCisVT<3, v4i32>, SDTCisVT<4, i32>]> @@ -618,6 +747,7 @@ def da : NamedOperandBit<"DA", NamedMatchClass<"DA">>; def r128 : NamedOperandBit<"R128", NamedMatchClass<"R128">>; def lwe : NamedOperandBit<"LWE", NamedMatchClass<"LWE">>; +def d16 : NamedOperandBit<"D16", NamedMatchClass<"D16">>; def exp_compr : NamedOperandBit<"ExpCompr", NamedMatchClass<"ExpCompr">>; def exp_vm : NamedOperandBit<"ExpVM", NamedMatchClass<"ExpVM">>; Index: test/CodeGen/AMDGPU/coalescer-subreg-join.mir =================================================================== --- test/CodeGen/AMDGPU/coalescer-subreg-join.mir +++ test/CodeGen/AMDGPU/coalescer-subreg-join.mir @@ -61,7 +61,7 @@ %11.sub6 = COPY %1 %11.sub7 = COPY %1 %11.sub8 = COPY %1 - dead %18 = IMAGE_SAMPLE_C_D_O_V1_V16 %11, %3, %4, 1, 0, 0, 0, 0, 0, 0, -1, implicit %exec + dead %18 = IMAGE_SAMPLE_C_D_O_V1_V16 %11, %3, %4, 1, 0, 0, 0, 0, 0, 0, 0, -1, implicit %exec %20.sub1 = COPY %2 %20.sub2 = COPY %2 %20.sub3 = COPY %2 @@ -70,6 +70,6 @@ %20.sub6 = COPY %2 %20.sub7 = COPY %2 %20.sub8 = COPY %2 - dead %27 = IMAGE_SAMPLE_C_D_O_V1_V16 %20, %5, %6, 1, 0, 0, 0, 0, 0, 0, -1, implicit %exec + dead %27 = IMAGE_SAMPLE_C_D_O_V1_V16 %20, %5, %6, 1, 0, 0, 0, 0, 0, 0, 0, -1, implicit %exec ... Index: test/CodeGen/AMDGPU/llvm.amdgcn.image.d16.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.image.d16.ll @@ -0,0 +1,100 @@ +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=UNPACKED %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=PACKED %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx901 -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=PACKED %s + + +; GCN-LABEL: {{^}}image_load_f16 +; GCN: image_load v0, v[0:3], s[0:7] dmask:0x1 unorm d16 +define amdgpu_ps half @image_load_f16(<4 x i32> %coords, <8 x i32> inreg %rsrc) { +main_body: + %tex = call half @llvm.amdgcn.image.load.f16.v4i32.v8i32(<4 x i32> %coords, <8 x i32> %rsrc, i32 1, i1 false, i1 false, i1 false, i1 false) + ret half %tex +} + +; GCN-LABEL: {{^}}image_load_v2f16: +; UNPACKED: image_load v[0:1], v[0:3], s[0:7] dmask:0x3 unorm d16 +; UNPACKED: v_mov_b32_e32 v0, v1 + +; PACKED: image_load v0, v[0:3], s[0:7] dmask:0x3 unorm d16 +; PACKED: v_lshrrev_b32_e32 v0, 16, v0 +define amdgpu_ps half @image_load_v2f16(<4 x i32> %coords, <8 x i32> inreg %rsrc) { +main_body: + %tex = call <2 x half> @llvm.amdgcn.image.load.v2f16.v4i32.v8i32(<4 x i32> %coords, <8 x i32> %rsrc, i32 3, i1 false, i1 false, i1 false, i1 false) + %elt = extractelement <2 x half> %tex, i32 1 + ret half %elt +} + +; GCN-LABEL: {{^}}image_load_v4f16: +; UNPACKED: image_load v[0:3], v[0:3], s[0:7] dmask:0xf unorm d16 +; UNPACKED: v_mov_b32_e32 v0, v3 + +; PACKED: image_load v[0:1], v[0:3], s[0:7] dmask:0xf unorm d16 +; PACKED: v_lshrrev_b32_e32 v0, 16, v1 +define amdgpu_ps half @image_load_v4f16(<4 x i32> %coords, <8 x i32> inreg %rsrc) { +main_body: + %tex = call <4 x half> @llvm.amdgcn.image.load.v4f16.v4i32.v8i32(<4 x i32> %coords, <8 x i32> %rsrc, i32 15, i1 false, i1 false, i1 false, i1 false) + %elt = extractelement <4 x half> %tex, i32 3 + ret half %elt +} + +; GCN-LABEL: {{^}}image_load_mip_v4f16: +; UNPACKED: image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf unorm d16 +; UNPACKED: v_mov_b32_e32 v0, v3 + +; PACKED: image_load_mip v[0:1], v[0:3], s[0:7] dmask:0xf unorm d16 +; PACKED: v_lshrrev_b32_e32 v0, 16, v1 +define amdgpu_ps half @image_load_mip_v4f16(<4 x i32> %coords, <8 x i32> inreg %rsrc) { +main_body: + %tex = call <4 x half> @llvm.amdgcn.image.load.mip.v4f16.v4i32.v8i32(<4 x i32> %coords, <8 x i32> %rsrc, i32 15, i1 false, i1 false, i1 false, i1 false) + %elt = extractelement <4 x half> %tex, i32 3 + ret half %elt +} + +; GCN-LABEL: {{^}}image_store_f16 +; GCN: image_store v4, v[0:3], s[0:7] dmask:0x1 unorm d16 +define amdgpu_kernel void @image_store_f16(half %data, <4 x i32> %coords, <8 x i32> inreg %rsrc) { +main_body: + call void @llvm.amdgcn.image.store.f16.v4i32.v8i32(half %data, <4 x i32> %coords, <8 x i32> %rsrc, i32 1, i1 false, i1 false, i1 false, i1 false) + ret void +} + +; GCN-LABEL: {{^}}image_store_v2f16 +; UNPACKED: image_store v[4:5], v[0:3], s[0:7] dmask:0x3 unorm d16 +; PACKED: image_store v4, v[0:3], s[0:7] dmask:0x3 unorm d16 +define amdgpu_kernel void @image_store_v2f16(<2 x half> %data, <4 x i32> %coords, <8 x i32> inreg %rsrc) { +main_body: + call void @llvm.amdgcn.image.store.v2f16.v4i32.v8i32(<2 x half> %data, <4 x i32> %coords, <8 x i32> %rsrc, i32 3, i1 false, i1 false, i1 false, i1 false) + ret void +} + +; GCN-LABEL: {{^}}image_store_v4f16 +; UNPACKED: image_store v[0:3], v[4:7], s[0:7] dmask:0xf unorm d16 +; PACKED: image_store v[4:5], v[0:3], s[0:7] dmask:0xf unorm d16 +define amdgpu_kernel void @image_store_v4f16(<4 x half> %data, <4 x i32> %coords, <8 x i32> inreg %rsrc) { +main_body: + call void @llvm.amdgcn.image.store.v4f16.v4i32.v8i32(<4 x half> %data, <4 x i32> %coords, <8 x i32> %rsrc, i32 15, i1 false, i1 false, i1 false, i1 false) + ret void +} + +; GCN-LABEL: {{^}}image_store_mip_v4f16 +; UNPACKED: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf unorm d16 +; PACKED: image_store_mip v[4:5], v[0:3], s[0:7] dmask:0xf unorm d16 +define amdgpu_kernel void @image_store_mip_v4f16(<4 x half> %data, <4 x i32> %coords, <8 x i32> inreg %rsrc) { +main_body: + call void @llvm.amdgcn.image.store.mip.v4f16.v4i32.v8i32(<4 x half> %data, <4 x i32> %coords, <8 x i32> %rsrc, i32 15, i1 false, i1 false, i1 false, i1 false) + ret void +} + + +declare half @llvm.amdgcn.image.load.f16.v4i32.v8i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) +declare <2 x half> @llvm.amdgcn.image.load.v2f16.v4i32.v8i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) +declare <4 x half> @llvm.amdgcn.image.load.v4f16.v4i32.v8i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) +declare <4 x half> @llvm.amdgcn.image.load.mip.v4f16.v4i32.v8i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) + +declare void @llvm.amdgcn.image.store.f16.v4i32.v8i32(half, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) +declare void @llvm.amdgcn.image.store.v2f16.v4i32.v8i32(<2 x half>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) +declare void @llvm.amdgcn.image.store.v4f16.v4i32.v8i32(<4 x half>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) +declare void @llvm.amdgcn.image.store.mip.v4f16.v4i32.v8i32(<4 x half>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) + + + Index: test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.d16.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.d16.ll @@ -0,0 +1,128 @@ +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=UNPACKED %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX81 %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx901 -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX9 %s + + +; GCN-LABEL: {{^}}image_gather4_f16: +; GCN: image_gather4 v0, v[0:3], s[0:7], s[8:11] dmask:0x1 d16 + +; UNPACKED: flat_store_short v[4:5], v0 +; GFX81: flat_store_short v[4:5], v0 +; GFX9: global_store_short v[4:5], v0, off +define amdgpu_kernel void @image_gather4_f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) { +main_body: + %tex = call half @llvm.amdgcn.image.gather4.f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 1, i1 0, i1 0, i1 0, i1 0, i1 0) + store half %tex, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}image_gather4_v2f16: +; UNPACKED: image_gather4 v[0:1], v[0:3], s[0:7], s[8:11] dmask:0x3 d16 +; UNPACKED: flat_store_short v[4:5], v1 + +; PACKED: image_gather4 v0, v[0:3], s[0:7], s[8:11] dmask:0x3 d16 +; GFX81: v_lshrrev_b32_e32 v0, 16, v0 +; GFX81: flat_store_short v[4:5], v0 +; GFX9: global_store_short_d16_hi v[4:5], v0, off +define amdgpu_kernel void @image_gather4_v2f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) { +main_body: + %tex = call <2 x half> @llvm.amdgcn.image.gather4.v2f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 3, i1 0, i1 0, i1 0, i1 0, i1 0) + %elt = extractelement <2 x half> %tex, i32 1 + store half %elt, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}image_gather4_v4f16: +; UNPACKED: image_gather4 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf d16 +; UNPACKED: flat_store_short v[4:5], v3 + +; PACKED: image_gather4 v[0:1], v[0:3], s[0:7], s[8:11] dmask:0xf d16 +; PACKED: v_lshrrev_b32_e32 v0, 16, v1 + +; GFX81: flat_store_short v[4:5], v0 +; GFX9: global_store_short v[4:5], v0, off +define amdgpu_kernel void @image_gather4_v4f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) { +main_body: + %tex = call <4 x half> @llvm.amdgcn.image.gather4.v4f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) + %elt = extractelement <4 x half> %tex, i32 3 + store half %elt, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}image_gather4_cl_v4f16: +; UNPACKED: image_gather4_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf d16 +; UNPACKED: flat_store_short v[4:5], v3 + +; PACKED: image_gather4_cl v[0:1], v[0:3], s[0:7], s[8:11] dmask:0xf d16 +; PACKED: v_lshrrev_b32_e32 v0, 16, v1 + +; GFX81: flat_store_short v[4:5], v0 +; GFX9: global_store_short v[4:5], v0, off +define amdgpu_kernel void @image_gather4_cl_v4f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) { +main_body: + %tex = call <4 x half> @llvm.amdgcn.image.gather4.cl.v4f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) + %elt = extractelement <4 x half> %tex, i32 3 + store half %elt, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}image_gather4_c_v4f16: +; UNPACKED: image_gather4_c v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf d16 +; UNPACKED: flat_store_short v[4:5], v3 + +; PACKED: image_gather4_c v[0:1], v[0:3], s[0:7], s[8:11] dmask:0xf d16 +; PACKED: v_lshrrev_b32_e32 v0, 16, v1 + +; GFX81: flat_store_short v[4:5], v0 +; GFX9: global_store_short v[4:5], v0, off +define amdgpu_kernel void @image_gather4_c_v4f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) { +main_body: + %tex = call <4 x half> @llvm.amdgcn.image.gather4.c.v4f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) + %elt = extractelement <4 x half> %tex, i32 3 + store half %elt, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}image_gather4_o_v4f16: +; UNPACKED: image_gather4_o v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf d16 +; UNPACKED: flat_store_short v[4:5], v3 + +; PACKED: image_gather4_o v[0:1], v[0:3], s[0:7], s[8:11] dmask:0xf d16 +; PACKED: v_lshrrev_b32_e32 v0, 16, v1 + +; GFX81: flat_store_short v[4:5], v0 +; GFX9: global_store_short v[4:5], v0, off +define amdgpu_kernel void @image_gather4_o_v4f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) { +main_body: + %tex = call <4 x half> @llvm.amdgcn.image.gather4.o.v4f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) + %elt = extractelement <4 x half> %tex, i32 3 + store half %elt, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}image_gather4_c_o_v4f16: +; UNPACKED: image_gather4_c_o v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf d16 +; UNPACKED: flat_store_short v[4:5], v3 + +; PACKED: image_gather4_c_o v[0:1], v[0:3], s[0:7], s[8:11] dmask:0xf d16 +; PACKED: v_lshrrev_b32_e32 v0, 16, v1 + +; GFX81: flat_store_short v[4:5], v0 +; GFX9: global_store_short v[4:5], v0, off +define amdgpu_kernel void @image_gather4_c_o_v4f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) { +main_body: + %tex = call <4 x half> @llvm.amdgcn.image.gather4.c.o.v4f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) + %elt = extractelement <4 x half> %tex, i32 3 + store half %elt, half addrspace(1)* %out + ret void +} + +declare half @llvm.amdgcn.image.gather4.f16.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) +declare <2 x half> @llvm.amdgcn.image.gather4.v2f16.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) +declare <4 x half> @llvm.amdgcn.image.gather4.v4f16.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) + + +declare <4 x half> @llvm.amdgcn.image.gather4.cl.v4f16.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) +declare <4 x half> @llvm.amdgcn.image.gather4.c.v4f16.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) +declare <4 x half> @llvm.amdgcn.image.gather4.o.v4f16.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) +declare <4 x half> @llvm.amdgcn.image.gather4.c.o.v4f16.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) Index: test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.ll @@ -0,0 +1,128 @@ +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=UNPACKED %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX81 %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx901 -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX9 %s + + +; GCN-LABEL: {{^}}image_sample_f16: +; GCN: image_sample v0, v[0:3], s[0:7], s[8:11] dmask:0x1 d16 + +; UNPACKED: flat_store_short v[4:5], v0 +; GFX81: flat_store_short v[4:5], v0 +; GFX9: global_store_short v[4:5], v0, off +define amdgpu_kernel void @image_sample_f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) { +main_body: + %tex = call half @llvm.amdgcn.image.sample.f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 1, i1 0, i1 0, i1 0, i1 0, i1 0) + store half %tex, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}image_sample_v2f16: +; UNPACKED: image_sample v[0:1], v[0:3], s[0:7], s[8:11] dmask:0x3 d16 +; UNPACKED: flat_store_short v[4:5], v1 + +; PACKED: image_sample v0, v[0:3], s[0:7], s[8:11] dmask:0x3 d16 +; GFX81: v_lshrrev_b32_e32 v0, 16, v0 +; GFX81: flat_store_short v[4:5], v0 +; GFX9: global_store_short_d16_hi v[4:5], v0, off +define amdgpu_kernel void @image_sample_v2f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) { +main_body: + %tex = call <2 x half> @llvm.amdgcn.image.sample.v2f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 3, i1 0, i1 0, i1 0, i1 0, i1 0) + %elt = extractelement <2 x half> %tex, i32 1 + store half %elt, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}image_sample_v4f16: +; UNPACKED: image_sample v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf d16 +; UNPACKED: flat_store_short v[4:5], v3 + +; PACKED: image_sample v[0:1], v[0:3], s[0:7], s[8:11] dmask:0xf d16 +; PACKED: v_lshrrev_b32_e32 v0, 16, v1 + +; GFX81: flat_store_short v[4:5], v0 +; GFX9: global_store_short v[4:5], v0, off +define amdgpu_kernel void @image_sample_v4f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) { +main_body: + %tex = call <4 x half> @llvm.amdgcn.image.sample.v4f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) + %elt = extractelement <4 x half> %tex, i32 3 + store half %elt, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}image_sample_cl_v4f16: +; UNPACKED: image_sample_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf d16 +; UNPACKED: flat_store_short v[4:5], v3 + +; PACKED: image_sample_cl v[0:1], v[0:3], s[0:7], s[8:11] dmask:0xf d16 +; PACKED: v_lshrrev_b32_e32 v0, 16, v1 + +; GFX81: flat_store_short v[4:5], v0 +; GFX9: global_store_short v[4:5], v0, off +define amdgpu_kernel void @image_sample_cl_v4f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) { +main_body: + %tex = call <4 x half> @llvm.amdgcn.image.sample.cl.v4f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) + %elt = extractelement <4 x half> %tex, i32 3 + store half %elt, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}image_sample_c_v4f16: +; UNPACKED: image_sample_c v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf d16 +; UNPACKED: flat_store_short v[4:5], v3 + +; PACKED: image_sample_c v[0:1], v[0:3], s[0:7], s[8:11] dmask:0xf d16 +; PACKED: v_lshrrev_b32_e32 v0, 16, v1 + +; GFX81: flat_store_short v[4:5], v0 +; GFX9: global_store_short v[4:5], v0, off +define amdgpu_kernel void @image_sample_c_v4f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) { +main_body: + %tex = call <4 x half> @llvm.amdgcn.image.sample.c.v4f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) + %elt = extractelement <4 x half> %tex, i32 3 + store half %elt, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}image_sample_o_v4f16: +; UNPACKED: image_sample_o v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf d16 +; UNPACKED: flat_store_short v[4:5], v3 + +; PACKED: image_sample_o v[0:1], v[0:3], s[0:7], s[8:11] dmask:0xf d16 +; PACKED: v_lshrrev_b32_e32 v0, 16, v1 + +; GFX81: flat_store_short v[4:5], v0 +; GFX9: global_store_short v[4:5], v0, off +define amdgpu_kernel void @image_sample_o_v4f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) { +main_body: + %tex = call <4 x half> @llvm.amdgcn.image.sample.o.v4f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) + %elt = extractelement <4 x half> %tex, i32 3 + store half %elt, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}image_sample_c_o_v4f16: +; UNPACKED: image_sample_c_o v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf d16 +; UNPACKED: flat_store_short v[4:5], v3 + +; PACKED: image_sample_c_o v[0:1], v[0:3], s[0:7], s[8:11] dmask:0xf d16 +; PACKED: v_lshrrev_b32_e32 v0, 16, v1 + +; GFX81: flat_store_short v[4:5], v0 +; GFX9: global_store_short v[4:5], v0, off +define amdgpu_kernel void @image_sample_c_o_v4f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) { +main_body: + %tex = call <4 x half> @llvm.amdgcn.image.sample.c.o.v4f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) + %elt = extractelement <4 x half> %tex, i32 3 + store half %elt, half addrspace(1)* %out + ret void +} + +declare half @llvm.amdgcn.image.sample.f16.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) +declare <2 x half> @llvm.amdgcn.image.sample.v2f16.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) +declare <4 x half> @llvm.amdgcn.image.sample.v4f16.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) + + +declare <4 x half> @llvm.amdgcn.image.sample.cl.v4f16.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) +declare <4 x half> @llvm.amdgcn.image.sample.c.v4f16.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) +declare <4 x half> @llvm.amdgcn.image.sample.o.v4f16.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) +declare <4 x half> @llvm.amdgcn.image.sample.c.o.v4f16.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1)