diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -214,6 +214,18 @@ "Branch offset of 3f hardware bug" >; +def FeatureImageStoreD16Bug : SubtargetFeature<"image-store-d16-bug", + "HasImageStoreD16Bug", + "true", + "Image Store D16 hardware bug" +>; + +def FeatureImageGather4D16Bug : SubtargetFeature<"image-gather4-d16-bug", + "HasImageGather4D16Bug", + "true", + "Image Gather4 D16 hardware bug" +>; + class SubtargetFeatureLDSBankCount : SubtargetFeature < "ldsbankcount"#Value, "LDSBankCount", @@ -771,7 +783,9 @@ [FeatureVolcanicIslands, FeatureLDSBankCount16, FeatureXNACK, - FeatureCodeObjectV3]>; + FeatureCodeObjectV3, + FeatureImageStoreD16Bug, + FeatureImageGather4D16Bug]>; def FeatureISAVersion9_0_0 : FeatureSet< [FeatureGFX9, @@ -779,7 +793,8 @@ FeatureLDSBankCount32, FeatureCodeObjectV3, FeatureDoesNotSupportXNACK, - FeatureDoesNotSupportSRAMECC]>; + FeatureDoesNotSupportSRAMECC, + FeatureImageGather4D16Bug]>; def FeatureISAVersion9_0_2 : FeatureSet< [FeatureGFX9, @@ -787,7 +802,8 @@ FeatureLDSBankCount32, FeatureXNACK, FeatureDoesNotSupportSRAMECC, - FeatureCodeObjectV3]>; + FeatureCodeObjectV3, + FeatureImageGather4D16Bug]>; def FeatureISAVersion9_0_4 : FeatureSet< [FeatureGFX9, @@ -795,7 +811,8 @@ FeatureFmaMixInsts, FeatureDoesNotSupportXNACK, FeatureDoesNotSupportSRAMECC, - FeatureCodeObjectV3]>; + FeatureCodeObjectV3, + FeatureImageGather4D16Bug]>; def FeatureISAVersion9_0_6 : FeatureSet< [FeatureGFX9, @@ -806,7 +823,8 @@ FeatureDot1Insts, FeatureDot2Insts, FeatureDoesNotSupportXNACK, - FeatureCodeObjectV3]>; + FeatureCodeObjectV3, + FeatureImageGather4D16Bug]>; def FeatureISAVersion9_0_8 : FeatureSet< [FeatureGFX9, @@ -825,14 +843,16 @@ FeatureAtomicFaddInsts, FeatureSRAMECC, FeatureMFMAInlineLiteralBug, - FeatureCodeObjectV3]>; + FeatureCodeObjectV3, + FeatureImageGather4D16Bug]>; def FeatureISAVersion9_0_9 : FeatureSet< [FeatureGFX9, FeatureMadMixInsts, FeatureLDSBankCount32, FeatureXNACK, - FeatureCodeObjectV3]>; + FeatureCodeObjectV3, + FeatureImageGather4D16Bug]>; // TODO: Organize more features into groups. def FeatureGroup { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -386,6 +386,8 @@ bool HasNSAtoVMEMBug; bool HasOffset3fBug; bool HasFlatSegmentOffsetBug; + bool HasImageStoreD16Bug; + bool HasImageGather4D16Bug; // Dummy feature to use for assembler in tablegen. bool FeatureDisable; @@ -1001,6 +1003,14 @@ return HasOffset3fBug; } + bool hasImageStoreD16Bug() const { + return HasImageStoreD16Bug; + } + + bool hasImageGather4D16Bug() const { + return HasImageGather4D16Bug; + } + bool hasNSAEncoding() const { return HasNSAEncoding; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -283,6 +283,8 @@ HasNSAtoVMEMBug(false), HasOffset3fBug(false), HasFlatSegmentOffsetBug(false), + HasImageStoreD16Bug(false), + HasImageGather4D16Bug(false), FeatureDisable(false), InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)), diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -103,7 +103,8 @@ ArrayRef Ops, EVT MemVT, MachineMemOperand *MMO, SelectionDAG &DAG) const; - SDValue handleD16VData(SDValue VData, SelectionDAG &DAG) const; + SDValue handleD16VData(SDValue VData, SelectionDAG &DAG, + bool Image = false) const; /// Converts \p Op, which must be of floating point type, to the /// floating point type \p VT, by either extending or truncating it. diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -5226,7 +5226,7 @@ SDValue Data(Result, 0); SDValue TexFail; - if (IsTexFail) { + if (DMaskPop > 0 && Data.getValueType() != MaskPopVT) { SDValue ZeroIdx = DAG.getConstant(0, DL, MVT::i32); if (MaskPopVT.isVector()) { Data = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskPopVT, @@ -5235,10 +5235,6 @@ Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskPopVT, SDValue(Result, 0), ZeroIdx); } - - TexFail = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, - SDValue(Result, 0), - DAG.getConstant(MaskPopDwords, DL, MVT::i32)); } if (DataDwordVT.isVector()) @@ -5253,8 +5249,13 @@ Data = DAG.getNode(ISD::BITCAST, DL, ReqRetVT, Data); - if (TexFail) + if (IsTexFail) { + TexFail = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, + SDValue(Result, 0), + DAG.getConstant(MaskPopDwords, DL, MVT::i32)); + return DAG.getMergeValues({Data, TexFail, SDValue(Result, 1)}, DL); + } if (Result->getNumValues() == 1) return Data; @@ -5343,7 +5344,7 @@ return Op; // D16 is unsupported for this instruction IsD16 = true; - VData = handleD16VData(VData, DAG); + VData = handleD16VData(VData, DAG, true); } NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32; @@ -5363,7 +5364,11 @@ (!LoadVT.isVector() && DMaskLanes > 1)) return Op; - if (IsD16 && !Subtarget->hasUnpackedD16VMem()) + // The sq block of gfx8 and gfx9 do not estimate register use correctly + // for d16 image_gather4, image_gather4_l, and image_gather4_lz + // instructions. + if (IsD16 && !Subtarget->hasUnpackedD16VMem() && + !(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug())) NumVDataDwords = (DMaskLanes + 1) / 2; else NumVDataDwords = DMaskLanes; @@ -6657,7 +6662,8 @@ } SDValue SITargetLowering::handleD16VData(SDValue VData, - SelectionDAG &DAG) const { + SelectionDAG &DAG, + bool Image) const { EVT StoreVT = VData.getValueType(); // No change for f16 and legal vector D16 types. @@ -6678,6 +6684,38 @@ return DAG.UnrollVectorOp(ZExt.getNode()); } + // The sq block of gfx8.1 does not estimate register use correctly for d16 + // image store instructions. The data operand is computed as if it were not a + // d16 image instruction. + if (Image && Subtarget->hasImageStoreD16Bug()) { + // Bitcast to i16 + EVT IntStoreVT = StoreVT.changeTypeToInteger(); + SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData); + + // Decompose into scalars + SmallVector Elts; + DAG.ExtractVectorElements(IntVData, Elts); + + // Group pairs of i16 into v2i16 and bitcast to i32 + SmallVector PackedElts; + EVT Vec2VT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, 2); + for (unsigned i = 0 ; i < Elts.size() / 2 ; i += 1) + { + SDValue Pair = DAG.getBuildVector(Vec2VT, DL, {Elts[i * 2], + Elts[i * 2 + 1]}); + SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair); + PackedElts.push_back(IntPair); + } + + // Pad using UNDEF + PackedElts.resize(PackedElts.size() * 2, DAG.getUNDEF(MVT::i32)); + + // Build final vector + EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, + PackedElts.size()); + return DAG.getBuildVector(VecVT, DL, PackedElts); + } + assert(isTypeLegal(StoreVT)); return VData; } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.d16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.d16.dim.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.d16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.d16.dim.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefixes=GCN,UNPACKED,GFX89 %s -; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -check-prefixes=GCN,PACKED,GFX81,GFX89 %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -check-prefixes=GCN,GFX81,GFX89 %s ; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefixes=GCN,PACKED,GFX9,GFX89 %s ; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefixes=GCN,GFX10 %s @@ -70,6 +70,7 @@ ; UNPACKED: v_and_b32_e32 ; UNPACKED: image_store v[{{[0-9:]+}}], v[0:1], s[0:7] dmask:0x3 unorm d16{{$}} ; PACKED: image_store v2, v[0:1], s[0:7] dmask:0x3 unorm d16{{$}} +; GFX81: image_store v[2:3], v[0:1], s[0:7] dmask:0x3 unorm d16{{$}} ; GFX10: image_store v2, v[0:1], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D unorm d16{{$}} define amdgpu_ps void @image_store_v2f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, float %in) { main_body: @@ -85,6 +86,7 @@ ; UNPACKED: v_and_b32_e32 ; UNPACKED: image_store v[{{[0-9:]+}}], v[0:1], s[0:7] dmask:0xf unorm d16{{$}} ; PACKED: image_store v[2:3], v[0:1], s[0:7] dmask:0xf unorm d16{{$}} +; GFX81: image_store v[2:5], v[0:1], s[0:7] dmask:0xf unorm d16{{$}} ; GFX10: image_store v[2:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm d16{{$}} define amdgpu_ps void @image_store_v4f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, <2 x float> %in) { main_body: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.d16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.d16.dim.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.d16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.d16.dim.ll @@ -1,11 +1,13 @@ ; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefixes=GCN,UNPACKED,GFX89 %s -; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -check-prefixes=GCN,PACKED,GFX81,GFX89 %s -; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefixes=GCN,PACKED,GFX9,GFX89 %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -check-prefixes=GCN,GFX81,GFX89 %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefixes=GCN,GFX9,GFX89 %s ; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefixes=GCN,GFX10 %s ; GCN-LABEL: {{^}}image_gather4_b_2d_v4f16: ; UNPACKED: image_gather4_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x4 d16{{$}} ; PACKED: image_gather4_b v[0:1], v[0:2], s[0:7], s[8:11] dmask:0x4 d16{{$}} +; GFX810: image_gather4_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x4 d16{{$}} +; GFX9: image_gather4_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x4 d16{{$}} ; GFX10: image_gather4_b v[0:1], v[0:2], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D d16{{$}} define amdgpu_ps <2 x float> @image_gather4_b_2d_v4f16(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %s, float %t) { main_body: