diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -234,6 +234,18 @@ "Branch offset of 3f hardware bug" >; +def FeatureImageStoreD16Bug : SubtargetFeature<"image-store-d16-bug", + "HasImageStoreD16Bug", + "true", + "Image Store D16 hardware bug" +>; + +def FeatureImageGather4D16Bug : SubtargetFeature<"image-gather4-d16-bug", + "HasImageGather4D16Bug", + "true", + "Image Gather4 D16 hardware bug" +>; + class SubtargetFeatureLDSBankCount : SubtargetFeature < "ldsbankcount"#Value, "LDSBankCount", @@ -810,7 +822,9 @@ [FeatureVolcanicIslands, FeatureLDSBankCount16, FeatureXNACK, - FeatureCodeObjectV3]>; + FeatureCodeObjectV3, + FeatureImageStoreD16Bug, + FeatureImageGather4D16Bug]>; def FeatureISAVersion9_0_0 : FeatureSet< [FeatureGFX9, @@ -818,7 +832,8 @@ FeatureLDSBankCount32, FeatureCodeObjectV3, FeatureDoesNotSupportXNACK, - FeatureDoesNotSupportSRAMECC]>; + FeatureDoesNotSupportSRAMECC, + FeatureImageGather4D16Bug]>; def FeatureISAVersion9_0_2 : FeatureSet< [FeatureGFX9, @@ -826,7 +841,8 @@ FeatureLDSBankCount32, FeatureXNACK, FeatureDoesNotSupportSRAMECC, - FeatureCodeObjectV3]>; + FeatureCodeObjectV3, + FeatureImageGather4D16Bug]>; def FeatureISAVersion9_0_4 : FeatureSet< [FeatureGFX9, @@ -834,7 +850,8 @@ FeatureFmaMixInsts, FeatureDoesNotSupportXNACK, FeatureDoesNotSupportSRAMECC, - FeatureCodeObjectV3]>; + FeatureCodeObjectV3, + FeatureImageGather4D16Bug]>; def FeatureISAVersion9_0_6 : FeatureSet< [FeatureGFX9, @@ -845,7 +862,8 @@ FeatureDot1Insts, FeatureDot2Insts, FeatureDoesNotSupportXNACK, - FeatureCodeObjectV3]>; + FeatureCodeObjectV3, + FeatureImageGather4D16Bug]>; def FeatureISAVersion9_0_8 : FeatureSet< [FeatureGFX9, @@ -864,14 +882,16 @@ FeatureAtomicFaddInsts, FeatureSRAMECC, FeatureMFMAInlineLiteralBug, - FeatureCodeObjectV3]>; + FeatureCodeObjectV3, + FeatureImageGather4D16Bug]>; def FeatureISAVersion9_0_9 : FeatureSet< [FeatureGFX9, FeatureMadMixInsts, FeatureLDSBankCount32, FeatureXNACK, - FeatureCodeObjectV3]>; + FeatureCodeObjectV3, + FeatureImageGather4D16Bug]>; // TODO: Organize more features into groups. def FeatureGroup { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -1539,6 +1539,16 @@ DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm(); DMaskLanes = BaseOpcode->Gather4 ? 4 : countPopulation(DMask); + // One memoperand is mandatory, except for getresinfo. + // FIXME: Check this in verifier. + if (!MI.memoperands_empty()) { + const MachineMemOperand *MMO = *MI.memoperands_begin(); + + // Infer d16 from the memory size, as the register type will be mangled by + // unpacked subtargets, or by TFE. + IsD16 = ((8 * MMO->getSize()) / DMaskLanes) < 32; + } + if (BaseOpcode->Store) { VDataIn = MI.getOperand(1).getReg(); VDataTy = MRI->getType(VDataIn); @@ -1548,18 +1558,8 @@ VDataTy = MRI->getType(VDataOut); NumVDataDwords = DMaskLanes; - // One memoperand is mandatory, except for getresinfo. - // FIXME: Check this in verifier. - if (!MI.memoperands_empty()) { - const MachineMemOperand *MMO = *MI.memoperands_begin(); - - // Infer d16 from the memory size, as the register type will be mangled by - // unpacked subtargets, or by TFE. - IsD16 = ((8 * MMO->getSize()) / DMaskLanes) < 32; - - if (IsD16 && !STI.hasUnpackedD16VMem()) - NumVDataDwords = (DMaskLanes + 1) / 2; - } + if (IsD16 && !STI.hasUnpackedD16VMem()) + NumVDataDwords = (DMaskLanes + 1) / 2; } } diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h @@ -146,7 +146,7 @@ splitBufferOffsets(MachineIRBuilder &B, Register OrigOffset) const; Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI, - Register Reg) const; + Register Reg, bool ImageStore = false) const; bool legalizeRawBufferStore(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool IsFormat) const; bool legalizeRawBufferLoad(MachineInstr &MI, MachineRegisterInfo &MRI, diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -3528,24 +3528,58 @@ /// Handle register layout difference for f16 images for some subtargets. Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI, - Register Reg) const { - if (!ST.hasUnpackedD16VMem()) - return Reg; - + Register Reg, + bool ImageStore) const { const LLT S16 = LLT::scalar(16); const LLT S32 = LLT::scalar(32); LLT StoreVT = MRI.getType(Reg); assert(StoreVT.isVector() && StoreVT.getElementType() == S16); - auto Unmerge = B.buildUnmerge(S16, Reg); + if (ST.hasUnpackedD16VMem()) { + auto Unmerge = B.buildUnmerge(S16, Reg); + + SmallVector WideRegs; + for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) + WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); + + int NumElts = StoreVT.getNumElements(); + + return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); + } + + if (ImageStore && ST.hasImageStoreD16Bug()) { + if (StoreVT.getNumElements() == 2) { + SmallVector PackedRegs; + Reg = B.buildBitcast(S32, Reg).getReg(0); + PackedRegs.push_back(Reg); + PackedRegs.resize(2, B.buildUndef(S32).getReg(0)); + return B.buildBuildVector(LLT::vector(2, S32), PackedRegs).getReg(0); + } + + if (StoreVT.getNumElements() == 3) { + SmallVector PackedRegs; + auto Unmerge = B.buildUnmerge(S16, Reg); + for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) + PackedRegs.push_back(Unmerge.getReg(I)); + PackedRegs.resize(8, B.buildUndef(S16).getReg(0)); + Reg = B.buildBuildVector(LLT::vector(8, S16), PackedRegs).getReg(0); + return B.buildBitcast(LLT::vector(4, S32), Reg).getReg(0); + } - SmallVector WideRegs; - for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) - WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); + if (StoreVT.getNumElements() == 4) { + SmallVector PackedRegs; + Reg = B.buildBitcast(LLT::vector(2, S32), Reg).getReg(0); + auto Unmerge = B.buildUnmerge(S32, Reg); + for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) + PackedRegs.push_back(Unmerge.getReg(I)); + PackedRegs.resize(4, B.buildUndef(S32).getReg(0)); + return B.buildBuildVector(LLT::vector(4, S32), PackedRegs).getReg(0); + } - int NumElts = StoreVT.getNumElements(); + llvm_unreachable("invalid data type"); + } - return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); + return Reg; } Register AMDGPULegalizerInfo::fixStoreSourceType( @@ -4215,7 +4249,7 @@ if (!Ty.isVector() || Ty.getElementType() != S16) return true; - Register RepackedReg = handleD16VData(B, *MRI, VData); + Register RepackedReg = handleD16VData(B, *MRI, VData, true); if (RepackedReg != VData) { MI.getOperand(1).setReg(RepackedReg); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -411,6 +411,8 @@ bool HasNSAtoVMEMBug; bool HasOffset3fBug; bool HasFlatSegmentOffsetBug; + bool HasImageStoreD16Bug; + bool HasImageGather4D16Bug; // Dummy feature to use for assembler in tablegen. bool FeatureDisable; @@ -1025,9 +1027,11 @@ return HasOffset3fBug; } - bool hasNSAEncoding() const { - return HasNSAEncoding; - } + bool hasImageStoreD16Bug() const { return HasImageStoreD16Bug; } + + bool hasImageGather4D16Bug() const { return HasImageGather4D16Bug; } + + bool hasNSAEncoding() const { return HasNSAEncoding; } bool hasGFX10_BEncoding() const { return GFX10_BEncoding; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -271,6 +271,8 @@ HasNSAtoVMEMBug(false), HasOffset3fBug(false), HasFlatSegmentOffsetBug(false), + HasImageStoreD16Bug(false), + HasImageGather4D16Bug(false), FeatureDisable(false), InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)), diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -108,7 +108,8 @@ ArrayRef Ops, EVT MemVT, MachineMemOperand *MMO, SelectionDAG &DAG) const; - SDValue handleD16VData(SDValue VData, SelectionDAG &DAG) const; + SDValue handleD16VData(SDValue VData, SelectionDAG &DAG, + bool ImageStore = false) const; /// Converts \p Op, which must be of floating point type, to the /// floating point type \p VT, by either extending or truncating it. diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -5851,7 +5851,7 @@ SDValue Data(Result, 0); SDValue TexFail; - if (IsTexFail) { + if (DMaskPop > 0 && Data.getValueType() != MaskPopVT) { SDValue ZeroIdx = DAG.getConstant(0, DL, MVT::i32); if (MaskPopVT.isVector()) { Data = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskPopVT, @@ -5860,10 +5860,6 @@ Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskPopVT, SDValue(Result, 0), ZeroIdx); } - - TexFail = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, - SDValue(Result, 0), - DAG.getConstant(MaskPopDwords, DL, MVT::i32)); } if (DataDwordVT.isVector()) @@ -5887,8 +5883,13 @@ } Data = DAG.getNode(ISD::BITCAST, DL, LegalReqRetVT, Data); - if (TexFail) + if (IsTexFail) { + TexFail = + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SDValue(Result, 0), + DAG.getConstant(MaskPopDwords, DL, MVT::i32)); + return DAG.getMergeValues({Data, TexFail, SDValue(Result, 1)}, DL); + } if (Result->getNumValues() == 1) return Data; @@ -6007,7 +6008,7 @@ return Op; // D16 is unsupported for this instruction IsD16 = true; - VData = handleD16VData(VData, DAG); + VData = handleD16VData(VData, DAG, true); } NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32; @@ -6027,7 +6028,11 @@ (!LoadVT.isVector() && DMaskLanes > 1)) return Op; - if (IsD16 && !Subtarget->hasUnpackedD16VMem()) + // The sq block of gfx8 and gfx9 do not estimate register use correctly + // for d16 image_gather4, image_gather4_l, and image_gather4_lz + // instructions. + if (IsD16 && !Subtarget->hasUnpackedD16VMem() && + !(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug())) NumVDataDwords = (DMaskLanes + 1) / 2; else NumVDataDwords = DMaskLanes; @@ -7401,8 +7406,8 @@ return NewOp; } -SDValue SITargetLowering::handleD16VData(SDValue VData, - SelectionDAG &DAG) const { +SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG, + bool ImageStore) const { EVT StoreVT = VData.getValueType(); // No change for f16 and legal vector D16 types. @@ -7434,6 +7439,36 @@ return DAG.getNode(ISD::BITCAST, DL, WidenedStoreVT, ZExt); } + // The sq block of gfx8.1 does not estimate register use correctly for d16 + // image store instructions. The data operand is computed as if it were not a + // d16 image instruction. + if (ImageStore && Subtarget->hasImageStoreD16Bug()) { + // Bitcast to i16 + EVT IntStoreVT = StoreVT.changeTypeToInteger(); + SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData); + + // Decompose into scalars + SmallVector Elts; + DAG.ExtractVectorElements(IntVData, Elts); + + // Group pairs of i16 into v2i16 and bitcast to i32 + SmallVector PackedElts; + for (unsigned I = 0; I < Elts.size() / 2; I += 1) { + SDValue Pair = + DAG.getBuildVector(MVT::v2i16, DL, {Elts[I * 2], Elts[I * 2 + 1]}); + SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair); + PackedElts.push_back(IntPair); + } + + // Pad using UNDEF + PackedElts.resize(PackedElts.size() * 2, DAG.getUNDEF(MVT::i32)); + + // Build final vector + EVT VecVT = + EVT::getVectorVT(*DAG.getContext(), MVT::i32, PackedElts.size()); + return DAG.getBuildVector(VecVT, DL, PackedElts); + } + assert(isTypeLegal(StoreVT)); return VData; } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.store.2d.d16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.store.2d.d16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.store.2d.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.store.2d.d16.ll @@ -1,8 +1,27 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga -stop-after=legalizer -global-isel-abort=0 -o - %s | FileCheck -check-prefix=UNPACKED %s -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx810 -stop-after=legalizer -global-isel-abort=0 -o - %s | FileCheck -check-prefix=PACKED %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx810 -stop-after=legalizer -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX81 %s define amdgpu_ps void @image_store_f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, half %data) { + ; PACKED-LABEL: name: image_store_f16 + ; PACKED: bb.1 (%ir-block.0): + ; PACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2 + ; PACKED: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; PACKED: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; PACKED: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; PACKED: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; PACKED: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; PACKED: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; PACKED: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; PACKED: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; PACKED: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; PACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; PACKED: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; PACKED: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY10]](s32) + ; PACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; PACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) + ; PACKED: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[TRUNC]](s16), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 2 into custom "TargetCustom8") + ; PACKED: S_ENDPGM 0 ; UNPACKED-LABEL: name: image_store_f16 ; UNPACKED: bb.1 (%ir-block.0): ; UNPACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2 @@ -22,7 +41,31 @@ ; UNPACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) ; UNPACKED: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[TRUNC]](s16), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 2 into custom "TargetCustom8") ; UNPACKED: S_ENDPGM 0 - ; PACKED-LABEL: name: image_store_f16 + ; GFX81-LABEL: name: image_store_f16 + ; GFX81: bb.1 (%ir-block.0): + ; GFX81: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2 + ; GFX81: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX81: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX81: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX81: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX81: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX81: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX81: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX81: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX81: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX81: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX81: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX81: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY10]](s32) + ; GFX81: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX81: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) + ; GFX81: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[TRUNC]](s16), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 2 into custom "TargetCustom8") + ; GFX81: S_ENDPGM 0 + call void @llvm.amdgcn.image.store.2d.f16.i32(half %data, i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @image_store_v2f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, <2 x half> %in) { + ; PACKED-LABEL: name: image_store_v2f16 ; PACKED: bb.1 (%ir-block.0): ; PACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2 ; PACKED: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 @@ -35,17 +78,11 @@ ; PACKED: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; PACKED: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 ; PACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; PACKED: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; PACKED: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY10]](s32) + ; PACKED: [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 ; PACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; PACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; PACKED: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[TRUNC]](s16), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 2 into custom "TargetCustom8") + ; PACKED: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[COPY10]](<2 x s16>), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 4 into custom "TargetCustom8") ; PACKED: S_ENDPGM 0 - call void @llvm.amdgcn.image.store.2d.f16.i32(half %data, i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) - ret void -} - -define amdgpu_ps void @image_store_v2f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, <2 x half> %in) { ; UNPACKED-LABEL: name: image_store_v2f16 ; UNPACKED: bb.1 (%ir-block.0): ; UNPACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2 @@ -70,9 +107,35 @@ ; UNPACKED: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY11]](s32), [[COPY12]](s32) ; UNPACKED: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[BUILD_VECTOR2]](<2 x s32>), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 4 into custom "TargetCustom8") ; UNPACKED: S_ENDPGM 0 - ; PACKED-LABEL: name: image_store_v2f16 + ; GFX81-LABEL: name: image_store_v2f16 + ; GFX81: bb.1 (%ir-block.0): + ; GFX81: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2 + ; GFX81: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX81: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX81: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX81: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX81: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX81: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX81: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX81: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX81: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX81: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX81: [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 + ; GFX81: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX81: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) + ; GFX81: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY10]](<2 x s16>) + ; GFX81: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX81: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[BITCAST]](s32), [[DEF]](s32) + ; GFX81: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[BUILD_VECTOR2]](<2 x s32>), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 4 into custom "TargetCustom8") + ; GFX81: S_ENDPGM 0 + call void @llvm.amdgcn.image.store.2d.v2f16.i32(<2 x half> %in, i32 3, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @image_store_v3f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, <3 x half> %in) { + ; PACKED-LABEL: name: image_store_v3f16 ; PACKED: bb.1 (%ir-block.0): - ; PACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2 + ; PACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2, $vgpr3 ; PACKED: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 ; PACKED: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 ; PACKED: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 @@ -84,15 +147,34 @@ ; PACKED: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 ; PACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 ; PACKED: [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 + ; PACKED: [[COPY11:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3 ; PACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; PACKED: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF + ; PACKED: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY10]](<2 x s16>), [[COPY11]](<2 x s16>), [[DEF]](<2 x s16>) + ; PACKED: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[CONCAT_VECTORS]](<6 x s16>) + ; PACKED: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST]](s96) + ; PACKED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; PACKED: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C]](s32) + ; PACKED: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C]](s32) + ; PACKED: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; PACKED: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; PACKED: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C1]] + ; PACKED: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; PACKED: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C1]] + ; PACKED: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) + ; PACKED: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; PACKED: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) + ; PACKED: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; PACKED: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY14]], [[C1]] + ; PACKED: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; PACKED: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C2]], [[C]](s32) + ; PACKED: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; PACKED: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) + ; PACKED: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[DEF]](<2 x s16>) + ; PACKED: [[UV3:%[0-9]+]]:_(<3 x s16>), [[UV4:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<6 x s16>) ; PACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; PACKED: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[COPY10]](<2 x s16>), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 4 into custom "TargetCustom8") + ; PACKED: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[UV3]](<3 x s16>), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 6 into custom "TargetCustom8", align 8) ; PACKED: S_ENDPGM 0 - call void @llvm.amdgcn.image.store.2d.v2f16.i32(<2 x half> %in, i32 3, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) - ret void -} - -define amdgpu_ps void @image_store_v3f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, <3 x half> %in) { ; UNPACKED-LABEL: name: image_store_v3f16 ; UNPACKED: bb.1 (%ir-block.0): ; UNPACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2, $vgpr3 @@ -123,7 +205,58 @@ ; UNPACKED: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32) ; UNPACKED: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[BUILD_VECTOR2]](<3 x s32>), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 6 into custom "TargetCustom8", align 8) ; UNPACKED: S_ENDPGM 0 - ; PACKED-LABEL: name: image_store_v3f16 + ; GFX81-LABEL: name: image_store_v3f16 + ; GFX81: bb.1 (%ir-block.0): + ; GFX81: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX81: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX81: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX81: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX81: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX81: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX81: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX81: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX81: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX81: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX81: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX81: [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 + ; GFX81: [[COPY11:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3 + ; GFX81: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX81: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF + ; GFX81: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY10]](<2 x s16>), [[COPY11]](<2 x s16>), [[DEF]](<2 x s16>) + ; GFX81: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[CONCAT_VECTORS]](<6 x s16>) + ; GFX81: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST]](s96) + ; GFX81: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX81: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C]](s32) + ; GFX81: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C]](s32) + ; GFX81: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) + ; GFX81: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX81: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX81: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C1]] + ; GFX81: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX81: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C1]] + ; GFX81: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) + ; GFX81: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX81: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) + ; GFX81: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX81: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY14]], [[C1]] + ; GFX81: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX81: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C2]], [[C]](s32) + ; GFX81: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; GFX81: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) + ; GFX81: [[OR2:%[0-9]+]]:_(s32) = G_OR [[C2]], [[SHL1]] + ; GFX81: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32) + ; GFX81: [[COPY15:%[0-9]+]]:_(s32) = COPY [[OR2]](s32) + ; GFX81: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY15]](s32) + ; GFX81: [[CONCAT_VECTORS1:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>) + ; GFX81: [[BITCAST5:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[CONCAT_VECTORS1]](<8 x s16>) + ; GFX81: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[BITCAST5]](<4 x s32>), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 6 into custom "TargetCustom8", align 8) + ; GFX81: S_ENDPGM 0 + call void @llvm.amdgcn.image.store.2d.v3f16.i32(<3 x half> %in, i32 7, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @image_store_v4f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, <4 x half> %in) { + ; PACKED-LABEL: name: image_store_v4f16 ; PACKED: bb.1 (%ir-block.0): ; PACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2, $vgpr3 ; PACKED: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 @@ -139,37 +272,10 @@ ; PACKED: [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 ; PACKED: [[COPY11:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3 ; PACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; PACKED: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF - ; PACKED: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY10]](<2 x s16>), [[COPY11]](<2 x s16>), [[DEF]](<2 x s16>) - ; PACKED: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[CONCAT_VECTORS]](<6 x s16>) - ; PACKED: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST]](s96) - ; PACKED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; PACKED: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C]](s32) - ; PACKED: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C]](s32) - ; PACKED: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; PACKED: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV]](s32) - ; PACKED: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C1]] - ; PACKED: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) - ; PACKED: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C1]] - ; PACKED: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) - ; PACKED: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] - ; PACKED: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) - ; PACKED: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) - ; PACKED: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY14]], [[C1]] - ; PACKED: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; PACKED: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C2]], [[C]](s32) - ; PACKED: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] - ; PACKED: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) - ; PACKED: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[DEF]](<2 x s16>) - ; PACKED: [[UV3:%[0-9]+]]:_(<3 x s16>), [[UV4:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<6 x s16>) + ; PACKED: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY10]](<2 x s16>), [[COPY11]](<2 x s16>) ; PACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; PACKED: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[UV3]](<3 x s16>), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 6 into custom "TargetCustom8", align 8) + ; PACKED: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[CONCAT_VECTORS]](<4 x s16>), 15, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 8 into custom "TargetCustom8") ; PACKED: S_ENDPGM 0 - call void @llvm.amdgcn.image.store.2d.v3f16.i32(<3 x half> %in, i32 7, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) - ret void -} - -define amdgpu_ps void @image_store_v4f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, <4 x half> %in) { ; UNPACKED-LABEL: name: image_store_v4f16 ; UNPACKED: bb.1 (%ir-block.0): ; UNPACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2, $vgpr3 @@ -199,26 +305,30 @@ ; UNPACKED: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32) ; UNPACKED: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[BUILD_VECTOR2]](<4 x s32>), 15, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 8 into custom "TargetCustom8") ; UNPACKED: S_ENDPGM 0 - ; PACKED-LABEL: name: image_store_v4f16 - ; PACKED: bb.1 (%ir-block.0): - ; PACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; PACKED: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 - ; PACKED: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 - ; PACKED: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; PACKED: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 - ; PACKED: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; PACKED: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; PACKED: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 - ; PACKED: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 - ; PACKED: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; PACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; PACKED: [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 - ; PACKED: [[COPY11:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3 - ; PACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; PACKED: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY10]](<2 x s16>), [[COPY11]](<2 x s16>) - ; PACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; PACKED: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[CONCAT_VECTORS]](<4 x s16>), 15, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 8 into custom "TargetCustom8") - ; PACKED: S_ENDPGM 0 + ; GFX81-LABEL: name: image_store_v4f16 + ; GFX81: bb.1 (%ir-block.0): + ; GFX81: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX81: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX81: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX81: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX81: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX81: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX81: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX81: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX81: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX81: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX81: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX81: [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 + ; GFX81: [[COPY11:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3 + ; GFX81: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX81: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY10]](<2 x s16>), [[COPY11]](<2 x s16>) + ; GFX81: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) + ; GFX81: [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[CONCAT_VECTORS]](<4 x s16>) + ; GFX81: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST]](<2 x s32>) + ; GFX81: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX81: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[DEF]](s32), [[DEF]](s32) + ; GFX81: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[BUILD_VECTOR2]](<4 x s32>), 15, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 8 into custom "TargetCustom8") + ; GFX81: S_ENDPGM 0 call void @llvm.amdgcn.image.store.2d.v4f16.i32(<4 x half> %in, i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) ret void } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.d16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.d16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.d16.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga -o - %s | FileCheck -check-prefix=UNPACKED %s -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx810 -o - %s | FileCheck -check-prefix=PACKED %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx810 -o - %s | FileCheck -check-prefix=GFX81 %s define amdgpu_ps void @image_store_f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, half %data) { ; UNPACKED-LABEL: image_store_f16: @@ -13,7 +13,7 @@ ; UNPACKED-NEXT: s_mov_b32 s5, s7 ; UNPACKED-NEXT: s_mov_b32 s6, s8 ; UNPACKED-NEXT: s_mov_b32 s7, s9 -; UNPACKED-NEXT: image_store v2, v[0:1], s[0:7] dmask:0x1 unorm +; UNPACKED-NEXT: image_store v2, v[0:1], s[0:7] dmask:0x1 unorm d16 ; UNPACKED-NEXT: s_endpgm ; ; PACKED-LABEL: image_store_f16: @@ -26,8 +26,21 @@ ; PACKED-NEXT: s_mov_b32 s5, s7 ; PACKED-NEXT: s_mov_b32 s6, s8 ; PACKED-NEXT: s_mov_b32 s7, s9 -; PACKED-NEXT: image_store v2, v[0:1], s[0:7] dmask:0x1 unorm +; PACKED-NEXT: image_store v2, v[0:1], s[0:7] dmask:0x1 unorm d16 ; PACKED-NEXT: s_endpgm +; +; GFX81-LABEL: image_store_f16: +; GFX81: ; %bb.0: +; GFX81-NEXT: s_mov_b32 s0, s2 +; GFX81-NEXT: s_mov_b32 s1, s3 +; GFX81-NEXT: s_mov_b32 s2, s4 +; GFX81-NEXT: s_mov_b32 s3, s5 +; GFX81-NEXT: s_mov_b32 s4, s6 +; GFX81-NEXT: s_mov_b32 s5, s7 +; GFX81-NEXT: s_mov_b32 s6, s8 +; GFX81-NEXT: s_mov_b32 s7, s9 +; GFX81-NEXT: image_store v2, v[0:1], s[0:7] dmask:0x1 unorm d16 +; GFX81-NEXT: s_endpgm call void @llvm.amdgcn.image.store.2d.f16.i32(half %data, i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) ret void } @@ -44,7 +57,7 @@ ; UNPACKED-NEXT: s_mov_b32 s6, s8 ; UNPACKED-NEXT: s_mov_b32 s7, s9 ; UNPACKED-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; UNPACKED-NEXT: image_store v[2:3], v[0:1], s[0:7] dmask:0x3 unorm +; UNPACKED-NEXT: image_store v[2:3], v[0:1], s[0:7] dmask:0x3 unorm d16 ; UNPACKED-NEXT: s_endpgm ; ; PACKED-LABEL: image_store_v2f16: @@ -57,8 +70,21 @@ ; PACKED-NEXT: s_mov_b32 s5, s7 ; PACKED-NEXT: s_mov_b32 s6, s8 ; PACKED-NEXT: s_mov_b32 s7, s9 -; PACKED-NEXT: image_store v2, v[0:1], s[0:7] dmask:0x3 unorm +; PACKED-NEXT: image_store v2, v[0:1], s[0:7] dmask:0x3 unorm d16 ; PACKED-NEXT: s_endpgm +; +; GFX81-LABEL: image_store_v2f16: +; GFX81: ; %bb.0: +; GFX81-NEXT: s_mov_b32 s0, s2 +; GFX81-NEXT: s_mov_b32 s1, s3 +; GFX81-NEXT: s_mov_b32 s2, s4 +; GFX81-NEXT: s_mov_b32 s3, s5 +; GFX81-NEXT: s_mov_b32 s4, s6 +; GFX81-NEXT: s_mov_b32 s5, s7 +; GFX81-NEXT: s_mov_b32 s6, s8 +; GFX81-NEXT: s_mov_b32 s7, s9 +; GFX81-NEXT: image_store v[2:3], v[0:1], s[0:7] dmask:0x3 unorm d16 +; GFX81-NEXT: s_endpgm call void @llvm.amdgcn.image.store.2d.v2f16.i32(<2 x half> %in, i32 3, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) ret void } @@ -85,7 +111,7 @@ ; UNPACKED-NEXT: v_mov_b32_e32 v5, v0 ; UNPACKED-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; UNPACKED-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; UNPACKED-NEXT: image_store v[1:4], v[5:6], s[0:7] dmask:0xf unorm +; UNPACKED-NEXT: image_store v[1:4], v[5:6], s[0:7] dmask:0xf unorm d16 ; UNPACKED-NEXT: s_endpgm ; ; PACKED-LABEL: image_store_v4f16: @@ -98,8 +124,21 @@ ; PACKED-NEXT: s_mov_b32 s5, s7 ; PACKED-NEXT: s_mov_b32 s6, s8 ; PACKED-NEXT: s_mov_b32 s7, s9 -; PACKED-NEXT: image_store v[2:3], v[0:1], s[0:7] dmask:0xf unorm +; PACKED-NEXT: image_store v[2:3], v[0:1], s[0:7] dmask:0xf unorm d16 ; PACKED-NEXT: s_endpgm +; +; GFX81-LABEL: image_store_v4f16: +; GFX81: ; %bb.0: +; GFX81-NEXT: s_mov_b32 s0, s2 +; GFX81-NEXT: s_mov_b32 s1, s3 +; GFX81-NEXT: s_mov_b32 s2, s4 +; GFX81-NEXT: s_mov_b32 s3, s5 +; GFX81-NEXT: s_mov_b32 s4, s6 +; GFX81-NEXT: s_mov_b32 s5, s7 +; GFX81-NEXT: s_mov_b32 s6, s8 +; GFX81-NEXT: s_mov_b32 s7, s9 +; GFX81-NEXT: image_store v[2:5], v[0:1], s[0:7] dmask:0xf unorm d16 +; GFX81-NEXT: s_endpgm call void @llvm.amdgcn.image.store.2d.v4f16.i32(<4 x half> %in, i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) ret void } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.d16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.d16.dim.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.d16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.d16.dim.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefixes=GCN,UNPACKED,GFX89 %s -; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -check-prefixes=GCN,PACKED,GFX81,GFX89 %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -check-prefixes=GCN,GFX81,GFX89 %s ; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefixes=GCN,PACKED,GFX9,GFX89 %s ; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefixes=GCN,GFX10 %s @@ -15,6 +15,7 @@ ; GCN-LABEL: {{^}}image_load_v2f16: ; UNPACKED: image_load v[0:1], v[0:1], s[0:7] dmask:0x3 unorm d16{{$}} ; PACKED: image_load v0, v[0:1], s[0:7] dmask:0x3 unorm d16{{$}} +; GFX81: image_load v0, v[0:1], s[0:7] dmask:0x3 unorm d16{{$}} ; GFX10: image_load v0, v[0:1], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D unorm d16{{$}} define amdgpu_ps float @image_load_v2f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t) { main_body: @@ -38,6 +39,7 @@ ; GCN-LABEL: {{^}}image_load_v4f16: ; UNPACKED: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm d16{{$}} ; PACKED: image_load v[0:1], v[0:1], s[0:7] dmask:0xf unorm d16{{$}} +; GFX81: image_load v[0:1], v[0:1], s[0:7] dmask:0xf unorm d16{{$}} ; GFX10: image_load v[0:1], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm d16{{$}} define amdgpu_ps <2 x float> @image_load_v4f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t) { main_body: @@ -49,6 +51,7 @@ ; GCN-LABEL: {{^}}image_load_mip_v4f16: ; UNPACKED: image_load_mip v[0:3], v[0:2], s[0:7] dmask:0xf unorm d16{{$}} ; PACKED: image_load_mip v[0:1], v[0:2], s[0:7] dmask:0xf unorm d16{{$}} +; GFX81: image_load_mip v[0:1], v[0:2], s[0:7] dmask:0xf unorm d16{{$}} ; GFX10: image_load_mip v[0:1], v[0:2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm d16{{$}} define amdgpu_ps <2 x float> @image_load_mip_v4f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %mip) { main_body: @@ -60,6 +63,7 @@ ; GCN-LABEL: {{^}}image_load_3d_v2f16: ; UNPACKED: image_load v[0:1], v[0:2], s[0:7] dmask:0x3 unorm d16{{$}} ; PACKED: image_load v0, v[0:2], s[0:7] dmask:0x3 unorm d16{{$}} +; GFX81: image_load v0, v[0:2], s[0:7] dmask:0x3 unorm d16{{$}} ; GFX10: image_load v0, v[0:2], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_3D unorm d16{{$}} define amdgpu_ps float @image_load_3d_v2f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %r) { main_body: @@ -90,6 +94,7 @@ ; UNPACKED: v_and_b32_e32 ; UNPACKED: image_store v[{{[0-9:]+}}], v[0:1], s[0:7] dmask:0x3 unorm d16{{$}} ; PACKED: image_store v2, v[0:1], s[0:7] dmask:0x3 unorm d16{{$}} +; GFX81: image_store v[2:3], v[0:1], s[0:7] dmask:0x3 unorm d16{{$}} ; GFX10: image_store v2, v[0:1], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D unorm d16{{$}} define amdgpu_ps void @image_store_v2f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, float %in) { main_body: @@ -113,6 +118,7 @@ ; UNPACKED: v_and_b32_e32 ; UNPACKED: image_store v[{{[0-9:]+}}], v[0:1], s[0:7] dmask:0xf unorm d16{{$}} ; PACKED: image_store v[2:3], v[0:1], s[0:7] dmask:0xf unorm d16{{$}} +; GFX81: image_store v[2:5], v[0:1], s[0:7] dmask:0xf unorm d16{{$}} ; GFX10: image_store v[2:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm d16{{$}} define amdgpu_ps void @image_store_v4f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, <2 x float> %in) { main_body: @@ -128,6 +134,7 @@ ; UNPACKED: v_and_b32_e32 ; UNPACKED: image_store_mip v[{{[0-9:]+}}], v[0:1], s[0:7] dmask:0xf unorm d16{{$}} ; PACKED: image_store_mip v[2:3], v[0:1], s[0:7] dmask:0xf unorm d16{{$}} +; GFX81: image_store_mip v[2:5], v[0:1], s[0:7] dmask:0xf unorm d16{{$}} ; GFX10: image_store_mip v[2:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm d16{{$}} define amdgpu_ps void @image_store_mip_1d_v4f16(<8 x i32> inreg %rsrc, i32 %s, i32 %mip, <2 x float> %in) { main_body: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.d16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.d16.dim.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.d16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.d16.dim.ll @@ -1,11 +1,13 @@ ; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefixes=GCN,UNPACKED,GFX89 %s -; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -check-prefixes=GCN,PACKED,GFX81,GFX89 %s -; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefixes=GCN,PACKED,GFX9,GFX89 %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -check-prefixes=GCN,GFX81,GFX89 %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefixes=GCN,GFX9,GFX89 %s ; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefixes=GCN,GFX10 %s ; GCN-LABEL: {{^}}image_gather4_b_2d_v4f16: ; UNPACKED: image_gather4_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x4 d16{{$}} ; PACKED: image_gather4_b v[0:1], v[0:2], s[0:7], s[8:11] dmask:0x4 d16{{$}} +; GFX810: image_gather4_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x4 d16{{$}} +; GFX9: image_gather4_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x4 d16{{$}} ; GFX10: image_gather4_b v[0:1], v[0:2], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D d16{{$}} define amdgpu_ps <2 x float> @image_gather4_b_2d_v4f16(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %s, float %t) { main_body: