Index: llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -2788,69 +2788,6 @@ return true; } -// Produce a vector of s16 elements from s32 pieces. -static void truncToS16Vector(MachineIRBuilder &B, Register DstReg, - ArrayRef UnmergeParts) { - const LLT S16 = LLT::scalar(16); - - SmallVector RemergeParts(UnmergeParts.size()); - for (int I = 0, E = UnmergeParts.size(); I != E; ++I) - RemergeParts[I] = B.buildTrunc(S16, UnmergeParts[I]).getReg(0); - - B.buildBuildVector(DstReg, RemergeParts); -} - -/// Convert a set of s32 registers to a result vector with s16 elements. -static void bitcastToS16Vector(MachineIRBuilder &B, Register DstReg, - ArrayRef UnmergeParts) { - MachineRegisterInfo &MRI = *B.getMRI(); - const LLT V2S16 = LLT::vector(2, 16); - LLT TargetTy = MRI.getType(DstReg); - int NumElts = UnmergeParts.size(); - - if (NumElts == 1) { - assert(TargetTy == V2S16); - B.buildBitcast(DstReg, UnmergeParts[0]); - return; - } - - SmallVector RemergeParts(NumElts); - for (int I = 0; I != NumElts; ++I) - RemergeParts[I] = B.buildBitcast(V2S16, UnmergeParts[I]).getReg(0); - - if (TargetTy.getSizeInBits() == 32u * NumElts) { - B.buildConcatVectors(DstReg, RemergeParts); - return; - } - - const LLT V3S16 = LLT::vector(3, 16); - const LLT V6S16 = LLT::vector(6, 16); - - // Widen to v6s16 and unpack v3 parts. - assert(TargetTy == V3S16); - - RemergeParts.push_back(B.buildUndef(V2S16).getReg(0)); - auto Concat = B.buildConcatVectors(V6S16, RemergeParts); - B.buildUnmerge({DstReg, MRI.createGenericVirtualRegister(V3S16)}, Concat); -} - -// FIXME: Just vector trunc should be sufficent, but legalization currently -// broken. -static void repackUnpackedD16Load(MachineIRBuilder &B, Register DstReg, - Register WideDstReg) { - const LLT S32 = LLT::scalar(32); - const LLT S16 = LLT::scalar(16); - - auto Unmerge = B.buildUnmerge(S32, WideDstReg); - - int NumOps = Unmerge->getNumOperands() - 1; - SmallVector RemergeParts(NumOps); - for (int I = 0; I != NumOps; ++I) - RemergeParts[I] = B.buildTrunc(S16, Unmerge.getReg(I)).getReg(0); - - B.buildBuildVector(DstReg, RemergeParts); -} - /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized /// vector with s16 typed elements. static void packImageA16AddressToDwords(MachineIRBuilder &B, @@ -2909,14 +2846,18 @@ return BaseOpcode->NumExtraArgs + NumGradients + NumCoords + NumLCM; } +static int getDMaskIdx(const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode, + int NumDefs) { + assert(!BaseOpcode->Atomic); + return NumDefs + 1 + (BaseOpcode->Store ? 1 : 0); +} + /// Return first address operand index in an image intrinsic. static int getImageVAddrIdxBegin(const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode, int NumDefs) { if (BaseOpcode->Atomic) return NumDefs + 1 + (BaseOpcode->AtomicX2 ? 2 : 1); - - int DMaskIdx = NumDefs + 1 + (BaseOpcode->Store ? 1 : 0); - return DMaskIdx + 1; + return getDMaskIdx(BaseOpcode, NumDefs) + 1; } /// Rewrite image intrinsics to use register layouts expected by the subtarget. @@ -2949,6 +2890,7 @@ MachineRegisterInfo *MRI = B.getMRI(); const LLT S32 = LLT::scalar(32); const LLT S16 = LLT::scalar(16); + const LLT V2S16 = LLT::vector(2, 16); // Index of first address argument const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs); @@ -3008,7 +2950,12 @@ if (BaseOpcode->Atomic) // No d16 atomics, or TFE. return true; + const int DMaskIdx = getDMaskIdx(BaseOpcode, NumDefs); + unsigned DMask = MI.getOperand(DMaskIdx).getImm(); + const int DMaskLanes = BaseOpcode->Gather4 ? 4 : countPopulation(DMask); + if (BaseOpcode->Store) { // No TFE for stores? + // TODO: Handle dmask trim Register VData = MI.getOperand(1).getReg(); LLT Ty = MRI->getType(VData); if (!Ty.isVector() || Ty.getElementType() != S16) @@ -3023,93 +2970,164 @@ LLT Ty = MRI->getType(DstReg); const LLT EltTy = Ty.getScalarType(); const bool IsD16 = Ty.getScalarType() == S16; - const unsigned NumElts = Ty.isVector() ? Ty.getNumElements() : 1; + const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; - if (IsTFE) { - // In the IR, TFE is supposed to be used with a 2 element struct return - // type. The intruction really returns these two values in one contiguous - // register, with one additional dword beyond the loaded data. Rewrite the - // return type to use a single register result. - Register Dst1Reg = MI.getOperand(1).getReg(); - if (MRI->getType(Dst1Reg) != S32) - return false; + // Confirm that the return type is large enough for the dmask specified + if (NumElts < DMaskLanes) + return false; - // TODO: Make sure the TFE operand bit is set. + if (NumElts > 4 || DMaskLanes > 4) + return false; - // The raw dword aligned data component of the load. The only legal cases - // where this matters should be when using the packed D16 format, for - // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, - LLT RoundedTy; - LLT TFETy; + const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; + const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts); - if (IsD16 && ST.hasUnpackedD16VMem()) { - RoundedTy = LLT::scalarOrVector(NumElts, 32); - TFETy = LLT::vector(NumElts + 1, 32); - } else { - unsigned EltSize = Ty.getScalarSizeInBits(); - unsigned RoundedElts = (Ty.getSizeInBits() + 31) / 32; - unsigned RoundedSize = 32 * RoundedElts; - RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); - TFETy = LLT::vector(RoundedSize / 32 + 1, S32); - } + // The raw dword aligned data component of the load. The only legal cases + // where this matters should be when using the packed D16 format, for + // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, + LLT RoundedTy; - Register TFEReg = MRI->createGenericVirtualRegister(TFETy); + // S32 vector to to cover all data, plus TFE result element. + LLT TFETy; - // FIXME: Do we need to notify the observer of the instruction change? - MI.getOperand(0).setReg(TFEReg); - MI.RemoveOperand(1); + // Register type to use for each loaded component. Will be S32 or V2S16. + LLT RegTy; + + if (IsD16 && ST.hasUnpackedD16VMem()) { + RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32); + TFETy = LLT::vector(AdjustedNumElts + 1, 32); + RegTy = S32; + } else { + unsigned EltSize = EltTy.getSizeInBits(); + unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; + unsigned RoundedSize = 32 * RoundedElts; + RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); + TFETy = LLT::vector(RoundedSize / 32 + 1, S32); + RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; + } + + // The return type does not need adjustment. + // TODO: Should we change s16 case to s32 or <2 x s16>? + if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) + return true; + + Register Dst1Reg; + + // Insert after the instruction. + B.setInsertPt(*MI.getParent(), ++MI.getIterator()); + + // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x + // s16> instead of s32, we would only need 1 bitcast instead of multiple. + const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; + const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; - // Insert after the instruction. - B.setInsertPt(*MI.getParent(), ++MI.getIterator()); + Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); - // Now figure out how to copy the new result register back into the old - // result. + // FIXME: Do we need to notify the observer of the instruction change? + MI.getOperand(0).setReg(NewResultReg); + + // In the IR, TFE is supposed to be used with a 2 element struct return + // type. The intruction really returns these two values in one contiguous + // register, with one additional dword beyond the loaded data. Rewrite the + // return type to use a single register result. - SmallVector UnmergeResults(TFETy.getNumElements(), Dst1Reg); - int NumDataElts = TFETy.getNumElements() - 1; + if (IsTFE) { + Dst1Reg = MI.getOperand(1).getReg(); + if (MRI->getType(Dst1Reg) != S32) + return false; - if (!Ty.isVector()) { - // Simplest case is a trivial unmerge (plus a truncate for d16). - UnmergeResults[0] = Ty == S32 ? - DstReg : MRI->createGenericVirtualRegister(S32); + // TODO: Make sure the TFE operand bit is set. + // FIXME: Do we need to notify the observer of the instruction change? + MI.RemoveOperand(1); - B.buildUnmerge(UnmergeResults, TFEReg); - if (Ty != S32) - B.buildTrunc(DstReg, UnmergeResults[0]); + // Handle the easy case that requires no repack instructions. + if (Ty == S32) { + B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); return true; } + } - // We have to repack into a new vector of some kind. - for (int I = 0; I != NumDataElts; ++I) - UnmergeResults[I] = MRI->createGenericVirtualRegister(S32); - B.buildUnmerge(UnmergeResults, TFEReg); + // Now figure out how to copy the new result register back into the old + // result. + SmallVector ResultRegs(ResultNumRegs, Dst1Reg); - // Drop the final TFE element. - ArrayRef DataPart(UnmergeResults.data(), NumDataElts); + const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; - if (EltTy == S32) - B.buildBuildVector(DstReg, DataPart); - else if (ST.hasUnpackedD16VMem()) - truncToS16Vector(B, DstReg, DataPart); - else - bitcastToS16Vector(B, DstReg, DataPart); + if (ResultNumRegs == 1) { + assert(!IsTFE); + ResultRegs[0] = NewResultReg; + } else { + // We have to repack into a new vector of some kind. + for (int I = 0; I != NumDataRegs; ++I) + ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); + B.buildUnmerge(ResultRegs, NewResultReg); + + // Drop the final TFE element to get the data part. The TFE result is + // directly written to the right place already. + if (IsTFE) + ResultRegs.resize(NumDataRegs); + } + // For an s16 scalar result, we form an s32 result with a truncate regardless + // of packed vs. unpacked. + if (IsD16 && !Ty.isVector()) { + B.buildTrunc(DstReg, ResultRegs[0]); return true; } - // Must be an image load. - if (!ST.hasUnpackedD16VMem() || !Ty.isVector() || Ty.getElementType() != S16) + // Avoid a build/concat_vector of 1 entry. + if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { + B.buildBitcast(DstReg, ResultRegs[0]); return true; + } - B.setInsertPt(*MI.getParent(), ++MI.getIterator()); + assert(Ty.isVector()); - LLT WidenedTy = Ty.changeElementType(S32); - Register WideDstReg = MRI->createGenericVirtualRegister(WidenedTy); + if (IsD16) { + // For packed D16 results with TFE enabled, all the data components are + // S32. Cast back to the expected type. + // + // TODO: We don't really need to use load s32 elements. We would only need one + // cast for the TFE result if a multiple of v2s16 was used. + if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { + for (Register &Reg : ResultRegs) + Reg = B.buildBitcast(V2S16, Reg).getReg(0); + } else if (ST.hasUnpackedD16VMem()) { + for (Register &Reg : ResultRegs) + Reg = B.buildTrunc(S16, Reg).getReg(0); + } + } - // FIXME: Do we need to notify the observer of the instruction change? - MI.getOperand(0).setReg(WideDstReg); + auto padWithUndef = [&](LLT Ty, int NumElts) { + if (NumElts == 0) + return; + Register Undef = B.buildUndef(Ty).getReg(0); + for (int I = 0; I != NumElts; ++I) + ResultRegs.push_back(Undef); + }; + + // Pad out any elements eliminated due to the dmask. + LLT ResTy = MRI->getType(ResultRegs[0]); + if (!ResTy.isVector()) { + padWithUndef(ResTy, NumElts - ResultRegs.size()); + B.buildBuildVector(DstReg, ResultRegs); + return true; + } + + assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); + const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; + + // Deal with the one annoying legal case. + const LLT V3S16 = LLT::vector(3, 16); + if (Ty == V3S16) { + padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); + auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs); + B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); + return true; + } - repackUnpackedD16Load(B, DstReg, WideDstReg); + padWithUndef(ResTy, RegsToCover - ResultRegs.size()); + B.buildConcatVectors(DstReg, ResultRegs); return true; } Index: llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.dim.a16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.dim.a16.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.dim.a16.ll @@ -2918,12 +2918,12 @@ ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; GFX9: [[INT:%[0-9]+]]:_(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.image.getresinfo.1d), 0, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0 - ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[INT]](<4 x s32>) - ; GFX9: $vgpr0 = COPY [[UV]](s32) - ; GFX9: $vgpr1 = COPY [[UV1]](s32) - ; GFX9: $vgpr2 = COPY [[UV2]](s32) - ; GFX9: $vgpr3 = COPY [[UV3]](s32) + ; GFX9: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.image.getresinfo.1d), 0, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0 + ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX9: $vgpr0 = COPY [[INT]](s32) + ; GFX9: $vgpr1 = COPY [[DEF]](s32) + ; GFX9: $vgpr2 = COPY [[DEF]](s32) + ; GFX9: $vgpr3 = COPY [[DEF]](s32) ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; GFX10NSA-LABEL: name: getresinfo_dmask0 ; GFX10NSA: bb.1.main_body: @@ -2940,12 +2940,12 @@ ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; GFX10NSA: [[INT:%[0-9]+]]:_(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.image.getresinfo.1d), 0, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0 - ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[INT]](<4 x s32>) - ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) - ; GFX10NSA: $vgpr1 = COPY [[UV1]](s32) - ; GFX10NSA: $vgpr2 = COPY [[UV2]](s32) - ; GFX10NSA: $vgpr3 = COPY [[UV3]](s32) + ; GFX10NSA: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.image.getresinfo.1d), 0, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0 + ; GFX10NSA: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX10NSA: $vgpr0 = COPY [[INT]](s32) + ; GFX10NSA: $vgpr1 = COPY [[DEF]](s32) + ; GFX10NSA: $vgpr2 = COPY [[DEF]](s32) + ; GFX10NSA: $vgpr3 = COPY [[DEF]](s32) ; GFX10NSA: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %mip = extractelement <2 x i16> %coords, i32 0 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2d.d16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2d.d16.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2d.d16.ll @@ -138,9 +138,13 @@ ; PACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 ; PACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; PACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; PACKED: [[INT:%[0-9]+]]:_(<3 x s16>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.2d), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 6 from custom "TargetCustom8", align 8) - ; PACKED: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; PACKED: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[INT]](<3 x s16>), 0 + ; PACKED: [[INT:%[0-9]+]]:_(<4 x s16>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.2d), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 6 from custom "TargetCustom8", align 8) + ; PACKED: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INT]](<4 x s16>) + ; PACKED: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF + ; PACKED: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[UV]](<2 x s16>), [[UV1]](<2 x s16>), [[DEF]](<2 x s16>) + ; PACKED: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>) + ; PACKED: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; PACKED: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV2]](<3 x s16>), 0 ; PACKED: [[EXTRACT:%[0-9]+]]:_(<2 x s16>) = G_EXTRACT [[INSERT]](<4 x s16>), 0 ; PACKED: [[EXTRACT1:%[0-9]+]]:_(<2 x s16>) = G_EXTRACT [[INSERT]](<4 x s16>), 32 ; PACKED: $vgpr0 = COPY [[EXTRACT]](<2 x s16>) @@ -440,6 +444,1128 @@ ret <4 x half> %tex } +define amdgpu_ps half @image_load_f16_dmask_0000(<8 x i32> inreg %rsrc, i32 %s, i32 %t) { + ; UNPACKED-LABEL: name: image_load_f16_dmask_0000 + ; UNPACKED: bb.1 (%ir-block.0): + ; UNPACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1 + ; UNPACKED: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; UNPACKED: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; UNPACKED: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; UNPACKED: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; UNPACKED: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; UNPACKED: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; UNPACKED: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; UNPACKED: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; UNPACKED: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; UNPACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; UNPACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; UNPACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) + ; UNPACKED: [[INT:%[0-9]+]]:_(s16) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.2d), 0, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 2 from custom "TargetCustom8") + ; UNPACKED: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT]](s16) + ; UNPACKED: $vgpr0 = COPY [[ANYEXT]](s32) + ; UNPACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; PACKED-LABEL: name: image_load_f16_dmask_0000 + ; PACKED: bb.1 (%ir-block.0): + ; PACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1 + ; PACKED: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; PACKED: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; PACKED: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; PACKED: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; PACKED: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; PACKED: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; PACKED: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; PACKED: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; PACKED: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; PACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; PACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; PACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) + ; PACKED: [[INT:%[0-9]+]]:_(s16) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.2d), 0, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 2 from custom "TargetCustom8") + ; PACKED: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT]](s16) + ; PACKED: $vgpr0 = COPY [[ANYEXT]](s32) + ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 + %tex = call half @llvm.amdgcn.image.load.2d.f16.i32(i32 0, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) + ret half %tex +} + +define amdgpu_ps <2 x half> @image_load_v2f16_dmask_1000(<8 x i32> inreg %rsrc, i32 %s, i32 %t) { + ; UNPACKED-LABEL: name: image_load_v2f16_dmask_1000 + ; UNPACKED: bb.1 (%ir-block.0): + ; UNPACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1 + ; UNPACKED: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; UNPACKED: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; UNPACKED: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; UNPACKED: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; UNPACKED: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; UNPACKED: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; UNPACKED: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; UNPACKED: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; UNPACKED: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; UNPACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; UNPACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; UNPACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) + ; UNPACKED: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 2 from custom "TargetCustom8") + ; UNPACKED: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[INT]](s32) + ; UNPACKED: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; UNPACKED: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) + ; UNPACKED: $vgpr0 = COPY [[BUILD_VECTOR2]](<2 x s16>) + ; UNPACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; PACKED-LABEL: name: image_load_v2f16_dmask_1000 + ; PACKED: bb.1 (%ir-block.0): + ; PACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1 + ; PACKED: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; PACKED: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; PACKED: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; PACKED: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; PACKED: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; PACKED: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; PACKED: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; PACKED: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; PACKED: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; PACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; PACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; PACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) + ; PACKED: [[INT:%[0-9]+]]:_(<2 x s16>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 2 from custom "TargetCustom8") + ; PACKED: $vgpr0 = COPY [[INT]](<2 x s16>) + ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 + %tex = call <2 x half> @llvm.amdgcn.image.load.2d.v2f16.i32(i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) + ret <2 x half> %tex +} + +define amdgpu_ps <2 x half> @image_load_v2f16_dmask_0000(<8 x i32> inreg %rsrc, i32 %s, i32 %t) { + ; UNPACKED-LABEL: name: image_load_v2f16_dmask_0000 + ; UNPACKED: bb.1 (%ir-block.0): + ; UNPACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1 + ; UNPACKED: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; UNPACKED: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; UNPACKED: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; UNPACKED: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; UNPACKED: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; UNPACKED: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; UNPACKED: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; UNPACKED: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; UNPACKED: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; UNPACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; UNPACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; UNPACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) + ; UNPACKED: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.2d), 0, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 2 from custom "TargetCustom8") + ; UNPACKED: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[INT]](s32) + ; UNPACKED: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; UNPACKED: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) + ; UNPACKED: $vgpr0 = COPY [[BUILD_VECTOR2]](<2 x s16>) + ; UNPACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; PACKED-LABEL: name: image_load_v2f16_dmask_0000 + ; PACKED: bb.1 (%ir-block.0): + ; PACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1 + ; PACKED: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; PACKED: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; PACKED: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; PACKED: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; PACKED: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; PACKED: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; PACKED: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; PACKED: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; PACKED: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; PACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; PACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; PACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) + ; PACKED: [[INT:%[0-9]+]]:_(<2 x s16>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.2d), 0, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 2 from custom "TargetCustom8") + ; PACKED: $vgpr0 = COPY [[INT]](<2 x s16>) + ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 + %tex = call <2 x half> @llvm.amdgcn.image.load.2d.v2f16.i32(i32 0, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) + ret <2 x half> %tex +} + +define amdgpu_ps <3 x half> @image_load_v3f16_dmask_1100(<8 x i32> inreg %rsrc, i32 %s, i32 %t) { + ; UNPACKED-LABEL: name: image_load_v3f16_dmask_1100 + ; UNPACKED: bb.1 (%ir-block.0): + ; UNPACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1 + ; UNPACKED: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; UNPACKED: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; UNPACKED: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; UNPACKED: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; UNPACKED: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; UNPACKED: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; UNPACKED: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; UNPACKED: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; UNPACKED: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; UNPACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; UNPACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; UNPACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) + ; UNPACKED: [[INT:%[0-9]+]]:_(<2 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 4 from custom "TargetCustom8") + ; UNPACKED: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[INT]](<2 x s32>) + ; UNPACKED: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[UV]](s32) + ; UNPACKED: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[UV1]](s32) + ; UNPACKED: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; UNPACKED: [[DEF1:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; UNPACKED: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; UNPACKED: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[DEF]](s16), [[DEF1]](s16) + ; UNPACKED: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) + ; UNPACKED: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<4 x s16>), 0 + ; UNPACKED: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; UNPACKED: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF2]], [[EXTRACT]](<3 x s16>), 0 + ; UNPACKED: [[EXTRACT1:%[0-9]+]]:_(<2 x s16>) = G_EXTRACT [[INSERT]](<4 x s16>), 0 + ; UNPACKED: [[EXTRACT2:%[0-9]+]]:_(<2 x s16>) = G_EXTRACT [[INSERT]](<4 x s16>), 32 + ; UNPACKED: $vgpr0 = COPY [[EXTRACT1]](<2 x s16>) + ; UNPACKED: $vgpr1 = COPY [[EXTRACT2]](<2 x s16>) + ; UNPACKED: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 + ; PACKED-LABEL: name: image_load_v3f16_dmask_1100 + ; PACKED: bb.1 (%ir-block.0): + ; PACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1 + ; PACKED: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; PACKED: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; PACKED: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; PACKED: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; PACKED: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; PACKED: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; PACKED: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; PACKED: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; PACKED: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; PACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; PACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; PACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) + ; PACKED: [[INT:%[0-9]+]]:_(<2 x s16>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 4 from custom "TargetCustom8") + ; PACKED: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF + ; PACKED: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[INT]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>) + ; PACKED: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>) + ; PACKED: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; PACKED: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 + ; PACKED: [[EXTRACT:%[0-9]+]]:_(<2 x s16>) = G_EXTRACT [[INSERT]](<4 x s16>), 0 + ; PACKED: [[EXTRACT1:%[0-9]+]]:_(<2 x s16>) = G_EXTRACT [[INSERT]](<4 x s16>), 32 + ; PACKED: $vgpr0 = COPY [[EXTRACT]](<2 x s16>) + ; PACKED: $vgpr1 = COPY [[EXTRACT1]](<2 x s16>) + ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 + %tex = call <3 x half> @llvm.amdgcn.image.load.2d.v3f16.i32(i32 3, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) + ret <3 x half> %tex +} + +define amdgpu_ps <3 x half> @image_load_v3f16_dmask_1000(<8 x i32> inreg %rsrc, i32 %s, i32 %t) { + ; UNPACKED-LABEL: name: image_load_v3f16_dmask_1000 + ; UNPACKED: bb.1 (%ir-block.0): + ; UNPACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1 + ; UNPACKED: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; UNPACKED: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; UNPACKED: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; UNPACKED: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; UNPACKED: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; UNPACKED: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; UNPACKED: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; UNPACKED: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; UNPACKED: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; UNPACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; UNPACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; UNPACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) + ; UNPACKED: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 2 from custom "TargetCustom8") + ; UNPACKED: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[INT]](s32) + ; UNPACKED: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; UNPACKED: [[DEF1:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; UNPACKED: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) + ; UNPACKED: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[DEF]](s16), [[DEF1]](s16) + ; UNPACKED: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) + ; UNPACKED: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<4 x s16>), 0 + ; UNPACKED: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; UNPACKED: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF2]], [[EXTRACT]](<3 x s16>), 0 + ; UNPACKED: [[EXTRACT1:%[0-9]+]]:_(<2 x s16>) = G_EXTRACT [[INSERT]](<4 x s16>), 0 + ; UNPACKED: [[EXTRACT2:%[0-9]+]]:_(<2 x s16>) = G_EXTRACT [[INSERT]](<4 x s16>), 32 + ; UNPACKED: $vgpr0 = COPY [[EXTRACT1]](<2 x s16>) + ; UNPACKED: $vgpr1 = COPY [[EXTRACT2]](<2 x s16>) + ; UNPACKED: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 + ; PACKED-LABEL: name: image_load_v3f16_dmask_1000 + ; PACKED: bb.1 (%ir-block.0): + ; PACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1 + ; PACKED: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; PACKED: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; PACKED: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; PACKED: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; PACKED: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; PACKED: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; PACKED: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; PACKED: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; PACKED: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; PACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; PACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; PACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) + ; PACKED: [[INT:%[0-9]+]]:_(<2 x s16>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 2 from custom "TargetCustom8") + ; PACKED: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF + ; PACKED: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[INT]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>) + ; PACKED: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>) + ; PACKED: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; PACKED: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 + ; PACKED: [[EXTRACT:%[0-9]+]]:_(<2 x s16>) = G_EXTRACT [[INSERT]](<4 x s16>), 0 + ; PACKED: [[EXTRACT1:%[0-9]+]]:_(<2 x s16>) = G_EXTRACT [[INSERT]](<4 x s16>), 32 + ; PACKED: $vgpr0 = COPY [[EXTRACT]](<2 x s16>) + ; PACKED: $vgpr1 = COPY [[EXTRACT1]](<2 x s16>) + ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 + %tex = call <3 x half> @llvm.amdgcn.image.load.2d.v3f16.i32(i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) + ret <3 x half> %tex +} + +define amdgpu_ps <3 x half> @image_load_v3f16_dmask_0000(<8 x i32> inreg %rsrc, i32 %s, i32 %t) { + ; UNPACKED-LABEL: name: image_load_v3f16_dmask_0000 + ; UNPACKED: bb.1 (%ir-block.0): + ; UNPACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1 + ; UNPACKED: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; UNPACKED: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; UNPACKED: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; UNPACKED: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; UNPACKED: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; UNPACKED: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; UNPACKED: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; UNPACKED: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; UNPACKED: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; UNPACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; UNPACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; UNPACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) + ; UNPACKED: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.2d), 0, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 2 from custom "TargetCustom8") + ; UNPACKED: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[INT]](s32) + ; UNPACKED: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; UNPACKED: [[DEF1:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; UNPACKED: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) + ; UNPACKED: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[DEF]](s16), [[DEF1]](s16) + ; UNPACKED: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) + ; UNPACKED: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<4 x s16>), 0 + ; UNPACKED: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; UNPACKED: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF2]], [[EXTRACT]](<3 x s16>), 0 + ; UNPACKED: [[EXTRACT1:%[0-9]+]]:_(<2 x s16>) = G_EXTRACT [[INSERT]](<4 x s16>), 0 + ; UNPACKED: [[EXTRACT2:%[0-9]+]]:_(<2 x s16>) = G_EXTRACT [[INSERT]](<4 x s16>), 32 + ; UNPACKED: $vgpr0 = COPY [[EXTRACT1]](<2 x s16>) + ; UNPACKED: $vgpr1 = COPY [[EXTRACT2]](<2 x s16>) + ; UNPACKED: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 + ; PACKED-LABEL: name: image_load_v3f16_dmask_0000 + ; PACKED: bb.1 (%ir-block.0): + ; PACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1 + ; PACKED: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; PACKED: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; PACKED: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; PACKED: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; PACKED: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; PACKED: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; PACKED: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; PACKED: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; PACKED: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; PACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; PACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; PACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) + ; PACKED: [[INT:%[0-9]+]]:_(<2 x s16>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.2d), 0, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 2 from custom "TargetCustom8") + ; PACKED: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF + ; PACKED: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[INT]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>) + ; PACKED: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>) + ; PACKED: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; PACKED: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 + ; PACKED: [[EXTRACT:%[0-9]+]]:_(<2 x s16>) = G_EXTRACT [[INSERT]](<4 x s16>), 0 + ; PACKED: [[EXTRACT1:%[0-9]+]]:_(<2 x s16>) = G_EXTRACT [[INSERT]](<4 x s16>), 32 + ; PACKED: $vgpr0 = COPY [[EXTRACT]](<2 x s16>) + ; PACKED: $vgpr1 = COPY [[EXTRACT1]](<2 x s16>) + ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 + %tex = call <3 x half> @llvm.amdgcn.image.load.2d.v3f16.i32(i32 0, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) + ret <3 x half> %tex +} + +define amdgpu_ps <4 x half> @image_load_v4f16_dmask_1110(<8 x i32> inreg %rsrc, i32 %s, i32 %t) { + ; UNPACKED-LABEL: name: image_load_v4f16_dmask_1110 + ; UNPACKED: bb.1 (%ir-block.0): + ; UNPACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1 + ; UNPACKED: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; UNPACKED: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; UNPACKED: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; UNPACKED: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; UNPACKED: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; UNPACKED: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; UNPACKED: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; UNPACKED: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; UNPACKED: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; UNPACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; UNPACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; UNPACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) + ; UNPACKED: [[INT:%[0-9]+]]:_(<3 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.2d), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 6 from custom "TargetCustom8", align 8) + ; UNPACKED: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[INT]](<3 x s32>) + ; UNPACKED: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[UV]](s32) + ; UNPACKED: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[UV1]](s32) + ; UNPACKED: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[UV2]](s32) + ; UNPACKED: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; UNPACKED: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; UNPACKED: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) + ; UNPACKED: $vgpr0 = COPY [[BUILD_VECTOR2]](<2 x s16>) + ; UNPACKED: $vgpr1 = COPY [[BUILD_VECTOR3]](<2 x s16>) + ; UNPACKED: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 + ; PACKED-LABEL: name: image_load_v4f16_dmask_1110 + ; PACKED: bb.1 (%ir-block.0): + ; PACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1 + ; PACKED: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; PACKED: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; PACKED: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; PACKED: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; PACKED: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; PACKED: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; PACKED: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; PACKED: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; PACKED: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; PACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; PACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; PACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) + ; PACKED: [[INT:%[0-9]+]]:_(<4 x s16>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.2d), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 6 from custom "TargetCustom8", align 8) + ; PACKED: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INT]](<4 x s16>) + ; PACKED: $vgpr0 = COPY [[UV]](<2 x s16>) + ; PACKED: $vgpr1 = COPY [[UV1]](<2 x s16>) + ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 + %tex = call <4 x half> @llvm.amdgcn.image.load.2d.v4f16.i32(i32 7, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) + ret <4 x half> %tex +} + +define amdgpu_ps <4 x half> @image_load_v4f16_dmask_1100(<8 x i32> inreg %rsrc, i32 %s, i32 %t) { + ; UNPACKED-LABEL: name: image_load_v4f16_dmask_1100 + ; UNPACKED: bb.1 (%ir-block.0): + ; UNPACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1 + ; UNPACKED: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; UNPACKED: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; UNPACKED: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; UNPACKED: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; UNPACKED: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; UNPACKED: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; UNPACKED: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; UNPACKED: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; UNPACKED: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; UNPACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; UNPACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; UNPACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) + ; UNPACKED: [[INT:%[0-9]+]]:_(<2 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 4 from custom "TargetCustom8") + ; UNPACKED: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[INT]](<2 x s32>) + ; UNPACKED: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[UV]](s32) + ; UNPACKED: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[UV1]](s32) + ; UNPACKED: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; UNPACKED: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; UNPACKED: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[DEF]](s16), [[DEF]](s16) + ; UNPACKED: $vgpr0 = COPY [[BUILD_VECTOR2]](<2 x s16>) + ; UNPACKED: $vgpr1 = COPY [[BUILD_VECTOR3]](<2 x s16>) + ; UNPACKED: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 + ; PACKED-LABEL: name: image_load_v4f16_dmask_1100 + ; PACKED: bb.1 (%ir-block.0): + ; PACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1 + ; PACKED: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; PACKED: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; PACKED: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; PACKED: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; PACKED: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; PACKED: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; PACKED: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; PACKED: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; PACKED: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; PACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; PACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; PACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) + ; PACKED: [[INT:%[0-9]+]]:_(<2 x s16>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 4 from custom "TargetCustom8") + ; PACKED: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF + ; PACKED: $vgpr0 = COPY [[INT]](<2 x s16>) + ; PACKED: $vgpr1 = COPY [[DEF]](<2 x s16>) + ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 + %tex = call <4 x half> @llvm.amdgcn.image.load.2d.v4f16.i32(i32 3, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) + ret <4 x half> %tex +} + +define amdgpu_ps <4 x half> @image_load_v4f16_dmask_1000(<8 x i32> inreg %rsrc, i32 %s, i32 %t) { + ; UNPACKED-LABEL: name: image_load_v4f16_dmask_1000 + ; UNPACKED: bb.1 (%ir-block.0): + ; UNPACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1 + ; UNPACKED: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; UNPACKED: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; UNPACKED: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; UNPACKED: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; UNPACKED: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; UNPACKED: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; UNPACKED: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; UNPACKED: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; UNPACKED: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; UNPACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; UNPACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; UNPACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) + ; UNPACKED: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 2 from custom "TargetCustom8") + ; UNPACKED: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[INT]](s32) + ; UNPACKED: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; UNPACKED: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) + ; UNPACKED: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[DEF]](s16), [[DEF]](s16) + ; UNPACKED: $vgpr0 = COPY [[BUILD_VECTOR2]](<2 x s16>) + ; UNPACKED: $vgpr1 = COPY [[BUILD_VECTOR3]](<2 x s16>) + ; UNPACKED: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 + ; PACKED-LABEL: name: image_load_v4f16_dmask_1000 + ; PACKED: bb.1 (%ir-block.0): + ; PACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1 + ; PACKED: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; PACKED: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; PACKED: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; PACKED: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; PACKED: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; PACKED: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; PACKED: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; PACKED: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; PACKED: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; PACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; PACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; PACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) + ; PACKED: [[INT:%[0-9]+]]:_(<2 x s16>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 2 from custom "TargetCustom8") + ; PACKED: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF + ; PACKED: $vgpr0 = COPY [[INT]](<2 x s16>) + ; PACKED: $vgpr1 = COPY [[DEF]](<2 x s16>) + ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 + %tex = call <4 x half> @llvm.amdgcn.image.load.2d.v4f16.i32(i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) + ret <4 x half> %tex +} + +define amdgpu_ps <4 x half> @image_load_v4f16_dmask_0000(<8 x i32> inreg %rsrc, i32 %s, i32 %t) { + ; UNPACKED-LABEL: name: image_load_v4f16_dmask_0000 + ; UNPACKED: bb.1 (%ir-block.0): + ; UNPACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1 + ; UNPACKED: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; UNPACKED: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; UNPACKED: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; UNPACKED: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; UNPACKED: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; UNPACKED: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; UNPACKED: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; UNPACKED: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; UNPACKED: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; UNPACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; UNPACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; UNPACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) + ; UNPACKED: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.2d), 0, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 2 from custom "TargetCustom8") + ; UNPACKED: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[INT]](s32) + ; UNPACKED: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; UNPACKED: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) + ; UNPACKED: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[DEF]](s16), [[DEF]](s16) + ; UNPACKED: $vgpr0 = COPY [[BUILD_VECTOR2]](<2 x s16>) + ; UNPACKED: $vgpr1 = COPY [[BUILD_VECTOR3]](<2 x s16>) + ; UNPACKED: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 + ; PACKED-LABEL: name: image_load_v4f16_dmask_0000 + ; PACKED: bb.1 (%ir-block.0): + ; PACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1 + ; PACKED: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; PACKED: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; PACKED: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; PACKED: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; PACKED: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; PACKED: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; PACKED: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; PACKED: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; PACKED: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; PACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; PACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; PACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) + ; PACKED: [[INT:%[0-9]+]]:_(<2 x s16>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.2d), 0, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 2 from custom "TargetCustom8") + ; PACKED: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF + ; PACKED: $vgpr0 = COPY [[INT]](<2 x s16>) + ; PACKED: $vgpr1 = COPY [[DEF]](<2 x s16>) + ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 + %tex = call <4 x half> @llvm.amdgcn.image.load.2d.v4f16.i32(i32 0, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) + ret <4 x half> %tex +} + +define amdgpu_ps half @image_load_tfe_f16_dmask_0000(<8 x i32> inreg %rsrc, i32 %s, i32 %t) { + ; UNPACKED-LABEL: name: image_load_tfe_f16_dmask_0000 + ; UNPACKED: bb.1 (%ir-block.0): + ; UNPACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1 + ; UNPACKED: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; UNPACKED: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; UNPACKED: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; UNPACKED: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; UNPACKED: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; UNPACKED: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; UNPACKED: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; UNPACKED: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; UNPACKED: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; UNPACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; UNPACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; UNPACKED: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; UNPACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) + ; UNPACKED: [[INT:%[0-9]+]]:_(<2 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.2d), 0, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 2 from custom "TargetCustom8") + ; UNPACKED: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[INT]](<2 x s32>) + ; UNPACKED: G_STORE [[UV1]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) + ; UNPACKED: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; UNPACKED: $vgpr0 = COPY [[COPY10]](s32) + ; UNPACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; PACKED-LABEL: name: image_load_tfe_f16_dmask_0000 + ; PACKED: bb.1 (%ir-block.0): + ; PACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1 + ; PACKED: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; PACKED: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; PACKED: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; PACKED: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; PACKED: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; PACKED: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; PACKED: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; PACKED: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; PACKED: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; PACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; PACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; PACKED: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; PACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) + ; PACKED: [[INT:%[0-9]+]]:_(<2 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.2d), 0, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 2 from custom "TargetCustom8") + ; PACKED: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[INT]](<2 x s32>) + ; PACKED: G_STORE [[UV1]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) + ; PACKED: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; PACKED: $vgpr0 = COPY [[COPY10]](s32) + ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 + %res = call { half, i32 } @llvm.amdgcn.image.load.2d.sl_f16i32s.i32(i32 0, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0) + %tex = extractvalue { half, i32 } %res, 0 + %tfe = extractvalue { half, i32 } %res, 1 + store i32 %tfe, i32 addrspace(1)* undef + ret half %tex +} + +define amdgpu_ps <2 x half> @image_load_tfe_v2f16_dmask_1000(<8 x i32> inreg %rsrc, i32 %s, i32 %t) { + ; UNPACKED-LABEL: name: image_load_tfe_v2f16_dmask_1000 + ; UNPACKED: bb.1 (%ir-block.0): + ; UNPACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1 + ; UNPACKED: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; UNPACKED: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; UNPACKED: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; UNPACKED: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; UNPACKED: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; UNPACKED: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; UNPACKED: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; UNPACKED: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; UNPACKED: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; UNPACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; UNPACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; UNPACKED: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; UNPACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) + ; UNPACKED: [[INT:%[0-9]+]]:_(<2 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 2 from custom "TargetCustom8") + ; UNPACKED: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[INT]](<2 x s32>) + ; UNPACKED: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[UV]](s32) + ; UNPACKED: [[DEF1:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; UNPACKED: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF1]](s16) + ; UNPACKED: G_STORE [[UV1]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) + ; UNPACKED: $vgpr0 = COPY [[BUILD_VECTOR2]](<2 x s16>) + ; UNPACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; PACKED-LABEL: name: image_load_tfe_v2f16_dmask_1000 + ; PACKED: bb.1 (%ir-block.0): + ; PACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1 + ; PACKED: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; PACKED: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; PACKED: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; PACKED: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; PACKED: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; PACKED: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; PACKED: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; PACKED: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; PACKED: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; PACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; PACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; PACKED: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; PACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) + ; PACKED: [[INT:%[0-9]+]]:_(<2 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 2 from custom "TargetCustom8") + ; PACKED: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[INT]](<2 x s32>) + ; PACKED: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[UV]](s32) + ; PACKED: G_STORE [[UV1]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) + ; PACKED: $vgpr0 = COPY [[BITCAST]](<2 x s16>) + ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 + %res = call { <2 x half>, i32 } @llvm.amdgcn.image.load.2d.sl_v2f16i32s.i32(i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0) + %tex = extractvalue { <2 x half>, i32 } %res, 0 + %tfe = extractvalue { <2 x half>, i32 } %res, 1 + store i32 %tfe, i32 addrspace(1)* undef + ret <2 x half> %tex +} + +define amdgpu_ps <2 x half> @image_load_tfe_v2f16_dmask_0000(<8 x i32> inreg %rsrc, i32 %s, i32 %t) { + ; UNPACKED-LABEL: name: image_load_tfe_v2f16_dmask_0000 + ; UNPACKED: bb.1 (%ir-block.0): + ; UNPACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1 + ; UNPACKED: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; UNPACKED: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; UNPACKED: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; UNPACKED: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; UNPACKED: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; UNPACKED: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; UNPACKED: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; UNPACKED: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; UNPACKED: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; UNPACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; UNPACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; UNPACKED: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; UNPACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) + ; UNPACKED: [[INT:%[0-9]+]]:_(<2 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.2d), 0, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 2 from custom "TargetCustom8") + ; UNPACKED: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[INT]](<2 x s32>) + ; UNPACKED: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[UV]](s32) + ; UNPACKED: [[DEF1:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; UNPACKED: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF1]](s16) + ; UNPACKED: G_STORE [[UV1]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) + ; UNPACKED: $vgpr0 = COPY [[BUILD_VECTOR2]](<2 x s16>) + ; UNPACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; PACKED-LABEL: name: image_load_tfe_v2f16_dmask_0000 + ; PACKED: bb.1 (%ir-block.0): + ; PACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1 + ; PACKED: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; PACKED: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; PACKED: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; PACKED: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; PACKED: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; PACKED: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; PACKED: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; PACKED: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; PACKED: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; PACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; PACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; PACKED: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; PACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) + ; PACKED: [[INT:%[0-9]+]]:_(<2 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.2d), 0, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 2 from custom "TargetCustom8") + ; PACKED: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[INT]](<2 x s32>) + ; PACKED: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[UV]](s32) + ; PACKED: G_STORE [[UV1]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) + ; PACKED: $vgpr0 = COPY [[BITCAST]](<2 x s16>) + ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 + %res = call { <2 x half>, i32 } @llvm.amdgcn.image.load.2d.sl_v2f16i32s.i32(i32 0, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0) + %tex = extractvalue { <2 x half>, i32 } %res, 0 + %tfe = extractvalue { <2 x half>, i32 } %res, 1 + store i32 %tfe, i32 addrspace(1)* undef + ret <2 x half> %tex +} + +define amdgpu_ps <3 x half> @image_load_tfe_v3f16_dmask_1100(<8 x i32> inreg %rsrc, i32 %s, i32 %t) { + ; UNPACKED-LABEL: name: image_load_tfe_v3f16_dmask_1100 + ; UNPACKED: bb.1 (%ir-block.0): + ; UNPACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1 + ; UNPACKED: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; UNPACKED: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; UNPACKED: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; UNPACKED: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; UNPACKED: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; UNPACKED: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; UNPACKED: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; UNPACKED: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; UNPACKED: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; UNPACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; UNPACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; UNPACKED: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; UNPACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) + ; UNPACKED: [[INT:%[0-9]+]]:_(<3 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 4 from custom "TargetCustom8") + ; UNPACKED: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[INT]](<3 x s32>) + ; UNPACKED: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[UV]](s32) + ; UNPACKED: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[UV1]](s32) + ; UNPACKED: [[DEF1:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; UNPACKED: [[DEF2:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; UNPACKED: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; UNPACKED: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[DEF1]](s16), [[DEF2]](s16) + ; UNPACKED: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) + ; UNPACKED: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<4 x s16>), 0 + ; UNPACKED: G_STORE [[UV2]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) + ; UNPACKED: [[DEF3:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; UNPACKED: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF3]], [[EXTRACT]](<3 x s16>), 0 + ; UNPACKED: [[EXTRACT1:%[0-9]+]]:_(<2 x s16>) = G_EXTRACT [[INSERT]](<4 x s16>), 0 + ; UNPACKED: [[EXTRACT2:%[0-9]+]]:_(<2 x s16>) = G_EXTRACT [[INSERT]](<4 x s16>), 32 + ; UNPACKED: $vgpr0 = COPY [[EXTRACT1]](<2 x s16>) + ; UNPACKED: $vgpr1 = COPY [[EXTRACT2]](<2 x s16>) + ; UNPACKED: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 + ; PACKED-LABEL: name: image_load_tfe_v3f16_dmask_1100 + ; PACKED: bb.1 (%ir-block.0): + ; PACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1 + ; PACKED: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; PACKED: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; PACKED: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; PACKED: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; PACKED: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; PACKED: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; PACKED: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; PACKED: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; PACKED: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; PACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; PACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; PACKED: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; PACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) + ; PACKED: [[INT:%[0-9]+]]:_(<2 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 4 from custom "TargetCustom8") + ; PACKED: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[INT]](<2 x s32>) + ; PACKED: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[UV]](s32) + ; PACKED: [[DEF1:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF + ; PACKED: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[DEF1]](<2 x s16>), [[DEF1]](<2 x s16>) + ; PACKED: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>) + ; PACKED: G_STORE [[UV1]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) + ; PACKED: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; PACKED: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF2]], [[UV2]](<3 x s16>), 0 + ; PACKED: [[EXTRACT:%[0-9]+]]:_(<2 x s16>) = G_EXTRACT [[INSERT]](<4 x s16>), 0 + ; PACKED: [[EXTRACT1:%[0-9]+]]:_(<2 x s16>) = G_EXTRACT [[INSERT]](<4 x s16>), 32 + ; PACKED: $vgpr0 = COPY [[EXTRACT]](<2 x s16>) + ; PACKED: $vgpr1 = COPY [[EXTRACT1]](<2 x s16>) + ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 + %res = call { <3 x half>, i32 } @llvm.amdgcn.image.load.2d.sl_v3f16i32s.i32(i32 3, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0) + %tex = extractvalue { <3 x half>, i32 } %res, 0 + %tfe = extractvalue { <3 x half>, i32 } %res, 1 + store i32 %tfe, i32 addrspace(1)* undef + ret <3 x half> %tex +} + +define amdgpu_ps <3 x half> @image_load_tfe_v3f16_dmask_1000(<8 x i32> inreg %rsrc, i32 %s, i32 %t) { + ; UNPACKED-LABEL: name: image_load_tfe_v3f16_dmask_1000 + ; UNPACKED: bb.1 (%ir-block.0): + ; UNPACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1 + ; UNPACKED: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; UNPACKED: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; UNPACKED: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; UNPACKED: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; UNPACKED: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; UNPACKED: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; UNPACKED: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; UNPACKED: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; UNPACKED: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; UNPACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; UNPACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; UNPACKED: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; UNPACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) + ; UNPACKED: [[INT:%[0-9]+]]:_(<2 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 2 from custom "TargetCustom8") + ; UNPACKED: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[INT]](<2 x s32>) + ; UNPACKED: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[UV]](s32) + ; UNPACKED: [[DEF1:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; UNPACKED: [[DEF2:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; UNPACKED: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF1]](s16) + ; UNPACKED: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[DEF1]](s16), [[DEF2]](s16) + ; UNPACKED: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) + ; UNPACKED: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<4 x s16>), 0 + ; UNPACKED: G_STORE [[UV1]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) + ; UNPACKED: [[DEF3:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; UNPACKED: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF3]], [[EXTRACT]](<3 x s16>), 0 + ; UNPACKED: [[EXTRACT1:%[0-9]+]]:_(<2 x s16>) = G_EXTRACT [[INSERT]](<4 x s16>), 0 + ; UNPACKED: [[EXTRACT2:%[0-9]+]]:_(<2 x s16>) = G_EXTRACT [[INSERT]](<4 x s16>), 32 + ; UNPACKED: $vgpr0 = COPY [[EXTRACT1]](<2 x s16>) + ; UNPACKED: $vgpr1 = COPY [[EXTRACT2]](<2 x s16>) + ; UNPACKED: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 + ; PACKED-LABEL: name: image_load_tfe_v3f16_dmask_1000 + ; PACKED: bb.1 (%ir-block.0): + ; PACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1 + ; PACKED: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; PACKED: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; PACKED: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; PACKED: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; PACKED: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; PACKED: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; PACKED: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; PACKED: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; PACKED: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; PACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; PACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; PACKED: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; PACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) + ; PACKED: [[INT:%[0-9]+]]:_(<2 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 2 from custom "TargetCustom8") + ; PACKED: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[INT]](<2 x s32>) + ; PACKED: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[UV]](s32) + ; PACKED: [[DEF1:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF + ; PACKED: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[DEF1]](<2 x s16>), [[DEF1]](<2 x s16>) + ; PACKED: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>) + ; PACKED: G_STORE [[UV1]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) + ; PACKED: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; PACKED: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF2]], [[UV2]](<3 x s16>), 0 + ; PACKED: [[EXTRACT:%[0-9]+]]:_(<2 x s16>) = G_EXTRACT [[INSERT]](<4 x s16>), 0 + ; PACKED: [[EXTRACT1:%[0-9]+]]:_(<2 x s16>) = G_EXTRACT [[INSERT]](<4 x s16>), 32 + ; PACKED: $vgpr0 = COPY [[EXTRACT]](<2 x s16>) + ; PACKED: $vgpr1 = COPY [[EXTRACT1]](<2 x s16>) + ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 + %res = call { <3 x half>, i32 } @llvm.amdgcn.image.load.2d.sl_v3f16i32s.i32(i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0) + %tex = extractvalue { <3 x half>, i32 } %res, 0 + %tfe = extractvalue { <3 x half>, i32 } %res, 1 + store i32 %tfe, i32 addrspace(1)* undef + ret <3 x half> %tex +} + +define amdgpu_ps <3 x half> @image_load_tfe_v3f16_dmask_0000(<8 x i32> inreg %rsrc, i32 %s, i32 %t) { + ; UNPACKED-LABEL: name: image_load_tfe_v3f16_dmask_0000 + ; UNPACKED: bb.1 (%ir-block.0): + ; UNPACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1 + ; UNPACKED: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; UNPACKED: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; UNPACKED: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; UNPACKED: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; UNPACKED: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; UNPACKED: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; UNPACKED: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; UNPACKED: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; UNPACKED: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; UNPACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; UNPACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; UNPACKED: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; UNPACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) + ; UNPACKED: [[INT:%[0-9]+]]:_(<2 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.2d), 0, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 2 from custom "TargetCustom8") + ; UNPACKED: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[INT]](<2 x s32>) + ; UNPACKED: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[UV]](s32) + ; UNPACKED: [[DEF1:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; UNPACKED: [[DEF2:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; UNPACKED: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF1]](s16) + ; UNPACKED: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[DEF1]](s16), [[DEF2]](s16) + ; UNPACKED: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) + ; UNPACKED: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<4 x s16>), 0 + ; UNPACKED: G_STORE [[UV1]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) + ; UNPACKED: [[DEF3:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; UNPACKED: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF3]], [[EXTRACT]](<3 x s16>), 0 + ; UNPACKED: [[EXTRACT1:%[0-9]+]]:_(<2 x s16>) = G_EXTRACT [[INSERT]](<4 x s16>), 0 + ; UNPACKED: [[EXTRACT2:%[0-9]+]]:_(<2 x s16>) = G_EXTRACT [[INSERT]](<4 x s16>), 32 + ; UNPACKED: $vgpr0 = COPY [[EXTRACT1]](<2 x s16>) + ; UNPACKED: $vgpr1 = COPY [[EXTRACT2]](<2 x s16>) + ; UNPACKED: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 + ; PACKED-LABEL: name: image_load_tfe_v3f16_dmask_0000 + ; PACKED: bb.1 (%ir-block.0): + ; PACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1 + ; PACKED: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; PACKED: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; PACKED: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; PACKED: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; PACKED: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; PACKED: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; PACKED: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; PACKED: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; PACKED: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; PACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; PACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; PACKED: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; PACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) + ; PACKED: [[INT:%[0-9]+]]:_(<2 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.2d), 0, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 2 from custom "TargetCustom8") + ; PACKED: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[INT]](<2 x s32>) + ; PACKED: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[UV]](s32) + ; PACKED: [[DEF1:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF + ; PACKED: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[DEF1]](<2 x s16>), [[DEF1]](<2 x s16>) + ; PACKED: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>) + ; PACKED: G_STORE [[UV1]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) + ; PACKED: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; PACKED: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF2]], [[UV2]](<3 x s16>), 0 + ; PACKED: [[EXTRACT:%[0-9]+]]:_(<2 x s16>) = G_EXTRACT [[INSERT]](<4 x s16>), 0 + ; PACKED: [[EXTRACT1:%[0-9]+]]:_(<2 x s16>) = G_EXTRACT [[INSERT]](<4 x s16>), 32 + ; PACKED: $vgpr0 = COPY [[EXTRACT]](<2 x s16>) + ; PACKED: $vgpr1 = COPY [[EXTRACT1]](<2 x s16>) + ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 + %res = call { <3 x half>, i32 } @llvm.amdgcn.image.load.2d.sl_v3f16i32s.i32(i32 0, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0) + %tex = extractvalue { <3 x half>, i32 } %res, 0 + %tfe = extractvalue { <3 x half>, i32 } %res, 1 + store i32 %tfe, i32 addrspace(1)* undef + ret <3 x half> %tex +} + +define amdgpu_ps <4 x half> @image_load_tfe_v4f16_dmask_1110(<8 x i32> inreg %rsrc, i32 %s, i32 %t) { + ; UNPACKED-LABEL: name: image_load_tfe_v4f16_dmask_1110 + ; UNPACKED: bb.1 (%ir-block.0): + ; UNPACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1 + ; UNPACKED: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; UNPACKED: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; UNPACKED: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; UNPACKED: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; UNPACKED: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; UNPACKED: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; UNPACKED: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; UNPACKED: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; UNPACKED: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; UNPACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; UNPACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; UNPACKED: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; UNPACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) + ; UNPACKED: [[INT:%[0-9]+]]:_(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.2d), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 6 from custom "TargetCustom8", align 8) + ; UNPACKED: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[INT]](<4 x s32>) + ; UNPACKED: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[UV]](s32) + ; UNPACKED: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[UV1]](s32) + ; UNPACKED: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[UV2]](s32) + ; UNPACKED: [[DEF1:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; UNPACKED: G_STORE [[UV3]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) + ; UNPACKED: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; UNPACKED: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF1]](s16) + ; UNPACKED: $vgpr0 = COPY [[BUILD_VECTOR2]](<2 x s16>) + ; UNPACKED: $vgpr1 = COPY [[BUILD_VECTOR3]](<2 x s16>) + ; UNPACKED: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 + ; PACKED-LABEL: name: image_load_tfe_v4f16_dmask_1110 + ; PACKED: bb.1 (%ir-block.0): + ; PACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1 + ; PACKED: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; PACKED: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; PACKED: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; PACKED: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; PACKED: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; PACKED: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; PACKED: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; PACKED: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; PACKED: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; PACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; PACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; PACKED: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; PACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) + ; PACKED: [[INT:%[0-9]+]]:_(<3 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.2d), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 6 from custom "TargetCustom8", align 8) + ; PACKED: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[INT]](<3 x s32>) + ; PACKED: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[UV]](s32) + ; PACKED: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[UV1]](s32) + ; PACKED: G_STORE [[UV2]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) + ; PACKED: $vgpr0 = COPY [[BITCAST]](<2 x s16>) + ; PACKED: $vgpr1 = COPY [[BITCAST1]](<2 x s16>) + ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 + %res = call { <4 x half>, i32 } @llvm.amdgcn.image.load.2d.sl_v4f16i32s.i32(i32 7, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0) + %tex = extractvalue { <4 x half>, i32 } %res, 0 + %tfe = extractvalue { <4 x half>, i32 } %res, 1 + store i32 %tfe, i32 addrspace(1)* undef + ret <4 x half> %tex +} + +define amdgpu_ps <4 x half> @image_load_tfe_v4f16_dmask_1100(<8 x i32> inreg %rsrc, i32 %s, i32 %t) { + ; UNPACKED-LABEL: name: image_load_tfe_v4f16_dmask_1100 + ; UNPACKED: bb.1 (%ir-block.0): + ; UNPACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1 + ; UNPACKED: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; UNPACKED: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; UNPACKED: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; UNPACKED: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; UNPACKED: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; UNPACKED: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; UNPACKED: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; UNPACKED: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; UNPACKED: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; UNPACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; UNPACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; UNPACKED: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; UNPACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) + ; UNPACKED: [[INT:%[0-9]+]]:_(<3 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 4 from custom "TargetCustom8") + ; UNPACKED: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[INT]](<3 x s32>) + ; UNPACKED: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[UV]](s32) + ; UNPACKED: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[UV1]](s32) + ; UNPACKED: [[DEF1:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; UNPACKED: G_STORE [[UV2]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) + ; UNPACKED: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; UNPACKED: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[DEF1]](s16), [[DEF1]](s16) + ; UNPACKED: $vgpr0 = COPY [[BUILD_VECTOR2]](<2 x s16>) + ; UNPACKED: $vgpr1 = COPY [[BUILD_VECTOR3]](<2 x s16>) + ; UNPACKED: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 + ; PACKED-LABEL: name: image_load_tfe_v4f16_dmask_1100 + ; PACKED: bb.1 (%ir-block.0): + ; PACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1 + ; PACKED: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; PACKED: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; PACKED: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; PACKED: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; PACKED: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; PACKED: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; PACKED: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; PACKED: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; PACKED: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; PACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; PACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; PACKED: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; PACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) + ; PACKED: [[INT:%[0-9]+]]:_(<2 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 4 from custom "TargetCustom8") + ; PACKED: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[INT]](<2 x s32>) + ; PACKED: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[UV]](s32) + ; PACKED: [[DEF1:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF + ; PACKED: G_STORE [[UV1]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) + ; PACKED: $vgpr0 = COPY [[BITCAST]](<2 x s16>) + ; PACKED: $vgpr1 = COPY [[DEF1]](<2 x s16>) + ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 + %res = call { <4 x half>, i32 } @llvm.amdgcn.image.load.2d.sl_v4f16i32s.i32(i32 3, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0) + %tex = extractvalue { <4 x half>, i32 } %res, 0 + %tfe = extractvalue { <4 x half>, i32 } %res, 1 + store i32 %tfe, i32 addrspace(1)* undef + ret <4 x half> %tex +} + +define amdgpu_ps <4 x half> @image_load_tfe_v4f16_dmask_1000(<8 x i32> inreg %rsrc, i32 %s, i32 %t) { + ; UNPACKED-LABEL: name: image_load_tfe_v4f16_dmask_1000 + ; UNPACKED: bb.1 (%ir-block.0): + ; UNPACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1 + ; UNPACKED: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; UNPACKED: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; UNPACKED: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; UNPACKED: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; UNPACKED: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; UNPACKED: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; UNPACKED: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; UNPACKED: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; UNPACKED: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; UNPACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; UNPACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; UNPACKED: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; UNPACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) + ; UNPACKED: [[INT:%[0-9]+]]:_(<2 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 2 from custom "TargetCustom8") + ; UNPACKED: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[INT]](<2 x s32>) + ; UNPACKED: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[UV]](s32) + ; UNPACKED: [[DEF1:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; UNPACKED: G_STORE [[UV1]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) + ; UNPACKED: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF1]](s16) + ; UNPACKED: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[DEF1]](s16), [[DEF1]](s16) + ; UNPACKED: $vgpr0 = COPY [[BUILD_VECTOR2]](<2 x s16>) + ; UNPACKED: $vgpr1 = COPY [[BUILD_VECTOR3]](<2 x s16>) + ; UNPACKED: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 + ; PACKED-LABEL: name: image_load_tfe_v4f16_dmask_1000 + ; PACKED: bb.1 (%ir-block.0): + ; PACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1 + ; PACKED: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; PACKED: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; PACKED: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; PACKED: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; PACKED: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; PACKED: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; PACKED: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; PACKED: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; PACKED: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; PACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; PACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; PACKED: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; PACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) + ; PACKED: [[INT:%[0-9]+]]:_(<2 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 2 from custom "TargetCustom8") + ; PACKED: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[INT]](<2 x s32>) + ; PACKED: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[UV]](s32) + ; PACKED: [[DEF1:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF + ; PACKED: G_STORE [[UV1]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) + ; PACKED: $vgpr0 = COPY [[BITCAST]](<2 x s16>) + ; PACKED: $vgpr1 = COPY [[DEF1]](<2 x s16>) + ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 + %res = call { <4 x half>, i32 } @llvm.amdgcn.image.load.2d.sl_v4f16i32s.i32(i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0) + %tex = extractvalue { <4 x half>, i32 } %res, 0 + %tfe = extractvalue { <4 x half>, i32 } %res, 1 + store i32 %tfe, i32 addrspace(1)* undef + ret <4 x half> %tex +} + +define amdgpu_ps <4 x half> @image_load_tfe_v4f16_dmask_0000(<8 x i32> inreg %rsrc, i32 %s, i32 %t) { + ; UNPACKED-LABEL: name: image_load_tfe_v4f16_dmask_0000 + ; UNPACKED: bb.1 (%ir-block.0): + ; UNPACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1 + ; UNPACKED: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; UNPACKED: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; UNPACKED: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; UNPACKED: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; UNPACKED: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; UNPACKED: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; UNPACKED: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; UNPACKED: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; UNPACKED: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; UNPACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; UNPACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; UNPACKED: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; UNPACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) + ; UNPACKED: [[INT:%[0-9]+]]:_(<2 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.2d), 0, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 2 from custom "TargetCustom8") + ; UNPACKED: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[INT]](<2 x s32>) + ; UNPACKED: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[UV]](s32) + ; UNPACKED: [[DEF1:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; UNPACKED: G_STORE [[UV1]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) + ; UNPACKED: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF1]](s16) + ; UNPACKED: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[DEF1]](s16), [[DEF1]](s16) + ; UNPACKED: $vgpr0 = COPY [[BUILD_VECTOR2]](<2 x s16>) + ; UNPACKED: $vgpr1 = COPY [[BUILD_VECTOR3]](<2 x s16>) + ; UNPACKED: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 + ; PACKED-LABEL: name: image_load_tfe_v4f16_dmask_0000 + ; PACKED: bb.1 (%ir-block.0): + ; PACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1 + ; PACKED: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; PACKED: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; PACKED: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; PACKED: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; PACKED: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; PACKED: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; PACKED: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; PACKED: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; PACKED: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; PACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; PACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; PACKED: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; PACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) + ; PACKED: [[INT:%[0-9]+]]:_(<2 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.2d), 0, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 2 from custom "TargetCustom8") + ; PACKED: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[INT]](<2 x s32>) + ; PACKED: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[UV]](s32) + ; PACKED: [[DEF1:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF + ; PACKED: G_STORE [[UV1]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) + ; PACKED: $vgpr0 = COPY [[BITCAST]](<2 x s16>) + ; PACKED: $vgpr1 = COPY [[DEF1]](<2 x s16>) + ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 + %res = call { <4 x half>, i32 } @llvm.amdgcn.image.load.2d.sl_v4f16i32s.i32(i32 0, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0) + %tex = extractvalue { <4 x half>, i32 } %res, 0 + %tfe = extractvalue { <4 x half>, i32 } %res, 1 + store i32 %tfe, i32 addrspace(1)* undef + ret <4 x half> %tex +} + declare half @llvm.amdgcn.image.load.2d.f16.i32(i32 immarg, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #0 declare <2 x half> @llvm.amdgcn.image.load.2d.v2f16.i32(i32 immarg, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #0 declare <3 x half> @llvm.amdgcn.image.load.2d.v3f16.i32(i32 immarg, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #0 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2d.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2d.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2d.ll @@ -224,6 +224,136 @@ ret <4 x float> %tex } +define amdgpu_ps float @image_load_f32_dmask_0000(<8 x i32> inreg %rsrc, i32 %s, i32 %t) { + %tex = call float @llvm.amdgcn.image.load.2d.f32.i32(i32 0, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) + ret float %tex +} + +define amdgpu_ps <2 x float> @image_load_v2f32_dmask_1000(<8 x i32> inreg %rsrc, i32 %s, i32 %t) { + %tex = call <2 x float> @llvm.amdgcn.image.load.2d.v2f32.i32(i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) + ret <2 x float> %tex +} + +define amdgpu_ps <2 x float> @image_load_v2f32_dmask_0000(<8 x i32> inreg %rsrc, i32 %s, i32 %t) { + %tex = call <2 x float> @llvm.amdgcn.image.load.2d.v2f32.i32(i32 0, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) + ret <2 x float> %tex +} + +define amdgpu_ps <3 x float> @image_load_v3f32_dmask_1100(<8 x i32> inreg %rsrc, i32 %s, i32 %t) { + %tex = call <3 x float> @llvm.amdgcn.image.load.2d.v3f32.i32(i32 3, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) + ret <3 x float> %tex +} + +define amdgpu_ps <3 x float> @image_load_v3f32_dmask_1000(<8 x i32> inreg %rsrc, i32 %s, i32 %t) { + %tex = call <3 x float> @llvm.amdgcn.image.load.2d.v3f32.i32(i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) + ret <3 x float> %tex +} + +define amdgpu_ps <3 x float> @image_load_v3f32_dmask_0000(<8 x i32> inreg %rsrc, i32 %s, i32 %t) { + %tex = call <3 x float> @llvm.amdgcn.image.load.2d.v3f32.i32(i32 0, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) + ret <3 x float> %tex +} + +define amdgpu_ps <4 x float> @image_load_v4f32_dmask_1110(<8 x i32> inreg %rsrc, i32 %s, i32 %t) { + %tex = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32 7, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) + ret <4 x float> %tex +} + +define amdgpu_ps <4 x float> @image_load_v4f32_dmask_1100(<8 x i32> inreg %rsrc, i32 %s, i32 %t) { + %tex = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32 3, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) + ret <4 x float> %tex +} + +define amdgpu_ps <4 x float> @image_load_v4f32_dmask_1000(<8 x i32> inreg %rsrc, i32 %s, i32 %t) { + %tex = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) + ret <4 x float> %tex +} + +define amdgpu_ps <4 x float> @image_load_v4f32_dmask_0000(<8 x i32> inreg %rsrc, i32 %s, i32 %t) { + %tex = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32 0, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) + ret <4 x float> %tex +} + +define amdgpu_ps float @image_load_tfe_f32_dmask_0000(<8 x i32> inreg %rsrc, i32 %s, i32 %t) { + %res = call { float, i32 } @llvm.amdgcn.image.load.2d.sl_f32i32s.i32(i32 0, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0) + %tex = extractvalue { float, i32 } %res, 0 + %tfe = extractvalue { float, i32 } %res, 1 + store i32 %tfe, i32 addrspace(1)* undef + ret float %tex +} + +define amdgpu_ps <2 x float> @image_load_tfe_v2f32_dmask_1000(<8 x i32> inreg %rsrc, i32 %s, i32 %t) { + %res = call { <2 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v2f32i32s.i32(i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0) + %tex = extractvalue { <2 x float>, i32 } %res, 0 + %tfe = extractvalue { <2 x float>, i32 } %res, 1 + store i32 %tfe, i32 addrspace(1)* undef + ret <2 x float> %tex +} + +define amdgpu_ps <2 x float> @image_load_tfe_v2f32_dmask_0000(<8 x i32> inreg %rsrc, i32 %s, i32 %t) { + %res = call { <2 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v2f32i32s.i32(i32 0, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0) + %tex = extractvalue { <2 x float>, i32 } %res, 0 + %tfe = extractvalue { <2 x float>, i32 } %res, 1 + store i32 %tfe, i32 addrspace(1)* undef + ret <2 x float> %tex +} + +define amdgpu_ps <3 x float> @image_load_tfe_v3f32_dmask_1100(<8 x i32> inreg %rsrc, i32 %s, i32 %t) { + %res = call { <3 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v3f32i32s.i32(i32 3, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0) + %tex = extractvalue { <3 x float>, i32 } %res, 0 + %tfe = extractvalue { <3 x float>, i32 } %res, 1 + store i32 %tfe, i32 addrspace(1)* undef + ret <3 x float> %tex +} + +define amdgpu_ps <3 x float> @image_load_tfe_v3f32_dmask_1000(<8 x i32> inreg %rsrc, i32 %s, i32 %t) { + %res = call { <3 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v3f32i32s.i32(i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0) + %tex = extractvalue { <3 x float>, i32 } %res, 0 + %tfe = extractvalue { <3 x float>, i32 } %res, 1 + store i32 %tfe, i32 addrspace(1)* undef + ret <3 x float> %tex +} + +define amdgpu_ps <3 x float> @image_load_tfe_v3f32_dmask_0000(<8 x i32> inreg %rsrc, i32 %s, i32 %t) { + %res = call { <3 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v3f32i32s.i32(i32 0, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0) + %tex = extractvalue { <3 x float>, i32 } %res, 0 + %tfe = extractvalue { <3 x float>, i32 } %res, 1 + store i32 %tfe, i32 addrspace(1)* undef + ret <3 x float> %tex +} + +define amdgpu_ps <4 x float> @image_load_tfe_v4f32_dmask_1110(<8 x i32> inreg %rsrc, i32 %s, i32 %t) { + %res = call { <4 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v4f32i32s.i32(i32 7, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0) + %tex = extractvalue { <4 x float>, i32 } %res, 0 + %tfe = extractvalue { <4 x float>, i32 } %res, 1 + store i32 %tfe, i32 addrspace(1)* undef + ret <4 x float> %tex +} + +define amdgpu_ps <4 x float> @image_load_tfe_v4f32_dmask_1100(<8 x i32> inreg %rsrc, i32 %s, i32 %t) { + %res = call { <4 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v4f32i32s.i32(i32 3, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0) + %tex = extractvalue { <4 x float>, i32 } %res, 0 + %tfe = extractvalue { <4 x float>, i32 } %res, 1 + store i32 %tfe, i32 addrspace(1)* undef + ret <4 x float> %tex +} + +define amdgpu_ps <4 x float> @image_load_tfe_v4f32_dmask_1000(<8 x i32> inreg %rsrc, i32 %s, i32 %t) { + %res = call { <4 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v4f32i32s.i32(i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0) + %tex = extractvalue { <4 x float>, i32 } %res, 0 + %tfe = extractvalue { <4 x float>, i32 } %res, 1 + store i32 %tfe, i32 addrspace(1)* undef + ret <4 x float> %tex +} + +define amdgpu_ps <4 x float> @image_load_tfe_v4f32_dmask_0000(<8 x i32> inreg %rsrc, i32 %s, i32 %t) { + %res = call { <4 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v4f32i32s.i32(i32 0, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0) + %tex = extractvalue { <4 x float>, i32 } %res, 0 + %tfe = extractvalue { <4 x float>, i32 } %res, 1 + store i32 %tfe, i32 addrspace(1)* undef + ret <4 x float> %tex +} + declare float @llvm.amdgcn.image.load.2d.f32.i32(i32 immarg, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #0 declare <2 x float> @llvm.amdgcn.image.load.2d.v2f32.i32(i32 immarg, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #0 declare <3 x float> @llvm.amdgcn.image.load.2d.v3f32.i32(i32 immarg, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #0