Index: llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h =================================================================== --- llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h +++ llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h @@ -41,6 +41,8 @@ case TargetOpcode::G_SEXT: case TargetOpcode::G_ZEXT: case TargetOpcode::G_ANYEXT: + case TargetOpcode::G_DELETE_TRAILING_VECTOR_ELTS: + case TargetOpcode::G_PAD_VECTOR_WITH_UNDEF_ELTS: return true; default: return false; @@ -862,6 +864,21 @@ return true; } + if (SrcDef->getOpcode() == TargetOpcode::G_DELETE_TRAILING_VECTOR_ELTS) { + SmallVector Regs; + for (unsigned i = 0; i < NumDefs; ++i) + Regs.push_back(MI.getOperand(i).getReg()); + + Register TruncSrc = SrcDef->getOperand(1).getReg(); + unsigned NumTruncElts = MRI.getType(TruncSrc).getNumElements() - NumDefs; + for (unsigned i = 0; i < NumTruncElts; ++i) + Regs.push_back(MRI.createGenericVirtualRegister(DestTy)); + + Builder.buildUnmerge(Regs, TruncSrc); + markInstAndDefDead(MI, *SrcDef, DeadInsts); + return true; + } + MachineInstr *MergeI = SrcDef; unsigned ConvertOp = 0; @@ -1055,6 +1072,32 @@ return true; } + bool + tryCombinePadVectorWithUndefElts(MachineInstr &MI, + SmallVectorImpl &DeadInsts, + SmallVectorImpl &UpdatedDefs) { + assert(MI.getOpcode() == TargetOpcode::G_PAD_VECTOR_WITH_UNDEF_ELTS); + + Builder.setInstrAndDebugLoc(MI); + Register DstReg = MI.getOperand(0).getReg(); + Register SrcReg = lookThroughCopyInstrs(MI.getOperand(1).getReg()); + + // DeleteTrailingVectorElts(PadVectorWithUndefElts x) - > copy x + Register TruncSrc; + if (mi_match(SrcReg, MRI, GDeleteTrailingVectorElts(m_Reg(TruncSrc)))) { + if (MRI.getType(DstReg) != MRI.getType(TruncSrc)) + return false; + + LLVM_DEBUG(dbgs() << ".. Combine MI: " << MI;); + MRI.replaceRegWith(DstReg, TruncSrc); + UpdatedDefs.push_back(DstReg); + markInstAndDefDead(MI, *MRI.getVRegDef(SrcReg), DeadInsts); + return true; + } + + return false; + } + /// Try to combine away MI. /// Returns true if it combined away the MI. /// Adds instructions that are dead as a result of the combine @@ -1080,6 +1123,9 @@ case TargetOpcode::G_ANYEXT: Changed = tryCombineAnyExt(MI, DeadInsts, UpdatedDefs); break; + case TargetOpcode::G_PAD_VECTOR_WITH_UNDEF_ELTS: + Changed = tryCombinePadVectorWithUndefElts(MI, DeadInsts, UpdatedDefs); + break; case TargetOpcode::G_ZEXT: Changed = tryCombineZExt(MI, DeadInsts, UpdatedDefs, WrapperObserver); break; @@ -1158,6 +1204,8 @@ switch (MI.getOpcode()) { case TargetOpcode::COPY: case TargetOpcode::G_TRUNC: + case TargetOpcode::G_DELETE_TRAILING_VECTOR_ELTS: + case TargetOpcode::G_PAD_VECTOR_WITH_UNDEF_ELTS: case TargetOpcode::G_ZEXT: case TargetOpcode::G_ANYEXT: case TargetOpcode::G_SEXT: Index: llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h =================================================================== --- llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h +++ llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h @@ -425,6 +425,12 @@ return UnaryOp_match(Src); } +template +inline UnaryOp_match +GDeleteTrailingVectorElts(const SrcTy &Src) { + return UnaryOp_match(Src); +} + template inline UnaryOp_match m_GBitcast(const SrcTy &Src) { Index: llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h =================================================================== --- llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h +++ llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h @@ -1865,6 +1865,19 @@ return buildInstr(TargetOpcode::G_BITREVERSE, {Dst}, {Src}); } + /// Build and insert \p Dst = G_PAD_VECTOR_WITH_UNDEF_ELTS \p Src + MachineInstrBuilder buildPadVectorWithUndefElements(const DstOp &Dst, + const SrcOp &Src) { + return buildInstr(TargetOpcode::G_PAD_VECTOR_WITH_UNDEF_ELTS, {Dst}, {Src}); + } + + /// Build and insert \p Dst = G_DELETE_TRAILING_VECTOR_ELTS \p Src + MachineInstrBuilder buildDeleteTrailingVectorElement(const DstOp &Dst, + const SrcOp &Src) { + return buildInstr(TargetOpcode::G_DELETE_TRAILING_VECTOR_ELTS, {Dst}, + {Src}); + } + virtual MachineInstrBuilder buildInstr(unsigned Opc, ArrayRef DstOps, ArrayRef SrcOps, Optional Flags = None); Index: llvm/include/llvm/CodeGen/GlobalISel/Utils.h =================================================================== --- llvm/include/llvm/CodeGen/GlobalISel/Utils.h +++ llvm/include/llvm/CodeGen/GlobalISel/Utils.h @@ -297,6 +297,11 @@ LLVM_READNONE LLT getLCMType(LLT OrigTy, LLT TargetTy); +LLVM_READNONE +/// Return smallest type that covers both \p OrigTy and \p TargetTy and is +/// multiple of TargetTy. +LLT getCoverTy(LLT OrigTy, LLT TargetTy); + /// Return a type where the total size is the greatest common divisor of \p /// OrigTy and \p TargetTy. This will try to either change the number of vector /// elements, or bitwidth of scalars. The intent is the result type can be used Index: llvm/include/llvm/Support/TargetOpcodes.def =================================================================== --- llvm/include/llvm/Support/TargetOpcodes.def +++ llvm/include/llvm/Support/TargetOpcodes.def @@ -283,6 +283,12 @@ /// Generic reference to global value. HANDLE_TARGET_OPCODE(G_GLOBAL_VALUE) +// Extend vector by padding with undef elements. +HANDLE_TARGET_OPCODE(G_PAD_VECTOR_WITH_UNDEF_ELTS) + +// Shrink vector by deleting trailing elements. +HANDLE_TARGET_OPCODE(G_DELETE_TRAILING_VECTOR_ELTS) + /// Generic instruction to extract blocks of bits from the register given /// (typically a sub-register COPY after instruction selection). HANDLE_TARGET_OPCODE(G_EXTRACT) Index: llvm/include/llvm/Target/GenericOpcodes.td =================================================================== --- llvm/include/llvm/Target/GenericOpcodes.td +++ llvm/include/llvm/Target/GenericOpcodes.td @@ -82,6 +82,20 @@ let hasSideEffects = false; } +// Extend vector by padding with undef elements. +def G_PAD_VECTOR_WITH_UNDEF_ELTS : GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type1:$src); + let hasSideEffects = 0; +} + +// Shrink vector by deleting trailing elements. +def G_DELETE_TRAILING_VECTOR_ELTS : GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type1:$src); + let hasSideEffects = 0; +} + def G_IMPLICIT_DEF : GenericInstruction { let OutOperandList = (outs type0:$dst); let InOperandList = (ins); Index: llvm/lib/CodeGen/GlobalISel/CallLowering.cpp =================================================================== --- llvm/lib/CodeGen/GlobalISel/CallLowering.cpp +++ llvm/lib/CodeGen/GlobalISel/CallLowering.cpp @@ -251,7 +251,7 @@ LLT PartLLT = MRI.getType(SrcRegs[0]); // Deal with v3s16 split into v2s16 - LLT LCMTy = getLCMType(LLTy, PartLLT); + LLT LCMTy = getCoverTy(LLTy, PartLLT); if (LCMTy == LLTy) { // Common case where no padding is needed. assert(DstRegs.size() == 1); @@ -262,21 +262,9 @@ // widening the original value. Register UnmergeSrcReg; if (LCMTy != PartLLT) { - // e.g. A <3 x s16> value was split to <2 x s16> - // %register_value0:_(<2 x s16>) - // %register_value1:_(<2 x s16>) - // %undef:_(<2 x s16>) = G_IMPLICIT_DEF - // %concat:_<6 x s16>) = G_CONCAT_VECTORS %reg_value0, %reg_value1, %undef - // %dst_reg:_(<3 x s16>), %dead:_(<3 x s16>) = G_UNMERGE_VALUES %concat - const int NumWide = LCMTy.getSizeInBits() / PartLLT.getSizeInBits(); - Register Undef = B.buildUndef(PartLLT).getReg(0); - - // Build vector of undefs. - SmallVector WidenedSrcs(NumWide, Undef); - - // Replace the first sources with the real registers. - std::copy(SrcRegs.begin(), SrcRegs.end(), WidenedSrcs.begin()); - UnmergeSrcReg = B.buildConcatVectors(LCMTy, WidenedSrcs).getReg(0); + assert(DstRegs.size() == 1); + return B.buildDeleteTrailingVectorElement(DstRegs[0], + B.buildMerge(LCMTy, SrcRegs)); } else { // We don't need to widen anything if we're extracting a scalar which was // promoted to a vector e.g. s8 -> v4s8 -> s8 @@ -293,6 +281,8 @@ for (int I = DstRegs.size(); I != NumDst; ++I) PadDstRegs[I] = MRI.createGenericVirtualRegister(LLTy); + if (PadDstRegs.size() == 1) + return B.buildDeleteTrailingVectorElement(DstRegs[0], UnmergeSrcReg); return B.buildUnmerge(PadDstRegs, UnmergeSrcReg); } @@ -480,7 +470,7 @@ MachineRegisterInfo &MRI = *B.getMRI(); LLT DstTy = MRI.getType(DstRegs[0]); - LLT LCMTy = getLCMType(SrcTy, PartTy); + LLT LCMTy = getCoverTy(SrcTy, PartTy); const unsigned DstSize = DstTy.getSizeInBits(); const unsigned SrcSize = SrcTy.getSizeInBits(); @@ -488,6 +478,7 @@ Register UnmergeSrc = SrcReg; + if(!LCMTy.isVector()){ if (CoveringSize != SrcSize) { // For scalars, it's common to be able to use a simple extension. if (SrcTy.isScalar() && DstTy.isScalar()) { @@ -504,15 +495,12 @@ UnmergeSrc = B.buildMerge(LCMTy, MergeParts).getReg(0); } } - - // Unmerge to the original registers and pad with dead defs. - SmallVector UnmergeResults(DstRegs.begin(), DstRegs.end()); - for (unsigned Size = DstSize * DstRegs.size(); Size != CoveringSize; - Size += DstSize) { - UnmergeResults.push_back(MRI.createGenericVirtualRegister(DstTy)); } - B.buildUnmerge(UnmergeResults, UnmergeSrc); + if (LCMTy.isVector() && CoveringSize != SrcSize) + UnmergeSrc = B.buildPadVectorWithUndefElements(LCMTy, SrcReg).getReg(0); + + B.buildUnmerge(DstRegs, UnmergeSrc); } bool CallLowering::determineAndHandleAssignments( Index: llvm/lib/CodeGen/GlobalISel/Legalizer.cpp =================================================================== --- llvm/lib/CodeGen/GlobalISel/Legalizer.cpp +++ llvm/lib/CodeGen/GlobalISel/Legalizer.cpp @@ -109,6 +109,8 @@ case TargetOpcode::G_CONCAT_VECTORS: case TargetOpcode::G_BUILD_VECTOR: case TargetOpcode::G_EXTRACT: + case TargetOpcode::G_DELETE_TRAILING_VECTOR_ELTS: + case TargetOpcode::G_PAD_VECTOR_WITH_UNDEF_ELTS: return true; case TargetOpcode::G_INSERT: return AllowGInsertAsArtifact; @@ -223,8 +225,15 @@ }; bool Changed = false; SmallVector RetryList; + unsigned IterCount=0; do { LLVM_DEBUG(dbgs() << "=== New Iteration ===\n"); + ++IterCount; + if (IterCount == 1000) { + llvm_unreachable(("infinite loop in " + MF.getName().str() + + " , TODO: fix mir tests\n\n") + .c_str()); + } assert(RetryList.empty() && "Expected no instructions in RetryList"); unsigned NumArtifacts = ArtifactList.size(); while (!InstList.empty()) { Index: llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp =================================================================== --- llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -1371,37 +1371,15 @@ unsigned OpIdx) { MachineOperand &MO = MI.getOperand(OpIdx); MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt()); - MO.setReg(widenWithUnmerge(WideTy, MO.getReg())); + Register DstExt = MRI.createGenericVirtualRegister(WideTy); + MIRBuilder.buildDeleteTrailingVectorElement(MO, DstExt); + MO.setReg(DstExt); } void LegalizerHelper::moreElementsVectorSrc(MachineInstr &MI, LLT MoreTy, unsigned OpIdx) { MachineOperand &MO = MI.getOperand(OpIdx); - - LLT OldTy = MRI.getType(MO.getReg()); - unsigned OldElts = OldTy.getNumElements(); - unsigned NewElts = MoreTy.getNumElements(); - - unsigned NumParts = NewElts / OldElts; - - // Use concat_vectors if the result is a multiple of the number of elements. - if (NumParts * OldElts == NewElts) { - SmallVector Parts; - Parts.push_back(MO.getReg()); - - Register ImpDef = MIRBuilder.buildUndef(OldTy).getReg(0); - for (unsigned I = 1; I != NumParts; ++I) - Parts.push_back(ImpDef); - - auto Concat = MIRBuilder.buildConcatVectors(MoreTy, Parts); - MO.setReg(Concat.getReg(0)); - return; - } - - Register MoreReg = MRI.createGenericVirtualRegister(MoreTy); - Register ImpDef = MIRBuilder.buildUndef(MoreTy).getReg(0); - MIRBuilder.buildInsert(MoreReg, ImpDef, MO.getReg(), 0); - MO.setReg(MoreReg); + MO.setReg(MIRBuilder.buildPadVectorWithUndefElements(MoreTy, MO).getReg(0)); } void LegalizerHelper::bitcastSrc(MachineInstr &MI, LLT CastTy, unsigned OpIdx) { @@ -2890,6 +2868,72 @@ LLT MemTy = MMO.getMemoryType(); MachineFunction &MF = MIRBuilder.getMF(); + if (MemTy.isVector() && DstTy.isVector()) { + LLT EltTy = MemTy.getScalarType(); + if (DstTy.getScalarType() != EltTy) + return UnableToLegalize; + + LLT PtrTy = MRI.getType(PtrReg); + LLT PtrScalarTy = LLT::scalar(PtrTy.getSizeInBits()); + + unsigned MemSize = MemTy.getSizeInBits(); + unsigned EltSize = MemTy.getScalarSizeInBits(); + unsigned Pow2MemSize = PowerOf2Floor(MemSize); + if (Pow2MemSize == MemTy.getSizeInBits()) + Pow2MemSize >>= 1; + // Split number of elements to get pow-2 load. + // Skip pow-2 load that also require element split (should bitcast vector + // to scalar first and then split scalar load). + if (Pow2MemSize % EltSize) + return UnableToLegalize; + + SmallVector EltsForPow2MemSize; + SmallVector RemainingElts; + + unsigned NumEltsForPow2MemSize = Pow2MemSize / EltSize; + for (unsigned i = 0; i < NumEltsForPow2MemSize; ++i) + EltsForPow2MemSize.push_back(MRI.createGenericVirtualRegister(EltTy)); + LLT Pow2Ty = NumEltsForPow2MemSize == 1 + ? EltTy + : LLT::fixed_vector(NumEltsForPow2MemSize, EltTy); + MachineMemOperand *Pow2MMO = MF.getMachineMemOperand(&MMO, 0, Pow2Ty); + if (NumEltsForPow2MemSize == 1) { + MIRBuilder.buildLoad(EltsForPow2MemSize[0], PtrReg, *Pow2MMO); + } else { + MIRBuilder.buildUnmerge(EltsForPow2MemSize, + MIRBuilder.buildLoad(Pow2Ty, PtrReg, *Pow2MMO)); + } + + unsigned NumEltsForRem = MemSize / EltSize - NumEltsForPow2MemSize; + for (unsigned i = 0; i < NumEltsForRem; ++i) + RemainingElts.push_back(MRI.createGenericVirtualRegister(EltTy)); + + LLT RemTy = + NumEltsForRem == 1 ? EltTy : LLT::fixed_vector(NumEltsForRem, EltTy); + + MachineMemOperand *RemMMO = + MF.getMachineMemOperand(&MMO, Pow2MemSize, RemTy); + auto OffsetCst = MIRBuilder.buildConstant(PtrScalarTy, Pow2MemSize / 8); + auto RemPtr = MIRBuilder.buildPtrAdd(PtrTy, PtrReg, OffsetCst); + + if (NumEltsForRem == 1) { + MIRBuilder.buildLoad(RemainingElts[0], RemPtr, *RemMMO); + } else { + MIRBuilder.buildUnmerge(RemainingElts, + MIRBuilder.buildLoad(RemTy, RemPtr, *RemMMO)); + } + + SmallVector AllElts; + for (unsigned i = 0; i < NumEltsForPow2MemSize; ++i) + AllElts.push_back(EltsForPow2MemSize[i]); + for (unsigned i = 0; i < NumEltsForRem; ++i) + AllElts.push_back(RemainingElts[i]); + + MIRBuilder.buildMerge(DstReg, AllElts); + LoadMI.eraseFromParent(); + return Legalized; + } + unsigned MemSizeInBits = MemTy.getSizeInBits(); unsigned MemStoreSizeInBits = 8 * MemTy.getSizeInBytes(); @@ -3028,10 +3072,74 @@ Register SrcReg = StoreMI.getValueReg(); Register PtrReg = StoreMI.getPointerReg(); LLT SrcTy = MRI.getType(SrcReg); + LLT PtrTy = MRI.getType(PtrReg); + LLT PtrScalarTy = LLT::scalar(PtrTy.getSizeInBits()); MachineFunction &MF = MIRBuilder.getMF(); MachineMemOperand &MMO = **StoreMI.memoperands_begin(); LLT MemTy = MMO.getMemoryType(); + if (MemTy.isVector() && SrcTy.isVector()) { + LLT EltTy = MemTy.getScalarType(); + if (SrcTy.getScalarType() != EltTy) + return UnableToLegalize; + + unsigned EltSize = MemTy.getScalarSizeInBits(); + unsigned Pow2MemSize = PowerOf2Floor(MemTy.getSizeInBits()); + // If we already have pow-2 memsize, split in two equal parts. + if (Pow2MemSize == MemTy.getSizeInBits()) + Pow2MemSize >>= 1; + // Split number of elements to get pow-2 store. + // Skip pow-2 store that also require element split (should bitcast vector + // to scalar first and then split scalar store). + if (Pow2MemSize % EltSize) + return UnableToLegalize; + + unsigned NumEltsForPow2MemSize = Pow2MemSize / EltSize; + + SmallVector Elts; + SmallVector EltsForPow2MemSize; + SmallVector RemainingElts; + + extractParts(SrcReg, SrcTy.getElementType(), SrcTy.getNumElements(), Elts); + for (unsigned i = 0; i < NumEltsForPow2MemSize; ++i) + EltsForPow2MemSize.push_back(Elts[i]); + for (unsigned i = NumEltsForPow2MemSize; i < Elts.size(); ++i) + RemainingElts.push_back(Elts[i]); + + + Register Pow2Regs; + if (EltsForPow2MemSize.size() == 1) + Pow2Regs = EltsForPow2MemSize[0]; + else { + LLT RemTy = LLT::fixed_vector(EltsForPow2MemSize.size(), EltSize); + Pow2Regs = MIRBuilder.buildMerge(RemTy, EltsForPow2MemSize).getReg(0); + } + + MachineMemOperand *Pow2MMO = MF.getMachineMemOperand(&MMO, 0, MRI.getType(Pow2Regs)); + MIRBuilder.buildStore(Pow2Regs, PtrReg, *Pow2MMO); + + + + Register RemainingRegs; + if (RemainingElts.size() == 1) + RemainingRegs = RemainingElts[0]; + else { + LLT RemTy = LLT::fixed_vector(RemainingElts.size(), EltSize); + RemainingRegs = MIRBuilder.buildMerge(RemTy, RemainingElts).getReg(0); + } + + MachineMemOperand *RemMMO = + MF.getMachineMemOperand(&MMO, Pow2MemSize, MRI.getType(RemainingRegs)); + + auto OffsetCst = MIRBuilder.buildConstant(PtrScalarTy, Pow2MemSize / 8); + auto RemPtr = MIRBuilder.buildPtrAdd(PtrTy, PtrReg, OffsetCst); + + MIRBuilder.buildStore(RemainingRegs, RemPtr, *RemMMO); + + StoreMI.eraseFromParent(); + return Legalized; + } + unsigned StoreWidth = MemTy.getSizeInBits(); unsigned StoreSizeInBits = 8 * MemTy.getSizeInBytes(); @@ -3101,11 +3209,9 @@ auto SmallVal = MIRBuilder.buildLShr(NewSrcTy, ExtVal, ShiftAmt); // Generate the PtrAdd and truncating stores. - LLT PtrTy = MRI.getType(PtrReg); - auto OffsetCst = MIRBuilder.buildConstant( - LLT::scalar(PtrTy.getSizeInBits()), LargeSplitSize / 8); - auto SmallPtr = - MIRBuilder.buildPtrAdd(PtrTy, PtrReg, OffsetCst); + auto OffsetCst = MIRBuilder.buildConstant(PtrScalarTy, LargeSplitSize / 8); + Register PtrAddReg = MRI.createGenericVirtualRegister(PtrTy); + auto SmallPtr = MIRBuilder.buildPtrAdd(PtrAddReg, PtrReg, OffsetCst); MachineMemOperand *LargeMMO = MF.getMachineMemOperand(&MMO, 0, LargeSplitSize / 8); @@ -3482,6 +3588,32 @@ case G_ROTL: case G_ROTR: return lowerRotate(MI); + case G_DELETE_TRAILING_VECTOR_ELTS: + case G_PAD_VECTOR_WITH_UNDEF_ELTS: { + Register Dst = MI.getOperand(0).getReg(); + Register Src = MI.getOperand(1).getReg(); + LLT EltTy = MRI.getType(Src).getElementType(); + unsigned DstNumElts = MRI.getType(Dst).getNumElements(); + + auto Unmerge = MIRBuilder.buildUnmerge(EltTy, Src); + SmallVector Regs; + + if (MI.getOpcode() == TargetOpcode::G_DELETE_TRAILING_VECTOR_ELTS) { + for (unsigned i = 0; i < DstNumElts; ++i) + Regs.push_back(Unmerge.getReg(i)); + } else { + unsigned SrcNumElts = MRI.getType(Src).getNumElements(); + for (unsigned i = 0; i < SrcNumElts; ++i) + Regs.push_back(Unmerge.getReg(i)); + Register Undef = MIRBuilder.buildUndef(EltTy).getReg(0); + for (unsigned i = 0; i < DstNumElts - SrcNumElts; ++i) + Regs.push_back(Undef); + } + + MIRBuilder.buildMerge(Dst, Regs); + MI.eraseFromParent(); + return Legalized; + } } } @@ -5067,6 +5199,29 @@ return moreElementsVectorPhi(MI, TypeIdx, MoreTy); case TargetOpcode::G_SHUFFLE_VECTOR: return moreElementsVectorShuffle(MI, TypeIdx, MoreTy); + case TargetOpcode::G_BUILD_VECTOR: + case TargetOpcode::G_BUILD_VECTOR_TRUNC: { + SmallVector Elts; + for (auto Op : MI.uses()) { + Elts.push_back(Op.getReg()); + } + + for (unsigned i = Elts.size(); i < MoreTy.getNumElements(); ++i) { + Elts.push_back(MIRBuilder.buildUndef(MoreTy.getScalarType())); + } + + MIRBuilder.buildDeleteTrailingVectorElement( + MI.getOperand(0).getReg(), MIRBuilder.buildInstr(Opc, {MoreTy}, Elts)); + MI.eraseFromParent(); + return Legalized; + } + case TargetOpcode::G_TRUNC: { + Observer.changingInstr(MI); + moreElementsVectorSrc(MI, MoreTy, 1); + moreElementsVectorDst(MI, MoreTy, 0); + Observer.changedInstr(MI); + return Legalized; + } default: return UnableToLegalize; } @@ -6705,6 +6860,24 @@ LLT VecTy = MRI.getType(SrcVec); LLT EltTy = VecTy.getElementType(); + unsigned NumElts = VecTy.getNumElements(); + + int64_t IdxVal; + if (mi_match(Idx, MRI, m_ICst(IdxVal)) && IdxVal <= NumElts) { + SmallVector SrcRegs; + extractParts(SrcVec, EltTy, NumElts, SrcRegs); + + if (InsertVal) { + SrcRegs[IdxVal] = MI.getOperand(2).getReg(); + MIRBuilder.buildMerge(DstReg, SrcRegs); + } else { + MIRBuilder.buildCopy(DstReg, SrcRegs[IdxVal]); + } + + MI.eraseFromParent(); + return Legalized; + } + if (!EltTy.isByteSized()) { // Not implemented. LLVM_DEBUG(dbgs() << "Can't handle non-byte element vectors yet\n"); return UnableToLegalize; @@ -6723,7 +6896,6 @@ // if the index is out of bounds. Register EltPtr = getVectorElementPointer(StackTemp.getReg(0), VecTy, Idx); - int64_t IdxVal; if (mi_match(Idx, MRI, m_ICst(IdxVal))) { int64_t Offset = IdxVal * EltBytes; PtrInfo = PtrInfo.getWithOffset(Offset); @@ -6755,7 +6927,6 @@ Register Src1Reg = MI.getOperand(2).getReg(); LLT Src0Ty = MRI.getType(Src0Reg); LLT DstTy = MRI.getType(DstReg); - LLT IdxTy = LLT::scalar(32); ArrayRef Mask = MI.getOperand(3).getShuffleMask(); @@ -6778,6 +6949,16 @@ Register Undef; SmallVector BuildVec; LLT EltTy = DstTy.getElementType(); + SmallVector Src0Regs, Src1Regs; + + int NumElts; + if (Src0Ty.isVector()) { + NumElts = Src0Ty.getNumElements(); + if (Src0Ty.isVector()) { + extractParts(Src0Reg, EltTy, NumElts, Src0Regs); + extractParts(Src1Reg, EltTy, NumElts, Src1Regs); + } + } for (int Idx : Mask) { if (Idx < 0) { @@ -6790,12 +6971,9 @@ if (Src0Ty.isScalar()) { BuildVec.push_back(Idx == 0 ? Src0Reg : Src1Reg); } else { - int NumElts = Src0Ty.getNumElements(); - Register SrcVec = Idx < NumElts ? Src0Reg : Src1Reg; + ArrayRef SrcVec = Idx < NumElts ? Src0Regs : Src1Regs; int ExtractIdx = Idx < NumElts ? Idx : Idx - NumElts; - auto IdxK = MIRBuilder.buildConstant(IdxTy, ExtractIdx); - auto Extract = MIRBuilder.buildExtractVectorElement(EltTy, SrcVec, IdxK); - BuildVec.push_back(Extract.getReg(0)); + BuildVec.push_back(SrcVec[ExtractIdx]); } } Index: llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp =================================================================== --- llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp +++ llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp @@ -1222,6 +1222,23 @@ "input vectors do not exactly cover the output vector register"); break; } + case TargetOpcode::G_DELETE_TRAILING_VECTOR_ELTS: + case TargetOpcode::G_PAD_VECTOR_WITH_UNDEF_ELTS: { + assert(DstOps[0].getLLTTy(*getMRI()).isVector() && "Invalid operand type"); + assert(SrcOps[0].getLLTTy(*getMRI()).isVector() && "Invalid operand type"); + assert(DstOps[0].getLLTTy(*getMRI()).getElementType() == + SrcOps[0].getLLTTy(*getMRI()).getElementType() && + "Element type mismatch"); + if (Opc == TargetOpcode::G_DELETE_TRAILING_VECTOR_ELTS) + assert(DstOps[0].getLLTTy(*getMRI()).getNumElements() < + SrcOps[0].getLLTTy(*getMRI()).getNumElements() && + "Element count mismatch"); + if (Opc == TargetOpcode::G_PAD_VECTOR_WITH_UNDEF_ELTS) + assert(DstOps[0].getLLTTy(*getMRI()).getNumElements() > + SrcOps[0].getLLTTy(*getMRI()).getNumElements() && + "Element count mismatch"); + break; + } case TargetOpcode::G_UADDE: { assert(DstOps.size() == 2 && "Invalid no of dst operands"); assert(SrcOps.size() == 3 && "Invalid no of src operands"); Index: llvm/lib/CodeGen/GlobalISel/Utils.cpp =================================================================== --- llvm/lib/CodeGen/GlobalISel/Utils.cpp +++ llvm/lib/CodeGen/GlobalISel/Utils.cpp @@ -815,6 +815,20 @@ return LLT::scalar(LCMSize); } +LLT llvm::getCoverTy(LLT OrigTy, LLT TargetTy) { + if (!OrigTy.isVector() || !TargetTy.isVector() || OrigTy == TargetTy || + (OrigTy.getScalarSizeInBits() != TargetTy.getScalarSizeInBits())) + return getLCMType(OrigTy, TargetTy); + + unsigned OrigTyNumElts = OrigTy.getNumElements(); + unsigned TargetTyNumElts = TargetTy.getNumElements(); + + if (OrigTyNumElts % TargetTyNumElts == 0) + return OrigTy; + unsigned NumElts = (OrigTyNumElts / TargetTyNumElts + 1) * TargetTyNumElts; + return LLT::fixed_vector(NumElts, OrigTy.getScalarType()); +} + LLT llvm::getGCDType(LLT OrigTy, LLT TargetTy) { const unsigned OrigSize = OrigTy.getSizeInBits(); const unsigned TargetSize = TargetTy.getSizeInBits(); Index: llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -76,6 +76,18 @@ }; } +static LegalityPredicate iss16OddVector(unsigned TypeIdx) { + return [=](const LegalityQuery &Query) { + const LLT Ty = Query.Types[TypeIdx]; + if (!Ty.isVector()) + return false; + + const LLT EltTy = Ty.getElementType(); + const unsigned EltSize = EltTy.getSizeInBits(); + return Ty.getNumElements() % 2 == 1 && EltSize == 16; + }; +} + static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { return [=](const LegalityQuery &Query) { const LLT Ty = Query.Types[TypeIdx]; @@ -104,6 +116,9 @@ return [=](const LegalityQuery &Query) { const LLT Ty = Query.Types[TypeIdx]; const LLT EltTy = Ty.getElementType(); + if (EltTy == LLT::scalar(16)) { + return std::make_pair(TypeIdx, LLT::fixed_vector(2, EltTy)); + } unsigned Size = Ty.getSizeInBits(); unsigned Pieces = (Size + 63) / 64; unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; @@ -620,7 +635,7 @@ getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) .clampScalar(0, S32, S64) - .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) + .moreElementsIf(iss16OddVector(0), oneMoreElement(0)) .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) .widenScalarToNextPow2(0) .scalarize(0); @@ -654,7 +669,7 @@ // s1 and s16 are special cases because they have legal operations on // them, but don't really occupy registers in the normal way. .legalFor({S1, S16}) - .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) + .moreElementsIf(iss16OddVector(0), oneMoreElement(0)) .clampScalarOrElt(0, S32, MaxScalar) .widenScalarToNextPow2(0, 32) .clampMaxNumElements(0, S32, 16); @@ -1051,12 +1066,12 @@ unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); unsigned AlignBits = Query.MMODescrs[0].AlignInBits; - if (MemSize < DstTy.getSizeInBits()) - MemSize = std::max(MemSize, AlignBits); - if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) return true; + if (MemSize < DstTy.getSizeInBits()) + MemSize = std::max(MemSize, AlignBits); + const LLT PtrTy = Query.Types[1]; unsigned AS = PtrTy.getAddressSpace(); if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) @@ -1196,75 +1211,6 @@ unsigned Align = Query.MMODescrs[0].AlignInBits; return std::make_pair(0, LLT::scalar(Align)); }) - .fewerElementsIf( - [=](const LegalityQuery &Query) -> bool { - return Query.Types[0].isVector() && - needToSplitMemOp(Query, Op == G_LOAD); - }, - [=](const LegalityQuery &Query) -> std::pair { - const LLT DstTy = Query.Types[0]; - const LLT PtrTy = Query.Types[1]; - - LLT EltTy = DstTy.getElementType(); - unsigned MaxSize = maxSizeForAddrSpace(ST, - PtrTy.getAddressSpace(), - Op == G_LOAD); - - // FIXME: Handle widened to power of 2 results better. This ends - // up scalarizing. - // FIXME: 3 element stores scalarized on SI - - // Split if it's too large for the address space. - unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); - if (MemSize > MaxSize) { - unsigned NumElts = DstTy.getNumElements(); - unsigned EltSize = EltTy.getSizeInBits(); - - if (MaxSize % EltSize == 0) { - return std::make_pair( - 0, LLT::scalarOrVector( - ElementCount::getFixed(MaxSize / EltSize), EltTy)); - } - - unsigned NumPieces = MemSize / MaxSize; - - // FIXME: Refine when odd breakdowns handled - // The scalars will need to be re-legalized. - if (NumPieces == 1 || NumPieces >= NumElts || - NumElts % NumPieces != 0) - return std::make_pair(0, EltTy); - - return std::make_pair( - 0, LLT::fixed_vector(NumElts / NumPieces, EltTy)); - } - - // FIXME: We could probably handle weird extending loads better. - if (DstTy.getSizeInBits() > MemSize) - return std::make_pair(0, EltTy); - - unsigned EltSize = EltTy.getSizeInBits(); - unsigned DstSize = DstTy.getSizeInBits(); - if (!isPowerOf2_32(DstSize)) { - // We're probably decomposing an odd sized store. Try to split - // to the widest type. TODO: Account for alignment. As-is it - // should be OK, since the new parts will be further legalized. - unsigned FloorSize = PowerOf2Floor(DstSize); - return std::make_pair( - 0, LLT::scalarOrVector( - ElementCount::getFixed(FloorSize / EltSize), EltTy)); - } - - // Need to split because of alignment. - unsigned Align = Query.MMODescrs[0].AlignInBits; - if (EltSize > Align && - (EltSize / Align < DstTy.getNumElements())) { - return std::make_pair( - 0, LLT::fixed_vector(EltSize / Align, EltTy)); - } - - // May need relegalization for the scalars. - return std::make_pair(0, EltTy); - }) .minScalar(0, S32) .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32)) .widenScalarToNextPow2(0) @@ -1342,7 +1288,7 @@ {S1, S32}) .clampScalar(0, S16, S64) .scalarize(1) - .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) + .moreElementsIf(iss16OddVector(0), oneMoreElement(0)) .fewerElementsIf(numElementsNotEven(0), scalarize(0)) .clampMaxNumElements(0, S32, 2) .clampMaxNumElements(0, LocalPtr, 2) @@ -1483,6 +1429,7 @@ .legalForCartesianProduct(AllS64Vectors, {S64}) .clampNumElements(0, V16S32, V32S32) .clampNumElements(0, V2S64, V16S64) + .moreElementsIf(iss16OddVector(0), oneMoreElement(0)) .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); if (ST.hasScalarPackInsts()) { @@ -1684,6 +1631,10 @@ G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) .unsupported(); + getActionDefinitionsBuilder( + {G_DELETE_TRAILING_VECTOR_ELTS, G_PAD_VECTOR_WITH_UNDEF_ELTS}) + .lower(); + getLegacyLegalizerInfo().computeTables(); verify(*ST.getInstrInfo()); } @@ -2234,9 +2185,10 @@ LLT EltTy = VecTy.getElementType(); assert(EltTy == MRI.getType(Dst)); - if (IdxVal < VecTy.getNumElements()) - B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits()); - else + if (IdxVal < VecTy.getNumElements()) { + auto Unmerge = B.buildUnmerge(EltTy, Vec); + B.buildCopy(Dst, Unmerge.getReg(IdxVal)); + } else B.buildUndef(Dst); MI.eraseFromParent(); @@ -2267,9 +2219,22 @@ LLT EltTy = VecTy.getElementType(); assert(EltTy == MRI.getType(Ins)); - if (IdxVal < VecTy.getNumElements()) - B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits()); - else + if (IdxVal < VecTy.getNumElements()) { + unsigned NumElts = VecTy.getNumElements(); + + SmallVector SrcRegs; + + for (unsigned i = 0; i < NumElts; ++i) + SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy)); + B.buildUnmerge(SrcRegs, Vec); + + SrcRegs[IdxVal] = MI.getOperand(2).getReg(); + B.buildMerge(Dst, SrcRegs); + } else + + // if (IdxVal < VecTy.getNumElements()) + // B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits()); + // else B.buildUndef(Dst); MI.eraseFromParent(); @@ -2570,10 +2535,8 @@ } else { // For cases where the widened type isn't a nice register value, unmerge // from a widened register (e.g. <3 x s16> -> <4 x s16>) - B.setInsertPt(B.getMBB(), ++B.getInsertPt()); - WideLoad = Helper.widenWithUnmerge(WideTy, ValReg); - B.setInsertPt(B.getMBB(), MI.getIterator()); - B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0); + WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); + B.buildDeleteTrailingVectorElement(ValReg, WideLoad); } } @@ -3835,6 +3798,10 @@ llvm_unreachable("invalid data type"); } + if (StoreVT == LLT::fixed_vector(3, S16)) { + Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg) + .getReg(0); + } return Reg; } @@ -4668,9 +4635,22 @@ // Deal with the one annoying legal case. const LLT V3S16 = LLT::fixed_vector(3, 16); if (Ty == V3S16) { - padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); - auto Concat = B.buildConcatVectors(LLT::fixed_vector(6, 16), ResultRegs); - B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); + if (IsTFE) { + if (ResultRegs.size() == 1) { + NewResultReg = ResultRegs[0]; + } else if (ResultRegs.size() == 2) { + LLT V4S16 = LLT::fixed_vector(4, 16); + NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0); + } else { + return false; + } + } + + if (MRI->getType(DstReg).getNumElements() < + MRI->getType(NewResultReg).getNumElements()) + B.buildDeleteTrailingVectorElement(DstReg, NewResultReg); + else + B.buildPadVectorWithUndefElements(DstReg, NewResultReg); return true; } Index: llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -1110,24 +1110,6 @@ MI.getOperand(OpIdx).setReg(SGPR); } -/// Split \p Ty into 2 pieces. The first will have \p FirstSize bits, and the -/// rest will be in the remainder. -static std::pair splitUnequalType(LLT Ty, unsigned FirstSize) { - unsigned TotalSize = Ty.getSizeInBits(); - if (!Ty.isVector()) - return {LLT::scalar(FirstSize), LLT::scalar(TotalSize - FirstSize)}; - - LLT EltTy = Ty.getElementType(); - unsigned EltSize = EltTy.getSizeInBits(); - assert(FirstSize % EltSize == 0); - - unsigned FirstPartNumElts = FirstSize / EltSize; - unsigned RemainderElts = (TotalSize - FirstSize) / EltSize; - - return {LLT::scalarOrVector(ElementCount::getFixed(FirstPartNumElts), EltTy), - LLT::scalarOrVector(ElementCount::getFixed(RemainderElts), EltTy)}; -} - static LLT widen96To128(LLT Ty) { if (!Ty.isVector()) return LLT::scalar(128); @@ -1187,18 +1169,24 @@ // 96-bit loads are only available for vector loads. We need to split this // into a 64-bit part, and 32 (unless we can widen to a 128-bit load). if (MMO->getAlign() < Align(16)) { - LLT Part64, Part32; - std::tie(Part64, Part32) = splitUnequalType(LoadTy, 64); - auto Load0 = B.buildLoadFromOffset(Part64, PtrReg, *MMO, 0); - auto Load1 = B.buildLoadFromOffset(Part32, PtrReg, *MMO, 8); - - auto Undef = B.buildUndef(LoadTy); - auto Ins0 = B.buildInsert(LoadTy, Undef, Load0, 0); - B.buildInsert(MI.getOperand(0), Ins0, Load1, 64); + MachineFunction *MF = MI.getParent()->getParent(); + ApplyRegBankMapping ApplyBank(*this, MRI, DstBank); + MachineIRBuilder B(MI, ApplyBank); + LegalizerHelper Helper(*MF, ApplyBank, B); + if (Helper.lowerLoad(cast(MI)) != LegalizerHelper::Legalized) + return false; + return true; } else { LLT WiderTy = widen96To128(LoadTy); auto WideLoad = B.buildLoadFromOffset(WiderTy, PtrReg, *MMO, 0); - B.buildExtract(MI.getOperand(0), WideLoad, 0); + if (WiderTy.isScalar()) + B.buildTrunc(MI.getOperand(0), WideLoad); + else { + auto Unmerge = B.buildUnmerge(WiderTy.getElementType(), WideLoad); + B.buildMerge( + MI.getOperand(0).getReg(), + {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)}); + } } } Index: llvm/test/CodeGen/AArch64/GlobalISel/call-lowering-vectors.ll =================================================================== --- llvm/test/CodeGen/AArch64/GlobalISel/call-lowering-vectors.ll +++ llvm/test/CodeGen/AArch64/GlobalISel/call-lowering-vectors.ll @@ -55,9 +55,7 @@ ; CHECK: BL @bar, csr_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit $s0, implicit-def $q0 ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0 ; CHECK: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY]](<2 x s64>) - ; CHECK: [[DEF1:%[0-9]+]]:_(<4 x s32>) = G_IMPLICIT_DEF - ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s32>) = G_CONCAT_VECTORS [[BITCAST]](<4 x s32>), [[DEF1]](<4 x s32>), [[DEF1]](<4 x s32>) - ; CHECK: [[UV:%[0-9]+]]:_(<3 x s32>), [[UV1:%[0-9]+]]:_(<3 x s32>), [[UV2:%[0-9]+]]:_(<3 x s32>), [[UV3:%[0-9]+]]:_(<3 x s32>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s32>) + ; CHECK: [[TRUNC_ELTS:%[0-9]+]]:_(<3 x s32>) = G_TRUNC_ELTS [[BITCAST]](<4 x s32>) ; CHECK: ADJCALLSTACKUP 0, 0, implicit-def $sp, implicit $sp ; CHECK: RET_ReallyLR %call = call <3 x float> @bar(float undef) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll @@ -32,27 +32,27 @@ ; GCN-NEXT: s_movk_i32 s4, 0x80 ; GCN-NEXT: global_load_dwordx4 v[8:11], v[3:4], off offset:16 ; GCN-NEXT: global_load_dwordx4 v[12:15], v[3:4], off offset:32 -; GCN-NEXT: global_load_dwordx4 v[56:59], v[3:4], off offset:48 +; GCN-NEXT: global_load_dwordx4 v[52:55], v[3:4], off offset:48 ; GCN-NEXT: s_mov_b32 s5, 0 ; GCN-NEXT: v_mov_b32_e32 v3, s4 ; GCN-NEXT: v_mov_b32_e32 v4, s5 ; GCN-NEXT: v_add_co_u32_e32 v3, vcc, v0, v3 -; GCN-NEXT: v_addc_co_u32_e32 v4, vcc, v1, v4, vcc -; GCN-NEXT: global_load_dwordx4 v[16:19], v[0:1], off -; GCN-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[24:27], v[0:1], off offset:32 -; GCN-NEXT: global_load_dwordx4 v[28:31], v[0:1], off offset:48 -; GCN-NEXT: global_load_dwordx4 v[32:35], v[0:1], off offset:64 -; GCN-NEXT: global_load_dwordx4 v[36:39], v[0:1], off offset:128 -; GCN-NEXT: global_load_dwordx4 v[40:43], v[0:1], off offset:192 -; GCN-NEXT: global_load_dwordx4 v[44:47], v[3:4], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[52:55], v[3:4], off offset:32 -; GCN-NEXT: global_load_dwordx4 v[48:51], v[3:4], off offset:48 ; GCN-NEXT: s_movk_i32 s4, 0xc0 ; GCN-NEXT: v_mov_b32_e32 v6, s5 +; GCN-NEXT: v_addc_co_u32_e32 v4, vcc, v1, v4, vcc ; GCN-NEXT: v_mov_b32_e32 v5, s4 ; GCN-NEXT: v_add_co_u32_e32 v60, vcc, v0, v5 ; GCN-NEXT: v_addc_co_u32_e32 v61, vcc, v1, v6, vcc +; GCN-NEXT: global_load_dwordx4 v[16:19], v[3:4], off offset:16 +; GCN-NEXT: global_load_dwordx4 v[20:23], v[3:4], off offset:32 +; GCN-NEXT: global_load_dwordx4 v[24:27], v[3:4], off offset:48 +; GCN-NEXT: global_load_dwordx4 v[28:31], v[0:1], off +; GCN-NEXT: global_load_dwordx4 v[32:35], v[0:1], off offset:16 +; GCN-NEXT: global_load_dwordx4 v[36:39], v[0:1], off offset:32 +; GCN-NEXT: global_load_dwordx4 v[40:43], v[0:1], off offset:48 +; GCN-NEXT: global_load_dwordx4 v[44:47], v[0:1], off offset:64 +; GCN-NEXT: global_load_dwordx4 v[48:51], v[0:1], off offset:128 +; GCN-NEXT: global_load_dwordx4 v[3:6], v[0:1], off offset:192 ; GCN-NEXT: v_and_b32_e32 v0, 63, v2 ; GCN-NEXT: v_lshrrev_b32_e64 v1, 6, s33 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -61,68 +61,50 @@ ; GCN-NEXT: s_add_i32 s32, s32, 0x10000 ; GCN-NEXT: s_add_i32 s32, s32, 0xffff0000 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill ; GCN-NEXT: global_load_dwordx4 v[4:7], v[60:61], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[48:51], v[60:61], off offset:32 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill +; GCN-NEXT: global_load_dwordx4 v[56:59], v[60:61], off offset:32 ; GCN-NEXT: global_load_dwordx4 v[60:63], v[60:61], off offset:48 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:256 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:260 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:264 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:268 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s33 offset:272 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s33 offset:276 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s33 offset:280 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s33 offset:284 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s33 offset:288 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s33 offset:292 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s33 offset:296 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s33 offset:300 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s33 offset:304 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s33 offset:308 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s33 offset:312 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:316 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:320 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s33 offset:324 -; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s33 offset:328 -; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:332 -; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:384 -; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:388 -; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:392 -; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:396 +; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s33 offset:256 +; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s33 offset:260 +; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s33 offset:264 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:268 +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:272 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s33 offset:276 +; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s33 offset:280 +; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:284 +; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:288 +; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:292 +; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:296 +; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:300 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:304 +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:308 +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:312 +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:316 +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:320 +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:324 +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:328 +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:332 +; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:384 +; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:388 +; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:392 +; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:396 ; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:336 ; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:340 ; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:344 @@ -131,76 +113,51 @@ ; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:356 ; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:360 ; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:364 -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:368 -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:372 -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:376 -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:380 -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:400 -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:404 -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:408 -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:412 -; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:416 -; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:420 -; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:424 -; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:428 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload +; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:368 +; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:372 +; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:376 +; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:380 +; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:400 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:404 +; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:408 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:412 +; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s33 offset:416 +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s33 offset:420 +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s33 offset:424 +; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s33 offset:428 +; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s33 offset:432 +; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s33 offset:436 +; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s33 offset:440 +; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s33 offset:444 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:524 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:528 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:532 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:536 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:540 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:544 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:548 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:552 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:556 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:560 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:564 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v12, v20 -; GCN-NEXT: v_mov_b32_e32 v13, v21 -; GCN-NEXT: v_mov_b32_e32 v14, v22 -; GCN-NEXT: v_mov_b32_e32 v15, v23 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:432 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:436 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:440 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:444 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:448 -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:452 -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:456 -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:460 +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:448 +; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:452 +; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:456 +; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:460 ; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:464 ; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:468 ; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:472 ; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:476 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:524 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:528 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:532 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:536 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:540 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:544 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:548 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:552 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:556 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:560 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:564 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v8, v11 -; GCN-NEXT: v_mov_b32_e32 v9, v12 -; GCN-NEXT: v_mov_b32_e32 v10, v13 -; GCN-NEXT: v_mov_b32_e32 v11, v14 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:480 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:484 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:488 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:492 +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:480 +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:484 +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:488 +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:492 ; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:496 ; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:500 ; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:504 @@ -237,7 +194,6 @@ ; GCN-NEXT: s_mov_b32 s6, s33 ; GCN-NEXT: s_add_i32 s33, s32, 0x3fc0 ; GCN-NEXT: s_and_b32 s33, s33, 0xffffc000 -; GCN-NEXT: v_add_co_u32_e32 v3, vcc, 64, v0 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill @@ -254,185 +210,325 @@ ; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v1, vcc +; GCN-NEXT: global_load_dwordx4 v[3:6], v[0:1], off +; GCN-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:16 +; GCN-NEXT: v_mov_b32_e32 v25, 0xffff +; GCN-NEXT: v_mov_b32_e32 v29, v2 ; GCN-NEXT: s_movk_i32 s4, 0x80 -; GCN-NEXT: global_load_dwordx4 v[8:11], v[3:4], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[12:15], v[3:4], off offset:32 -; GCN-NEXT: global_load_dwordx4 v[56:59], v[3:4], off offset:48 ; GCN-NEXT: s_mov_b32 s5, 0 +; GCN-NEXT: v_add_co_u32_e32 v19, vcc, 64, v0 +; GCN-NEXT: v_addc_co_u32_e32 v20, vcc, 0, v1, vcc +; GCN-NEXT: s_add_i32 s32, s32, 0x10000 +; GCN-NEXT: s_add_i32 s32, s32, 0xffff0000 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v5 +; GCN-NEXT: v_and_or_b32 v2, v3, v25, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v6 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill +; GCN-NEXT: v_and_or_b32 v2, v4, v25, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill +; GCN-NEXT: v_and_or_b32 v2, v5, v25, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GCN-NEXT: s_waitcnt vmcnt(2) +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill +; GCN-NEXT: v_and_or_b32 v2, v6, v25, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill +; GCN-NEXT: v_and_or_b32 v30, v7, v25, v15 +; GCN-NEXT: v_and_or_b32 v31, v8, v25, v16 +; GCN-NEXT: v_and_or_b32 v32, v9, v25, v17 +; GCN-NEXT: v_and_or_b32 v33, v10, v25, v18 +; GCN-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:32 +; GCN-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:48 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v11 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_and_or_b32 v34, v8, v25, v3 +; GCN-NEXT: v_and_or_b32 v35, v9, v25, v4 +; GCN-NEXT: v_and_or_b32 v36, v10, v25, v5 +; GCN-NEXT: v_and_or_b32 v37, v11, v25, v6 +; GCN-NEXT: v_and_or_b32 v38, v12, v25, v7 +; GCN-NEXT: global_load_dwordx4 v[3:6], v[0:1], off offset:64 +; GCN-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:128 +; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v13 +; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GCN-NEXT: v_and_or_b32 v39, v13, v25, v16 +; GCN-NEXT: v_and_or_b32 v48, v14, v25, v17 +; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GCN-NEXT: v_and_or_b32 v49, v15, v25, v18 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GCN-NEXT: v_and_or_b32 v50, v3, v25, v11 +; GCN-NEXT: v_and_or_b32 v51, v4, v25, v12 ; GCN-NEXT: v_mov_b32_e32 v3, s4 ; GCN-NEXT: v_mov_b32_e32 v4, s5 -; GCN-NEXT: v_add_co_u32_e32 v3, vcc, v0, v3 -; GCN-NEXT: v_addc_co_u32_e32 v4, vcc, v1, v4, vcc -; GCN-NEXT: global_load_dwordx4 v[16:19], v[0:1], off -; GCN-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[24:27], v[0:1], off offset:32 -; GCN-NEXT: global_load_dwordx4 v[28:31], v[0:1], off offset:48 -; GCN-NEXT: global_load_dwordx4 v[32:35], v[0:1], off offset:64 -; GCN-NEXT: global_load_dwordx4 v[36:39], v[0:1], off offset:128 -; GCN-NEXT: global_load_dwordx4 v[40:43], v[0:1], off offset:192 -; GCN-NEXT: global_load_dwordx4 v[44:47], v[3:4], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[52:55], v[3:4], off offset:32 -; GCN-NEXT: global_load_dwordx4 v[48:51], v[3:4], off offset:48 +; GCN-NEXT: v_and_or_b32 v52, v5, v25, v13 +; GCN-NEXT: v_and_or_b32 v53, v6, v25, v14 ; GCN-NEXT: s_movk_i32 s4, 0xc0 ; GCN-NEXT: v_mov_b32_e32 v6, s5 ; GCN-NEXT: v_mov_b32_e32 v5, s4 -; GCN-NEXT: v_add_co_u32_e32 v60, vcc, v0, v5 -; GCN-NEXT: v_addc_co_u32_e32 v61, vcc, v1, v6, vcc -; GCN-NEXT: v_bfe_u32 v0, v2, 1, 6 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GCN-NEXT: v_and_b32_e32 v1, 1, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GCN-NEXT: s_add_i32 s32, s32, 0x10000 -; GCN-NEXT: s_add_i32 s32, s32, 0xffff0000 +; GCN-NEXT: v_add_co_u32_e32 v23, vcc, v0, v5 +; GCN-NEXT: v_addc_co_u32_e32 v24, vcc, v1, v6, vcc +; GCN-NEXT: global_load_dwordx4 v[11:14], v[19:20], off offset:16 +; GCN-NEXT: global_load_dwordx4 v[15:18], v[23:24], off offset:48 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v13 +; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GCN-NEXT: v_and_or_b32 v54, v11, v25, v5 +; GCN-NEXT: v_and_or_b32 v55, v12, v25, v6 +; GCN-NEXT: v_and_or_b32 v40, v13, v25, v21 +; GCN-NEXT: v_and_or_b32 v41, v14, v25, v22 +; GCN-NEXT: global_load_dwordx4 v[11:14], v[19:20], off offset:32 +; GCN-NEXT: global_load_dwordx4 v[19:22], v[19:20], off offset:48 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_and_or_b32 v42, v11, v25, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_and_or_b32 v43, v12, v25, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_and_or_b32 v44, v13, v25, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_and_or_b32 v28, v14, v25, v5 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_and_or_b32 v45, v19, v25, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v20 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_and_or_b32 v46, v20, v25, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v21 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_and_or_b32 v47, v21, v25, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_and_or_b32 v56, v22, v25, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_and_or_b32 v57, v7, v25, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_and_or_b32 v58, v8, v25, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_and_or_b32 v59, v9, v25, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v10 +; GCN-NEXT: v_add_co_u32_e32 v12, vcc, v0, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_and_or_b32 v60, v10, v25, v5 +; GCN-NEXT: v_addc_co_u32_e32 v13, vcc, v1, v4, vcc +; GCN-NEXT: global_load_dwordx4 v[3:6], v[0:1], off offset:192 +; GCN-NEXT: global_load_dwordx4 v[7:10], v[12:13], off offset:16 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill -; GCN-NEXT: global_load_dwordx4 v[4:7], v[60:61], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[48:51], v[60:61], off offset:32 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_and_or_b32 v27, v7, v25, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v8 +; GCN-NEXT: v_and_or_b32 v61, v9, v25, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_and_or_b32 v26, v8, v25, v1 +; GCN-NEXT: v_and_or_b32 v62, v10, v25, v7 +; GCN-NEXT: global_load_dwordx4 v[8:11], v[12:13], off offset:32 +; GCN-NEXT: global_load_dwordx4 v[19:22], v[12:13], off offset:48 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_and_or_b32 v14, v8, v25, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_and_or_b32 v13, v9, v25, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_and_or_b32 v63, v10, v25, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_and_or_b32 v12, v11, v25, v7 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_and_or_b32 v19, v19, v25, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v20 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_and_or_b32 v20, v20, v25, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v21 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_and_or_b32 v21, v21, v25, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_and_or_b32 v22, v22, v25, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_and_or_b32 v3, v3, v25, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_and_or_b32 v2, v4, v25, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_and_or_b32 v1, v5, v25, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_and_or_b32 v0, v6, v25, v4 +; GCN-NEXT: global_load_dwordx4 v[4:7], v[23:24], off offset:16 +; GCN-NEXT: global_load_dwordx4 v[8:11], v[23:24], off offset:32 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GCN-NEXT: v_and_or_b32 v4, v4, v25, v23 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GCN-NEXT: v_and_or_b32 v5, v5, v25, v23 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GCN-NEXT: v_and_or_b32 v6, v6, v25, v23 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GCN-NEXT: v_and_or_b32 v7, v7, v25, v23 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill -; GCN-NEXT: global_load_dwordx4 v[60:63], v[60:61], off offset:48 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:256 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:260 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:264 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:268 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s33 offset:272 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s33 offset:276 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s33 offset:280 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s33 offset:284 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s33 offset:288 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s33 offset:292 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s33 offset:296 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s33 offset:300 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s33 offset:304 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s33 offset:308 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s33 offset:312 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:316 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:320 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s33 offset:324 -; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s33 offset:328 -; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:332 -; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:384 -; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:388 -; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:392 -; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:396 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:336 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:340 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:344 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:348 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:352 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:356 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:360 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:364 -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:368 -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:372 -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:376 -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:380 -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:400 -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:404 -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:408 -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:412 -; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:416 -; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:420 -; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:424 -; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:428 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload +; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GCN-NEXT: v_and_or_b32 v8, v8, v25, v23 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GCN-NEXT: v_and_or_b32 v9, v9, v25, v23 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GCN-NEXT: v_and_or_b32 v10, v10, v25, v23 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GCN-NEXT: v_and_or_b32 v11, v11, v25, v23 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GCN-NEXT: v_and_or_b32 v15, v15, v25, v23 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GCN-NEXT: v_and_or_b32 v16, v16, v25, v23 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v17 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GCN-NEXT: v_and_or_b32 v17, v17, v25, v23 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v18 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GCN-NEXT: v_and_or_b32 v18, v18, v25, v23 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s33 offset:256 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v12, v20 -; GCN-NEXT: v_mov_b32_e32 v13, v21 -; GCN-NEXT: v_mov_b32_e32 v14, v22 -; GCN-NEXT: v_mov_b32_e32 v15, v23 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:432 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:436 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:440 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:444 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:448 -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:452 -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:456 -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:460 +; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s33 offset:260 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s33 offset:264 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:524 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s33 offset:268 +; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s33 offset:272 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:276 +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:280 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s33 offset:284 +; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s33 offset:288 +; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:292 +; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:296 +; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:300 +; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:304 +; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:308 +; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:312 +; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:316 +; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:320 +; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:324 +; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:328 +; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:332 +; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:336 +; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:340 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:344 +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:348 +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:352 +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:356 +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:360 +; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s33 offset:364 +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:368 +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:372 +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:376 +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:380 +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:384 +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:388 +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:392 +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:396 +; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s33 offset:400 +; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s33 offset:404 +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:408 +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:412 +; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:416 +; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:420 +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s33 offset:424 +; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:428 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:432 +; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s33 offset:436 +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s33 offset:440 +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s33 offset:444 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:448 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:452 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:456 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:460 ; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:464 ; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:468 ; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:472 ; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:476 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:524 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:528 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:532 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:536 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:540 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:544 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:548 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:552 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:556 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:560 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:564 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v8, v11 -; GCN-NEXT: v_mov_b32_e32 v9, v12 -; GCN-NEXT: v_mov_b32_e32 v10, v13 -; GCN-NEXT: v_mov_b32_e32 v11, v14 ; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:480 ; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:484 ; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:488 ; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:492 -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:496 -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:500 -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:504 -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s33 offset:508 -; GCN-NEXT: v_lshrrev_b32_e64 v11, 6, s33 -; GCN-NEXT: v_add_u32_e32 v11, 0x100, v11 -; GCN-NEXT: v_add_u32_e32 v0, v11, v0 +; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:496 +; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:500 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:504 +; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:508 +; GCN-NEXT: v_bfe_u32 v0, v29, 1, 6 +; GCN-NEXT: v_lshrrev_b32_e64 v2, 6, s33 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: v_add_u32_e32 v2, 0x100, v2 +; GCN-NEXT: v_add_u32_e32 v0, v2, v0 ; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload @@ -450,6 +546,8 @@ ; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload +; GCN-NEXT: v_and_b32_e32 v1, 1, v29 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GCN-NEXT: s_mov_b32 s33, s6 ; GCN-NEXT: s_waitcnt vmcnt(16) ; GCN-NEXT: v_lshrrev_b32_e32 v0, v1, v0 @@ -488,27 +586,27 @@ ; GCN-NEXT: s_movk_i32 s4, 0x80 ; GCN-NEXT: global_load_dwordx4 v[8:11], v[3:4], off offset:16 ; GCN-NEXT: global_load_dwordx4 v[12:15], v[3:4], off offset:32 -; GCN-NEXT: global_load_dwordx4 v[56:59], v[3:4], off offset:48 +; GCN-NEXT: global_load_dwordx4 v[52:55], v[3:4], off offset:48 ; GCN-NEXT: s_mov_b32 s5, 0 ; GCN-NEXT: v_mov_b32_e32 v3, s4 ; GCN-NEXT: v_mov_b32_e32 v4, s5 ; GCN-NEXT: v_add_co_u32_e32 v3, vcc, v0, v3 -; GCN-NEXT: v_addc_co_u32_e32 v4, vcc, v1, v4, vcc -; GCN-NEXT: global_load_dwordx4 v[16:19], v[0:1], off -; GCN-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[24:27], v[0:1], off offset:32 -; GCN-NEXT: global_load_dwordx4 v[28:31], v[0:1], off offset:48 -; GCN-NEXT: global_load_dwordx4 v[32:35], v[0:1], off offset:64 -; GCN-NEXT: global_load_dwordx4 v[36:39], v[0:1], off offset:128 -; GCN-NEXT: global_load_dwordx4 v[40:43], v[0:1], off offset:192 -; GCN-NEXT: global_load_dwordx4 v[44:47], v[3:4], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[52:55], v[3:4], off offset:32 -; GCN-NEXT: global_load_dwordx4 v[48:51], v[3:4], off offset:48 ; GCN-NEXT: s_movk_i32 s4, 0xc0 ; GCN-NEXT: v_mov_b32_e32 v6, s5 +; GCN-NEXT: v_addc_co_u32_e32 v4, vcc, v1, v4, vcc ; GCN-NEXT: v_mov_b32_e32 v5, s4 ; GCN-NEXT: v_add_co_u32_e32 v60, vcc, v0, v5 ; GCN-NEXT: v_addc_co_u32_e32 v61, vcc, v1, v6, vcc +; GCN-NEXT: global_load_dwordx4 v[16:19], v[3:4], off offset:16 +; GCN-NEXT: global_load_dwordx4 v[20:23], v[3:4], off offset:32 +; GCN-NEXT: global_load_dwordx4 v[24:27], v[3:4], off offset:48 +; GCN-NEXT: global_load_dwordx4 v[28:31], v[0:1], off +; GCN-NEXT: global_load_dwordx4 v[32:35], v[0:1], off offset:16 +; GCN-NEXT: global_load_dwordx4 v[36:39], v[0:1], off offset:32 +; GCN-NEXT: global_load_dwordx4 v[40:43], v[0:1], off offset:48 +; GCN-NEXT: global_load_dwordx4 v[44:47], v[0:1], off offset:64 +; GCN-NEXT: global_load_dwordx4 v[48:51], v[0:1], off offset:128 +; GCN-NEXT: global_load_dwordx4 v[3:6], v[0:1], off offset:192 ; GCN-NEXT: v_and_b32_e32 v0, 31, v2 ; GCN-NEXT: v_lshrrev_b32_e64 v2, 6, s33 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -517,68 +615,50 @@ ; GCN-NEXT: s_add_i32 s32, s32, 0x10000 ; GCN-NEXT: s_add_i32 s32, s32, 0xffff0000 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill ; GCN-NEXT: global_load_dwordx4 v[4:7], v[60:61], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[48:51], v[60:61], off offset:32 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill +; GCN-NEXT: global_load_dwordx4 v[56:59], v[60:61], off offset:32 ; GCN-NEXT: global_load_dwordx4 v[60:63], v[60:61], off offset:48 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:256 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:260 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:264 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:268 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s33 offset:272 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s33 offset:276 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s33 offset:280 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s33 offset:284 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s33 offset:288 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s33 offset:292 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s33 offset:296 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s33 offset:300 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s33 offset:304 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s33 offset:308 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s33 offset:312 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:316 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:320 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s33 offset:324 -; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s33 offset:328 -; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:332 -; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:384 -; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:388 -; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:392 -; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:396 +; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s33 offset:256 +; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s33 offset:260 +; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s33 offset:264 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:268 +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:272 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s33 offset:276 +; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s33 offset:280 +; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:284 +; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:288 +; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:292 +; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:296 +; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:300 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:304 +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:308 +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:312 +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:316 +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:320 +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:324 +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:328 +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:332 +; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:384 +; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:388 +; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:392 +; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:396 ; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:336 ; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:340 ; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:344 @@ -587,76 +667,51 @@ ; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:356 ; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:360 ; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:364 -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:368 -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:372 -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:376 -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:380 -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:400 -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:404 -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:408 -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:412 -; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:416 -; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:420 -; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:424 -; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:428 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload +; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:368 +; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:372 +; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:376 +; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:380 +; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:400 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:404 +; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:408 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:412 +; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s33 offset:416 +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s33 offset:420 +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s33 offset:424 +; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s33 offset:428 +; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s33 offset:432 +; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s33 offset:436 +; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s33 offset:440 +; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s33 offset:444 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:524 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:528 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:532 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:536 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:540 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:544 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:548 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:552 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:556 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:560 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:564 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v12, v20 -; GCN-NEXT: v_mov_b32_e32 v13, v21 -; GCN-NEXT: v_mov_b32_e32 v14, v22 -; GCN-NEXT: v_mov_b32_e32 v15, v23 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:432 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:436 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:440 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:444 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:448 -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:452 -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:456 -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:460 +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:448 +; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:452 +; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:456 +; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:460 ; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:464 ; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:468 ; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:472 ; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:476 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:524 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:528 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:532 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:536 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:540 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:544 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:548 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:552 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:556 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:560 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:564 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v8, v11 -; GCN-NEXT: v_mov_b32_e32 v9, v12 -; GCN-NEXT: v_mov_b32_e32 v10, v13 -; GCN-NEXT: v_mov_b32_e32 v11, v14 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:480 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:484 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:488 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:492 +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:480 +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:484 +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:488 +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:492 ; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:496 ; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:500 ; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:504 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/function-returns.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/function-returns.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/function-returns.ll @@ -843,9 +843,8 @@ ; CHECK: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 ; CHECK: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; CHECK: [[LOAD:%[0-9]+]]:_(<3 x s16>) = G_LOAD [[DEF]](p1) :: (load (<3 x s16>) from `<3 x i16> addrspace(1)* undef`, align 8, addrspace 1) - ; CHECK: [[DEF1:%[0-9]+]]:_(<3 x s16>) = G_IMPLICIT_DEF - ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[LOAD]](<3 x s16>), [[DEF1]](<3 x s16>) - ; CHECK: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>) + ; CHECK: [[PAD_VECTOR_WITH_UNDEF_ELTS:%[0-9]+]]:_(<4 x s16>) = G_PAD_VECTOR_WITH_UNDEF_ELTS [[LOAD]](<3 x s16>) + ; CHECK: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[PAD_VECTOR_WITH_UNDEF_ELTS]](<4 x s16>) ; CHECK: $vgpr0 = COPY [[UV]](<2 x s16>) ; CHECK: $vgpr1 = COPY [[UV1]](<2 x s16>) ; CHECK: [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]] @@ -894,9 +893,8 @@ ; CHECK: [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF ; CHECK: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (volatile load (p1) from `<5 x i16> addrspace(1)* addrspace(4)* undef`, addrspace 4) ; CHECK: [[LOAD1:%[0-9]+]]:_(<5 x s16>) = G_LOAD [[LOAD]](p1) :: (load (<5 x s16>) from %ir.ptr, align 16, addrspace 1) - ; CHECK: [[DEF1:%[0-9]+]]:_(<5 x s16>) = G_IMPLICIT_DEF - ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<10 x s16>) = G_CONCAT_VECTORS [[LOAD1]](<5 x s16>), [[DEF1]](<5 x s16>) - ; CHECK: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>), [[UV4:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<10 x s16>) + ; CHECK: [[PAD_VECTOR_WITH_UNDEF_ELTS:%[0-9]+]]:_(<6 x s16>) = G_PAD_VECTOR_WITH_UNDEF_ELTS [[LOAD1]](<5 x s16>) + ; CHECK: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[PAD_VECTOR_WITH_UNDEF_ELTS]](<6 x s16>) ; CHECK: $vgpr0 = COPY [[UV]](<2 x s16>) ; CHECK: $vgpr1 = COPY [[UV1]](<2 x s16>) ; CHECK: $vgpr2 = COPY [[UV2]](<2 x s16>) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll @@ -2762,6 +2762,9 @@ ; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], s12, 5 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], s12, 6 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[10:11], s12, 7 +; GFX9-NEXT: v_mov_b32_e32 v10, 0 +; GFX9-NEXT: v_mov_b32_e32 v11, 0 +; GFX9-NEXT: v_mov_b32_e32 v13, 0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[0:1] @@ -2771,22 +2774,19 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[6:7] ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[8:9] ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[10:11] -; GFX9-NEXT: v_and_or_b32 v10, v1, s13, v0 +; GFX9-NEXT: v_and_or_b32 v12, v1, s13, v0 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], s12, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, v10, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e64 v2, v4, v10, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v10, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v3, v5, v10, s[2:3] -; GFX9-NEXT: v_cndmask_b32_e64 v4, v6, v10, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v7, v10, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v6, v8, v10, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v9, v10, s[10:11] -; GFX9-NEXT: v_mov_b32_e32 v8, 0 -; GFX9-NEXT: v_mov_b32_e32 v9, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, v12, s[12:13] +; GFX9-NEXT: v_cndmask_b32_e64 v2, v4, v12, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v12, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v3, v5, v12, s[2:3] +; GFX9-NEXT: v_cndmask_b32_e64 v4, v6, v12, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v5, v7, v12, s[6:7] ; GFX9-NEXT: s_mov_b64 s[0:1], 16 -; GFX9-NEXT: v_mov_b32_e32 v10, 0 -; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off -; GFX9-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v6, v8, v12, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v7, v9, v12, s[10:11] +; GFX9-NEXT: global_store_dwordx4 v[10:11], v[0:3], off +; GFX9-NEXT: global_store_dwordx4 v13, v[4:7], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_v_v16i16_s_s: @@ -2810,8 +2810,6 @@ ; GFX8-NEXT: v_cmp_eq_u32_e64 s[6:7], s12, 5 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], s12, 6 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], s12, 7 -; GFX8-NEXT: v_mov_b32_e32 v10, 16 -; GFX8-NEXT: v_mov_b32_e32 v11, 0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_cndmask_b32_e32 v8, v0, v1, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, v2, s[0:1] @@ -2822,18 +2820,20 @@ ; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, v6, s[8:9] ; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, v7, s[10:11] ; GFX8-NEXT: v_and_b32_e32 v8, s14, v8 -; GFX8-NEXT: v_or_b32_e32 v8, s13, v8 +; GFX8-NEXT: v_or_b32_e32 v10, s13, v8 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[12:13], s12, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[12:13] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v8, s[2:3] -; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, v8, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v8, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[10:11] ; GFX8-NEXT: v_mov_b32_e32 v8, 0 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v10, s[12:13] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v10, s[2:3] ; GFX8-NEXT: v_mov_b32_e32 v9, 0 +; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, v10, s[6:7] +; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v10, s[8:9] +; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v10, s[10:11] +; GFX8-NEXT: v_mov_b32_e32 v10, 16 +; GFX8-NEXT: v_mov_b32_e32 v11, 0 ; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[4:7] ; GFX8-NEXT: s_endpgm @@ -2942,57 +2942,57 @@ ; GFX9-LABEL: insertelement_s_v16i16_v_s: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 -; GFX9-NEXT: s_lshr_b32 s2, s4, 1 -; GFX9-NEXT: s_cmp_eq_u32 s2, 1 +; GFX9-NEXT: s_lshr_b32 s0, s4, 1 +; GFX9-NEXT: s_cmp_eq_u32 s0, 1 ; GFX9-NEXT: s_mov_b32 s3, 0xffff ; GFX9-NEXT: v_and_b32_e32 v0, s3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_cselect_b32 s0, s9, s8 -; GFX9-NEXT: s_cmp_eq_u32 s2, 2 -; GFX9-NEXT: s_cselect_b32 s0, s10, s0 -; GFX9-NEXT: s_cmp_eq_u32 s2, 3 -; GFX9-NEXT: s_cselect_b32 s0, s11, s0 -; GFX9-NEXT: s_cmp_eq_u32 s2, 4 -; GFX9-NEXT: s_cselect_b32 s0, s12, s0 -; GFX9-NEXT: s_cmp_eq_u32 s2, 5 -; GFX9-NEXT: s_cselect_b32 s0, s13, s0 -; GFX9-NEXT: s_cmp_eq_u32 s2, 6 -; GFX9-NEXT: s_cselect_b32 s0, s14, s0 -; GFX9-NEXT: s_cmp_eq_u32 s2, 7 -; GFX9-NEXT: s_cselect_b32 s0, s15, s0 -; GFX9-NEXT: s_and_b32 s1, s4, 1 -; GFX9-NEXT: s_lshl_b32 s1, s1, 4 -; GFX9-NEXT: s_lshl_b32 s3, s3, s1 -; GFX9-NEXT: s_andn2_b32 s0, s0, s3 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_lshl_or_b32 v8, v0, s1, v1 +; GFX9-NEXT: s_cselect_b32 s1, s9, s8 +; GFX9-NEXT: s_cmp_eq_u32 s0, 2 +; GFX9-NEXT: s_cselect_b32 s1, s10, s1 +; GFX9-NEXT: s_cmp_eq_u32 s0, 3 +; GFX9-NEXT: s_cselect_b32 s1, s11, s1 +; GFX9-NEXT: s_cmp_eq_u32 s0, 4 +; GFX9-NEXT: s_cselect_b32 s1, s12, s1 +; GFX9-NEXT: s_cmp_eq_u32 s0, 5 +; GFX9-NEXT: s_cselect_b32 s1, s13, s1 +; GFX9-NEXT: s_cmp_eq_u32 s0, 6 +; GFX9-NEXT: s_cselect_b32 s1, s14, s1 +; GFX9-NEXT: s_cmp_eq_u32 s0, 7 +; GFX9-NEXT: s_cselect_b32 s1, s15, s1 +; GFX9-NEXT: s_and_b32 s2, s4, 1 +; GFX9-NEXT: s_lshl_b32 s2, s2, 4 +; GFX9-NEXT: s_lshl_b32 s3, s3, s2 +; GFX9-NEXT: s_andn2_b32 s1, s1, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_lshl_or_b32 v10, v0, s2, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc ; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc ; GFX9-NEXT: v_mov_b32_e32 v2, s10 -; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc ; GFX9-NEXT: v_mov_b32_e32 v3, s11 -; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc -; GFX9-NEXT: v_mov_b32_e32 v5, s13 -; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 5 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc -; GFX9-NEXT: v_mov_b32_e32 v6, s14 -; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 6 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v10, vcc ; GFX9-NEXT: v_mov_b32_e32 v4, s12 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, 4 -; GFX9-NEXT: v_mov_b32_e32 v7, s15 -; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 7 -; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 4 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc +; GFX9-NEXT: v_mov_b32_e32 v5, s13 +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 5 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v10, vcc ; GFX9-NEXT: v_mov_b32_e32 v8, 0 +; GFX9-NEXT: v_mov_b32_e32 v6, s14 +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc ; GFX9-NEXT: v_mov_b32_e32 v9, 0 +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 7 +; GFX9-NEXT: v_mov_b32_e32 v7, s15 ; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off +; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v10, vcc ; GFX9-NEXT: s_mov_b64 s[0:1], 16 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: global_store_dwordx4 v0, v[4:7], s[0:1] @@ -3001,53 +3001,53 @@ ; GFX8-LABEL: insertelement_s_v16i16_v_s: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 -; GFX8-NEXT: s_lshr_b32 s2, s4, 1 -; GFX8-NEXT: s_cmp_eq_u32 s2, 1 +; GFX8-NEXT: s_lshr_b32 s0, s4, 1 +; GFX8-NEXT: s_cmp_eq_u32 s0, 1 ; GFX8-NEXT: s_mov_b32 s3, 0xffff -; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 +; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_cselect_b32 s0, s9, s8 -; GFX8-NEXT: s_cmp_eq_u32 s2, 2 -; GFX8-NEXT: s_cselect_b32 s0, s10, s0 -; GFX8-NEXT: s_cmp_eq_u32 s2, 3 -; GFX8-NEXT: s_cselect_b32 s0, s11, s0 -; GFX8-NEXT: s_cmp_eq_u32 s2, 4 -; GFX8-NEXT: s_cselect_b32 s0, s12, s0 -; GFX8-NEXT: s_cmp_eq_u32 s2, 5 -; GFX8-NEXT: s_cselect_b32 s0, s13, s0 -; GFX8-NEXT: s_cmp_eq_u32 s2, 6 -; GFX8-NEXT: s_cselect_b32 s0, s14, s0 -; GFX8-NEXT: s_cmp_eq_u32 s2, 7 -; GFX8-NEXT: s_cselect_b32 s0, s15, s0 -; GFX8-NEXT: s_and_b32 s1, s4, 1 -; GFX8-NEXT: s_lshl_b32 s1, s1, 4 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: s_lshl_b32 s1, s3, s1 +; GFX8-NEXT: s_cselect_b32 s1, s9, s8 +; GFX8-NEXT: s_cmp_eq_u32 s0, 2 +; GFX8-NEXT: s_cselect_b32 s1, s10, s1 +; GFX8-NEXT: s_cmp_eq_u32 s0, 3 +; GFX8-NEXT: s_cselect_b32 s1, s11, s1 +; GFX8-NEXT: s_cmp_eq_u32 s0, 4 +; GFX8-NEXT: s_cselect_b32 s1, s12, s1 +; GFX8-NEXT: s_cmp_eq_u32 s0, 5 +; GFX8-NEXT: s_cselect_b32 s1, s13, s1 +; GFX8-NEXT: s_cmp_eq_u32 s0, 6 +; GFX8-NEXT: s_cselect_b32 s1, s14, s1 +; GFX8-NEXT: s_cmp_eq_u32 s0, 7 +; GFX8-NEXT: s_cselect_b32 s1, s15, s1 +; GFX8-NEXT: s_and_b32 s2, s4, 1 +; GFX8-NEXT: s_lshl_b32 s2, s2, 4 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NEXT: s_lshl_b32 s2, s3, s2 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: s_andn2_b32 s0, s0, s1 -; GFX8-NEXT: v_or_b32_e32 v8, s0, v0 +; GFX8-NEXT: s_andn2_b32 s1, s1, s2 +; GFX8-NEXT: v_or_b32_e32 v8, s1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s8 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc ; GFX8-NEXT: v_mov_b32_e32 v1, s9 -; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1 +; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc ; GFX8-NEXT: v_mov_b32_e32 v2, s10 -; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s2, 2 +; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc ; GFX8-NEXT: v_mov_b32_e32 v3, s11 -; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s2, 3 +; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc +; GFX8-NEXT: v_mov_b32_e32 v4, s12 +; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 4 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc ; GFX8-NEXT: v_mov_b32_e32 v5, s13 -; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s2, 5 +; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 5 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc ; GFX8-NEXT: v_mov_b32_e32 v6, s14 -; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s2, 6 +; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 6 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc -; GFX8-NEXT: v_mov_b32_e32 v4, s12 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, 4 ; GFX8-NEXT: v_mov_b32_e32 v7, s15 -; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s2, 7 -; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[0:1] +; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 7 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc ; GFX8-NEXT: v_mov_b32_e32 v8, 0 ; GFX8-NEXT: v_mov_b32_e32 v10, 16 @@ -3060,52 +3060,52 @@ ; GFX7-LABEL: insertelement_s_v16i16_v_s: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 -; GFX7-NEXT: s_lshr_b32 s2, s4, 1 -; GFX7-NEXT: s_cmp_eq_u32 s2, 1 +; GFX7-NEXT: s_lshr_b32 s0, s4, 1 +; GFX7-NEXT: s_cmp_eq_u32 s0, 1 ; GFX7-NEXT: s_mov_b32 s3, 0xffff ; GFX7-NEXT: v_and_b32_e32 v0, s3, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_cselect_b32 s0, s9, s8 -; GFX7-NEXT: s_cmp_eq_u32 s2, 2 -; GFX7-NEXT: s_cselect_b32 s0, s10, s0 -; GFX7-NEXT: s_cmp_eq_u32 s2, 3 -; GFX7-NEXT: s_cselect_b32 s0, s11, s0 -; GFX7-NEXT: s_cmp_eq_u32 s2, 4 -; GFX7-NEXT: s_cselect_b32 s0, s12, s0 -; GFX7-NEXT: s_cmp_eq_u32 s2, 5 -; GFX7-NEXT: s_cselect_b32 s0, s13, s0 -; GFX7-NEXT: s_cmp_eq_u32 s2, 6 -; GFX7-NEXT: s_cselect_b32 s0, s14, s0 -; GFX7-NEXT: s_cmp_eq_u32 s2, 7 -; GFX7-NEXT: s_cselect_b32 s0, s15, s0 -; GFX7-NEXT: s_and_b32 s1, s4, 1 -; GFX7-NEXT: s_lshl_b32 s1, s1, 4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s1, v0 -; GFX7-NEXT: s_lshl_b32 s1, s3, s1 -; GFX7-NEXT: s_andn2_b32 s0, s0, s1 -; GFX7-NEXT: v_or_b32_e32 v8, s0, v0 +; GFX7-NEXT: s_cselect_b32 s1, s9, s8 +; GFX7-NEXT: s_cmp_eq_u32 s0, 2 +; GFX7-NEXT: s_cselect_b32 s1, s10, s1 +; GFX7-NEXT: s_cmp_eq_u32 s0, 3 +; GFX7-NEXT: s_cselect_b32 s1, s11, s1 +; GFX7-NEXT: s_cmp_eq_u32 s0, 4 +; GFX7-NEXT: s_cselect_b32 s1, s12, s1 +; GFX7-NEXT: s_cmp_eq_u32 s0, 5 +; GFX7-NEXT: s_cselect_b32 s1, s13, s1 +; GFX7-NEXT: s_cmp_eq_u32 s0, 6 +; GFX7-NEXT: s_cselect_b32 s1, s14, s1 +; GFX7-NEXT: s_cmp_eq_u32 s0, 7 +; GFX7-NEXT: s_cselect_b32 s1, s15, s1 +; GFX7-NEXT: s_and_b32 s2, s4, 1 +; GFX7-NEXT: s_lshl_b32 s2, s2, 4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s2, v0 +; GFX7-NEXT: s_lshl_b32 s2, s3, s2 +; GFX7-NEXT: s_andn2_b32 s1, s1, s2 +; GFX7-NEXT: v_or_b32_e32 v8, s1, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s8 -; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 0 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc ; GFX7-NEXT: v_mov_b32_e32 v1, s9 -; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc ; GFX7-NEXT: v_mov_b32_e32 v2, s10 -; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s2, 2 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 2 ; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc ; GFX7-NEXT: v_mov_b32_e32 v3, s11 -; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s2, 3 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 3 ; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc +; GFX7-NEXT: v_mov_b32_e32 v4, s12 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 4 +; GFX7-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc ; GFX7-NEXT: v_mov_b32_e32 v5, s13 -; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s2, 5 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 5 ; GFX7-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc -; GFX7-NEXT: v_mov_b32_e32 v4, s12 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, 4 ; GFX7-NEXT: v_mov_b32_e32 v6, s14 -; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s2, 6 -; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[0:1] +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 6 ; GFX7-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc -; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s2, 7 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 7 ; GFX7-NEXT: v_mov_b32_e32 v7, s15 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: s_mov_b32 s2, -1 @@ -3214,27 +3214,27 @@ ; GFX9-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[10:11] ; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX9-NEXT: v_and_or_b32 v9, v1, v0, v2 +; GFX9-NEXT: v_and_or_b32 v10, v1, v0, v2 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: v_mov_b32_e32 v2, s18 ; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v8 +; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: v_mov_b32_e32 v4, s20 ; GFX9-NEXT: v_mov_b32_e32 v5, s21 ; GFX9-NEXT: v_mov_b32_e32 v6, s22 ; GFX9-NEXT: v_mov_b32_e32 v7, s23 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[2:3] -; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v9, s[14:15] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[10:11] -; GFX9-NEXT: v_mov_b32_e32 v8, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v10, s[12:13] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v10, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v9, 0 ; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off +; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[14:15] +; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v10, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v10, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v10, s[10:11] ; GFX9-NEXT: s_mov_b64 s[0:1], 16 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: global_store_dwordx4 v0, v[4:7], s[0:1] @@ -3454,7 +3454,7 @@ ; GFX9-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[10:11] ; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 -; GFX9-NEXT: v_and_or_b32 v9, v2, v1, v0 +; GFX9-NEXT: v_and_or_b32 v10, v2, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s12 ; GFX9-NEXT: v_mov_b32_e32 v1, s13 ; GFX9-NEXT: v_mov_b32_e32 v2, s14 @@ -3464,17 +3464,17 @@ ; GFX9-NEXT: v_mov_b32_e32 v6, s18 ; GFX9-NEXT: v_mov_b32_e32 v7, s19 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[2:3] -; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v9, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[10:11] ; GFX9-NEXT: v_mov_b32_e32 v8, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v10, s[12:13] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v10, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v9, 0 ; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off +; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v10, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v10, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v10, s[10:11] ; GFX9-NEXT: s_mov_b64 s[0:1], 16 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: global_store_dwordx4 v0, v[4:7], s[0:1] @@ -3679,6 +3679,7 @@ ; GFX9-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v0 ; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v0 +; GFX9-NEXT: v_mov_b32_e32 v14, 0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_cndmask_b32_e32 v11, v3, v4, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v5, s[0:1] @@ -3688,21 +3689,20 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v8, s[6:7] ; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v9, s[8:9] ; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v10, s[10:11] -; GFX9-NEXT: v_and_or_b32 v11, v11, v1, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v5, v11, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, v11, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v6, v11, s[2:3] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v8, v11, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v6, v9, v11, s[8:9] -; GFX9-NEXT: v_mov_b32_e32 v8, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v11, vcc -; GFX9-NEXT: v_mov_b32_e32 v9, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v4, v7, v11, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v10, v11, s[10:11] +; GFX9-NEXT: v_and_or_b32 v13, v11, v1, v2 +; GFX9-NEXT: v_mov_b32_e32 v11, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v2, v5, v13, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, v13, s[12:13] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v13, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v3, v6, v13, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v12, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v4, v7, v13, s[4:5] ; GFX9-NEXT: s_mov_b64 s[0:1], 16 -; GFX9-NEXT: v_mov_b32_e32 v10, 0 -; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off -; GFX9-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v5, v8, v13, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v6, v9, v13, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v7, v10, v13, s[10:11] +; GFX9-NEXT: global_store_dwordx4 v[11:12], v[0:3], off +; GFX9-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_v_v16i16_s_v: @@ -3737,21 +3737,21 @@ ; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v9, s[8:9] ; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v10, s[10:11] ; GFX8-NEXT: v_and_b32_e32 v1, v11, v1 -; GFX8-NEXT: v_or_b32_e32 v11, v1, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v11, s[12:13] -; GFX8-NEXT: v_cndmask_b32_e64 v2, v5, v11, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v11, s[2:3] -; GFX8-NEXT: v_cndmask_b32_e64 v5, v8, v11, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v6, v9, v11, s[8:9] -; GFX8-NEXT: v_mov_b32_e32 v8, 0 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v11, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v4, v7, v11, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v7, v10, v11, s[10:11] -; GFX8-NEXT: v_mov_b32_e32 v10, 16 -; GFX8-NEXT: v_mov_b32_e32 v9, 0 +; GFX8-NEXT: v_or_b32_e32 v13, v1, v2 ; GFX8-NEXT: v_mov_b32_e32 v11, 0 -; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] -; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[4:7] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v13, s[12:13] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v13, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v5, v13, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v13, s[2:3] +; GFX8-NEXT: v_mov_b32_e32 v12, 0 +; GFX8-NEXT: v_cndmask_b32_e64 v5, v8, v13, s[6:7] +; GFX8-NEXT: v_cndmask_b32_e64 v6, v9, v13, s[8:9] +; GFX8-NEXT: v_mov_b32_e32 v8, 16 +; GFX8-NEXT: v_cndmask_b32_e64 v4, v7, v13, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v7, v10, v13, s[10:11] +; GFX8-NEXT: v_mov_b32_e32 v9, 0 +; GFX8-NEXT: flat_store_dwordx4 v[11:12], v[0:3] +; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[4:7] ; GFX8-NEXT: s_endpgm ; ; GFX7-LABEL: insertelement_v_v16i16_s_v: @@ -3873,6 +3873,9 @@ ; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], s12, 5 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], s12, 6 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[10:11], s12, 7 +; GFX9-NEXT: v_mov_b32_e32 v11, 0 +; GFX9-NEXT: v_mov_b32_e32 v12, 0 +; GFX9-NEXT: v_mov_b32_e32 v14, 0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1] @@ -3882,22 +3885,19 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[6:7] ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[8:9] ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v10, s[10:11] -; GFX9-NEXT: v_and_or_b32 v11, v1, s13, v0 +; GFX9-NEXT: v_and_or_b32 v13, v1, s13, v0 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], s12, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v5, v11, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, v11, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v6, v11, s[2:3] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v8, v11, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v6, v9, v11, s[8:9] -; GFX9-NEXT: v_mov_b32_e32 v8, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v11, vcc -; GFX9-NEXT: v_mov_b32_e32 v9, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v4, v7, v11, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v10, v11, s[10:11] +; GFX9-NEXT: v_cndmask_b32_e64 v2, v5, v13, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, v13, s[12:13] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v13, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v3, v6, v13, s[2:3] +; GFX9-NEXT: v_cndmask_b32_e64 v4, v7, v13, s[4:5] ; GFX9-NEXT: s_mov_b64 s[0:1], 16 -; GFX9-NEXT: v_mov_b32_e32 v10, 0 -; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off -; GFX9-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v5, v8, v13, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v6, v9, v13, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v7, v10, v13, s[10:11] +; GFX9-NEXT: global_store_dwordx4 v[11:12], v[0:3], off +; GFX9-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_v_v16i16_v_s: @@ -3921,6 +3921,8 @@ ; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], s12, 6 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], s12, 7 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_mov_b32_e32 v11, 0 +; GFX8-NEXT: v_mov_b32_e32 v12, 0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1] @@ -3931,22 +3933,20 @@ ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[8:9] ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v10, s[10:11] ; GFX8-NEXT: v_and_b32_e32 v1, s13, v1 -; GFX8-NEXT: v_or_b32_e32 v11, v1, v0 +; GFX8-NEXT: v_or_b32_e32 v13, v1, v0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[12:13], s12, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v11, s[12:13] -; GFX8-NEXT: v_cndmask_b32_e64 v2, v5, v11, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v11, s[2:3] -; GFX8-NEXT: v_cndmask_b32_e64 v5, v8, v11, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v6, v9, v11, s[8:9] -; GFX8-NEXT: v_mov_b32_e32 v8, 0 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v11, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v4, v7, v11, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v7, v10, v11, s[10:11] -; GFX8-NEXT: v_mov_b32_e32 v10, 16 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v13, s[12:13] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v13, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v5, v13, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v13, s[2:3] +; GFX8-NEXT: v_cndmask_b32_e64 v5, v8, v13, s[6:7] +; GFX8-NEXT: v_cndmask_b32_e64 v6, v9, v13, s[8:9] +; GFX8-NEXT: v_mov_b32_e32 v8, 16 +; GFX8-NEXT: v_cndmask_b32_e64 v4, v7, v13, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v7, v10, v13, s[10:11] ; GFX8-NEXT: v_mov_b32_e32 v9, 0 -; GFX8-NEXT: v_mov_b32_e32 v11, 0 -; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] -; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[4:7] +; GFX8-NEXT: flat_store_dwordx4 v[11:12], v[0:3] +; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[4:7] ; GFX8-NEXT: s_endpgm ; ; GFX7-LABEL: insertelement_v_v16i16_v_s: @@ -4067,7 +4067,10 @@ ; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v0 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v0 ; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX9-NEXT: v_mov_b32_e32 v12, 0 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v0 +; GFX9-NEXT: v_mov_b32_e32 v13, 0 +; GFX9-NEXT: v_mov_b32_e32 v15, 0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] @@ -4077,21 +4080,18 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[6:7] ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v10, s[8:9] ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[10:11] -; GFX9-NEXT: v_and_or_b32 v12, v3, v1, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, v12, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v12, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v4, v8, v12, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v9, v12, s[6:7] -; GFX9-NEXT: v_mov_b32_e32 v8, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, v12, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v7, v12, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v9, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v6, v10, v12, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v11, v12, s[10:11] +; GFX9-NEXT: v_and_or_b32 v14, v3, v1, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, v14, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, v14, s[12:13] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v14, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v3, v7, v14, s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], 16 -; GFX9-NEXT: v_mov_b32_e32 v10, 0 -; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off -; GFX9-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v4, v8, v14, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v5, v9, v14, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v6, v10, v14, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v7, v11, v14, s[10:11] +; GFX9-NEXT: global_store_dwordx4 v[12:13], v[0:3], off +; GFX9-NEXT: global_store_dwordx4 v15, v[4:7], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_v_v16i16_v_v: @@ -4114,7 +4114,9 @@ ; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v0 ; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX8-NEXT: v_mov_b32_e32 v12, 0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v0 +; GFX8-NEXT: v_mov_b32_e32 v13, 0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] @@ -4125,21 +4127,19 @@ ; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v10, s[8:9] ; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[10:11] ; GFX8-NEXT: v_and_b32_e32 v1, v3, v1 -; GFX8-NEXT: v_or_b32_e32 v12, v1, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v4, v12, s[12:13] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v12, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v4, v8, v12, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v5, v9, v12, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v8, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v12, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v12, s[2:3] -; GFX8-NEXT: v_cndmask_b32_e64 v6, v10, v12, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v7, v11, v12, s[10:11] -; GFX8-NEXT: v_mov_b32_e32 v10, 16 +; GFX8-NEXT: v_or_b32_e32 v14, v1, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v4, v14, s[12:13] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v14, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v14, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v14, s[2:3] +; GFX8-NEXT: v_cndmask_b32_e64 v4, v8, v14, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v5, v9, v14, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v8, 16 +; GFX8-NEXT: v_cndmask_b32_e64 v6, v10, v14, s[8:9] +; GFX8-NEXT: v_cndmask_b32_e64 v7, v11, v14, s[10:11] ; GFX8-NEXT: v_mov_b32_e32 v9, 0 -; GFX8-NEXT: v_mov_b32_e32 v11, 0 -; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] -; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[4:7] +; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[0:3] +; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[4:7] ; GFX8-NEXT: s_endpgm ; ; GFX7-LABEL: insertelement_v_v16i16_v_v: Index: llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll @@ -6,123 +6,138 @@ ; GCN-LABEL: v_insert_v64i32_37: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-NEXT: v_lshlrev_b32_e32 v68, 8, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v64, 8, v0 ; GCN-NEXT: s_movk_i32 s4, 0x80 ; GCN-NEXT: s_mov_b32 s5, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: v_add_co_u32_e32 v2, vcc, v0, v68 +; GCN-NEXT: v_add_co_u32_e32 v2, vcc, v0, v64 ; GCN-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc -; GCN-NEXT: v_add_co_u32_e32 v0, vcc, 64, v2 -; GCN-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc -; GCN-NEXT: global_load_dwordx4 v[32:35], v[0:1], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[36:39], v[0:1], off offset:32 -; GCN-NEXT: global_load_dwordx4 v[40:43], v[0:1], off offset:48 +; GCN-NEXT: v_add_co_u32_e32 v56, vcc, 64, v2 ; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_addc_co_u32_e32 v57, vcc, 0, v3, vcc ; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: v_add_co_u32_e32 v64, vcc, v2, v0 +; GCN-NEXT: v_add_co_u32_e32 v58, vcc, v2, v0 ; GCN-NEXT: s_movk_i32 s4, 0xc0 -; GCN-NEXT: v_addc_co_u32_e32 v65, vcc, v3, v1, vcc +; GCN-NEXT: v_addc_co_u32_e32 v59, vcc, v3, v1, vcc ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: v_add_co_u32_e32 v66, vcc, v2, v0 -; GCN-NEXT: v_addc_co_u32_e32 v67, vcc, v3, v1, vcc -; GCN-NEXT: global_load_dwordx4 v[44:47], v68, s[0:1] -; GCN-NEXT: global_load_dwordx4 v[48:51], v68, s[0:1] offset:16 -; GCN-NEXT: global_load_dwordx4 v[52:55], v68, s[0:1] offset:32 -; GCN-NEXT: global_load_dwordx4 v[56:59], v68, s[0:1] offset:48 -; GCN-NEXT: global_load_dwordx4 v[60:63], v68, s[0:1] offset:64 -; GCN-NEXT: global_load_dwordx4 v[4:7], v[64:65], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[8:11], v[64:65], off offset:32 -; GCN-NEXT: global_load_dwordx4 v[12:15], v[64:65], off offset:48 -; GCN-NEXT: global_load_dwordx4 v[20:23], v[66:67], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[24:27], v[66:67], off offset:32 -; GCN-NEXT: global_load_dwordx4 v[28:31], v[66:67], off offset:48 -; GCN-NEXT: global_load_dwordx4 v[0:3], v68, s[0:1] offset:128 -; GCN-NEXT: global_load_dwordx4 v[16:19], v68, s[0:1] offset:192 -; GCN-NEXT: s_waitcnt vmcnt(7) -; GCN-NEXT: v_mov_b32_e32 v5, 0x3e7 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: global_store_dwordx4 v68, v[0:3], s[2:3] offset:128 -; GCN-NEXT: global_store_dwordx4 v68, v[4:7], s[2:3] offset:144 -; GCN-NEXT: global_store_dwordx4 v68, v[8:11], s[2:3] offset:160 -; GCN-NEXT: global_store_dwordx4 v68, v[12:15], s[2:3] offset:176 -; GCN-NEXT: s_waitcnt vmcnt(4) -; GCN-NEXT: global_store_dwordx4 v68, v[16:19], s[2:3] offset:192 -; GCN-NEXT: global_store_dwordx4 v68, v[20:23], s[2:3] offset:208 -; GCN-NEXT: global_store_dwordx4 v68, v[24:27], s[2:3] offset:224 -; GCN-NEXT: global_store_dwordx4 v68, v[44:47], s[2:3] -; GCN-NEXT: global_store_dwordx4 v68, v[48:51], s[2:3] offset:16 -; GCN-NEXT: global_store_dwordx4 v68, v[52:55], s[2:3] offset:32 -; GCN-NEXT: global_store_dwordx4 v68, v[56:59], s[2:3] offset:48 -; GCN-NEXT: global_store_dwordx4 v68, v[60:63], s[2:3] offset:64 -; GCN-NEXT: global_store_dwordx4 v68, v[28:31], s[2:3] offset:240 -; GCN-NEXT: global_store_dwordx4 v68, v[32:35], s[2:3] offset:80 -; GCN-NEXT: global_store_dwordx4 v68, v[36:39], s[2:3] offset:96 -; GCN-NEXT: global_store_dwordx4 v68, v[40:43], s[2:3] offset:112 +; GCN-NEXT: v_add_co_u32_e32 v60, vcc, v2, v0 +; GCN-NEXT: v_addc_co_u32_e32 v61, vcc, v3, v1, vcc +; GCN-NEXT: global_load_dwordx4 v[4:7], v[56:57], off offset:16 +; GCN-NEXT: global_load_dwordx4 v[8:11], v[56:57], off offset:32 +; GCN-NEXT: global_load_dwordx4 v[12:15], v[56:57], off offset:48 +; GCN-NEXT: global_load_dwordx4 v[16:19], v[58:59], off offset:16 +; GCN-NEXT: global_load_dwordx4 v[20:23], v[58:59], off offset:32 +; GCN-NEXT: global_load_dwordx4 v[24:27], v[58:59], off offset:48 +; GCN-NEXT: global_load_dwordx4 v[28:31], v[60:61], off offset:16 +; GCN-NEXT: global_load_dwordx4 v[32:35], v[60:61], off offset:32 +; GCN-NEXT: global_load_dwordx4 v[36:39], v[60:61], off offset:48 +; GCN-NEXT: global_load_dwordx4 v[0:3], v64, s[0:1] +; GCN-NEXT: global_load_dwordx4 v[40:43], v64, s[0:1] offset:16 +; GCN-NEXT: global_load_dwordx4 v[44:47], v64, s[0:1] offset:32 +; GCN-NEXT: global_load_dwordx4 v[48:51], v64, s[0:1] offset:48 +; GCN-NEXT: global_load_dwordx4 v[52:55], v64, s[0:1] offset:64 +; GCN-NEXT: ; kill: killed $vgpr56 killed $vgpr57 +; GCN-NEXT: ; kill: killed $vgpr60 killed $vgpr61 +; GCN-NEXT: ; kill: killed $vgpr58 killed $vgpr59 +; GCN-NEXT: global_load_dwordx4 v[56:59], v64, s[0:1] offset:128 +; GCN-NEXT: global_load_dwordx4 v[60:63], v64, s[0:1] offset:192 +; GCN-NEXT: s_waitcnt vmcnt(12) +; GCN-NEXT: v_mov_b32_e32 v17, 0x3e7 +; GCN-NEXT: s_waitcnt vmcnt(6) +; GCN-NEXT: global_store_dwordx4 v64, v[0:3], s[2:3] +; GCN-NEXT: s_waitcnt vmcnt(6) +; GCN-NEXT: global_store_dwordx4 v64, v[40:43], s[2:3] offset:16 +; GCN-NEXT: s_waitcnt vmcnt(6) +; GCN-NEXT: global_store_dwordx4 v64, v[44:47], s[2:3] offset:32 +; GCN-NEXT: s_waitcnt vmcnt(6) +; GCN-NEXT: global_store_dwordx4 v64, v[48:51], s[2:3] offset:48 +; GCN-NEXT: s_waitcnt vmcnt(6) +; GCN-NEXT: global_store_dwordx4 v64, v[52:55], s[2:3] offset:64 +; GCN-NEXT: s_waitcnt vmcnt(6) +; GCN-NEXT: global_store_dwordx4 v64, v[56:59], s[2:3] offset:128 +; GCN-NEXT: global_store_dwordx4 v64, v[4:7], s[2:3] offset:80 +; GCN-NEXT: global_store_dwordx4 v64, v[8:11], s[2:3] offset:96 +; GCN-NEXT: global_store_dwordx4 v64, v[12:15], s[2:3] offset:112 +; GCN-NEXT: global_store_dwordx4 v64, v[16:19], s[2:3] offset:144 +; GCN-NEXT: global_store_dwordx4 v64, v[20:23], s[2:3] offset:160 +; GCN-NEXT: global_store_dwordx4 v64, v[24:27], s[2:3] offset:176 +; GCN-NEXT: s_waitcnt vmcnt(12) +; GCN-NEXT: global_store_dwordx4 v64, v[60:63], s[2:3] offset:192 +; GCN-NEXT: global_store_dwordx4 v64, v[28:31], s[2:3] offset:208 +; GCN-NEXT: global_store_dwordx4 v64, v[32:35], s[2:3] offset:224 +; GCN-NEXT: global_store_dwordx4 v64, v[36:39], s[2:3] offset:240 ; GCN-NEXT: s_endpgm ; ; GFX10-LABEL: v_insert_v64i32_37: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-NEXT: v_lshlrev_b32_e32 v70, 8, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v66, 8, v0 ; GFX10-NEXT: s_movk_i32 s4, 0x80 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, s4 -; GFX10-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-NEXT: v_mov_b32_e32 v3, s5 ; GFX10-NEXT: s_movk_i32 s4, 0xc0 -; GFX10-NEXT: v_mov_b32_e32 v3, s4 -; GFX10-NEXT: v_mov_b32_e32 v4, s5 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, s1 -; GFX10-NEXT: v_mov_b32_e32 v5, s0 -; GFX10-NEXT: s_clause 0x4 -; GFX10-NEXT: global_load_dwordx4 v[32:35], v70, s[0:1] -; GFX10-NEXT: global_load_dwordx4 v[36:39], v70, s[0:1] offset:16 -; GFX10-NEXT: global_load_dwordx4 v[40:43], v70, s[0:1] offset:32 -; GFX10-NEXT: global_load_dwordx4 v[44:47], v70, s[0:1] offset:48 -; GFX10-NEXT: global_load_dwordx4 v[48:51], v70, s[0:1] offset:64 -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v5, v70 -; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v6, vcc_lo -; GFX10-NEXT: v_add_co_u32 v64, vcc_lo, v0, 64 -; GFX10-NEXT: v_add_co_ci_u32_e32 v65, vcc_lo, 0, v5, vcc_lo -; GFX10-NEXT: v_add_co_u32 v66, vcc_lo, v0, v1 -; GFX10-NEXT: v_add_co_ci_u32_e32 v67, vcc_lo, v5, v2, vcc_lo -; GFX10-NEXT: v_add_co_u32 v68, vcc_lo, v0, v3 -; GFX10-NEXT: v_add_co_ci_u32_e32 v69, vcc_lo, v5, v4, vcc_lo -; GFX10-NEXT: s_clause 0xa -; GFX10-NEXT: global_load_dwordx4 v[52:55], v[64:65], off offset:16 -; GFX10-NEXT: global_load_dwordx4 v[56:59], v[64:65], off offset:32 -; GFX10-NEXT: global_load_dwordx4 v[60:63], v[64:65], off offset:48 -; GFX10-NEXT: global_load_dwordx4 v[4:7], v[66:67], off offset:16 -; GFX10-NEXT: global_load_dwordx4 v[8:11], v[66:67], off offset:32 -; GFX10-NEXT: global_load_dwordx4 v[12:15], v[66:67], off offset:48 -; GFX10-NEXT: global_load_dwordx4 v[20:23], v[68:69], off offset:16 -; GFX10-NEXT: global_load_dwordx4 v[24:27], v[68:69], off offset:32 -; GFX10-NEXT: global_load_dwordx4 v[28:31], v[68:69], off offset:48 -; GFX10-NEXT: global_load_dwordx4 v[0:3], v70, s[0:1] offset:128 -; GFX10-NEXT: global_load_dwordx4 v[16:19], v70, s[0:1] offset:192 -; GFX10-NEXT: s_waitcnt vmcnt(7) -; GFX10-NEXT: v_mov_b32_e32 v5, 0x3e7 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v0, v66 +; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-NEXT: v_add_co_u32 v60, vcc_lo, v4, 64 +; GFX10-NEXT: v_add_co_ci_u32_e32 v61, vcc_lo, 0, v5, vcc_lo +; GFX10-NEXT: v_add_co_u32 v62, vcc_lo, v4, v2 +; GFX10-NEXT: v_add_co_ci_u32_e32 v63, vcc_lo, v5, v3, vcc_lo +; GFX10-NEXT: v_add_co_u32 v64, vcc_lo, v4, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v65, vcc_lo, v5, v1, vcc_lo +; GFX10-NEXT: s_clause 0xe +; GFX10-NEXT: global_load_dwordx4 v[4:7], v[60:61], off offset:16 +; GFX10-NEXT: global_load_dwordx4 v[8:11], v[60:61], off offset:32 +; GFX10-NEXT: global_load_dwordx4 v[12:15], v[60:61], off offset:48 +; GFX10-NEXT: global_load_dwordx4 v[16:19], v[62:63], off offset:16 +; GFX10-NEXT: global_load_dwordx4 v[20:23], v[62:63], off offset:32 +; GFX10-NEXT: global_load_dwordx4 v[24:27], v[62:63], off offset:48 +; GFX10-NEXT: global_load_dwordx4 v[28:31], v[64:65], off offset:16 +; GFX10-NEXT: global_load_dwordx4 v[32:35], v[64:65], off offset:32 +; GFX10-NEXT: global_load_dwordx4 v[36:39], v[64:65], off offset:48 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v66, s[0:1] +; GFX10-NEXT: global_load_dwordx4 v[40:43], v66, s[0:1] offset:16 +; GFX10-NEXT: global_load_dwordx4 v[44:47], v66, s[0:1] offset:32 +; GFX10-NEXT: global_load_dwordx4 v[48:51], v66, s[0:1] offset:48 +; GFX10-NEXT: global_load_dwordx4 v[52:55], v66, s[0:1] offset:64 +; GFX10-NEXT: global_load_dwordx4 v[56:59], v66, s[0:1] offset:128 +; GFX10-NEXT: ; kill: killed $vgpr60 killed $vgpr61 +; GFX10-NEXT: ; kill: killed $vgpr64 killed $vgpr65 +; GFX10-NEXT: ; kill: killed $vgpr62 killed $vgpr63 +; GFX10-NEXT: global_load_dwordx4 v[60:63], v66, s[0:1] offset:192 +; GFX10-NEXT: s_waitcnt vmcnt(12) +; GFX10-NEXT: v_mov_b32_e32 v17, 0x3e7 +; GFX10-NEXT: s_waitcnt vmcnt(6) +; GFX10-NEXT: global_store_dwordx4 v66, v[0:3], s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(5) +; GFX10-NEXT: global_store_dwordx4 v66, v[40:43], s[2:3] offset:16 +; GFX10-NEXT: s_waitcnt vmcnt(4) +; GFX10-NEXT: global_store_dwordx4 v66, v[44:47], s[2:3] offset:32 +; GFX10-NEXT: s_waitcnt vmcnt(3) +; GFX10-NEXT: global_store_dwordx4 v66, v[48:51], s[2:3] offset:48 +; GFX10-NEXT: s_waitcnt vmcnt(2) +; GFX10-NEXT: global_store_dwordx4 v66, v[52:55], s[2:3] offset:64 ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: global_store_dwordx4 v70, v[0:3], s[2:3] offset:128 -; GFX10-NEXT: global_store_dwordx4 v70, v[4:7], s[2:3] offset:144 -; GFX10-NEXT: global_store_dwordx4 v70, v[8:11], s[2:3] offset:160 -; GFX10-NEXT: global_store_dwordx4 v70, v[12:15], s[2:3] offset:176 +; GFX10-NEXT: global_store_dwordx4 v66, v[56:59], s[2:3] offset:128 +; GFX10-NEXT: global_store_dwordx4 v66, v[4:7], s[2:3] offset:80 +; GFX10-NEXT: global_store_dwordx4 v66, v[8:11], s[2:3] offset:96 +; GFX10-NEXT: global_store_dwordx4 v66, v[12:15], s[2:3] offset:112 +; GFX10-NEXT: global_store_dwordx4 v66, v[16:19], s[2:3] offset:144 +; GFX10-NEXT: global_store_dwordx4 v66, v[20:23], s[2:3] offset:160 +; GFX10-NEXT: global_store_dwordx4 v66, v[24:27], s[2:3] offset:176 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_store_dwordx4 v70, v[16:19], s[2:3] offset:192 -; GFX10-NEXT: global_store_dwordx4 v70, v[20:23], s[2:3] offset:208 -; GFX10-NEXT: global_store_dwordx4 v70, v[24:27], s[2:3] offset:224 -; GFX10-NEXT: global_store_dwordx4 v70, v[32:35], s[2:3] -; GFX10-NEXT: global_store_dwordx4 v70, v[36:39], s[2:3] offset:16 -; GFX10-NEXT: global_store_dwordx4 v70, v[40:43], s[2:3] offset:32 -; GFX10-NEXT: global_store_dwordx4 v70, v[44:47], s[2:3] offset:48 -; GFX10-NEXT: global_store_dwordx4 v70, v[48:51], s[2:3] offset:64 -; GFX10-NEXT: global_store_dwordx4 v70, v[52:55], s[2:3] offset:80 -; GFX10-NEXT: global_store_dwordx4 v70, v[56:59], s[2:3] offset:96 -; GFX10-NEXT: global_store_dwordx4 v70, v[60:63], s[2:3] offset:112 -; GFX10-NEXT: global_store_dwordx4 v70, v[28:31], s[2:3] offset:240 +; GFX10-NEXT: global_store_dwordx4 v66, v[60:63], s[2:3] offset:192 +; GFX10-NEXT: global_store_dwordx4 v66, v[28:31], s[2:3] offset:208 +; GFX10-NEXT: global_store_dwordx4 v66, v[32:35], s[2:3] offset:224 +; GFX10-NEXT: global_store_dwordx4 v66, v[36:39], s[2:3] offset:240 ; GFX10-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.in = getelementptr <64 x i32>, <64 x i32> addrspace(1)* %ptr.in, i32 %id Index: llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-extract.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-extract.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-extract.mir @@ -1,353 +1,38 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -march=amdgcn -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck %s ---- -name: extract512 -legalized: true -regBankSelected: true - -body: | - bb.0: - ; CHECK-LABEL: name: extract512 - ; CHECK: [[DEF:%[0-9]+]]:sgpr_512 = IMPLICIT_DEF - ; CHECK: [[COPY:%[0-9]+]]:sreg_32 = COPY [[DEF]].sub0 - ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[DEF]].sub1 - ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[DEF]].sub2 - ; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[DEF]].sub3 - ; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[DEF]].sub4 - ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[DEF]].sub5 - ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[DEF]].sub6 - ; CHECK: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[DEF]].sub7 - ; CHECK: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[DEF]].sub8 - ; CHECK: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[DEF]].sub9 - ; CHECK: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[DEF]].sub10 - ; CHECK: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[DEF]].sub11 - ; CHECK: [[COPY12:%[0-9]+]]:sreg_32 = COPY [[DEF]].sub12 - ; CHECK: [[COPY13:%[0-9]+]]:sreg_32 = COPY [[DEF]].sub13 - ; CHECK: [[COPY14:%[0-9]+]]:sreg_32 = COPY [[DEF]].sub14 - ; CHECK: [[COPY15:%[0-9]+]]:sreg_32 = COPY [[DEF]].sub15 - ; CHECK: $sgpr0 = COPY [[COPY]] - ; CHECK: $sgpr1 = COPY [[COPY1]] - ; CHECK: $sgpr2 = COPY [[COPY2]] - ; CHECK: $sgpr3 = COPY [[COPY3]] - ; CHECK: $sgpr4 = COPY [[COPY4]] - ; CHECK: $sgpr5 = COPY [[COPY5]] - ; CHECK: $sgpr6 = COPY [[COPY6]] - ; CHECK: $sgpr7 = COPY [[COPY7]] - ; CHECK: $sgpr8 = COPY [[COPY8]] - ; CHECK: $sgpr9 = COPY [[COPY9]] - ; CHECK: $sgpr10 = COPY [[COPY10]] - ; CHECK: $sgpr11 = COPY [[COPY11]] - ; CHECK: $sgpr12 = COPY [[COPY12]] - ; CHECK: $sgpr13 = COPY [[COPY13]] - ; CHECK: $sgpr14 = COPY [[COPY14]] - ; CHECK: $sgpr15 = COPY [[COPY15]] - ; CHECK: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15 - %0:sgpr(s512) = G_IMPLICIT_DEF - %1:sgpr(s32) = G_EXTRACT %0:sgpr(s512), 0 - %2:sgpr(s32) = G_EXTRACT %0:sgpr(s512), 32 - %3:sgpr(s32) = G_EXTRACT %0:sgpr(s512), 64 - %4:sgpr(s32) = G_EXTRACT %0:sgpr(s512), 96 - %5:sgpr(s32) = G_EXTRACT %0:sgpr(s512), 128 - %6:sgpr(s32) = G_EXTRACT %0:sgpr(s512), 160 - %7:sgpr(s32) = G_EXTRACT %0:sgpr(s512), 192 - %8:sgpr(s32) = G_EXTRACT %0:sgpr(s512), 224 - %9:sgpr(s32) = G_EXTRACT %0:sgpr(s512), 256 - %10:sgpr(s32) = G_EXTRACT %0:sgpr(s512), 288 - %11:sgpr(s32) = G_EXTRACT %0:sgpr(s512), 320 - %12:sgpr(s32) = G_EXTRACT %0:sgpr(s512), 352 - %13:sgpr(s32) = G_EXTRACT %0:sgpr(s512), 384 - %14:sgpr(s32) = G_EXTRACT %0:sgpr(s512), 416 - %15:sgpr(s32) = G_EXTRACT %0:sgpr(s512), 448 - %16:sgpr(s32) = G_EXTRACT %0:sgpr(s512), 480 - $sgpr0 = COPY %1:sgpr(s32) - $sgpr1 = COPY %2:sgpr(s32) - $sgpr2 = COPY %3:sgpr(s32) - $sgpr3 = COPY %4:sgpr(s32) - $sgpr4 = COPY %5:sgpr(s32) - $sgpr5 = COPY %6:sgpr(s32) - $sgpr6 = COPY %7:sgpr(s32) - $sgpr7 = COPY %8:sgpr(s32) - $sgpr8 = COPY %9:sgpr(s32) - $sgpr9 = COPY %10:sgpr(s32) - $sgpr10 = COPY %11:sgpr(s32) - $sgpr11 = COPY %12:sgpr(s32) - $sgpr12 = COPY %13:sgpr(s32) - $sgpr13 = COPY %14:sgpr(s32) - $sgpr14 = COPY %15:sgpr(s32) - $sgpr15 = COPY %16:sgpr(s32) - SI_RETURN_TO_EPILOG $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15 -... - ---- -name: extract_s_s32_s1024 -legalized: true -regBankSelected: true - -body: | - bb.0: - ; CHECK-LABEL: name: extract_s_s32_s1024 - ; CHECK: [[DEF:%[0-9]+]]:sgpr_1024 = IMPLICIT_DEF - ; CHECK: [[COPY:%[0-9]+]]:sreg_32 = COPY [[DEF]].sub0 - ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[DEF]].sub1 - ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[DEF]].sub2 - ; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[DEF]].sub3 - ; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[DEF]].sub4 - ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[DEF]].sub5 - ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[DEF]].sub6 - ; CHECK: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[DEF]].sub7 - ; CHECK: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[DEF]].sub8 - ; CHECK: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[DEF]].sub9 - ; CHECK: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[DEF]].sub10 - ; CHECK: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[DEF]].sub11 - ; CHECK: [[COPY12:%[0-9]+]]:sreg_32 = COPY [[DEF]].sub12 - ; CHECK: [[COPY13:%[0-9]+]]:sreg_32 = COPY [[DEF]].sub13 - ; CHECK: [[COPY14:%[0-9]+]]:sreg_32 = COPY [[DEF]].sub14 - ; CHECK: [[COPY15:%[0-9]+]]:sreg_32 = COPY [[DEF]].sub15 - ; CHECK: [[COPY16:%[0-9]+]]:sreg_32 = COPY [[DEF]].sub16 - ; CHECK: [[COPY17:%[0-9]+]]:sreg_32 = COPY [[DEF]].sub17 - ; CHECK: [[COPY18:%[0-9]+]]:sreg_32 = COPY [[DEF]].sub18 - ; CHECK: [[COPY19:%[0-9]+]]:sreg_32 = COPY [[DEF]].sub19 - ; CHECK: [[COPY20:%[0-9]+]]:sreg_32 = COPY [[DEF]].sub20 - ; CHECK: [[COPY21:%[0-9]+]]:sreg_32 = COPY [[DEF]].sub21 - ; CHECK: [[COPY22:%[0-9]+]]:sreg_32 = COPY [[DEF]].sub22 - ; CHECK: [[COPY23:%[0-9]+]]:sreg_32 = COPY [[DEF]].sub23 - ; CHECK: [[COPY24:%[0-9]+]]:sreg_32 = COPY [[DEF]].sub24 - ; CHECK: [[COPY25:%[0-9]+]]:sreg_32 = COPY [[DEF]].sub25 - ; CHECK: [[COPY26:%[0-9]+]]:sreg_32 = COPY [[DEF]].sub26 - ; CHECK: [[COPY27:%[0-9]+]]:sreg_32 = COPY [[DEF]].sub27 - ; CHECK: [[COPY28:%[0-9]+]]:sreg_32 = COPY [[DEF]].sub28 - ; CHECK: [[COPY29:%[0-9]+]]:sreg_32 = COPY [[DEF]].sub29 - ; CHECK: [[COPY30:%[0-9]+]]:sreg_32 = COPY [[DEF]].sub30 - ; CHECK: [[COPY31:%[0-9]+]]:sreg_32 = COPY [[DEF]].sub31 - ; CHECK: S_ENDPGM 0, implicit [[DEF]], implicit [[COPY]], implicit [[COPY1]], implicit [[COPY2]], implicit [[COPY3]], implicit [[COPY4]], implicit [[COPY5]], implicit [[COPY6]], implicit [[COPY7]], implicit [[COPY8]], implicit [[COPY9]], implicit [[COPY10]], implicit [[COPY11]], implicit [[COPY12]], implicit [[COPY13]], implicit [[COPY14]], implicit [[COPY15]], implicit [[COPY16]], implicit [[COPY17]], implicit [[COPY18]], implicit [[COPY19]], implicit [[COPY20]], implicit [[COPY21]], implicit [[COPY22]], implicit [[COPY23]], implicit [[COPY24]], implicit [[COPY25]], implicit [[COPY26]], implicit [[COPY27]], implicit [[COPY28]], implicit [[COPY29]], implicit [[COPY30]], implicit [[COPY31]] - %0:sgpr(s1024) = G_IMPLICIT_DEF - %1:sgpr(s32) = G_EXTRACT %0:sgpr, 0 - %2:sgpr(s32) = G_EXTRACT %0:sgpr, 32 - %3:sgpr(s32) = G_EXTRACT %0:sgpr, 64 - %4:sgpr(s32) = G_EXTRACT %0:sgpr, 96 - %5:sgpr(s32) = G_EXTRACT %0:sgpr, 128 - %6:sgpr(s32) = G_EXTRACT %0:sgpr, 160 - %7:sgpr(s32) = G_EXTRACT %0:sgpr, 192 - %8:sgpr(s32) = G_EXTRACT %0:sgpr, 224 - %9:sgpr(s32) = G_EXTRACT %0:sgpr, 256 - %10:sgpr(s32) = G_EXTRACT %0:sgpr, 288 - %11:sgpr(s32) = G_EXTRACT %0:sgpr, 320 - %12:sgpr(s32) = G_EXTRACT %0:sgpr, 352 - %13:sgpr(s32) = G_EXTRACT %0:sgpr, 384 - %14:sgpr(s32) = G_EXTRACT %0:sgpr, 416 - %15:sgpr(s32) = G_EXTRACT %0:sgpr, 448 - %16:sgpr(s32) = G_EXTRACT %0:sgpr, 480 - - %17:sgpr(s32) = G_EXTRACT %0:sgpr, 512 - %18:sgpr(s32) = G_EXTRACT %0:sgpr, 544 - %19:sgpr(s32) = G_EXTRACT %0:sgpr, 576 - %20:sgpr(s32) = G_EXTRACT %0:sgpr, 608 - %21:sgpr(s32) = G_EXTRACT %0:sgpr, 640 - %22:sgpr(s32) = G_EXTRACT %0:sgpr, 672 - %23:sgpr(s32) = G_EXTRACT %0:sgpr, 704 - %24:sgpr(s32) = G_EXTRACT %0:sgpr, 736 - %25:sgpr(s32) = G_EXTRACT %0:sgpr, 768 - %26:sgpr(s32) = G_EXTRACT %0:sgpr, 800 - %27:sgpr(s32) = G_EXTRACT %0:sgpr, 832 - %28:sgpr(s32) = G_EXTRACT %0:sgpr, 864 - %29:sgpr(s32) = G_EXTRACT %0:sgpr, 896 - %30:sgpr(s32) = G_EXTRACT %0:sgpr, 928 - %31:sgpr(s32) = G_EXTRACT %0:sgpr, 960 - %32:sgpr(s32) = G_EXTRACT %0:sgpr, 992 - - S_ENDPGM 0, implicit %0, implicit %1, implicit %2, implicit %3, implicit %4, implicit %5, implicit %6, implicit %7, implicit %8, implicit %9, implicit %10, implicit %11, implicit %12, implicit %13, implicit %14, implicit %15, implicit %16, implicit %17, implicit %18, implicit %19, implicit %20, implicit %21, implicit %22, implicit %23, implicit %24, implicit %25, implicit %26, implicit %27, implicit %28, implicit %29, implicit %30, implicit %31, implicit %32 -... - -# TODO: Handle offset 32 ---- -name: extract_sgpr_s64_from_s128 -legalized: true -regBankSelected: true - -body: | - bb.0: - ; CHECK-LABEL: name: extract_sgpr_s64_from_s128 - ; CHECK: [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF - ; CHECK: [[COPY:%[0-9]+]]:sreg_64 = COPY [[DEF]].sub0_sub1 - ; CHECK: [[COPY1:%[0-9]+]]:sreg_64 = COPY [[DEF]].sub2_sub3 - ; CHECK: S_ENDPGM 0, implicit [[COPY]], implicit [[COPY1]] - %0:sgpr(s128) = G_IMPLICIT_DEF - %1:sgpr(s64) = G_EXTRACT %0, 0 - %2:sgpr(s64) = G_EXTRACT %0, 64 - S_ENDPGM 0, implicit %1, implicit %2 -... - ---- -name: extract_sgpr_s96_from_s128 -legalized: true -regBankSelected: true - -body: | - bb.0: - liveins: $sgpr0_sgpr1_sgpr2_sgpr3 - ; CHECK-LABEL: name: extract_sgpr_s96_from_s128 - ; CHECK: [[COPY:%[0-9]+]]:sgpr_128_with_sub1_sub2_sub3 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 - ; CHECK: [[COPY1:%[0-9]+]]:sgpr_128_with_sub0_sub1_sub2 = COPY [[COPY]] - ; CHECK: [[COPY2:%[0-9]+]]:sgpr_96 = COPY [[COPY1]].sub0_sub1_sub2 - ; CHECK: [[COPY3:%[0-9]+]]:sgpr_96 = COPY [[COPY]].sub1_sub2_sub3 - ; CHECK: S_ENDPGM 0, implicit [[COPY2]], implicit [[COPY3]] - %0:sgpr(s128) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 - %1:sgpr(s96) = G_EXTRACT %0, 0 - %2:sgpr(s96) = G_EXTRACT %0, 32 - S_ENDPGM 0, implicit %1, implicit %2 - -... - ---- -name: extract_sgpr_s32_from_v3s32 -legalized: true -regBankSelected: true +--- | + @external_constant32 = external addrspace(6) constant i32, align 4 -body: | - bb.0: - liveins: $sgpr0_sgpr1_sgpr2 - ; CHECK-LABEL: name: extract_sgpr_s32_from_v3s32 - ; CHECK: [[COPY:%[0-9]+]]:sgpr_96 = COPY $sgpr0_sgpr1_sgpr2 - ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1 - ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub2 - ; CHECK: S_ENDPGM 0, implicit [[COPY]], implicit [[COPY1]], implicit [[COPY2]] - %0:sgpr(<3 x s32>) = COPY $sgpr0_sgpr1_sgpr2 - %1:sgpr(s32) = G_EXTRACT %0, 0 - %2:sgpr(s32) = G_EXTRACT %0, 32 - %3:sgpr(s32) = G_EXTRACT %0, 64 - S_ENDPGM 0, implicit %0, implicit %2, implicit %3 + define void @external_constant32_got() { + ret void + } ... --- -name: extract_sgpr_v2s32_from_v3s32 +name: external_constant32_got legalized: true regBankSelected: true -body: | - bb.0: - liveins: $sgpr0_sgpr1_sgpr2 - ; CHECK-LABEL: name: extract_sgpr_v2s32_from_v3s32 - ; CHECK: [[COPY:%[0-9]+]]:sgpr_96_with_sub0_sub1 = COPY $sgpr0_sgpr1_sgpr2 - ; CHECK: [[COPY1:%[0-9]+]]:sreg_64 = COPY [[COPY]].sub0_sub1 - ; CHECK: S_ENDPGM 0, implicit [[COPY1]] - %0:sgpr(<3 x s32>) = COPY $sgpr0_sgpr1_sgpr2 - %1:sgpr(<2 x s32>) = G_EXTRACT %0, 0 - S_ENDPGM 0, implicit %1 - -... - ---- -name: extract_sgpr_v3s32_from_v4s32 -legalized: true -regBankSelected: true - -body: | - bb.0: - liveins: $sgpr0_sgpr1_sgpr2_sgpr3 - ; CHECK-LABEL: name: extract_sgpr_v3s32_from_v4s32 - ; CHECK: [[COPY:%[0-9]+]]:sgpr_128_with_sub1_sub2_sub3 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 - ; CHECK: [[COPY1:%[0-9]+]]:sgpr_128_with_sub0_sub1_sub2 = COPY [[COPY]] - ; CHECK: [[COPY2:%[0-9]+]]:sgpr_96 = COPY [[COPY1]].sub0_sub1_sub2 - ; CHECK: [[COPY3:%[0-9]+]]:sgpr_96 = COPY [[COPY]].sub1_sub2_sub3 - ; CHECK: S_ENDPGM 0, implicit [[COPY2]], implicit [[COPY3]] - %0:sgpr(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 - %1:sgpr(<3 x s32>) = G_EXTRACT %0, 0 - %2:sgpr(<3 x s32>) = G_EXTRACT %0, 32 - S_ENDPGM 0, implicit %1, implicit %2 - -... - ---- -name: extract_sgpr_v2s16_from_v4s16_offset0 -legalized: true -regBankSelected: true - -body: | - bb.0: - liveins: $sgpr0_sgpr1 - ; CHECK-LABEL: name: extract_sgpr_v2s16_from_v4s16_offset0 - ; CHECK: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 - ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0 - ; CHECK: S_ENDPGM 0, implicit [[COPY1]] - %0:sgpr(<4 x s16>) = COPY $sgpr0_sgpr1 - %1:sgpr(<2 x s16>) = G_EXTRACT %0, 0 - S_ENDPGM 0, implicit %1 - -... - ---- -name: extract_sgpr_v2s16_from_v4s16_offset32 -legalized: true -regBankSelected: true - -body: | - bb.0: - liveins: $sgpr0_sgpr1 - ; CHECK-LABEL: name: extract_sgpr_v2s16_from_v4s16_offset32 - ; CHECK: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 - ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1 - ; CHECK: S_ENDPGM 0, implicit [[COPY1]] - %0:sgpr(<4 x s16>) = COPY $sgpr0_sgpr1 - %1:sgpr(<2 x s16>) = G_EXTRACT %0, 32 - S_ENDPGM 0, implicit %1 - -... - -# FIXME: Probably should not be legal ---- -name: extract_sgpr_s16_from_v4s16_offset0 -legalized: true -regBankSelected: true - -body: | - bb.0: - liveins: $sgpr0_sgpr1 - ; CHECK-LABEL: name: extract_sgpr_s16_from_v4s16_offset0 - ; CHECK: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 - ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0 - ; CHECK: S_ENDPGM 0, implicit [[COPY1]] - %0:sgpr(<4 x s16>) = COPY $sgpr0_sgpr1 - %1:sgpr(s16) = G_EXTRACT %0, 0 - S_ENDPGM 0, implicit %1 - -... - -# FIXME: Probably should not be legal ---- -name: extract_sgpr_s16_from_v4s16_offset32 -legalized: true -regBankSelected: true - -body: | - bb.0: - liveins: $sgpr0_sgpr1 - ; CHECK-LABEL: name: extract_sgpr_s16_from_v4s16_offset32 - ; CHECK: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 - ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1 - ; CHECK: S_ENDPGM 0, implicit [[COPY1]] - %0:sgpr(<4 x s16>) = COPY $sgpr0_sgpr1 - %1:sgpr(s16) = G_EXTRACT %0, 32 - S_ENDPGM 0, implicit %1 - -... - -# FIXME: Probably should not be legal ---- -name: extract_sgpr_s16_from_v6s16_offset32 -legalized: true -regBankSelected: true +body: | + bb.1 (%ir-block.0): + liveins: $sgpr30_sgpr31 -body: | - bb.0: - liveins: $sgpr0_sgpr1_sgpr2 - ; CHECK-LABEL: name: extract_sgpr_s16_from_v6s16_offset32 - ; CHECK: [[COPY:%[0-9]+]]:sgpr_96 = COPY $sgpr0_sgpr1_sgpr2 - ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1 - ; CHECK: S_ENDPGM 0, implicit [[COPY1]] - %0:sgpr(<6 x s16>) = COPY $sgpr0_sgpr1_sgpr2 - %1:sgpr(s16) = G_EXTRACT %0, 32 - S_ENDPGM 0, implicit %1 + ; CHECK-LABEL: name: external_constant32_got + ; CHECK: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @external_constant32 + 4, target-flags(amdgpu-gotprel32-hi) @external_constant32 + 12, implicit-def $scc + ; CHECK: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p4) from got, addrspace 4) + ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0 + ; CHECK: $vgpr0 = COPY [[COPY1]] + ; CHECK: [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]] + ; CHECK: S_SETPC_B64_return [[COPY2]], implicit $vgpr0 + %0:sgpr_64 = COPY $sgpr30_sgpr31 + %3:sreg_64(p4) = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @external_constant32 + 4, target-flags(amdgpu-gotprel32-hi) @external_constant32 + 12, implicit-def $scc + %4:sgpr(p4) = G_LOAD %3(p4) :: (dereferenceable invariant load (p4) from got, addrspace 4) + %1:sgpr(p6) = G_EXTRACT %4(p4), 0 + $vgpr0 = COPY %1(p6) + %2:ccr_sgpr_64 = COPY %0 + S_SETPC_B64_return %2, implicit $vgpr0 ... Index: llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir @@ -16,37 +16,39 @@ ; CHECK: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 ; CHECK: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 36, 0 :: (dereferenceable invariant load (<2 x s64>), align 4, addrspace 4) ; CHECK: [[COPY1:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub0_sub1 - ; CHECK: [[COPY2:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub2_sub3 - ; CHECK: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY2]], 0, 0 :: (load (s64), addrspace 1) + ; CHECK: [[COPY2:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub0_sub1 + ; CHECK: [[COPY3:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub2_sub3 + ; CHECK: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY3]], 0, 0 :: (load (s64), addrspace 1) ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; CHECK: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; CHECK: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] - ; CHECK: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM]] - ; CHECK: %12:vreg_64 = nofpexcept V_ADD_F64_e64 0, [[COPY3]], 1, [[COPY4]], 0, 0, implicit $mode, implicit $exec - ; CHECK: %15:vreg_64 = nofpexcept V_FRACT_F64_e64 0, %12, 0, 0, implicit $mode, implicit $exec + ; CHECK: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] + ; CHECK: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM]] + ; CHECK: %14:vreg_64 = nofpexcept V_ADD_F64_e64 0, [[COPY4]], 1, [[COPY5]], 0, 0, implicit $mode, implicit $exec + ; CHECK: %17:vreg_64 = nofpexcept V_FRACT_F64_e64 0, %14, 0, 0, implicit $mode, implicit $exec ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; CHECK: GLOBAL_STORE_DWORDX2_SADDR [[V_MOV_B32_e32_]], %15, [[COPY1]], 0, 0, implicit $exec :: (store (s64), addrspace 1) + ; CHECK: GLOBAL_STORE_DWORDX2_SADDR [[V_MOV_B32_e32_]], %17, [[COPY1]], 0, 0, implicit $exec :: (store (s64), addrspace 1) ; CHECK: S_ENDPGM 0 %2:sgpr(p4) = COPY $sgpr0_sgpr1 %7:sgpr(s64) = G_CONSTANT i64 36 %8:sgpr(p4) = G_PTR_ADD %2, %7(s64) %9:sgpr(<2 x s64>) = G_LOAD %8(p4) :: (dereferenceable invariant load (<2 x s64>), align 4, addrspace 4) - %10:sgpr(s64) = G_EXTRACT %9(<2 x s64>), 0 - %13:sgpr(s64) = G_EXTRACT %9(<2 x s64>), 64 - %15:sgpr(p1) = G_INTTOPTR %13(s64) + %29:sgpr(s128) = G_BITCAST %9(<2 x s64>) + %27:sgpr(s64) = G_TRUNC %29(s128) + %25:sgpr(s64), %26:sgpr(s64) = G_UNMERGE_VALUES %9(<2 x s64>) + %15:sgpr(p1) = G_INTTOPTR %26(s64) %18:sgpr(s64) = G_LOAD %15(p1) :: (load (s64), addrspace 1) %19:sgpr(s64) = G_FCONSTANT double -0.000000e+00 %24:sgpr(s64) = G_FNEG %18 - %25:vgpr(s64) = COPY %19(s64) - %26:vgpr(s64) = COPY %24(s64) - %20:vgpr(s64) = G_FADD %25, %26 + %30:vgpr(s64) = COPY %19(s64) + %31:vgpr(s64) = COPY %24(s64) + %20:vgpr(s64) = G_FADD %30, %31 %21:vgpr(s64) = G_FFLOOR %20 %23:vgpr(s64) = G_FNEG %21 %22:vgpr(s64) = G_FADD %20, %23 - %12:sgpr(p1) = G_INTTOPTR %10(s64) - %27:vgpr(p1) = COPY %12(p1) - G_STORE %22(s64), %27(p1) :: (store (s64), addrspace 1) + %12:sgpr(p1) = G_INTTOPTR %27(s64) + %32:vgpr(p1) = COPY %12(p1) + G_STORE %22(s64), %32(p1) :: (store (s64), addrspace 1) S_ENDPGM 0 ... @@ -63,40 +65,41 @@ ; CHECK-LABEL: name: fract_f64_neg_abs ; CHECK: liveins: $sgpr0_sgpr1 ; CHECK: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 - ; CHECK: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = - ; S_LOAD_DWORDX4_IMM [[COPY]], 36, 0 :: (dereferenceable invariant load (<2 x s64>), align 4, addrspace 4) + ; CHECK: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 36, 0 :: (dereferenceable invariant load (<2 x s64>), align 4, addrspace 4) ; CHECK: [[COPY1:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub0_sub1 - ; CHECK: [[COPY2:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub2_sub3 - ; CHECK: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY2]], 0, 0 :: (load (s64), addrspace 1) + ; CHECK: [[COPY2:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub0_sub1 + ; CHECK: [[COPY3:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub2_sub3 + ; CHECK: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY3]], 0, 0 :: (load (s64), addrspace 1) ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; CHECK: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; CHECK: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] - ; CHECK: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM]] - ; CHECK: %13:vreg_64 = nofpexcept V_ADD_F64_e64 0, [[COPY3]], 3, [[COPY4]], 0, 0, implicit $mode, implicit $exec - ; CHECK: %16:vreg_64 = nofpexcept V_FRACT_F64_e64 0, %13, 0, 0, implicit $mode, implicit $exec + ; CHECK: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] + ; CHECK: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM]] + ; CHECK: %15:vreg_64 = nofpexcept V_ADD_F64_e64 0, [[COPY4]], 3, [[COPY5]], 0, 0, implicit $mode, implicit $exec + ; CHECK: %18:vreg_64 = nofpexcept V_FRACT_F64_e64 0, %15, 0, 0, implicit $mode, implicit $exec ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; CHECK: GLOBAL_STORE_DWORDX2_SADDR [[V_MOV_B32_e32_]], %16, [[COPY1]], 0, 0, implicit $exec :: (store (s64), addrspace 1) + ; CHECK: GLOBAL_STORE_DWORDX2_SADDR [[V_MOV_B32_e32_]], %18, [[COPY1]], 0, 0, implicit $exec :: (store (s64), addrspace 1) ; CHECK: S_ENDPGM 0 %2:sgpr(p4) = COPY $sgpr0_sgpr1 %7:sgpr(s64) = G_CONSTANT i64 36 %8:sgpr(p4) = G_PTR_ADD %2, %7(s64) %9:sgpr(<2 x s64>) = G_LOAD %8(p4) :: (dereferenceable invariant load (<2 x s64>), align 4, addrspace 4) - %10:sgpr(s64) = G_EXTRACT %9(<2 x s64>), 0 - %13:sgpr(s64) = G_EXTRACT %9(<2 x s64>), 64 - %15:sgpr(p1) = G_INTTOPTR %13(s64) + %30:sgpr(s128) = G_BITCAST %9(<2 x s64>) + %28:sgpr(s64) = G_TRUNC %30(s128) + %26:sgpr(s64), %27:sgpr(s64) = G_UNMERGE_VALUES %9(<2 x s64>) + %15:sgpr(p1) = G_INTTOPTR %27(s64) %18:sgpr(s64) = G_LOAD %15(p1) :: (load (s64), addrspace 1) %19:sgpr(s64) = G_FABS %18 %20:sgpr(s64) = G_FCONSTANT double -0.000000e+00 %25:sgpr(s64) = G_FNEG %19 - %26:vgpr(s64) = COPY %20(s64) - %27:vgpr(s64) = COPY %25(s64) - %21:vgpr(s64) = G_FADD %26, %27 + %31:vgpr(s64) = COPY %20(s64) + %32:vgpr(s64) = COPY %25(s64) + %21:vgpr(s64) = G_FADD %31, %32 %22:vgpr(s64) = G_FFLOOR %21 %24:vgpr(s64) = G_FNEG %22 %23:vgpr(s64) = G_FADD %21, %24 - %12:sgpr(p1) = G_INTTOPTR %10(s64) - %28:vgpr(p1) = COPY %12(p1) - G_STORE %23(s64), %28(p1) :: (store (s64), addrspace 1) + %12:sgpr(p1) = G_INTTOPTR %28(s64) + %33:vgpr(p1) = COPY %12(p1) + G_STORE %23(s64), %33(p1) :: (store (s64), addrspace 1) S_ENDPGM 0 ... Index: llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-insert.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-insert.mir +++ /dev/null @@ -1,607 +0,0 @@ -# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -march=amdgcn -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s - ---- - -name: insert_s512_s32 -legalized: true -regBankSelected: true - -body: | - bb.0: - ; CHECK-LABEL: name: insert_s512_s32 - ; CHECK: [[DEF:%[0-9]+]]:sgpr_512 = IMPLICIT_DEF - ; CHECK: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF - ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:sgpr_512 = INSERT_SUBREG [[DEF]], [[DEF1]], %subreg.sub0 - ; CHECK: [[INSERT_SUBREG1:%[0-9]+]]:sgpr_512 = INSERT_SUBREG [[INSERT_SUBREG]], [[DEF1]], %subreg.sub1 - ; CHECK: [[INSERT_SUBREG2:%[0-9]+]]:sgpr_512 = INSERT_SUBREG [[INSERT_SUBREG1]], [[DEF1]], %subreg.sub2 - ; CHECK: [[INSERT_SUBREG3:%[0-9]+]]:sgpr_512 = INSERT_SUBREG [[INSERT_SUBREG2]], [[DEF1]], %subreg.sub3 - ; CHECK: [[INSERT_SUBREG4:%[0-9]+]]:sgpr_512 = INSERT_SUBREG [[INSERT_SUBREG3]], [[DEF1]], %subreg.sub4 - ; CHECK: [[INSERT_SUBREG5:%[0-9]+]]:sgpr_512 = INSERT_SUBREG [[INSERT_SUBREG4]], [[DEF1]], %subreg.sub5 - ; CHECK: [[INSERT_SUBREG6:%[0-9]+]]:sgpr_512 = INSERT_SUBREG [[INSERT_SUBREG5]], [[DEF1]], %subreg.sub6 - ; CHECK: [[INSERT_SUBREG7:%[0-9]+]]:sgpr_512 = INSERT_SUBREG [[INSERT_SUBREG6]], [[DEF1]], %subreg.sub7 - ; CHECK: [[INSERT_SUBREG8:%[0-9]+]]:sgpr_512 = INSERT_SUBREG [[INSERT_SUBREG7]], [[DEF1]], %subreg.sub8 - ; CHECK: [[INSERT_SUBREG9:%[0-9]+]]:sgpr_512 = INSERT_SUBREG [[INSERT_SUBREG8]], [[DEF1]], %subreg.sub9 - ; CHECK: [[INSERT_SUBREG10:%[0-9]+]]:sgpr_512 = INSERT_SUBREG [[INSERT_SUBREG9]], [[DEF1]], %subreg.sub10 - ; CHECK: [[INSERT_SUBREG11:%[0-9]+]]:sgpr_512 = INSERT_SUBREG [[INSERT_SUBREG10]], [[DEF1]], %subreg.sub11 - ; CHECK: [[INSERT_SUBREG12:%[0-9]+]]:sgpr_512 = INSERT_SUBREG [[INSERT_SUBREG11]], [[DEF1]], %subreg.sub12 - ; CHECK: [[INSERT_SUBREG13:%[0-9]+]]:sgpr_512 = INSERT_SUBREG [[INSERT_SUBREG12]], [[DEF1]], %subreg.sub13 - ; CHECK: [[INSERT_SUBREG14:%[0-9]+]]:sgpr_512 = INSERT_SUBREG [[INSERT_SUBREG13]], [[DEF1]], %subreg.sub14 - ; CHECK: [[INSERT_SUBREG15:%[0-9]+]]:sgpr_512 = INSERT_SUBREG [[INSERT_SUBREG14]], [[DEF1]], %subreg.sub15 - ; CHECK: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY [[INSERT_SUBREG15]] - ; CHECK: SI_RETURN_TO_EPILOG $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - %0:sgpr(s512) = G_IMPLICIT_DEF - %1:sgpr(s32) = G_IMPLICIT_DEF - %2:sgpr(s512) = G_INSERT %0:sgpr, %1:sgpr(s32), 0 - %3:sgpr(s512) = G_INSERT %2:sgpr, %1:sgpr(s32), 32 - %4:sgpr(s512) = G_INSERT %3:sgpr, %1:sgpr(s32), 64 - %5:sgpr(s512) = G_INSERT %4:sgpr, %1:sgpr(s32), 96 - %6:sgpr(s512) = G_INSERT %5:sgpr, %1:sgpr(s32), 128 - %7:sgpr(s512) = G_INSERT %6:sgpr, %1:sgpr(s32), 160 - %8:sgpr(s512) = G_INSERT %7:sgpr, %1:sgpr(s32), 192 - %9:sgpr(s512) = G_INSERT %8:sgpr, %1:sgpr(s32), 224 - %10:sgpr(s512) = G_INSERT %9:sgpr, %1:sgpr(s32), 256 - %11:sgpr(s512) = G_INSERT %10:sgpr, %1:sgpr(s32), 288 - %12:sgpr(s512) = G_INSERT %11:sgpr, %1:sgpr(s32), 320 - %13:sgpr(s512) = G_INSERT %12:sgpr, %1:sgpr(s32), 352 - %14:sgpr(s512) = G_INSERT %13:sgpr, %1:sgpr(s32), 384 - %15:sgpr(s512) = G_INSERT %14:sgpr, %1:sgpr(s32), 416 - %16:sgpr(s512) = G_INSERT %15:sgpr, %1:sgpr(s32), 448 - %17:sgpr(s512) = G_INSERT %16:sgpr, %1:sgpr(s32), 480 - $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY %17:sgpr(s512) - SI_RETURN_TO_EPILOG $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 -... - ---- - -name: insert_v_s64_v_s32_0 -legalized: true -regBankSelected: true - -body: | - bb.0: - liveins: $vgpr0_vgpr1, $vgpr2 - ; CHECK-LABEL: name: insert_v_s64_v_s32_0 - ; CHECK: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:vreg_64 = INSERT_SUBREG [[COPY]], [[COPY1]], %subreg.sub0 - ; CHECK: S_ENDPGM 0, implicit [[INSERT_SUBREG]] - %0:vgpr(s64) = COPY $vgpr0_vgpr1 - %1:vgpr(s32) = COPY $vgpr2 - %2:vgpr(s64) = G_INSERT %0, %1, 0 - S_ENDPGM 0, implicit %2 -... - ---- - -name: insert_v_s64_v_s32_32 -legalized: true -regBankSelected: true - -body: | - bb.0: - liveins: $vgpr0_vgpr1, $vgpr2 - ; CHECK-LABEL: name: insert_v_s64_v_s32_32 - ; CHECK: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:vreg_64 = INSERT_SUBREG [[COPY]], [[COPY1]], %subreg.sub1 - ; CHECK: S_ENDPGM 0, implicit [[INSERT_SUBREG]] - %0:vgpr(s64) = COPY $vgpr0_vgpr1 - %1:vgpr(s32) = COPY $vgpr2 - %2:vgpr(s64) = G_INSERT %0, %1, 32 - S_ENDPGM 0, implicit %2 -... - ---- - -name: insert_s_s64_s_s32_0 -legalized: true -regBankSelected: true - -body: | - bb.0: - liveins: $sgpr0_sgpr1, $sgpr2 - ; CHECK-LABEL: name: insert_s_s64_s_s32_0 - ; CHECK: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 - ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:sreg_64 = INSERT_SUBREG [[COPY]], [[COPY1]], %subreg.sub0 - ; CHECK: S_ENDPGM 0, implicit [[INSERT_SUBREG]] - %0:sgpr(s64) = COPY $sgpr0_sgpr1 - %1:sgpr(s32) = COPY $sgpr2 - %2:sgpr(s64) = G_INSERT %0, %1, 0 - S_ENDPGM 0, implicit %2 -... - ---- - -name: insert_s_s64_s_s32_32 -legalized: true -regBankSelected: true - -body: | - bb.0: - liveins: $sgpr0_sgpr1, $sgpr2 - ; CHECK-LABEL: name: insert_s_s64_s_s32_32 - ; CHECK: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 - ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:sreg_64 = INSERT_SUBREG [[COPY]], [[COPY1]], %subreg.sub1 - ; CHECK: S_ENDPGM 0, implicit [[INSERT_SUBREG]] - %0:sgpr(s64) = COPY $sgpr0_sgpr1 - %1:sgpr(s32) = COPY $sgpr2 - %2:sgpr(s64) = G_INSERT %0, %1, 32 - S_ENDPGM 0, implicit %2 -... - ---- - -name: insert_s_s64_v_s32_32 -legalized: true -regBankSelected: true - -body: | - bb.0: - liveins: $sgpr0_sgpr1, $vgpr0 - ; CHECK-LABEL: name: insert_s_s64_v_s32_32 - ; CHECK: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 - ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:vreg_64 = INSERT_SUBREG [[COPY]], [[COPY1]], %subreg.sub1 - ; CHECK: S_ENDPGM 0, implicit [[INSERT_SUBREG]] - %0:sgpr(s64) = COPY $sgpr0_sgpr1 - %1:vgpr(s32) = COPY $vgpr2 - %2:vgpr(s64) = G_INSERT %0, %1, 32 - S_ENDPGM 0, implicit %2 -... - ---- - -name: insert_v_s64_s_s32_32 -legalized: true -regBankSelected: true - -body: | - bb.0: - liveins: $vgpr0_vgpr1, $sgpr0 - ; CHECK-LABEL: name: insert_v_s64_s_s32_32 - ; CHECK: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:vreg_64 = INSERT_SUBREG [[COPY]], [[COPY1]], %subreg.sub1 - ; CHECK: S_ENDPGM 0, implicit [[INSERT_SUBREG]] - %0:vgpr(s64) = COPY $vgpr0_vgpr1 - %1:sgpr(s32) = COPY $sgpr0 - %2:vgpr(s64) = G_INSERT %0, %1, 32 - S_ENDPGM 0, implicit %2 -... - ---- - -name: insert_v_s96_v_s64_0 -legalized: true -regBankSelected: true - -body: | - bb.0: - liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4 - ; CHECK-LABEL: name: insert_v_s96_v_s64_0 - ; CHECK: [[COPY:%[0-9]+]]:vreg_96 = COPY $vgpr0_vgpr1_vgpr2 - ; CHECK: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr3_vgpr4 - ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:vreg_96 = INSERT_SUBREG [[COPY]], [[COPY1]], %subreg.sub0_sub1 - ; CHECK: S_ENDPGM 0, implicit [[INSERT_SUBREG]] - %0:vgpr(s96) = COPY $vgpr0_vgpr1_vgpr2 - %1:vgpr(s64) = COPY $vgpr3_vgpr4 - %2:vgpr(s96) = G_INSERT %0, %1, 0 - S_ENDPGM 0, implicit %2 -... - ---- - -name: insert_v_s96_v_s64_32 -legalized: true -regBankSelected: true - -body: | - bb.0: - liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4 - ; CHECK-LABEL: name: insert_v_s96_v_s64_32 - ; CHECK: [[COPY:%[0-9]+]]:vreg_96 = COPY $vgpr0_vgpr1_vgpr2 - ; CHECK: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr3_vgpr4 - ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:vreg_96 = INSERT_SUBREG [[COPY]], [[COPY1]], %subreg.sub1_sub2 - ; CHECK: S_ENDPGM 0, implicit [[INSERT_SUBREG]] - %0:vgpr(s96) = COPY $vgpr0_vgpr1_vgpr2 - %1:vgpr(s64) = COPY $vgpr3_vgpr4 - %2:vgpr(s96) = G_INSERT %0, %1, 32 - S_ENDPGM 0, implicit %2 -... - ---- - -name: insert_s_s96_s_s64_0 -legalized: true -regBankSelected: true - -body: | - bb.0: - liveins: $sgpr0_sgpr1_sgpr2, $sgpr4_sgpr5 - ; CHECK-LABEL: name: insert_s_s96_s_s64_0 - ; CHECK: [[COPY:%[0-9]+]]:sgpr_96_with_sub0_sub1 = COPY $sgpr0_sgpr1_sgpr2 - ; CHECK: [[COPY1:%[0-9]+]]:sreg_64 = COPY $sgpr4_sgpr5 - ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:sgpr_96 = INSERT_SUBREG [[COPY]], [[COPY1]], %subreg.sub0_sub1 - ; CHECK: S_ENDPGM 0, implicit [[INSERT_SUBREG]] - %0:sgpr(s96) = COPY $sgpr0_sgpr1_sgpr2 - %1:sgpr(s64) = COPY $sgpr4_sgpr5 - %2:sgpr(s96) = G_INSERT %0, %1, 0 - S_ENDPGM 0, implicit %2 -... - ---- - -name: insert_s_s96_s_s64_32 -legalized: true -regBankSelected: true - -body: | - bb.0: - liveins: $sgpr0_sgpr1_sgpr2, $sgpr4_sgpr5 - ; CHECK-LABEL: name: insert_s_s96_s_s64_32 - ; CHECK: [[COPY:%[0-9]+]]:sgpr_96_with_sub1_sub2 = COPY $sgpr0_sgpr1_sgpr2 - ; CHECK: [[COPY1:%[0-9]+]]:sreg_64 = COPY $sgpr4_sgpr5 - ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:sgpr_96 = INSERT_SUBREG [[COPY]], [[COPY1]], %subreg.sub1_sub2 - ; CHECK: S_ENDPGM 0, implicit [[INSERT_SUBREG]] - %0:sgpr(s96) = COPY $sgpr0_sgpr1_sgpr2 - %1:sgpr(s64) = COPY $sgpr4_sgpr5 - %2:sgpr(s96) = G_INSERT %0, %1, 32 - S_ENDPGM 0, implicit %2 -... - ---- - -name: insert_s_s128_s_s64_0 -legalized: true -regBankSelected: true - -body: | - bb.0: - liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4_sgpr5 - ; CHECK-LABEL: name: insert_s_s128_s_s64_0 - ; CHECK: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 - ; CHECK: [[COPY1:%[0-9]+]]:sreg_64 = COPY $sgpr4_sgpr5 - ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:sgpr_128 = INSERT_SUBREG [[COPY]], [[COPY1]], %subreg.sub0_sub1 - ; CHECK: S_ENDPGM 0, implicit [[INSERT_SUBREG]] - %0:sgpr(s128) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 - %1:sgpr(s64) = COPY $sgpr4_sgpr5 - %2:sgpr(s128) = G_INSERT %0, %1, 0 - S_ENDPGM 0, implicit %2 -... - -# --- - -# name: insert_s_s128_s_s64_32 -# legalized: true -# regBankSelected: true - -# body: | -# bb.0: -# liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4_sgpr5 -# %0:sgpr(s128) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 -# %1:sgpr(s64) = COPY $sgpr4_sgpr5 -# %2:sgpr(s128) = G_INSERT %0, %1, 32 -# S_ENDPGM 0, implicit %2 -# ... - ---- - -name: insert_s_s128_s_s64_64 -legalized: true -regBankSelected: true - -body: | - bb.0: - liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4_sgpr5 - ; CHECK-LABEL: name: insert_s_s128_s_s64_64 - ; CHECK: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 - ; CHECK: [[COPY1:%[0-9]+]]:sreg_64 = COPY $sgpr4_sgpr5 - ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:sgpr_128 = INSERT_SUBREG [[COPY]], [[COPY1]], %subreg.sub2_sub3 - ; CHECK: S_ENDPGM 0, implicit [[INSERT_SUBREG]] - %0:sgpr(s128) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 - %1:sgpr(s64) = COPY $sgpr4_sgpr5 - %2:sgpr(s128) = G_INSERT %0, %1, 64 - S_ENDPGM 0, implicit %2 -... - ---- - -name: insert_s_v256_v_s64_96 -legalized: true -regBankSelected: true - -body: | - bb.0: - liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9 - ; CHECK-LABEL: name: insert_s_v256_v_s64_96 - ; CHECK: [[COPY:%[0-9]+]]:vreg_256 = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - ; CHECK: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr8_vgpr9 - ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:vreg_256 = INSERT_SUBREG [[COPY]], [[COPY1]], %subreg.sub3_sub4 - ; CHECK: S_ENDPGM 0, implicit [[INSERT_SUBREG]] - %0:vgpr(s256) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - %1:vgpr(s64) = COPY $vgpr8_vgpr9 - %2:vgpr(s256) = G_INSERT %0, %1, 96 - S_ENDPGM 0, implicit %2 -... - ---- - -name: insert_s_s256_s_s64_128 -legalized: true -regBankSelected: true - -body: | - bb.0: - liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9 - ; CHECK-LABEL: name: insert_s_s256_s_s64_128 - ; CHECK: [[COPY:%[0-9]+]]:sgpr_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 - ; CHECK: [[COPY1:%[0-9]+]]:sreg_64 = COPY $sgpr4_sgpr5 - ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:sgpr_256 = INSERT_SUBREG [[COPY]], [[COPY1]], %subreg.sub4_sub5 - ; CHECK: S_ENDPGM 0, implicit [[INSERT_SUBREG]] - %0:sgpr(s256) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 - %1:sgpr(s64) = COPY $sgpr4_sgpr5 - %2:sgpr(s256) = G_INSERT %0, %1, 128 - S_ENDPGM 0, implicit %2 -... - -# --- - -# name: insert_s_s256_s_s64_160 -# legalized: true -# regBankSelected: true - -# body: | -# bb.0: -# liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9 -# %0:sgpr(s256) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 -# %1:sgpr(s64) = COPY $sgpr4_sgpr5 -# %2:sgpr(s256) = G_INSERT %0, %1, 160 -# S_ENDPGM 0, implicit %2 -# ... - ---- - -name: insert_s_s128_s_s96_0 -legalized: true -regBankSelected: true - -body: | - bb.0: - liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr6_sgpr7_sgpr8 - ; CHECK-LABEL: name: insert_s_s128_s_s96_0 - ; CHECK: [[COPY:%[0-9]+]]:sgpr_128_with_sub0_sub1_sub2 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 - ; CHECK: [[COPY1:%[0-9]+]]:sgpr_96 = COPY $sgpr6_sgpr7_sgpr8 - ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:sgpr_128 = INSERT_SUBREG [[COPY]], [[COPY1]], %subreg.sub0_sub1_sub2 - ; CHECK: S_ENDPGM 0, implicit [[INSERT_SUBREG]] - %0:sgpr(s128) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 - %1:sgpr(s96) = COPY $sgpr6_sgpr7_sgpr8 - %2:sgpr(s128) = G_INSERT %0, %1, 0 - S_ENDPGM 0, implicit %2 -... - ---- - -name: insert_s_s128_s_s96_32 -legalized: true -regBankSelected: true - -body: | - bb.0: - liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr6_sgpr7_sgpr8 - ; CHECK-LABEL: name: insert_s_s128_s_s96_32 - ; CHECK: [[COPY:%[0-9]+]]:sgpr_128_with_sub1_sub2_sub3 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 - ; CHECK: [[COPY1:%[0-9]+]]:sgpr_96 = COPY $sgpr6_sgpr7_sgpr8 - ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:sgpr_128 = INSERT_SUBREG [[COPY]], [[COPY1]], %subreg.sub1_sub2_sub3 - ; CHECK: S_ENDPGM 0, implicit [[INSERT_SUBREG]] - %0:sgpr(s128) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 - %1:sgpr(s96) = COPY $sgpr6_sgpr7_sgpr8 - %2:sgpr(s128) = G_INSERT %0, %1, 32 - S_ENDPGM 0, implicit %2 -... - ---- - -name: insert_s_s160_s_s96_0 -legalized: true -regBankSelected: true - -body: | - bb.0: - liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4, $sgpr6_sgpr7_sgpr8 - ; CHECK-LABEL: name: insert_s_s160_s_s96_0 - ; CHECK: [[COPY:%[0-9]+]]:sgpr_160_with_sub0_sub1_sub2 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4 - ; CHECK: [[COPY1:%[0-9]+]]:sgpr_96 = COPY $sgpr6_sgpr7_sgpr8 - ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:sgpr_160 = INSERT_SUBREG [[COPY]], [[COPY1]], %subreg.sub0_sub1_sub2 - ; CHECK: S_ENDPGM 0, implicit [[INSERT_SUBREG]] - %0:sgpr(s160) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4 - %1:sgpr(s96) = COPY $sgpr6_sgpr7_sgpr8 - %2:sgpr(s160) = G_INSERT %0, %1, 0 - S_ENDPGM 0, implicit %2 -... - ---- - -name: insert_s_s160_s_s96_32 -legalized: true -regBankSelected: true - -body: | - bb.0: - liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4, $sgpr6_sgpr7_sgpr8 - ; CHECK-LABEL: name: insert_s_s160_s_s96_32 - ; CHECK: [[COPY:%[0-9]+]]:sgpr_160_with_sub1_sub2_sub3 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4 - ; CHECK: [[COPY1:%[0-9]+]]:sgpr_96 = COPY $sgpr6_sgpr7_sgpr8 - ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:sgpr_160 = INSERT_SUBREG [[COPY]], [[COPY1]], %subreg.sub1_sub2_sub3 - ; CHECK: S_ENDPGM 0, implicit [[INSERT_SUBREG]] - %0:sgpr(s160) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4 - %1:sgpr(s96) = COPY $sgpr6_sgpr7_sgpr8 - %2:sgpr(s160) = G_INSERT %0, %1, 32 - S_ENDPGM 0, implicit %2 -... - ---- - -name: insert_s_s160_s_s96_64 -legalized: true -regBankSelected: true - -body: | - bb.0: - liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4, $sgpr6_sgpr7_sgpr8 - ; CHECK-LABEL: name: insert_s_s160_s_s96_64 - ; CHECK: [[COPY:%[0-9]+]]:sgpr_160_with_sub2_sub3_sub4 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4 - ; CHECK: [[COPY1:%[0-9]+]]:sgpr_96 = COPY $sgpr6_sgpr7_sgpr8 - ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:sgpr_160 = INSERT_SUBREG [[COPY]], [[COPY1]], %subreg.sub2_sub3_sub4 - ; CHECK: S_ENDPGM 0, implicit [[INSERT_SUBREG]] - %0:sgpr(s160) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4 - %1:sgpr(s96) = COPY $sgpr6_sgpr7_sgpr8 - %2:sgpr(s160) = G_INSERT %0, %1, 64 - S_ENDPGM 0, implicit %2 -... - ---- - -name: insert_s_s256_s_s128_0 -legalized: true -regBankSelected: true - -body: | - bb.0: - liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11 - - ; CHECK-LABEL: name: insert_s_s256_s_s128_0 - ; CHECK: [[COPY:%[0-9]+]]:sgpr_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 - ; CHECK: [[COPY1:%[0-9]+]]:sgpr_128 = COPY $sgpr8_sgpr9_sgpr10_sgpr11 - ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:sgpr_256 = INSERT_SUBREG [[COPY]], [[COPY1]], %subreg.sub0_sub1_sub2_sub3 - ; CHECK: S_ENDPGM 0, implicit [[INSERT_SUBREG]] - %0:sgpr(s256) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 - %1:sgpr(s128) = COPY $sgpr8_sgpr9_sgpr10_sgpr11 - %2:sgpr(s256) = G_INSERT %0, %1, 0 - S_ENDPGM 0, implicit %2 -... - ---- - -name: insert_v_s256_v_s128_32 -legalized: true -regBankSelected: true - -body: | - bb.0: - liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11 - - ; CHECK-LABEL: name: insert_v_s256_v_s128_32 - ; CHECK: [[COPY:%[0-9]+]]:vreg_256 = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - ; CHECK: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr8_vgpr9_vgpr10_vgpr11 - ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:vreg_256 = INSERT_SUBREG [[COPY]], [[COPY1]], %subreg.sub1_sub2_sub3_sub4 - ; CHECK: S_ENDPGM 0, implicit [[INSERT_SUBREG]] - %0:vgpr(s256) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - %1:vgpr(s128) = COPY $vgpr8_vgpr9_vgpr10_vgpr11 - %2:vgpr(s256) = G_INSERT %0, %1, 32 - S_ENDPGM 0, implicit %2 -... - ---- - -name: insert_v_s256_v_s128_64 -legalized: true -regBankSelected: true - -body: | - bb.0: - liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11 - - ; CHECK-LABEL: name: insert_v_s256_v_s128_64 - ; CHECK: [[COPY:%[0-9]+]]:vreg_256 = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - ; CHECK: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr8_vgpr9_vgpr10_vgpr11 - ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:vreg_256 = INSERT_SUBREG [[COPY]], [[COPY1]], %subreg.sub2_sub3_sub4_sub5 - ; CHECK: S_ENDPGM 0, implicit [[INSERT_SUBREG]] - %0:vgpr(s256) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - %1:vgpr(s128) = COPY $vgpr8_vgpr9_vgpr10_vgpr11 - %2:vgpr(s256) = G_INSERT %0, %1, 64 - S_ENDPGM 0, implicit %2 -... - ---- - -name: insert_v_s256_v_s128_96 -legalized: true -regBankSelected: true - -body: | - bb.0: - liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11 - - ; CHECK-LABEL: name: insert_v_s256_v_s128_96 - ; CHECK: [[COPY:%[0-9]+]]:vreg_256 = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - ; CHECK: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr8_vgpr9_vgpr10_vgpr11 - ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:vreg_256 = INSERT_SUBREG [[COPY]], [[COPY1]], %subreg.sub3_sub4_sub5_sub6 - ; CHECK: S_ENDPGM 0, implicit [[INSERT_SUBREG]] - %0:vgpr(s256) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - %1:vgpr(s128) = COPY $vgpr8_vgpr9_vgpr10_vgpr11 - %2:vgpr(s256) = G_INSERT %0, %1, 96 - S_ENDPGM 0, implicit %2 -... - ---- - -name: insert_v_s256_v_s128_128 -legalized: true -regBankSelected: true - -body: | - bb.0: - liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11 - - ; CHECK-LABEL: name: insert_v_s256_v_s128_128 - ; CHECK: [[COPY:%[0-9]+]]:vreg_256 = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - ; CHECK: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr8_vgpr9_vgpr10_vgpr11 - ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:vreg_256 = INSERT_SUBREG [[COPY]], [[COPY1]], %subreg.sub4_sub5_sub6_sub7 - ; CHECK: S_ENDPGM 0, implicit [[INSERT_SUBREG]] - %0:vgpr(s256) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - %1:vgpr(s128) = COPY $vgpr8_vgpr9_vgpr10_vgpr11 - %2:vgpr(s256) = G_INSERT %0, %1, 128 - S_ENDPGM 0, implicit %2 -... - ---- -name: insert_sgpr_v2s16_to_v4s16_offset0 -legalized: true -regBankSelected: true - -body: | - bb.0: - liveins: $sgpr0_sgpr1, $sgpr2 - ; CHECK-LABEL: name: insert_sgpr_v2s16_to_v4s16_offset0 - ; CHECK: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 - ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:sreg_64 = INSERT_SUBREG [[COPY]], [[COPY1]], %subreg.sub0 - ; CHECK: S_ENDPGM 0, implicit [[INSERT_SUBREG]] - %0:sgpr(<4 x s16>) = COPY $sgpr0_sgpr1 - %1:sgpr(<2 x s16>) = COPY $sgpr2 - %2:sgpr(<4 x s16>) = G_INSERT %0, %1, 0 - S_ENDPGM 0, implicit %2 - -... - ---- -name: insert_sgpr_v2s16_to_v4s16_offset32 -legalized: true -regBankSelected: true - -body: | - bb.0: - liveins: $sgpr0_sgpr1, $sgpr2 - ; CHECK-LABEL: name: insert_sgpr_v2s16_to_v4s16_offset32 - ; CHECK: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 - ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:sreg_64 = INSERT_SUBREG [[COPY]], [[COPY1]], %subreg.sub1 - ; CHECK: S_ENDPGM 0, implicit [[INSERT_SUBREG]] - %0:sgpr(<4 x s16>) = COPY $sgpr0_sgpr1 - %1:sgpr(<2 x s16>) = COPY $sgpr2 - %2:sgpr(<4 x s16>) = G_INSERT %0, %1, 32 - S_ENDPGM 0, implicit %2 -... Index: llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-insert.xfail.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-insert.xfail.mir +++ /dev/null @@ -1,39 +0,0 @@ -# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -global-isel-abort=2 -pass-remarks-missed='gisel*' -verify-machineinstrs -o /dev/null %s 2>&1 | FileCheck -check-prefix=ERR %s - -# ERR: remark: :0:0: cannot select: %3:sgpr(<4 x s16>) = G_INSERT %0:sgpr, %2:sgpr(s16), 0 (in function: insert_sgpr_s16_to_v4s16_offset0) -# ERR-NEXT: remark: :0:0: cannot select: %2:sgpr(<16 x s32>) = G_INSERT %0:sgpr, %1:sgpr(<8 x s32>), 0 (in function: insert_sgpr_v8s32_to_v16s32_offset0) -# ERR-NOT: remark - -# FIXME: This 16-bit insert source should not be legal and this test -# should be deleted ---- -name: insert_sgpr_s16_to_v4s16_offset0 -legalized: true -regBankSelected: true - -body: | - bb.0: - liveins: $sgpr0_sgpr1, $sgpr2 - %0:sgpr(<4 x s16>) = COPY $sgpr0_sgpr1 - %1:sgpr(s32) = COPY $sgpr2 - %2:sgpr(s16) = G_TRUNC %1 - %3:sgpr(<4 x s16>) = G_INSERT %0, %2, 0 - S_ENDPGM 0, implicit %3 - -... - -# getSubRegFromChannel current does not handle cases > 128-bits ---- -name: insert_sgpr_v8s32_to_v16s32_offset0 -legalized: true -regBankSelected: true - -body: | - bb.0: - liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23 - %0:sgpr(<16 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - %1:sgpr(<8 x s32>) = COPY $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23 - %2:sgpr(<16 x s32>) = G_INSERT %0, %1, 0 - S_ENDPGM 0, implicit %2 - -... Index: llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-return-values.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-return-values.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-return-values.ll @@ -2042,11 +2042,10 @@ ; GCN: $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_v3i16_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1 ; GCN: [[COPY21:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GCN: [[COPY22:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 - ; GCN: [[DEF1:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF - ; GCN: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY21]](<2 x s16>), [[COPY22]](<2 x s16>), [[DEF1]](<2 x s16>) - ; GCN: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>) + ; GCN: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY21]](<2 x s16>), [[COPY22]](<2 x s16>) + ; GCN: [[DELETE_TRAILING_VECTOR_ELTS:%[0-9]+]]:_(<3 x s16>) = G_DELETE_TRAILING_VECTOR_ELTS [[CONCAT_VECTORS]](<4 x s16>) ; GCN: ADJCALLSTACKDOWN 0, 0, implicit-def $scc - ; GCN: G_STORE [[UV]](<3 x s16>), [[DEF]](p1) :: (volatile store (<3 x s16>) into `<3 x i16> addrspace(1)* undef`, align 8, addrspace 1) + ; GCN: G_STORE [[DELETE_TRAILING_VECTOR_ELTS]](<3 x s16>), [[DEF]](p1) :: (volatile store (<3 x s16>) into `<3 x i16> addrspace(1)* undef`, align 8, addrspace 1) ; GCN: S_ENDPGM 0 %val = call <3 x i16> @external_v3i16_func_void() store volatile <3 x i16> %val, <3 x i16> addrspace(1)* undef @@ -2213,11 +2212,10 @@ ; GCN: $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_v3f16_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1 ; GCN: [[COPY21:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GCN: [[COPY22:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 - ; GCN: [[DEF1:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF - ; GCN: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY21]](<2 x s16>), [[COPY22]](<2 x s16>), [[DEF1]](<2 x s16>) - ; GCN: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>) + ; GCN: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY21]](<2 x s16>), [[COPY22]](<2 x s16>) + ; GCN: [[DELETE_TRAILING_VECTOR_ELTS:%[0-9]+]]:_(<3 x s16>) = G_DELETE_TRAILING_VECTOR_ELTS [[CONCAT_VECTORS]](<4 x s16>) ; GCN: ADJCALLSTACKDOWN 0, 0, implicit-def $scc - ; GCN: G_STORE [[UV]](<3 x s16>), [[DEF]](p1) :: (volatile store (<3 x s16>) into `<3 x half> addrspace(1)* undef`, align 8, addrspace 1) + ; GCN: G_STORE [[DELETE_TRAILING_VECTOR_ELTS]](<3 x s16>), [[DEF]](p1) :: (volatile store (<3 x s16>) into `<3 x half> addrspace(1)* undef`, align 8, addrspace 1) ; GCN: S_ENDPGM 0 %val = call <3 x half> @external_v3f16_func_void() store volatile <3 x half> %val, <3 x half> addrspace(1)* undef Index: llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll @@ -2064,9 +2064,8 @@ ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 ; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) ; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] - ; CHECK: [[DEF1:%[0-9]+]]:_(<3 x s16>) = G_IMPLICIT_DEF - ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[LOAD]](<3 x s16>), [[DEF1]](<3 x s16>) - ; CHECK: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>) + ; CHECK: [[PAD_VECTOR_WITH_UNDEF_ELTS:%[0-9]+]]:_(<4 x s16>) = G_PAD_VECTOR_WITH_UNDEF_ELTS [[LOAD]](<3 x s16>) + ; CHECK: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[PAD_VECTOR_WITH_UNDEF_ELTS]](<4 x s16>) ; CHECK: $vgpr0 = COPY [[UV]](<2 x s16>) ; CHECK: $vgpr1 = COPY [[UV1]](<2 x s16>) ; CHECK: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg @@ -2123,9 +2122,8 @@ ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 ; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) ; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] - ; CHECK: [[DEF1:%[0-9]+]]:_(<3 x s16>) = G_IMPLICIT_DEF - ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[LOAD]](<3 x s16>), [[DEF1]](<3 x s16>) - ; CHECK: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>) + ; CHECK: [[PAD_VECTOR_WITH_UNDEF_ELTS:%[0-9]+]]:_(<4 x s16>) = G_PAD_VECTOR_WITH_UNDEF_ELTS [[LOAD]](<3 x s16>) + ; CHECK: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[PAD_VECTOR_WITH_UNDEF_ELTS]](<4 x s16>) ; CHECK: $vgpr0 = COPY [[UV]](<2 x s16>) ; CHECK: $vgpr1 = COPY [[UV1]](<2 x s16>) ; CHECK: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg @@ -2298,9 +2296,8 @@ ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 ; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) ; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] - ; CHECK: [[DEF1:%[0-9]+]]:_(<5 x s16>) = G_IMPLICIT_DEF - ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<10 x s16>) = G_CONCAT_VECTORS [[LOAD]](<5 x s16>), [[DEF1]](<5 x s16>) - ; CHECK: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>), [[UV4:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<10 x s16>) + ; CHECK: [[PAD_VECTOR_WITH_UNDEF_ELTS:%[0-9]+]]:_(<6 x s16>) = G_PAD_VECTOR_WITH_UNDEF_ELTS [[LOAD]](<5 x s16>) + ; CHECK: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[PAD_VECTOR_WITH_UNDEF_ELTS]](<6 x s16>) ; CHECK: $vgpr0 = COPY [[UV]](<2 x s16>) ; CHECK: $vgpr1 = COPY [[UV1]](<2 x s16>) ; CHECK: $vgpr2 = COPY [[UV2]](<2 x s16>) @@ -2358,9 +2355,8 @@ ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 ; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) ; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] - ; CHECK: [[DEF1:%[0-9]+]]:_(<7 x s16>) = G_IMPLICIT_DEF - ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<14 x s16>) = G_CONCAT_VECTORS [[LOAD]](<7 x s16>), [[DEF1]](<7 x s16>) - ; CHECK: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>), [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>), [[UV6:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<14 x s16>) + ; CHECK: [[PAD_VECTOR_WITH_UNDEF_ELTS:%[0-9]+]]:_(<8 x s16>) = G_PAD_VECTOR_WITH_UNDEF_ELTS [[LOAD]](<7 x s16>) + ; CHECK: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[PAD_VECTOR_WITH_UNDEF_ELTS]](<8 x s16>) ; CHECK: $vgpr0 = COPY [[UV]](<2 x s16>) ; CHECK: $vgpr1 = COPY [[UV1]](<2 x s16>) ; CHECK: $vgpr2 = COPY [[UV2]](<2 x s16>) @@ -2419,9 +2415,8 @@ ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 ; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) ; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] - ; CHECK: [[DEF1:%[0-9]+]]:_(<63 x s16>) = G_IMPLICIT_DEF - ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<126 x s16>) = G_CONCAT_VECTORS [[LOAD]](<63 x s16>), [[DEF1]](<63 x s16>) - ; CHECK: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>), [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>), [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>), [[UV8:%[0-9]+]]:_(<2 x s16>), [[UV9:%[0-9]+]]:_(<2 x s16>), [[UV10:%[0-9]+]]:_(<2 x s16>), [[UV11:%[0-9]+]]:_(<2 x s16>), [[UV12:%[0-9]+]]:_(<2 x s16>), [[UV13:%[0-9]+]]:_(<2 x s16>), [[UV14:%[0-9]+]]:_(<2 x s16>), [[UV15:%[0-9]+]]:_(<2 x s16>), [[UV16:%[0-9]+]]:_(<2 x s16>), [[UV17:%[0-9]+]]:_(<2 x s16>), [[UV18:%[0-9]+]]:_(<2 x s16>), [[UV19:%[0-9]+]]:_(<2 x s16>), [[UV20:%[0-9]+]]:_(<2 x s16>), [[UV21:%[0-9]+]]:_(<2 x s16>), [[UV22:%[0-9]+]]:_(<2 x s16>), [[UV23:%[0-9]+]]:_(<2 x s16>), [[UV24:%[0-9]+]]:_(<2 x s16>), [[UV25:%[0-9]+]]:_(<2 x s16>), [[UV26:%[0-9]+]]:_(<2 x s16>), [[UV27:%[0-9]+]]:_(<2 x s16>), [[UV28:%[0-9]+]]:_(<2 x s16>), [[UV29:%[0-9]+]]:_(<2 x s16>), [[UV30:%[0-9]+]]:_(<2 x s16>), [[UV31:%[0-9]+]]:_(<2 x s16>), [[UV32:%[0-9]+]]:_(<2 x s16>), [[UV33:%[0-9]+]]:_(<2 x s16>), [[UV34:%[0-9]+]]:_(<2 x s16>), [[UV35:%[0-9]+]]:_(<2 x s16>), [[UV36:%[0-9]+]]:_(<2 x s16>), [[UV37:%[0-9]+]]:_(<2 x s16>), [[UV38:%[0-9]+]]:_(<2 x s16>), [[UV39:%[0-9]+]]:_(<2 x s16>), [[UV40:%[0-9]+]]:_(<2 x s16>), [[UV41:%[0-9]+]]:_(<2 x s16>), [[UV42:%[0-9]+]]:_(<2 x s16>), [[UV43:%[0-9]+]]:_(<2 x s16>), [[UV44:%[0-9]+]]:_(<2 x s16>), [[UV45:%[0-9]+]]:_(<2 x s16>), [[UV46:%[0-9]+]]:_(<2 x s16>), [[UV47:%[0-9]+]]:_(<2 x s16>), [[UV48:%[0-9]+]]:_(<2 x s16>), [[UV49:%[0-9]+]]:_(<2 x s16>), [[UV50:%[0-9]+]]:_(<2 x s16>), [[UV51:%[0-9]+]]:_(<2 x s16>), [[UV52:%[0-9]+]]:_(<2 x s16>), [[UV53:%[0-9]+]]:_(<2 x s16>), [[UV54:%[0-9]+]]:_(<2 x s16>), [[UV55:%[0-9]+]]:_(<2 x s16>), [[UV56:%[0-9]+]]:_(<2 x s16>), [[UV57:%[0-9]+]]:_(<2 x s16>), [[UV58:%[0-9]+]]:_(<2 x s16>), [[UV59:%[0-9]+]]:_(<2 x s16>), [[UV60:%[0-9]+]]:_(<2 x s16>), [[UV61:%[0-9]+]]:_(<2 x s16>), [[UV62:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<126 x s16>) + ; CHECK: [[PAD_VECTOR_WITH_UNDEF_ELTS:%[0-9]+]]:_(<64 x s16>) = G_PAD_VECTOR_WITH_UNDEF_ELTS [[LOAD]](<63 x s16>) + ; CHECK: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>), [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>), [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>), [[UV8:%[0-9]+]]:_(<2 x s16>), [[UV9:%[0-9]+]]:_(<2 x s16>), [[UV10:%[0-9]+]]:_(<2 x s16>), [[UV11:%[0-9]+]]:_(<2 x s16>), [[UV12:%[0-9]+]]:_(<2 x s16>), [[UV13:%[0-9]+]]:_(<2 x s16>), [[UV14:%[0-9]+]]:_(<2 x s16>), [[UV15:%[0-9]+]]:_(<2 x s16>), [[UV16:%[0-9]+]]:_(<2 x s16>), [[UV17:%[0-9]+]]:_(<2 x s16>), [[UV18:%[0-9]+]]:_(<2 x s16>), [[UV19:%[0-9]+]]:_(<2 x s16>), [[UV20:%[0-9]+]]:_(<2 x s16>), [[UV21:%[0-9]+]]:_(<2 x s16>), [[UV22:%[0-9]+]]:_(<2 x s16>), [[UV23:%[0-9]+]]:_(<2 x s16>), [[UV24:%[0-9]+]]:_(<2 x s16>), [[UV25:%[0-9]+]]:_(<2 x s16>), [[UV26:%[0-9]+]]:_(<2 x s16>), [[UV27:%[0-9]+]]:_(<2 x s16>), [[UV28:%[0-9]+]]:_(<2 x s16>), [[UV29:%[0-9]+]]:_(<2 x s16>), [[UV30:%[0-9]+]]:_(<2 x s16>), [[UV31:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[PAD_VECTOR_WITH_UNDEF_ELTS]](<64 x s16>) ; CHECK: $vgpr0 = COPY [[UV]](<2 x s16>) ; CHECK: $vgpr1 = COPY [[UV1]](<2 x s16>) ; CHECK: $vgpr2 = COPY [[UV2]](<2 x s16>) @@ -2511,9 +2506,8 @@ ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 ; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) ; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] - ; CHECK: [[DEF1:%[0-9]+]]:_(<65 x s16>) = G_IMPLICIT_DEF - ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<130 x s16>) = G_CONCAT_VECTORS [[LOAD]](<65 x s16>), [[DEF1]](<65 x s16>) - ; CHECK: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>), [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>), [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>), [[UV8:%[0-9]+]]:_(<2 x s16>), [[UV9:%[0-9]+]]:_(<2 x s16>), [[UV10:%[0-9]+]]:_(<2 x s16>), [[UV11:%[0-9]+]]:_(<2 x s16>), [[UV12:%[0-9]+]]:_(<2 x s16>), [[UV13:%[0-9]+]]:_(<2 x s16>), [[UV14:%[0-9]+]]:_(<2 x s16>), [[UV15:%[0-9]+]]:_(<2 x s16>), [[UV16:%[0-9]+]]:_(<2 x s16>), [[UV17:%[0-9]+]]:_(<2 x s16>), [[UV18:%[0-9]+]]:_(<2 x s16>), [[UV19:%[0-9]+]]:_(<2 x s16>), [[UV20:%[0-9]+]]:_(<2 x s16>), [[UV21:%[0-9]+]]:_(<2 x s16>), [[UV22:%[0-9]+]]:_(<2 x s16>), [[UV23:%[0-9]+]]:_(<2 x s16>), [[UV24:%[0-9]+]]:_(<2 x s16>), [[UV25:%[0-9]+]]:_(<2 x s16>), [[UV26:%[0-9]+]]:_(<2 x s16>), [[UV27:%[0-9]+]]:_(<2 x s16>), [[UV28:%[0-9]+]]:_(<2 x s16>), [[UV29:%[0-9]+]]:_(<2 x s16>), [[UV30:%[0-9]+]]:_(<2 x s16>), [[UV31:%[0-9]+]]:_(<2 x s16>), [[UV32:%[0-9]+]]:_(<2 x s16>), [[UV33:%[0-9]+]]:_(<2 x s16>), [[UV34:%[0-9]+]]:_(<2 x s16>), [[UV35:%[0-9]+]]:_(<2 x s16>), [[UV36:%[0-9]+]]:_(<2 x s16>), [[UV37:%[0-9]+]]:_(<2 x s16>), [[UV38:%[0-9]+]]:_(<2 x s16>), [[UV39:%[0-9]+]]:_(<2 x s16>), [[UV40:%[0-9]+]]:_(<2 x s16>), [[UV41:%[0-9]+]]:_(<2 x s16>), [[UV42:%[0-9]+]]:_(<2 x s16>), [[UV43:%[0-9]+]]:_(<2 x s16>), [[UV44:%[0-9]+]]:_(<2 x s16>), [[UV45:%[0-9]+]]:_(<2 x s16>), [[UV46:%[0-9]+]]:_(<2 x s16>), [[UV47:%[0-9]+]]:_(<2 x s16>), [[UV48:%[0-9]+]]:_(<2 x s16>), [[UV49:%[0-9]+]]:_(<2 x s16>), [[UV50:%[0-9]+]]:_(<2 x s16>), [[UV51:%[0-9]+]]:_(<2 x s16>), [[UV52:%[0-9]+]]:_(<2 x s16>), [[UV53:%[0-9]+]]:_(<2 x s16>), [[UV54:%[0-9]+]]:_(<2 x s16>), [[UV55:%[0-9]+]]:_(<2 x s16>), [[UV56:%[0-9]+]]:_(<2 x s16>), [[UV57:%[0-9]+]]:_(<2 x s16>), [[UV58:%[0-9]+]]:_(<2 x s16>), [[UV59:%[0-9]+]]:_(<2 x s16>), [[UV60:%[0-9]+]]:_(<2 x s16>), [[UV61:%[0-9]+]]:_(<2 x s16>), [[UV62:%[0-9]+]]:_(<2 x s16>), [[UV63:%[0-9]+]]:_(<2 x s16>), [[UV64:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<130 x s16>) + ; CHECK: [[PAD_VECTOR_WITH_UNDEF_ELTS:%[0-9]+]]:_(<66 x s16>) = G_PAD_VECTOR_WITH_UNDEF_ELTS [[LOAD]](<65 x s16>) + ; CHECK: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>), [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>), [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>), [[UV8:%[0-9]+]]:_(<2 x s16>), [[UV9:%[0-9]+]]:_(<2 x s16>), [[UV10:%[0-9]+]]:_(<2 x s16>), [[UV11:%[0-9]+]]:_(<2 x s16>), [[UV12:%[0-9]+]]:_(<2 x s16>), [[UV13:%[0-9]+]]:_(<2 x s16>), [[UV14:%[0-9]+]]:_(<2 x s16>), [[UV15:%[0-9]+]]:_(<2 x s16>), [[UV16:%[0-9]+]]:_(<2 x s16>), [[UV17:%[0-9]+]]:_(<2 x s16>), [[UV18:%[0-9]+]]:_(<2 x s16>), [[UV19:%[0-9]+]]:_(<2 x s16>), [[UV20:%[0-9]+]]:_(<2 x s16>), [[UV21:%[0-9]+]]:_(<2 x s16>), [[UV22:%[0-9]+]]:_(<2 x s16>), [[UV23:%[0-9]+]]:_(<2 x s16>), [[UV24:%[0-9]+]]:_(<2 x s16>), [[UV25:%[0-9]+]]:_(<2 x s16>), [[UV26:%[0-9]+]]:_(<2 x s16>), [[UV27:%[0-9]+]]:_(<2 x s16>), [[UV28:%[0-9]+]]:_(<2 x s16>), [[UV29:%[0-9]+]]:_(<2 x s16>), [[UV30:%[0-9]+]]:_(<2 x s16>), [[UV31:%[0-9]+]]:_(<2 x s16>), [[UV32:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[PAD_VECTOR_WITH_UNDEF_ELTS]](<66 x s16>) ; CHECK: $vgpr0 = COPY [[UV]](<2 x s16>) ; CHECK: $vgpr1 = COPY [[UV1]](<2 x s16>) ; CHECK: $vgpr2 = COPY [[UV2]](<2 x s16>) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll @@ -108,6 +108,7 @@ ; CHECK: bb.2.bb1: ; CHECK: successors: %bb.3(0x80000000) ; CHECK: G_STORE [[C1]](s32), [[DEF]](p1) :: (volatile store (s32) into `i32 addrspace(1)* undef`, addrspace 1) + ; CHECK: G_BR %bb.3 ; CHECK: bb.3.bb2: ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[INT1]](s64) ; CHECK: [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]] @@ -1167,12 +1168,11 @@ ; CHECK: liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31 ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; CHECK: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 - ; CHECK: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF - ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[DEF]](<2 x s16>) - ; CHECK: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>) + ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>) + ; CHECK: [[DELETE_TRAILING_VECTOR_ELTS:%[0-9]+]]:_(<3 x s16>) = G_DELETE_TRAILING_VECTOR_ELTS [[CONCAT_VECTORS]](<4 x s16>) ; CHECK: [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 - ; CHECK: [[DEF1:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF - ; CHECK: G_STORE [[UV]](<3 x s16>), [[DEF1]](p1) :: (store (<3 x s16>) into `<3 x i16> addrspace(1)* undef`, align 8, addrspace 1) + ; CHECK: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; CHECK: G_STORE [[DELETE_TRAILING_VECTOR_ELTS]](<3 x s16>), [[DEF]](p1) :: (store (<3 x s16>) into `<3 x i16> addrspace(1)* undef`, align 8, addrspace 1) ; CHECK: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]] ; CHECK: S_SETPC_B64_return [[COPY3]] store <3 x i16> %arg0, <3 x i16> addrspace(1)* undef @@ -1202,12 +1202,11 @@ ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; CHECK: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; CHECK: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 - ; CHECK: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF - ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<10 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[COPY2]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>) - ; CHECK: [[UV:%[0-9]+]]:_(<5 x s16>), [[UV1:%[0-9]+]]:_(<5 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<10 x s16>) + ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[COPY2]](<2 x s16>) + ; CHECK: [[DELETE_TRAILING_VECTOR_ELTS:%[0-9]+]]:_(<5 x s16>) = G_DELETE_TRAILING_VECTOR_ELTS [[CONCAT_VECTORS]](<6 x s16>) ; CHECK: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 - ; CHECK: [[DEF1:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF - ; CHECK: G_STORE [[UV]](<5 x s16>), [[DEF1]](p1) :: (store (<5 x s16>) into `<5 x i16> addrspace(1)* undef`, align 16, addrspace 1) + ; CHECK: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; CHECK: G_STORE [[DELETE_TRAILING_VECTOR_ELTS]](<5 x s16>), [[DEF]](p1) :: (store (<5 x s16>) into `<5 x i16> addrspace(1)* undef`, align 16, addrspace 1) ; CHECK: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]] ; CHECK: S_SETPC_B64_return [[COPY4]] store <5 x i16> %arg0, <5 x i16> addrspace(1)* undef @@ -1294,12 +1293,11 @@ ; CHECK: [[COPY31:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr31 ; CHECK: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0 ; CHECK: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[FRAME_INDEX]](p5) :: (invariant load (<2 x s16>) from %fixed-stack.0, align 16, addrspace 5) - ; CHECK: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF - ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<130 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>), [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>), [[COPY6]](<2 x s16>), [[COPY7]](<2 x s16>), [[COPY8]](<2 x s16>), [[COPY9]](<2 x s16>), [[COPY10]](<2 x s16>), [[COPY11]](<2 x s16>), [[COPY12]](<2 x s16>), [[COPY13]](<2 x s16>), [[COPY14]](<2 x s16>), [[COPY15]](<2 x s16>), [[COPY16]](<2 x s16>), [[COPY17]](<2 x s16>), [[COPY18]](<2 x s16>), [[COPY19]](<2 x s16>), [[COPY20]](<2 x s16>), [[COPY21]](<2 x s16>), [[COPY22]](<2 x s16>), [[COPY23]](<2 x s16>), [[COPY24]](<2 x s16>), [[COPY25]](<2 x s16>), [[COPY26]](<2 x s16>), [[COPY27]](<2 x s16>), [[COPY28]](<2 x s16>), [[COPY29]](<2 x s16>), [[COPY30]](<2 x s16>), [[COPY31]](<2 x s16>), [[LOAD]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>) - ; CHECK: [[UV:%[0-9]+]]:_(<65 x s16>), [[UV1:%[0-9]+]]:_(<65 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<130 x s16>) + ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<66 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>), [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>), [[COPY6]](<2 x s16>), [[COPY7]](<2 x s16>), [[COPY8]](<2 x s16>), [[COPY9]](<2 x s16>), [[COPY10]](<2 x s16>), [[COPY11]](<2 x s16>), [[COPY12]](<2 x s16>), [[COPY13]](<2 x s16>), [[COPY14]](<2 x s16>), [[COPY15]](<2 x s16>), [[COPY16]](<2 x s16>), [[COPY17]](<2 x s16>), [[COPY18]](<2 x s16>), [[COPY19]](<2 x s16>), [[COPY20]](<2 x s16>), [[COPY21]](<2 x s16>), [[COPY22]](<2 x s16>), [[COPY23]](<2 x s16>), [[COPY24]](<2 x s16>), [[COPY25]](<2 x s16>), [[COPY26]](<2 x s16>), [[COPY27]](<2 x s16>), [[COPY28]](<2 x s16>), [[COPY29]](<2 x s16>), [[COPY30]](<2 x s16>), [[COPY31]](<2 x s16>), [[LOAD]](<2 x s16>) + ; CHECK: [[DELETE_TRAILING_VECTOR_ELTS:%[0-9]+]]:_(<65 x s16>) = G_DELETE_TRAILING_VECTOR_ELTS [[CONCAT_VECTORS]](<66 x s16>) ; CHECK: [[COPY32:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 - ; CHECK: [[DEF1:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF - ; CHECK: G_STORE [[UV]](<65 x s16>), [[DEF1]](p1) :: (store (<65 x s16>) into `<65 x i16> addrspace(1)* undef`, align 256, addrspace 1) + ; CHECK: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; CHECK: G_STORE [[DELETE_TRAILING_VECTOR_ELTS]](<65 x s16>), [[DEF]](p1) :: (store (<65 x s16>) into `<65 x i16> addrspace(1)* undef`, align 256, addrspace 1) ; CHECK: [[COPY33:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY32]] ; CHECK: S_SETPC_B64_return [[COPY33]] store <65 x i16> %arg0, <65 x i16> addrspace(1)* undef @@ -1598,12 +1596,11 @@ ; CHECK: liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31 ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; CHECK: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 - ; CHECK: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF - ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[DEF]](<2 x s16>) - ; CHECK: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>) + ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>) + ; CHECK: [[DELETE_TRAILING_VECTOR_ELTS:%[0-9]+]]:_(<3 x s16>) = G_DELETE_TRAILING_VECTOR_ELTS [[CONCAT_VECTORS]](<4 x s16>) ; CHECK: [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 - ; CHECK: [[DEF1:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF - ; CHECK: G_STORE [[UV]](<3 x s16>), [[DEF1]](p1) :: (store (<3 x s16>) into `<3 x half> addrspace(1)* undef`, align 8, addrspace 1) + ; CHECK: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; CHECK: G_STORE [[DELETE_TRAILING_VECTOR_ELTS]](<3 x s16>), [[DEF]](p1) :: (store (<3 x s16>) into `<3 x half> addrspace(1)* undef`, align 8, addrspace 1) ; CHECK: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]] ; CHECK: S_SETPC_B64_return [[COPY3]] store <3 x half> %arg0, <3 x half> addrspace(1)* undef Index: llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-sibling-call.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-sibling-call.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-sibling-call.ll @@ -1698,9 +1698,8 @@ ; GCN: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 ; GCN: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GCN: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 - ; GCN: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF - ; GCN: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY8]](<2 x s16>), [[COPY9]](<2 x s16>), [[DEF]](<2 x s16>) - ; GCN: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>) + ; GCN: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY8]](<2 x s16>), [[COPY9]](<2 x s16>) + ; GCN: [[DELETE_TRAILING_VECTOR_ELTS:%[0-9]+]]:_(<3 x s16>) = G_DELETE_TRAILING_VECTOR_ELTS [[CONCAT_VECTORS]](<4 x s16>) ; GCN: [[COPY10:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 ; GCN: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @v3i16_fastcc_v3i16 ; GCN: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] @@ -1711,11 +1710,10 @@ ; GCN: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY2]] ; GCN: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY1]] ; GCN: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) - ; GCN: [[DEF1:%[0-9]+]]:_(<3 x s16>) = G_IMPLICIT_DEF - ; GCN: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[UV]](<3 x s16>), [[DEF1]](<3 x s16>) - ; GCN: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>), [[UV4:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<6 x s16>) - ; GCN: $vgpr0 = COPY [[UV2]](<2 x s16>) - ; GCN: $vgpr1 = COPY [[UV3]](<2 x s16>) + ; GCN: [[PAD_VECTOR_WITH_UNDEF_ELTS:%[0-9]+]]:_(<4 x s16>) = G_PAD_VECTOR_WITH_UNDEF_ELTS [[DELETE_TRAILING_VECTOR_ELTS]](<3 x s16>) + ; GCN: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[PAD_VECTOR_WITH_UNDEF_ELTS]](<4 x s16>) + ; GCN: $vgpr0 = COPY [[UV]](<2 x s16>) + ; GCN: $vgpr1 = COPY [[UV1]](<2 x s16>) ; GCN: [[COPY19:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY19]](<4 x s32>) ; GCN: $sgpr4_sgpr5 = COPY [[COPY11]](p4) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2d.d16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2d.d16.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2d.d16.ll @@ -114,31 +114,23 @@ ; UNPACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) ; UNPACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<3 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load (<3 x s16>) from custom "ImageResource", align 8) ; UNPACKED: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<3 x s32>) - ; UNPACKED: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF - ; UNPACKED: [[DEF1:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF - ; UNPACKED: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; UNPACKED: [[DEF3:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; UNPACKED: [[UV3:%[0-9]+]]:_(<2 x s16>), [[UV4:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF2]](<4 x s16>) - ; UNPACKED: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) - ; UNPACKED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; UNPACKED: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; UNPACKED: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; UNPACKED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 ; UNPACKED: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32) - ; UNPACKED: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C1]] + ; UNPACKED: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C]] ; UNPACKED: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) - ; UNPACKED: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C1]] - ; UNPACKED: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) + ; UNPACKED: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C]] + ; UNPACKED: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNPACKED: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32) ; UNPACKED: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] - ; UNPACKED: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) + ; UNPACKED: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) ; UNPACKED: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) - ; UNPACKED: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C1]] - ; UNPACKED: [[COPY13:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; UNPACKED: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C1]] - ; UNPACKED: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32) + ; UNPACKED: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C]] + ; UNPACKED: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; UNPACKED: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C2]], [[C1]](s32) ; UNPACKED: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] - ; UNPACKED: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) - ; UNPACKED: $vgpr0 = COPY [[BITCAST1]](<2 x s16>) - ; UNPACKED: $vgpr1 = COPY [[BITCAST2]](<2 x s16>) + ; UNPACKED: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) + ; UNPACKED: $vgpr0 = COPY [[BITCAST]](<2 x s16>) + ; UNPACKED: $vgpr1 = COPY [[BITCAST1]](<2 x s16>) ; UNPACKED: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 ; PACKED-LABEL: name: image_load_v3f16 ; PACKED: bb.1 (%ir-block.0): @@ -157,34 +149,8 @@ ; PACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) ; PACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s16>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load (<3 x s16>) from custom "ImageResource", align 8) ; PACKED: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s16>) - ; PACKED: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF - ; PACKED: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; PACKED: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; PACKED: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) - ; PACKED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; PACKED: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; PACKED: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) - ; PACKED: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; PACKED: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF1]](<4 x s16>) - ; PACKED: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) - ; PACKED: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) - ; PACKED: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; PACKED: [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; PACKED: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C1]] - ; PACKED: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) - ; PACKED: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C1]] - ; PACKED: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) - ; PACKED: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] - ; PACKED: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) - ; PACKED: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) - ; PACKED: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C1]] - ; PACKED: [[COPY13:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) - ; PACKED: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C1]] - ; PACKED: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32) - ; PACKED: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] - ; PACKED: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) - ; PACKED: $vgpr0 = COPY [[BITCAST3]](<2 x s16>) - ; PACKED: $vgpr1 = COPY [[BITCAST4]](<2 x s16>) + ; PACKED: $vgpr0 = COPY [[UV]](<2 x s16>) + ; PACKED: $vgpr1 = COPY [[UV1]](<2 x s16>) ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 %tex = call <3 x half> @llvm.amdgcn.image.load.2d.v3f16.i32(i32 7, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) ret <3 x half> %tex @@ -382,32 +348,24 @@ ; UNPACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) ; UNPACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load (<3 x s16>) from custom "ImageResource", align 8) ; UNPACKED: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) - ; UNPACKED: [[DEF1:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF - ; UNPACKED: [[DEF2:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF ; UNPACKED: G_STORE [[UV3]](s32), [[DEF]](p1) :: (store (s32) into `i32 addrspace(1)* undef`, addrspace 1) - ; UNPACKED: [[DEF3:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; UNPACKED: [[DEF4:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; UNPACKED: [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF3]](<4 x s16>) - ; UNPACKED: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) - ; UNPACKED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; UNPACKED: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; UNPACKED: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; UNPACKED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 ; UNPACKED: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32) - ; UNPACKED: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C1]] + ; UNPACKED: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C]] ; UNPACKED: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) - ; UNPACKED: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C1]] - ; UNPACKED: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) + ; UNPACKED: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C]] + ; UNPACKED: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNPACKED: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32) ; UNPACKED: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] - ; UNPACKED: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) + ; UNPACKED: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) ; UNPACKED: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) - ; UNPACKED: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C1]] - ; UNPACKED: [[COPY13:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; UNPACKED: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C1]] - ; UNPACKED: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32) + ; UNPACKED: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C]] + ; UNPACKED: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; UNPACKED: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C2]], [[C1]](s32) ; UNPACKED: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] - ; UNPACKED: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) - ; UNPACKED: $vgpr0 = COPY [[BITCAST1]](<2 x s16>) - ; UNPACKED: $vgpr1 = COPY [[BITCAST2]](<2 x s16>) + ; UNPACKED: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) + ; UNPACKED: $vgpr0 = COPY [[BITCAST]](<2 x s16>) + ; UNPACKED: $vgpr1 = COPY [[BITCAST1]](<2 x s16>) ; UNPACKED: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 ; PACKED-LABEL: name: image_load_tfe_v3f16 ; PACKED: bb.1 (%ir-block.0): @@ -429,35 +387,11 @@ ; PACKED: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<3 x s32>) ; PACKED: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[UV]](s32) ; PACKED: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[UV1]](s32) - ; PACKED: [[DEF1:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF + ; PACKED: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>) ; PACKED: G_STORE [[UV2]](s32), [[DEF]](p1) :: (store (s32) into `i32 addrspace(1)* undef`, addrspace 1) - ; PACKED: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; PACKED: [[DEF3:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; PACKED: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[BITCAST]](<2 x s16>) - ; PACKED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; PACKED: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) - ; PACKED: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[BITCAST1]](<2 x s16>) - ; PACKED: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) - ; PACKED: [[UV3:%[0-9]+]]:_(<2 x s16>), [[UV4:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF2]](<4 x s16>) - ; PACKED: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) - ; PACKED: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32) - ; PACKED: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; PACKED: [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) - ; PACKED: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C1]] - ; PACKED: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) - ; PACKED: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C1]] - ; PACKED: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) - ; PACKED: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] - ; PACKED: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) - ; PACKED: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST3]](s32) - ; PACKED: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C1]] - ; PACKED: [[COPY13:%[0-9]+]]:_(s32) = COPY [[BITCAST4]](s32) - ; PACKED: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C1]] - ; PACKED: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32) - ; PACKED: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] - ; PACKED: [[BITCAST6:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) - ; PACKED: $vgpr0 = COPY [[BITCAST5]](<2 x s16>) - ; PACKED: $vgpr1 = COPY [[BITCAST6]](<2 x s16>) + ; PACKED: [[UV3:%[0-9]+]]:_(<2 x s16>), [[UV4:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<4 x s16>) + ; PACKED: $vgpr0 = COPY [[UV3]](<2 x s16>) + ; PACKED: $vgpr1 = COPY [[UV4]](<2 x s16>) ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 %res = call { <3 x half>, i32 } @llvm.amdgcn.image.load.2d.sl_v3f16i32s.i32(i32 7, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0) %tex = extractvalue { <3 x half>, i32 } %res, 0 @@ -677,29 +611,21 @@ ; UNPACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) ; UNPACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load (<2 x s16>) from custom "ImageResource") ; UNPACKED: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>) - ; UNPACKED: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF - ; UNPACKED: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; UNPACKED: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; UNPACKED: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF1]](<4 x s16>) - ; UNPACKED: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) - ; UNPACKED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; UNPACKED: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; UNPACKED: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; UNPACKED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 ; UNPACKED: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32) - ; UNPACKED: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C1]] + ; UNPACKED: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C]] ; UNPACKED: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) - ; UNPACKED: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C1]] - ; UNPACKED: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) + ; UNPACKED: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C]] + ; UNPACKED: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNPACKED: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32) ; UNPACKED: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] - ; UNPACKED: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) + ; UNPACKED: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) ; UNPACKED: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; UNPACKED: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; UNPACKED: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C1]] - ; UNPACKED: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C]](s32) + ; UNPACKED: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C2]], [[C1]](s32) ; UNPACKED: [[OR1:%[0-9]+]]:_(s32) = G_OR [[C2]], [[SHL1]] - ; UNPACKED: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) - ; UNPACKED: $vgpr0 = COPY [[BITCAST1]](<2 x s16>) - ; UNPACKED: $vgpr1 = COPY [[BITCAST2]](<2 x s16>) + ; UNPACKED: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) + ; UNPACKED: $vgpr0 = COPY [[BITCAST]](<2 x s16>) + ; UNPACKED: $vgpr1 = COPY [[BITCAST1]](<2 x s16>) ; UNPACKED: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 ; PACKED-LABEL: name: image_load_v3f16_dmask_1100 ; PACKED: bb.1 (%ir-block.0): @@ -717,17 +643,9 @@ ; PACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 ; PACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) ; PACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s16>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load (<2 x s16>) from custom "ImageResource") - ; PACKED: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF - ; PACKED: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; PACKED: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; PACKED: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s16>) ; PACKED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; PACKED: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; PACKED: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[DEF]](<2 x s16>) - ; PACKED: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; PACKED: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF1]](<4 x s16>) - ; PACKED: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) - ; PACKED: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) ; PACKED: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 ; PACKED: [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) ; PACKED: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C1]] @@ -735,16 +653,13 @@ ; PACKED: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C1]] ; PACKED: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) ; PACKED: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] - ; PACKED: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) - ; PACKED: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) - ; PACKED: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C1]] - ; PACKED: [[COPY13:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) - ; PACKED: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C1]] - ; PACKED: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32) - ; PACKED: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] - ; PACKED: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) - ; PACKED: $vgpr0 = COPY [[BITCAST3]](<2 x s16>) - ; PACKED: $vgpr1 = COPY [[BITCAST4]](<2 x s16>) + ; PACKED: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) + ; PACKED: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; PACKED: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C2]], [[C]](s32) + ; PACKED: [[OR1:%[0-9]+]]:_(s32) = G_OR [[C2]], [[SHL1]] + ; PACKED: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) + ; PACKED: $vgpr0 = COPY [[BITCAST1]](<2 x s16>) + ; PACKED: $vgpr1 = COPY [[BITCAST2]](<2 x s16>) ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 %tex = call <3 x half> @llvm.amdgcn.image.load.2d.v3f16.i32(i32 3, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) ret <3 x half> %tex @@ -767,27 +682,18 @@ ; UNPACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 ; UNPACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) ; UNPACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load (s16) from custom "ImageResource") - ; UNPACKED: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF - ; UNPACKED: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; UNPACKED: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; UNPACKED: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF1]](<4 x s16>) - ; UNPACKED: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) - ; UNPACKED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; UNPACKED: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; UNPACKED: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; UNPACKED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 ; UNPACKED: [[COPY10:%[0-9]+]]:_(s32) = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) - ; UNPACKED: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C1]] - ; UNPACKED: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; UNPACKED: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[C2]], [[C]](s32) + ; UNPACKED: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C]] + ; UNPACKED: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; UNPACKED: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNPACKED: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[C1]], [[C2]](s32) ; UNPACKED: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] - ; UNPACKED: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) - ; UNPACKED: [[COPY11:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; UNPACKED: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C1]] - ; UNPACKED: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) - ; UNPACKED: [[OR1:%[0-9]+]]:_(s32) = G_OR [[C2]], [[SHL1]] - ; UNPACKED: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) - ; UNPACKED: $vgpr0 = COPY [[BITCAST1]](<2 x s16>) - ; UNPACKED: $vgpr1 = COPY [[BITCAST2]](<2 x s16>) + ; UNPACKED: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) + ; UNPACKED: [[OR1:%[0-9]+]]:_(s32) = G_OR [[C1]], [[SHL]] + ; UNPACKED: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) + ; UNPACKED: $vgpr0 = COPY [[BITCAST]](<2 x s16>) + ; UNPACKED: $vgpr1 = COPY [[BITCAST1]](<2 x s16>) ; UNPACKED: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 ; PACKED-LABEL: name: image_load_v3f16_dmask_1000 ; PACKED: bb.1 (%ir-block.0): @@ -805,17 +711,9 @@ ; PACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 ; PACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) ; PACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s16>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load (s16) from custom "ImageResource") - ; PACKED: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF - ; PACKED: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; PACKED: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; PACKED: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s16>) ; PACKED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; PACKED: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; PACKED: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[DEF]](<2 x s16>) - ; PACKED: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; PACKED: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF1]](<4 x s16>) - ; PACKED: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) - ; PACKED: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) ; PACKED: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 ; PACKED: [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) ; PACKED: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C1]] @@ -823,16 +721,13 @@ ; PACKED: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C1]] ; PACKED: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) ; PACKED: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] - ; PACKED: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) - ; PACKED: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) - ; PACKED: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C1]] - ; PACKED: [[COPY13:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) - ; PACKED: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C1]] - ; PACKED: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32) - ; PACKED: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] - ; PACKED: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) - ; PACKED: $vgpr0 = COPY [[BITCAST3]](<2 x s16>) - ; PACKED: $vgpr1 = COPY [[BITCAST4]](<2 x s16>) + ; PACKED: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) + ; PACKED: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; PACKED: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C2]], [[C]](s32) + ; PACKED: [[OR1:%[0-9]+]]:_(s32) = G_OR [[C2]], [[SHL1]] + ; PACKED: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) + ; PACKED: $vgpr0 = COPY [[BITCAST1]](<2 x s16>) + ; PACKED: $vgpr1 = COPY [[BITCAST2]](<2 x s16>) ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 %tex = call <3 x half> @llvm.amdgcn.image.load.2d.v3f16.i32(i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) ret <3 x half> %tex @@ -853,34 +748,9 @@ ; UNPACKED: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 ; UNPACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 ; UNPACKED: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; UNPACKED: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; UNPACKED: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; UNPACKED: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) - ; UNPACKED: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) - ; UNPACKED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; UNPACKED: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; UNPACKED: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) - ; UNPACKED: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; UNPACKED: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF2]](<4 x s16>) - ; UNPACKED: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) - ; UNPACKED: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) - ; UNPACKED: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; UNPACKED: [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; UNPACKED: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C1]] - ; UNPACKED: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) - ; UNPACKED: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C1]] - ; UNPACKED: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) - ; UNPACKED: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] - ; UNPACKED: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) - ; UNPACKED: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) - ; UNPACKED: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C1]] - ; UNPACKED: [[COPY13:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) - ; UNPACKED: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C1]] - ; UNPACKED: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32) - ; UNPACKED: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] - ; UNPACKED: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) - ; UNPACKED: $vgpr0 = COPY [[BITCAST3]](<2 x s16>) - ; UNPACKED: $vgpr1 = COPY [[BITCAST4]](<2 x s16>) + ; UNPACKED: $vgpr0 = COPY [[UV]](<2 x s16>) + ; UNPACKED: $vgpr1 = COPY [[UV1]](<2 x s16>) ; UNPACKED: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 ; PACKED-LABEL: name: image_load_v3f16_dmask_0000 ; PACKED: bb.1 (%ir-block.0): @@ -896,34 +766,9 @@ ; PACKED: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 ; PACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 ; PACKED: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; PACKED: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; PACKED: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; PACKED: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) - ; PACKED: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) - ; PACKED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; PACKED: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; PACKED: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) - ; PACKED: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; PACKED: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF2]](<4 x s16>) - ; PACKED: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) - ; PACKED: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) - ; PACKED: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; PACKED: [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; PACKED: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C1]] - ; PACKED: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) - ; PACKED: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C1]] - ; PACKED: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) - ; PACKED: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] - ; PACKED: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) - ; PACKED: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) - ; PACKED: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C1]] - ; PACKED: [[COPY13:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) - ; PACKED: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C1]] - ; PACKED: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32) - ; PACKED: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] - ; PACKED: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) - ; PACKED: $vgpr0 = COPY [[BITCAST3]](<2 x s16>) - ; PACKED: $vgpr1 = COPY [[BITCAST4]](<2 x s16>) + ; PACKED: $vgpr0 = COPY [[UV]](<2 x s16>) + ; PACKED: $vgpr1 = COPY [[UV1]](<2 x s16>) ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 %tex = call <3 x half> @llvm.amdgcn.image.load.2d.v3f16.i32(i32 0, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) ret <3 x half> %tex @@ -1331,30 +1176,22 @@ ; UNPACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) ; UNPACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<3 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load (<2 x s16>) from custom "ImageResource") ; UNPACKED: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<3 x s32>) - ; UNPACKED: [[DEF1:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF ; UNPACKED: G_STORE [[UV2]](s32), [[DEF]](p1) :: (store (s32) into `i32 addrspace(1)* undef`, addrspace 1) - ; UNPACKED: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; UNPACKED: [[DEF3:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; UNPACKED: [[UV3:%[0-9]+]]:_(<2 x s16>), [[UV4:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF2]](<4 x s16>) - ; UNPACKED: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) - ; UNPACKED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; UNPACKED: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; UNPACKED: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; UNPACKED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 ; UNPACKED: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32) - ; UNPACKED: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C1]] + ; UNPACKED: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C]] ; UNPACKED: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) - ; UNPACKED: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C1]] - ; UNPACKED: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) + ; UNPACKED: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C]] + ; UNPACKED: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNPACKED: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32) ; UNPACKED: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] - ; UNPACKED: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) + ; UNPACKED: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) ; UNPACKED: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; UNPACKED: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; UNPACKED: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C1]] - ; UNPACKED: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C]](s32) + ; UNPACKED: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C2]], [[C1]](s32) ; UNPACKED: [[OR1:%[0-9]+]]:_(s32) = G_OR [[C2]], [[SHL1]] - ; UNPACKED: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) - ; UNPACKED: $vgpr0 = COPY [[BITCAST1]](<2 x s16>) - ; UNPACKED: $vgpr1 = COPY [[BITCAST2]](<2 x s16>) + ; UNPACKED: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) + ; UNPACKED: $vgpr0 = COPY [[BITCAST]](<2 x s16>) + ; UNPACKED: $vgpr1 = COPY [[BITCAST1]](<2 x s16>) ; UNPACKED: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 ; PACKED-LABEL: name: image_load_tfe_v3f16_dmask_1100 ; PACKED: bb.1 (%ir-block.0): @@ -1375,18 +1212,10 @@ ; PACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load (<2 x s16>) from custom "ImageResource") ; PACKED: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>) ; PACKED: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[UV]](s32) - ; PACKED: [[DEF1:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF - ; PACKED: G_STORE [[UV1]](s32), [[DEF]](p1) :: (store (s32) into `i32 addrspace(1)* undef`, addrspace 1) - ; PACKED: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; PACKED: [[DEF3:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; PACKED: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[BITCAST]](<2 x s16>) ; PACKED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; PACKED: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; PACKED: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[DEF1]](<2 x s16>) - ; PACKED: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) - ; PACKED: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF2]](<4 x s16>) - ; PACKED: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) - ; PACKED: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) + ; PACKED: G_STORE [[UV1]](s32), [[DEF]](p1) :: (store (s32) into `i32 addrspace(1)* undef`, addrspace 1) ; PACKED: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 ; PACKED: [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) ; PACKED: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C1]] @@ -1394,16 +1223,13 @@ ; PACKED: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C1]] ; PACKED: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) ; PACKED: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] - ; PACKED: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) - ; PACKED: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) - ; PACKED: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C1]] - ; PACKED: [[COPY13:%[0-9]+]]:_(s32) = COPY [[BITCAST3]](s32) - ; PACKED: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C1]] - ; PACKED: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32) - ; PACKED: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] - ; PACKED: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) - ; PACKED: $vgpr0 = COPY [[BITCAST4]](<2 x s16>) - ; PACKED: $vgpr1 = COPY [[BITCAST5]](<2 x s16>) + ; PACKED: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) + ; PACKED: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; PACKED: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C2]], [[C]](s32) + ; PACKED: [[OR1:%[0-9]+]]:_(s32) = G_OR [[C2]], [[SHL1]] + ; PACKED: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) + ; PACKED: $vgpr0 = COPY [[BITCAST2]](<2 x s16>) + ; PACKED: $vgpr1 = COPY [[BITCAST3]](<2 x s16>) ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 %res = call { <3 x half>, i32 } @llvm.amdgcn.image.load.2d.sl_v3f16i32s.i32(i32 3, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0) %tex = extractvalue { <3 x half>, i32 } %res, 0 @@ -1431,28 +1257,19 @@ ; UNPACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) ; UNPACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load (s16) from custom "ImageResource") ; UNPACKED: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>) - ; UNPACKED: [[DEF1:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF ; UNPACKED: G_STORE [[UV1]](s32), [[DEF]](p1) :: (store (s32) into `i32 addrspace(1)* undef`, addrspace 1) - ; UNPACKED: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; UNPACKED: [[DEF3:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; UNPACKED: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF2]](<4 x s16>) - ; UNPACKED: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) - ; UNPACKED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; UNPACKED: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; UNPACKED: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; UNPACKED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 ; UNPACKED: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32) - ; UNPACKED: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C1]] - ; UNPACKED: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; UNPACKED: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[C2]], [[C]](s32) + ; UNPACKED: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C]] + ; UNPACKED: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; UNPACKED: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNPACKED: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[C1]], [[C2]](s32) ; UNPACKED: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] - ; UNPACKED: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) - ; UNPACKED: [[COPY11:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; UNPACKED: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C1]] - ; UNPACKED: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) - ; UNPACKED: [[OR1:%[0-9]+]]:_(s32) = G_OR [[C2]], [[SHL1]] - ; UNPACKED: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) - ; UNPACKED: $vgpr0 = COPY [[BITCAST1]](<2 x s16>) - ; UNPACKED: $vgpr1 = COPY [[BITCAST2]](<2 x s16>) + ; UNPACKED: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) + ; UNPACKED: [[OR1:%[0-9]+]]:_(s32) = G_OR [[C1]], [[SHL]] + ; UNPACKED: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) + ; UNPACKED: $vgpr0 = COPY [[BITCAST]](<2 x s16>) + ; UNPACKED: $vgpr1 = COPY [[BITCAST1]](<2 x s16>) ; UNPACKED: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 ; PACKED-LABEL: name: image_load_tfe_v3f16_dmask_1000 ; PACKED: bb.1 (%ir-block.0): @@ -1473,18 +1290,10 @@ ; PACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load (s16) from custom "ImageResource") ; PACKED: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>) ; PACKED: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[UV]](s32) - ; PACKED: [[DEF1:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF - ; PACKED: G_STORE [[UV1]](s32), [[DEF]](p1) :: (store (s32) into `i32 addrspace(1)* undef`, addrspace 1) - ; PACKED: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; PACKED: [[DEF3:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; PACKED: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[BITCAST]](<2 x s16>) ; PACKED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; PACKED: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; PACKED: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[DEF1]](<2 x s16>) - ; PACKED: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) - ; PACKED: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF2]](<4 x s16>) - ; PACKED: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) - ; PACKED: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) + ; PACKED: G_STORE [[UV1]](s32), [[DEF]](p1) :: (store (s32) into `i32 addrspace(1)* undef`, addrspace 1) ; PACKED: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 ; PACKED: [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) ; PACKED: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C1]] @@ -1492,16 +1301,13 @@ ; PACKED: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C1]] ; PACKED: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) ; PACKED: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] - ; PACKED: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) - ; PACKED: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) - ; PACKED: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C1]] - ; PACKED: [[COPY13:%[0-9]+]]:_(s32) = COPY [[BITCAST3]](s32) - ; PACKED: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C1]] - ; PACKED: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32) - ; PACKED: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] - ; PACKED: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) - ; PACKED: $vgpr0 = COPY [[BITCAST4]](<2 x s16>) - ; PACKED: $vgpr1 = COPY [[BITCAST5]](<2 x s16>) + ; PACKED: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) + ; PACKED: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; PACKED: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C2]], [[C]](s32) + ; PACKED: [[OR1:%[0-9]+]]:_(s32) = G_OR [[C2]], [[SHL1]] + ; PACKED: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) + ; PACKED: $vgpr0 = COPY [[BITCAST2]](<2 x s16>) + ; PACKED: $vgpr1 = COPY [[BITCAST3]](<2 x s16>) ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 %res = call { <3 x half>, i32 } @llvm.amdgcn.image.load.2d.sl_v3f16i32s.i32(i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0) %tex = extractvalue { <3 x half>, i32 } %res, 0 @@ -1529,28 +1335,19 @@ ; UNPACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) ; UNPACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load (s16) from custom "ImageResource") ; UNPACKED: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>) - ; UNPACKED: [[DEF1:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF ; UNPACKED: G_STORE [[UV1]](s32), [[DEF]](p1) :: (store (s32) into `i32 addrspace(1)* undef`, addrspace 1) - ; UNPACKED: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; UNPACKED: [[DEF3:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; UNPACKED: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF2]](<4 x s16>) - ; UNPACKED: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) - ; UNPACKED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; UNPACKED: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; UNPACKED: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; UNPACKED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 ; UNPACKED: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32) - ; UNPACKED: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C1]] - ; UNPACKED: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; UNPACKED: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[C2]], [[C]](s32) + ; UNPACKED: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C]] + ; UNPACKED: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; UNPACKED: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNPACKED: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[C1]], [[C2]](s32) ; UNPACKED: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] - ; UNPACKED: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) - ; UNPACKED: [[COPY11:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; UNPACKED: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C1]] - ; UNPACKED: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) - ; UNPACKED: [[OR1:%[0-9]+]]:_(s32) = G_OR [[C2]], [[SHL1]] - ; UNPACKED: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) - ; UNPACKED: $vgpr0 = COPY [[BITCAST1]](<2 x s16>) - ; UNPACKED: $vgpr1 = COPY [[BITCAST2]](<2 x s16>) + ; UNPACKED: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) + ; UNPACKED: [[OR1:%[0-9]+]]:_(s32) = G_OR [[C1]], [[SHL]] + ; UNPACKED: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) + ; UNPACKED: $vgpr0 = COPY [[BITCAST]](<2 x s16>) + ; UNPACKED: $vgpr1 = COPY [[BITCAST1]](<2 x s16>) ; UNPACKED: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 ; PACKED-LABEL: name: image_load_tfe_v3f16_dmask_0000 ; PACKED: bb.1 (%ir-block.0): @@ -1571,18 +1368,10 @@ ; PACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load (s16) from custom "ImageResource") ; PACKED: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>) ; PACKED: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[UV]](s32) - ; PACKED: [[DEF1:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF - ; PACKED: G_STORE [[UV1]](s32), [[DEF]](p1) :: (store (s32) into `i32 addrspace(1)* undef`, addrspace 1) - ; PACKED: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; PACKED: [[DEF3:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; PACKED: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[BITCAST]](<2 x s16>) ; PACKED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; PACKED: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; PACKED: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[DEF1]](<2 x s16>) - ; PACKED: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) - ; PACKED: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF2]](<4 x s16>) - ; PACKED: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) - ; PACKED: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) + ; PACKED: G_STORE [[UV1]](s32), [[DEF]](p1) :: (store (s32) into `i32 addrspace(1)* undef`, addrspace 1) ; PACKED: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 ; PACKED: [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) ; PACKED: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C1]] @@ -1590,16 +1379,13 @@ ; PACKED: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C1]] ; PACKED: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) ; PACKED: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] - ; PACKED: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) - ; PACKED: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) - ; PACKED: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C1]] - ; PACKED: [[COPY13:%[0-9]+]]:_(s32) = COPY [[BITCAST3]](s32) - ; PACKED: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C1]] - ; PACKED: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32) - ; PACKED: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] - ; PACKED: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) - ; PACKED: $vgpr0 = COPY [[BITCAST4]](<2 x s16>) - ; PACKED: $vgpr1 = COPY [[BITCAST5]](<2 x s16>) + ; PACKED: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) + ; PACKED: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; PACKED: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C2]], [[C]](s32) + ; PACKED: [[OR1:%[0-9]+]]:_(s32) = G_OR [[C2]], [[SHL1]] + ; PACKED: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) + ; PACKED: $vgpr0 = COPY [[BITCAST2]](<2 x s16>) + ; PACKED: $vgpr1 = COPY [[BITCAST3]](<2 x s16>) ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 %res = call { <3 x half>, i32 } @llvm.amdgcn.image.load.2d.sl_v3f16i32s.i32(i32 0, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0) %tex = extractvalue { <3 x half>, i32 } %res, 0 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.store.2d.d16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.store.2d.d16.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.store.2d.d16.ll @@ -188,17 +188,15 @@ ; UNPACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 ; UNPACKED: [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 ; UNPACKED: [[COPY11:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3 - ; UNPACKED: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF - ; UNPACKED: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY10]](<2 x s16>), [[COPY11]](<2 x s16>), [[DEF]](<2 x s16>) - ; UNPACKED: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[CONCAT_VECTORS]](<6 x s16>) - ; UNPACKED: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST]](s96) - ; UNPACKED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; UNPACKED: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C]](s32) - ; UNPACKED: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C]](s32) ; UNPACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; UNPACKED: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; UNPACKED: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY10]](<2 x s16>) + ; UNPACKED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNPACKED: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; UNPACKED: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY11]](<2 x s16>) + ; UNPACKED: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; UNPACKED: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) ; UNPACKED: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) - ; UNPACKED: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; UNPACKED: [[COPY14:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) ; UNPACKED: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32) ; UNPACKED: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[BUILD_VECTOR2]](<3 x s32>), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store (<3 x s16>) into custom "ImageResource", align 8) ; UNPACKED: S_ENDPGM 0 @@ -218,33 +216,31 @@ ; GFX81: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX81: [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 ; GFX81: [[COPY11:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3 - ; GFX81: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF - ; GFX81: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY10]](<2 x s16>), [[COPY11]](<2 x s16>), [[DEF]](<2 x s16>) - ; GFX81: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[CONCAT_VECTORS]](<6 x s16>) - ; GFX81: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST]](s96) - ; GFX81: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX81: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C]](s32) - ; GFX81: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C]](s32) ; GFX81: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) + ; GFX81: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY10]](<2 x s16>) + ; GFX81: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX81: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX81: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY11]](<2 x s16>) + ; GFX81: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) ; GFX81: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; GFX81: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX81: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) ; GFX81: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C1]] ; GFX81: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) ; GFX81: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C1]] ; GFX81: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) ; GFX81: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] - ; GFX81: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) - ; GFX81: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX81: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) + ; GFX81: [[COPY14:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) ; GFX81: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY14]], [[C1]] ; GFX81: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX81: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C2]], [[C]](s32) ; GFX81: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] - ; GFX81: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) + ; GFX81: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) ; GFX81: [[OR2:%[0-9]+]]:_(s32) = G_OR [[C2]], [[SHL1]] - ; GFX81: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32) - ; GFX81: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>) - ; GFX81: [[BITCAST4:%[0-9]+]]:_(<3 x s32>) = G_BITCAST [[CONCAT_VECTORS1]](<6 x s16>) - ; GFX81: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[BITCAST4]](<3 x s32>), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store (<3 x s16>) into custom "ImageResource", align 8) + ; GFX81: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32) + ; GFX81: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>) + ; GFX81: [[BITCAST5:%[0-9]+]]:_(<3 x s32>) = G_BITCAST [[CONCAT_VECTORS]](<6 x s16>) + ; GFX81: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[BITCAST5]](<3 x s32>), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store (<3 x s16>) into custom "ImageResource", align 8) ; GFX81: S_ENDPGM 0 ; GFX9-LABEL: name: image_store_v3f16 ; GFX9: bb.1 (%ir-block.0): @@ -262,23 +258,9 @@ ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX9: [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 ; GFX9: [[COPY11:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3 - ; GFX9: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF - ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY10]](<2 x s16>), [[COPY11]](<2 x s16>), [[DEF]](<2 x s16>) - ; GFX9: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[CONCAT_VECTORS]](<6 x s16>) - ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST]](s96) - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C]](s32) - ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C]](s32) - ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV]](s32) - ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) - ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32) - ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) - ; GFX9: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[DEF1]](s32) - ; GFX9: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[DEF]](<2 x s16>) - ; GFX9: [[UV3:%[0-9]+]]:_(<3 x s16>), [[UV4:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<6 x s16>) + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY10]](<2 x s16>), [[COPY11]](<2 x s16>) ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[UV3]](<3 x s16>), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store (<3 x s16>) into custom "ImageResource", align 8) + ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[CONCAT_VECTORS]](<4 x s16>), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store (<3 x s16>) into custom "ImageResource", align 8) ; GFX9: S_ENDPGM 0 ; GFX10-LABEL: name: image_store_v3f16 ; GFX10: bb.1 (%ir-block.0): @@ -296,23 +278,9 @@ ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX10: [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 ; GFX10: [[COPY11:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3 - ; GFX10: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF - ; GFX10: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY10]](<2 x s16>), [[COPY11]](<2 x s16>), [[DEF]](<2 x s16>) - ; GFX10: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[CONCAT_VECTORS]](<6 x s16>) - ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST]](s96) - ; GFX10: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C]](s32) - ; GFX10: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C]](s32) - ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV]](s32) - ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) - ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32) - ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) - ; GFX10: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[DEF1]](s32) - ; GFX10: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[DEF]](<2 x s16>) - ; GFX10: [[UV3:%[0-9]+]]:_(<3 x s16>), [[UV4:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<6 x s16>) + ; GFX10: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY10]](<2 x s16>), [[COPY11]](<2 x s16>) ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; GFX10: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[UV3]](<3 x s16>), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store (<3 x s16>) into custom "ImageResource", align 8) + ; GFX10: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[CONCAT_VECTORS]](<4 x s16>), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store (<3 x s16>) into custom "ImageResource", align 8) ; GFX10: S_ENDPGM 0 call void @llvm.amdgcn.image.store.2d.v3f16.i32(<3 x half> %in, i32 7, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) ret void Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll @@ -509,13 +509,10 @@ ; GFX8-UNPACKED-NEXT: s_mov_b32 s7, s9 ; GFX8-UNPACKED-NEXT: image_load v[0:2], v0, s[0:7] dmask:0x7 unorm d16 ; GFX8-UNPACKED-NEXT: s_mov_b32 s0, 0xffff -; GFX8-UNPACKED-NEXT: s_and_b32 s1, s0, s0 -; GFX8-UNPACKED-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0) -; GFX8-UNPACKED-NEXT: v_and_b32_e32 v4, s0, v1 -; GFX8-UNPACKED-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-UNPACKED-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX8-UNPACKED-NEXT: v_and_b32_e32 v3, s0, v1 +; GFX8-UNPACKED-NEXT: v_and_b32_e32 v1, s0, v2 +; GFX8-UNPACKED-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX8-UNPACKED-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-UNPACKED-NEXT: ; return to shader part epilog ; @@ -530,15 +527,7 @@ ; GFX8-PACKED-NEXT: s_mov_b32 s6, s8 ; GFX8-PACKED-NEXT: s_mov_b32 s7, s9 ; GFX8-PACKED-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x7 unorm d16 -; GFX8-PACKED-NEXT: s_mov_b32 s0, 0xffff -; GFX8-PACKED-NEXT: s_and_b32 s0, s0, s0 -; GFX8-PACKED-NEXT: s_lshl_b32 s0, s0, 16 -; GFX8-PACKED-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-PACKED-NEXT: s_waitcnt vmcnt(0) -; GFX8-PACKED-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX8-PACKED-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-PACKED-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-PACKED-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-PACKED-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: load_1d_v3f16_xyz: @@ -552,13 +541,7 @@ ; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x7 unorm d16 -; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX9-NEXT: s_lshl_b32 s0, s0, 16 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX9-NEXT: v_and_or_b32 v1, v1, v2, s0 -; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v3 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_1d_v3f16_xyz: @@ -571,15 +554,8 @@ ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX10-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_1D unorm d16 -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_lshl_b32 s0, s0, 16 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX10-NEXT: v_and_or_b32 v1, v1, v3, s0 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX10-NEXT: v_and_or_b32 v0, v0, v3, v2 ; GFX10-NEXT: ; return to shader part epilog %v = call <3 x half> @llvm.amdgcn.image.load.1d.v3f16.i32(i32 7, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) ret <3 x half> %v Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll @@ -173,21 +173,18 @@ ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; GFX6: [[S_BUFFER_LOAD_DWORDX4_SGPR:%[0-9]+]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_SGPR [[REG_SEQUENCE]], [[COPY4]], 0 :: (dereferenceable invariant load (s96), align 4) - ; GFX6: [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF - ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_512 = REG_SEQUENCE [[S_BUFFER_LOAD_DWORDX4_SGPR]], %subreg.sub0_sub1_sub2_sub3, [[DEF]], %subreg.sub4_sub5_sub6_sub7, [[DEF]], %subreg.sub8_sub9_sub10_sub11 - ; GFX6: [[COPY5:%[0-9]+]]:sgpr_512_with_sub0_sub1_sub2 = COPY [[REG_SEQUENCE1]] - ; GFX6: [[COPY6:%[0-9]+]]:sgpr_96 = COPY [[COPY5]].sub0_sub1_sub2 - ; GFX6: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[COPY6]].sub0 - ; GFX6: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[COPY6]].sub1 - ; GFX6: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[COPY6]].sub2 - ; GFX6: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY7]] - ; GFX6: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec + ; GFX6: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX4_SGPR]].sub0 + ; GFX6: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX4_SGPR]].sub1 + ; GFX6: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX4_SGPR]].sub2 + ; GFX6: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX4_SGPR]].sub3 + ; GFX6: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] + ; GFX6: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec ; GFX6: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] - ; GFX6: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY8]] - ; GFX6: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec + ; GFX6: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] + ; GFX6: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec ; GFX6: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] - ; GFX6: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY9]] - ; GFX6: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec + ; GFX6: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY7]] + ; GFX6: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec ; GFX6: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; GFX6: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2 ; GFX7-LABEL: name: s_buffer_load_v3i32 @@ -200,21 +197,18 @@ ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; GFX7: [[S_BUFFER_LOAD_DWORDX4_SGPR:%[0-9]+]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_SGPR [[REG_SEQUENCE]], [[COPY4]], 0 :: (dereferenceable invariant load (s96), align 4) - ; GFX7: [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF - ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_512 = REG_SEQUENCE [[S_BUFFER_LOAD_DWORDX4_SGPR]], %subreg.sub0_sub1_sub2_sub3, [[DEF]], %subreg.sub4_sub5_sub6_sub7, [[DEF]], %subreg.sub8_sub9_sub10_sub11 - ; GFX7: [[COPY5:%[0-9]+]]:sgpr_512_with_sub0_sub1_sub2 = COPY [[REG_SEQUENCE1]] - ; GFX7: [[COPY6:%[0-9]+]]:sgpr_96 = COPY [[COPY5]].sub0_sub1_sub2 - ; GFX7: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[COPY6]].sub0 - ; GFX7: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[COPY6]].sub1 - ; GFX7: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[COPY6]].sub2 - ; GFX7: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY7]] - ; GFX7: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec + ; GFX7: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX4_SGPR]].sub0 + ; GFX7: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX4_SGPR]].sub1 + ; GFX7: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX4_SGPR]].sub2 + ; GFX7: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX4_SGPR]].sub3 + ; GFX7: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] + ; GFX7: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec ; GFX7: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] - ; GFX7: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY8]] - ; GFX7: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec + ; GFX7: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] + ; GFX7: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec ; GFX7: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] - ; GFX7: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY9]] - ; GFX7: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec + ; GFX7: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY7]] + ; GFX7: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec ; GFX7: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; GFX7: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2 ; GFX8-LABEL: name: s_buffer_load_v3i32 @@ -227,21 +221,18 @@ ; GFX8: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; GFX8: [[S_BUFFER_LOAD_DWORDX4_SGPR:%[0-9]+]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_SGPR [[REG_SEQUENCE]], [[COPY4]], 0 :: (dereferenceable invariant load (s96), align 4) - ; GFX8: [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF - ; GFX8: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_512 = REG_SEQUENCE [[S_BUFFER_LOAD_DWORDX4_SGPR]], %subreg.sub0_sub1_sub2_sub3, [[DEF]], %subreg.sub4_sub5_sub6_sub7, [[DEF]], %subreg.sub8_sub9_sub10_sub11 - ; GFX8: [[COPY5:%[0-9]+]]:sgpr_512_with_sub0_sub1_sub2 = COPY [[REG_SEQUENCE1]] - ; GFX8: [[COPY6:%[0-9]+]]:sgpr_96 = COPY [[COPY5]].sub0_sub1_sub2 - ; GFX8: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[COPY6]].sub0 - ; GFX8: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[COPY6]].sub1 - ; GFX8: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[COPY6]].sub2 - ; GFX8: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY7]] - ; GFX8: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec + ; GFX8: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX4_SGPR]].sub0 + ; GFX8: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX4_SGPR]].sub1 + ; GFX8: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX4_SGPR]].sub2 + ; GFX8: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX4_SGPR]].sub3 + ; GFX8: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] + ; GFX8: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec ; GFX8: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] - ; GFX8: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY8]] - ; GFX8: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec + ; GFX8: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] + ; GFX8: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec ; GFX8: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] - ; GFX8: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY9]] - ; GFX8: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec + ; GFX8: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY7]] + ; GFX8: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec ; GFX8: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; GFX8: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2 %val = call <3 x i32> @llvm.amdgcn.s.buffer.load.v3i32(<4 x i32> %rsrc, i32 %soffset, i32 0) @@ -1589,17 +1580,13 @@ ; GFX6: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX6: [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF - ; GFX6: [[COPY5:%[0-9]+]]:vreg_128 = COPY [[DEF]] - ; GFX6: [[COPY6:%[0-9]+]]:vreg_128 = COPY [[DEF]] - ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:vreg_512 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[COPY5]], %subreg.sub4_sub5_sub6_sub7, [[COPY6]], %subreg.sub8_sub9_sub10_sub11 - ; GFX6: [[COPY7:%[0-9]+]]:vreg_96 = COPY [[REG_SEQUENCE1]].sub0_sub1_sub2 - ; GFX6: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY7]].sub0 - ; GFX6: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY7]].sub1 - ; GFX6: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY7]].sub2 - ; GFX6: $vgpr0 = COPY [[COPY8]] - ; GFX6: $vgpr1 = COPY [[COPY9]] - ; GFX6: $vgpr2 = COPY [[COPY10]] + ; GFX6: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub0 + ; GFX6: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub1 + ; GFX6: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub2 + ; GFX6: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub3 + ; GFX6: $vgpr0 = COPY [[COPY5]] + ; GFX6: $vgpr1 = COPY [[COPY6]] + ; GFX6: $vgpr2 = COPY [[COPY7]] ; GFX6: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2 ; GFX7-LABEL: name: s_buffer_load_v3f32_vgpr_offset ; GFX7: bb.1 (%ir-block.0): @@ -1612,17 +1599,13 @@ ; GFX7: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX7: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX7: [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF - ; GFX7: [[COPY5:%[0-9]+]]:vreg_128 = COPY [[DEF]] - ; GFX7: [[COPY6:%[0-9]+]]:vreg_128 = COPY [[DEF]] - ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_512 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[COPY5]], %subreg.sub4_sub5_sub6_sub7, [[COPY6]], %subreg.sub8_sub9_sub10_sub11 - ; GFX7: [[COPY7:%[0-9]+]]:vreg_96 = COPY [[REG_SEQUENCE1]].sub0_sub1_sub2 - ; GFX7: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY7]].sub0 - ; GFX7: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY7]].sub1 - ; GFX7: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY7]].sub2 - ; GFX7: $vgpr0 = COPY [[COPY8]] - ; GFX7: $vgpr1 = COPY [[COPY9]] - ; GFX7: $vgpr2 = COPY [[COPY10]] + ; GFX7: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub0 + ; GFX7: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub1 + ; GFX7: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub2 + ; GFX7: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub3 + ; GFX7: $vgpr0 = COPY [[COPY5]] + ; GFX7: $vgpr1 = COPY [[COPY6]] + ; GFX7: $vgpr2 = COPY [[COPY7]] ; GFX7: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2 ; GFX8-LABEL: name: s_buffer_load_v3f32_vgpr_offset ; GFX8: bb.1 (%ir-block.0): @@ -1635,17 +1618,13 @@ ; GFX8: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX8: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX8: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX8: [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF - ; GFX8: [[COPY5:%[0-9]+]]:vreg_128 = COPY [[DEF]] - ; GFX8: [[COPY6:%[0-9]+]]:vreg_128 = COPY [[DEF]] - ; GFX8: [[REG_SEQUENCE1:%[0-9]+]]:vreg_512 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[COPY5]], %subreg.sub4_sub5_sub6_sub7, [[COPY6]], %subreg.sub8_sub9_sub10_sub11 - ; GFX8: [[COPY7:%[0-9]+]]:vreg_96 = COPY [[REG_SEQUENCE1]].sub0_sub1_sub2 - ; GFX8: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY7]].sub0 - ; GFX8: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY7]].sub1 - ; GFX8: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY7]].sub2 - ; GFX8: $vgpr0 = COPY [[COPY8]] - ; GFX8: $vgpr1 = COPY [[COPY9]] - ; GFX8: $vgpr2 = COPY [[COPY10]] + ; GFX8: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub0 + ; GFX8: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub1 + ; GFX8: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub2 + ; GFX8: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub3 + ; GFX8: $vgpr0 = COPY [[COPY5]] + ; GFX8: $vgpr1 = COPY [[COPY6]] + ; GFX8: $vgpr2 = COPY [[COPY7]] ; GFX8: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2 %val = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret <3 x float> %val Index: llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll @@ -608,20 +608,20 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align4(<3 x i32> addrspace(4)* inreg %ptr) { ; GFX9-LABEL: s_load_constant_v3i32_align4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s5, s1 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x8 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: s_load_constant_v3i32_align4: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_mov_b32 s2, s0 -; GFX7-NEXT: s_mov_b32 s3, s1 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX7-NEXT: s_load_dword s2, s[2:3], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX7-NEXT: s_load_dword s2, s[0:1], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s0, s4 +; GFX7-NEXT: s_mov_b32 s1, s5 ; GFX7-NEXT: ; return to shader part epilog %load = load <3 x i32>, <3 x i32> addrspace(4)* %ptr, align 4 ret <3 x i32> %load @@ -630,20 +630,20 @@ define amdgpu_ps i96 @s_load_constant_i96_align8(i96 addrspace(4)* inreg %ptr) { ; GFX9-LABEL: s_load_constant_i96_align8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s5, s1 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x8 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: s_load_constant_i96_align8: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_mov_b32 s2, s0 -; GFX7-NEXT: s_mov_b32 s3, s1 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX7-NEXT: s_load_dword s2, s[2:3], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX7-NEXT: s_load_dword s2, s[0:1], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s0, s4 +; GFX7-NEXT: s_mov_b32 s1, s5 ; GFX7-NEXT: ; return to shader part epilog %load = load i96, i96 addrspace(4)* %ptr, align 8 ret i96 %load @@ -652,20 +652,20 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align8(<3 x i32> addrspace(4)* inreg %ptr) { ; GFX9-LABEL: s_load_constant_v3i32_align8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s5, s1 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x8 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: s_load_constant_v3i32_align8: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_mov_b32 s2, s0 -; GFX7-NEXT: s_mov_b32 s3, s1 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX7-NEXT: s_load_dword s2, s[2:3], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX7-NEXT: s_load_dword s2, s[0:1], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s0, s4 +; GFX7-NEXT: s_mov_b32 s1, s5 ; GFX7-NEXT: ; return to shader part epilog %load = load <3 x i32>, <3 x i32> addrspace(4)* %ptr, align 8 ret <3 x i32> %load @@ -674,20 +674,20 @@ define amdgpu_ps <3 x i32> @s_load_constant_v6i16_align8(<6 x i16> addrspace(4)* inreg %ptr) { ; GFX9-LABEL: s_load_constant_v6i16_align8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s5, s1 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x8 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: s_load_constant_v6i16_align8: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_mov_b32 s2, s0 -; GFX7-NEXT: s_mov_b32 s3, s1 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX7-NEXT: s_load_dword s2, s[2:3], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX7-NEXT: s_load_dword s2, s[0:1], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s0, s4 +; GFX7-NEXT: s_mov_b32 s1, s5 ; GFX7-NEXT: ; return to shader part epilog %load = load <6 x i16>, <6 x i16> addrspace(4)* %ptr, align 8 %cast = bitcast <6 x i16> %load to <3 x i32> Index: llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll @@ -88,12 +88,7 @@ ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 ; CHECK: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load (s96), align 4) - ; CHECK: [[DEF:%[0-9]+]]:sgpr(<4 x s32>) = G_IMPLICIT_DEF - ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:sgpr(<12 x s32>) = G_CONCAT_VECTORS [[AMDGPU_S_BUFFER_LOAD]](<4 x s32>), [[DEF]](<4 x s32>), [[DEF]](<4 x s32>) - ; CHECK: [[BITCAST:%[0-9]+]]:sgpr(s384) = G_BITCAST [[CONCAT_VECTORS]](<12 x s32>) - ; CHECK: [[TRUNC:%[0-9]+]]:sgpr(s96) = G_TRUNC [[BITCAST]](s384) - ; CHECK: [[BITCAST1:%[0-9]+]]:sgpr(<3 x s32>) = G_BITCAST [[TRUNC]](s96) - ; CHECK: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32), [[UV2:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[BITCAST1]](<3 x s32>) + ; CHECK: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32), [[UV2:%[0-9]+]]:sgpr(s32), [[UV3:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<4 x s32>) ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[UV]](s32) ; CHECK: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32) ; CHECK: $sgpr0 = COPY [[INT]](s32) @@ -114,12 +109,7 @@ ; GREEDY: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 ; GREEDY: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load (s96), align 4) - ; GREEDY: [[DEF:%[0-9]+]]:sgpr(<4 x s32>) = G_IMPLICIT_DEF - ; GREEDY: [[CONCAT_VECTORS:%[0-9]+]]:sgpr(<12 x s32>) = G_CONCAT_VECTORS [[AMDGPU_S_BUFFER_LOAD]](<4 x s32>), [[DEF]](<4 x s32>), [[DEF]](<4 x s32>) - ; GREEDY: [[BITCAST:%[0-9]+]]:sgpr(s384) = G_BITCAST [[CONCAT_VECTORS]](<12 x s32>) - ; GREEDY: [[TRUNC:%[0-9]+]]:sgpr(s96) = G_TRUNC [[BITCAST]](s384) - ; GREEDY: [[BITCAST1:%[0-9]+]]:sgpr(<3 x s32>) = G_BITCAST [[TRUNC]](s96) - ; GREEDY: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32), [[UV2:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[BITCAST1]](<3 x s32>) + ; GREEDY: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32), [[UV2:%[0-9]+]]:sgpr(s32), [[UV3:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<4 x s32>) ; GREEDY: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[UV]](s32) ; GREEDY: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32) ; GREEDY: $sgpr0 = COPY [[INT]](s32) @@ -420,14 +410,7 @@ ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; CHECK: [[DEF:%[0-9]+]]:sgpr(<4 x s32>) = G_IMPLICIT_DEF - ; CHECK: [[COPY5:%[0-9]+]]:vgpr(<4 x s32>) = COPY [[DEF]](<4 x s32>) - ; CHECK: [[COPY6:%[0-9]+]]:vgpr(<4 x s32>) = COPY [[DEF]](<4 x s32>) - ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<12 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[COPY5]](<4 x s32>), [[COPY6]](<4 x s32>) - ; CHECK: [[BITCAST:%[0-9]+]]:vgpr(s384) = G_BITCAST [[CONCAT_VECTORS]](<12 x s32>) - ; CHECK: [[TRUNC:%[0-9]+]]:vgpr(s96) = G_TRUNC [[BITCAST]](s384) - ; CHECK: [[BITCAST1:%[0-9]+]]:vgpr(<3 x s32>) = G_BITCAST [[TRUNC]](s96) - ; CHECK: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[BITCAST1]](<3 x s32>) + ; CHECK: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_BUFFER_LOAD]](<4 x s32>) ; CHECK: $vgpr0 = COPY [[UV]](s32) ; CHECK: $vgpr1 = COPY [[UV1]](s32) ; CHECK: $vgpr2 = COPY [[UV2]](s32) @@ -444,14 +427,7 @@ ; GREEDY: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; GREEDY: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; GREEDY: [[DEF:%[0-9]+]]:sgpr(<4 x s32>) = G_IMPLICIT_DEF - ; GREEDY: [[COPY5:%[0-9]+]]:vgpr(<4 x s32>) = COPY [[DEF]](<4 x s32>) - ; GREEDY: [[COPY6:%[0-9]+]]:vgpr(<4 x s32>) = COPY [[DEF]](<4 x s32>) - ; GREEDY: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<12 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[COPY5]](<4 x s32>), [[COPY6]](<4 x s32>) - ; GREEDY: [[BITCAST:%[0-9]+]]:vgpr(s384) = G_BITCAST [[CONCAT_VECTORS]](<12 x s32>) - ; GREEDY: [[TRUNC:%[0-9]+]]:vgpr(s96) = G_TRUNC [[BITCAST]](s384) - ; GREEDY: [[BITCAST1:%[0-9]+]]:vgpr(<3 x s32>) = G_BITCAST [[TRUNC]](s96) - ; GREEDY: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[BITCAST1]](<3 x s32>) + ; GREEDY: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_BUFFER_LOAD]](<4 x s32>) ; GREEDY: $vgpr0 = COPY [[UV]](s32) ; GREEDY: $vgpr1 = COPY [[UV1]](s32) ; GREEDY: $vgpr2 = COPY [[UV2]](s32) @@ -796,11 +772,87 @@ ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<8 x s16>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) ; CHECK: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<8 x s16>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<16 x s16>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<8 x s16>), [[AMDGPU_BUFFER_LOAD1]](<8 x s16>) - ; CHECK: [[UV:%[0-9]+]]:vgpr(<8 x s16>), [[UV1:%[0-9]+]]:vgpr(<8 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<16 x s16>) - ; CHECK: G_STORE [[UV]](<8 x s16>), [[DEF]](p1) :: (store (<8 x s16>) into `<16 x i16> addrspace(1)* undef`, align 32, addrspace 1) - ; CHECK: [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16 - ; CHECK: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64) - ; CHECK: G_STORE [[UV1]](<8 x s16>), [[PTR_ADD]](p1) :: (store (<8 x s16>) into `<16 x i16> addrspace(1)* undef` + 16, basealign 32, addrspace 1) + ; CHECK: [[UV:%[0-9]+]]:vgpr(<2 x s16>), [[UV1:%[0-9]+]]:vgpr(<2 x s16>), [[UV2:%[0-9]+]]:vgpr(<2 x s16>), [[UV3:%[0-9]+]]:vgpr(<2 x s16>), [[UV4:%[0-9]+]]:vgpr(<2 x s16>), [[UV5:%[0-9]+]]:vgpr(<2 x s16>), [[UV6:%[0-9]+]]:vgpr(<2 x s16>), [[UV7:%[0-9]+]]:vgpr(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<16 x s16>) + ; CHECK: [[BITCAST:%[0-9]+]]:vgpr(s32) = G_BITCAST [[UV]](<2 x s16>) + ; CHECK: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16 + ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; CHECK: [[LSHR:%[0-9]+]]:vgpr(s32) = G_LSHR [[BITCAST]], [[COPY5]](s32) + ; CHECK: [[BITCAST1:%[0-9]+]]:vgpr(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; CHECK: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; CHECK: [[LSHR1:%[0-9]+]]:vgpr(s32) = G_LSHR [[BITCAST1]], [[COPY6]](s32) + ; CHECK: [[BITCAST2:%[0-9]+]]:vgpr(s32) = G_BITCAST [[UV2]](<2 x s16>) + ; CHECK: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; CHECK: [[LSHR2:%[0-9]+]]:vgpr(s32) = G_LSHR [[BITCAST2]], [[COPY7]](s32) + ; CHECK: [[BITCAST3:%[0-9]+]]:vgpr(s32) = G_BITCAST [[UV3]](<2 x s16>) + ; CHECK: [[COPY8:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; CHECK: [[LSHR3:%[0-9]+]]:vgpr(s32) = G_LSHR [[BITCAST3]], [[COPY8]](s32) + ; CHECK: [[BITCAST4:%[0-9]+]]:vgpr(s32) = G_BITCAST [[UV4]](<2 x s16>) + ; CHECK: [[COPY9:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; CHECK: [[LSHR4:%[0-9]+]]:vgpr(s32) = G_LSHR [[BITCAST4]], [[COPY9]](s32) + ; CHECK: [[BITCAST5:%[0-9]+]]:vgpr(s32) = G_BITCAST [[UV5]](<2 x s16>) + ; CHECK: [[COPY10:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; CHECK: [[LSHR5:%[0-9]+]]:vgpr(s32) = G_LSHR [[BITCAST5]], [[COPY10]](s32) + ; CHECK: [[BITCAST6:%[0-9]+]]:vgpr(s32) = G_BITCAST [[UV6]](<2 x s16>) + ; CHECK: [[COPY11:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; CHECK: [[LSHR6:%[0-9]+]]:vgpr(s32) = G_LSHR [[BITCAST6]], [[COPY11]](s32) + ; CHECK: [[BITCAST7:%[0-9]+]]:vgpr(s32) = G_BITCAST [[UV7]](<2 x s16>) + ; CHECK: [[COPY12:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; CHECK: [[LSHR7:%[0-9]+]]:vgpr(s32) = G_LSHR [[BITCAST7]], [[COPY12]](s32) + ; CHECK: [[C3:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 65535 + ; CHECK: [[COPY13:%[0-9]+]]:vgpr(s32) = COPY [[C3]](s32) + ; CHECK: [[AND:%[0-9]+]]:vgpr(s32) = G_AND [[BITCAST]], [[COPY13]] + ; CHECK: [[COPY14:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; CHECK: [[SHL:%[0-9]+]]:vgpr(s32) = G_SHL [[LSHR]], [[COPY14]](s32) + ; CHECK: [[OR:%[0-9]+]]:vgpr(s32) = G_OR [[AND]], [[SHL]] + ; CHECK: [[BITCAST8:%[0-9]+]]:vgpr(<2 x s16>) = G_BITCAST [[OR]](s32) + ; CHECK: [[COPY15:%[0-9]+]]:vgpr(s32) = COPY [[C3]](s32) + ; CHECK: [[AND1:%[0-9]+]]:vgpr(s32) = G_AND [[BITCAST1]], [[COPY15]] + ; CHECK: [[COPY16:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; CHECK: [[SHL1:%[0-9]+]]:vgpr(s32) = G_SHL [[LSHR1]], [[COPY16]](s32) + ; CHECK: [[OR1:%[0-9]+]]:vgpr(s32) = G_OR [[AND1]], [[SHL1]] + ; CHECK: [[BITCAST9:%[0-9]+]]:vgpr(<2 x s16>) = G_BITCAST [[OR1]](s32) + ; CHECK: [[COPY17:%[0-9]+]]:vgpr(s32) = COPY [[C3]](s32) + ; CHECK: [[AND2:%[0-9]+]]:vgpr(s32) = G_AND [[BITCAST2]], [[COPY17]] + ; CHECK: [[COPY18:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; CHECK: [[SHL2:%[0-9]+]]:vgpr(s32) = G_SHL [[LSHR2]], [[COPY18]](s32) + ; CHECK: [[OR2:%[0-9]+]]:vgpr(s32) = G_OR [[AND2]], [[SHL2]] + ; CHECK: [[BITCAST10:%[0-9]+]]:vgpr(<2 x s16>) = G_BITCAST [[OR2]](s32) + ; CHECK: [[COPY19:%[0-9]+]]:vgpr(s32) = COPY [[C3]](s32) + ; CHECK: [[AND3:%[0-9]+]]:vgpr(s32) = G_AND [[BITCAST3]], [[COPY19]] + ; CHECK: [[COPY20:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; CHECK: [[SHL3:%[0-9]+]]:vgpr(s32) = G_SHL [[LSHR3]], [[COPY20]](s32) + ; CHECK: [[OR3:%[0-9]+]]:vgpr(s32) = G_OR [[AND3]], [[SHL3]] + ; CHECK: [[BITCAST11:%[0-9]+]]:vgpr(<2 x s16>) = G_BITCAST [[OR3]](s32) + ; CHECK: [[CONCAT_VECTORS1:%[0-9]+]]:vgpr(<8 x s16>) = G_CONCAT_VECTORS [[BITCAST8]](<2 x s16>), [[BITCAST9]](<2 x s16>), [[BITCAST10]](<2 x s16>), [[BITCAST11]](<2 x s16>) + ; CHECK: G_STORE [[CONCAT_VECTORS1]](<8 x s16>), [[DEF]](p1) :: (store (<8 x s16>) into `<16 x i16> addrspace(1)* undef`, align 32, addrspace 1) + ; CHECK: [[COPY21:%[0-9]+]]:vgpr(s32) = COPY [[C3]](s32) + ; CHECK: [[AND4:%[0-9]+]]:vgpr(s32) = G_AND [[BITCAST4]], [[COPY21]] + ; CHECK: [[COPY22:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; CHECK: [[SHL4:%[0-9]+]]:vgpr(s32) = G_SHL [[LSHR4]], [[COPY22]](s32) + ; CHECK: [[OR4:%[0-9]+]]:vgpr(s32) = G_OR [[AND4]], [[SHL4]] + ; CHECK: [[BITCAST12:%[0-9]+]]:vgpr(<2 x s16>) = G_BITCAST [[OR4]](s32) + ; CHECK: [[COPY23:%[0-9]+]]:vgpr(s32) = COPY [[C3]](s32) + ; CHECK: [[AND5:%[0-9]+]]:vgpr(s32) = G_AND [[BITCAST5]], [[COPY23]] + ; CHECK: [[COPY24:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; CHECK: [[SHL5:%[0-9]+]]:vgpr(s32) = G_SHL [[LSHR5]], [[COPY24]](s32) + ; CHECK: [[OR5:%[0-9]+]]:vgpr(s32) = G_OR [[AND5]], [[SHL5]] + ; CHECK: [[BITCAST13:%[0-9]+]]:vgpr(<2 x s16>) = G_BITCAST [[OR5]](s32) + ; CHECK: [[COPY25:%[0-9]+]]:vgpr(s32) = COPY [[C3]](s32) + ; CHECK: [[AND6:%[0-9]+]]:vgpr(s32) = G_AND [[BITCAST6]], [[COPY25]] + ; CHECK: [[COPY26:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; CHECK: [[SHL6:%[0-9]+]]:vgpr(s32) = G_SHL [[LSHR6]], [[COPY26]](s32) + ; CHECK: [[OR6:%[0-9]+]]:vgpr(s32) = G_OR [[AND6]], [[SHL6]] + ; CHECK: [[BITCAST14:%[0-9]+]]:vgpr(<2 x s16>) = G_BITCAST [[OR6]](s32) + ; CHECK: [[COPY27:%[0-9]+]]:vgpr(s32) = COPY [[C3]](s32) + ; CHECK: [[AND7:%[0-9]+]]:vgpr(s32) = G_AND [[BITCAST7]], [[COPY27]] + ; CHECK: [[COPY28:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; CHECK: [[SHL7:%[0-9]+]]:vgpr(s32) = G_SHL [[LSHR7]], [[COPY28]](s32) + ; CHECK: [[OR7:%[0-9]+]]:vgpr(s32) = G_OR [[AND7]], [[SHL7]] + ; CHECK: [[BITCAST15:%[0-9]+]]:vgpr(<2 x s16>) = G_BITCAST [[OR7]](s32) + ; CHECK: [[CONCAT_VECTORS2:%[0-9]+]]:vgpr(<8 x s16>) = G_CONCAT_VECTORS [[BITCAST12]](<2 x s16>), [[BITCAST13]](<2 x s16>), [[BITCAST14]](<2 x s16>), [[BITCAST15]](<2 x s16>) + ; CHECK: [[C4:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16 + ; CHECK: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C4]](s64) + ; CHECK: G_STORE [[CONCAT_VECTORS2]](<8 x s16>), [[PTR_ADD]](p1) :: (store (<8 x s16>) into `<16 x i16> addrspace(1)* undef` + 128, align 32, addrspace 1) ; CHECK: S_ENDPGM 0 ; GREEDY-LABEL: name: s_buffer_load_v16i16_vgpr_offset ; GREEDY: bb.1 (%ir-block.0): @@ -817,11 +869,87 @@ ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<8 x s16>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) ; GREEDY: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<8 x s16>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) ; GREEDY: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<16 x s16>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<8 x s16>), [[AMDGPU_BUFFER_LOAD1]](<8 x s16>) - ; GREEDY: [[UV:%[0-9]+]]:vgpr(<8 x s16>), [[UV1:%[0-9]+]]:vgpr(<8 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<16 x s16>) - ; GREEDY: G_STORE [[UV]](<8 x s16>), [[DEF]](p1) :: (store (<8 x s16>) into `<16 x i16> addrspace(1)* undef`, align 32, addrspace 1) - ; GREEDY: [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16 - ; GREEDY: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64) - ; GREEDY: G_STORE [[UV1]](<8 x s16>), [[PTR_ADD]](p1) :: (store (<8 x s16>) into `<16 x i16> addrspace(1)* undef` + 16, basealign 32, addrspace 1) + ; GREEDY: [[UV:%[0-9]+]]:vgpr(<2 x s16>), [[UV1:%[0-9]+]]:vgpr(<2 x s16>), [[UV2:%[0-9]+]]:vgpr(<2 x s16>), [[UV3:%[0-9]+]]:vgpr(<2 x s16>), [[UV4:%[0-9]+]]:vgpr(<2 x s16>), [[UV5:%[0-9]+]]:vgpr(<2 x s16>), [[UV6:%[0-9]+]]:vgpr(<2 x s16>), [[UV7:%[0-9]+]]:vgpr(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<16 x s16>) + ; GREEDY: [[BITCAST:%[0-9]+]]:vgpr(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GREEDY: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16 + ; GREEDY: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; GREEDY: [[LSHR:%[0-9]+]]:vgpr(s32) = G_LSHR [[BITCAST]], [[COPY5]](s32) + ; GREEDY: [[BITCAST1:%[0-9]+]]:vgpr(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; GREEDY: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; GREEDY: [[LSHR1:%[0-9]+]]:vgpr(s32) = G_LSHR [[BITCAST1]], [[COPY6]](s32) + ; GREEDY: [[BITCAST2:%[0-9]+]]:vgpr(s32) = G_BITCAST [[UV2]](<2 x s16>) + ; GREEDY: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; GREEDY: [[LSHR2:%[0-9]+]]:vgpr(s32) = G_LSHR [[BITCAST2]], [[COPY7]](s32) + ; GREEDY: [[BITCAST3:%[0-9]+]]:vgpr(s32) = G_BITCAST [[UV3]](<2 x s16>) + ; GREEDY: [[COPY8:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; GREEDY: [[LSHR3:%[0-9]+]]:vgpr(s32) = G_LSHR [[BITCAST3]], [[COPY8]](s32) + ; GREEDY: [[BITCAST4:%[0-9]+]]:vgpr(s32) = G_BITCAST [[UV4]](<2 x s16>) + ; GREEDY: [[COPY9:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; GREEDY: [[LSHR4:%[0-9]+]]:vgpr(s32) = G_LSHR [[BITCAST4]], [[COPY9]](s32) + ; GREEDY: [[BITCAST5:%[0-9]+]]:vgpr(s32) = G_BITCAST [[UV5]](<2 x s16>) + ; GREEDY: [[COPY10:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; GREEDY: [[LSHR5:%[0-9]+]]:vgpr(s32) = G_LSHR [[BITCAST5]], [[COPY10]](s32) + ; GREEDY: [[BITCAST6:%[0-9]+]]:vgpr(s32) = G_BITCAST [[UV6]](<2 x s16>) + ; GREEDY: [[COPY11:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; GREEDY: [[LSHR6:%[0-9]+]]:vgpr(s32) = G_LSHR [[BITCAST6]], [[COPY11]](s32) + ; GREEDY: [[BITCAST7:%[0-9]+]]:vgpr(s32) = G_BITCAST [[UV7]](<2 x s16>) + ; GREEDY: [[COPY12:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; GREEDY: [[LSHR7:%[0-9]+]]:vgpr(s32) = G_LSHR [[BITCAST7]], [[COPY12]](s32) + ; GREEDY: [[C3:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 65535 + ; GREEDY: [[COPY13:%[0-9]+]]:vgpr(s32) = COPY [[C3]](s32) + ; GREEDY: [[AND:%[0-9]+]]:vgpr(s32) = G_AND [[BITCAST]], [[COPY13]] + ; GREEDY: [[COPY14:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; GREEDY: [[SHL:%[0-9]+]]:vgpr(s32) = G_SHL [[LSHR]], [[COPY14]](s32) + ; GREEDY: [[OR:%[0-9]+]]:vgpr(s32) = G_OR [[AND]], [[SHL]] + ; GREEDY: [[BITCAST8:%[0-9]+]]:vgpr(<2 x s16>) = G_BITCAST [[OR]](s32) + ; GREEDY: [[COPY15:%[0-9]+]]:vgpr(s32) = COPY [[C3]](s32) + ; GREEDY: [[AND1:%[0-9]+]]:vgpr(s32) = G_AND [[BITCAST1]], [[COPY15]] + ; GREEDY: [[COPY16:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; GREEDY: [[SHL1:%[0-9]+]]:vgpr(s32) = G_SHL [[LSHR1]], [[COPY16]](s32) + ; GREEDY: [[OR1:%[0-9]+]]:vgpr(s32) = G_OR [[AND1]], [[SHL1]] + ; GREEDY: [[BITCAST9:%[0-9]+]]:vgpr(<2 x s16>) = G_BITCAST [[OR1]](s32) + ; GREEDY: [[COPY17:%[0-9]+]]:vgpr(s32) = COPY [[C3]](s32) + ; GREEDY: [[AND2:%[0-9]+]]:vgpr(s32) = G_AND [[BITCAST2]], [[COPY17]] + ; GREEDY: [[COPY18:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; GREEDY: [[SHL2:%[0-9]+]]:vgpr(s32) = G_SHL [[LSHR2]], [[COPY18]](s32) + ; GREEDY: [[OR2:%[0-9]+]]:vgpr(s32) = G_OR [[AND2]], [[SHL2]] + ; GREEDY: [[BITCAST10:%[0-9]+]]:vgpr(<2 x s16>) = G_BITCAST [[OR2]](s32) + ; GREEDY: [[COPY19:%[0-9]+]]:vgpr(s32) = COPY [[C3]](s32) + ; GREEDY: [[AND3:%[0-9]+]]:vgpr(s32) = G_AND [[BITCAST3]], [[COPY19]] + ; GREEDY: [[COPY20:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; GREEDY: [[SHL3:%[0-9]+]]:vgpr(s32) = G_SHL [[LSHR3]], [[COPY20]](s32) + ; GREEDY: [[OR3:%[0-9]+]]:vgpr(s32) = G_OR [[AND3]], [[SHL3]] + ; GREEDY: [[BITCAST11:%[0-9]+]]:vgpr(<2 x s16>) = G_BITCAST [[OR3]](s32) + ; GREEDY: [[CONCAT_VECTORS1:%[0-9]+]]:vgpr(<8 x s16>) = G_CONCAT_VECTORS [[BITCAST8]](<2 x s16>), [[BITCAST9]](<2 x s16>), [[BITCAST10]](<2 x s16>), [[BITCAST11]](<2 x s16>) + ; GREEDY: G_STORE [[CONCAT_VECTORS1]](<8 x s16>), [[DEF]](p1) :: (store (<8 x s16>) into `<16 x i16> addrspace(1)* undef`, align 32, addrspace 1) + ; GREEDY: [[COPY21:%[0-9]+]]:vgpr(s32) = COPY [[C3]](s32) + ; GREEDY: [[AND4:%[0-9]+]]:vgpr(s32) = G_AND [[BITCAST4]], [[COPY21]] + ; GREEDY: [[COPY22:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; GREEDY: [[SHL4:%[0-9]+]]:vgpr(s32) = G_SHL [[LSHR4]], [[COPY22]](s32) + ; GREEDY: [[OR4:%[0-9]+]]:vgpr(s32) = G_OR [[AND4]], [[SHL4]] + ; GREEDY: [[BITCAST12:%[0-9]+]]:vgpr(<2 x s16>) = G_BITCAST [[OR4]](s32) + ; GREEDY: [[COPY23:%[0-9]+]]:vgpr(s32) = COPY [[C3]](s32) + ; GREEDY: [[AND5:%[0-9]+]]:vgpr(s32) = G_AND [[BITCAST5]], [[COPY23]] + ; GREEDY: [[COPY24:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; GREEDY: [[SHL5:%[0-9]+]]:vgpr(s32) = G_SHL [[LSHR5]], [[COPY24]](s32) + ; GREEDY: [[OR5:%[0-9]+]]:vgpr(s32) = G_OR [[AND5]], [[SHL5]] + ; GREEDY: [[BITCAST13:%[0-9]+]]:vgpr(<2 x s16>) = G_BITCAST [[OR5]](s32) + ; GREEDY: [[COPY25:%[0-9]+]]:vgpr(s32) = COPY [[C3]](s32) + ; GREEDY: [[AND6:%[0-9]+]]:vgpr(s32) = G_AND [[BITCAST6]], [[COPY25]] + ; GREEDY: [[COPY26:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; GREEDY: [[SHL6:%[0-9]+]]:vgpr(s32) = G_SHL [[LSHR6]], [[COPY26]](s32) + ; GREEDY: [[OR6:%[0-9]+]]:vgpr(s32) = G_OR [[AND6]], [[SHL6]] + ; GREEDY: [[BITCAST14:%[0-9]+]]:vgpr(<2 x s16>) = G_BITCAST [[OR6]](s32) + ; GREEDY: [[COPY27:%[0-9]+]]:vgpr(s32) = COPY [[C3]](s32) + ; GREEDY: [[AND7:%[0-9]+]]:vgpr(s32) = G_AND [[BITCAST7]], [[COPY27]] + ; GREEDY: [[COPY28:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; GREEDY: [[SHL7:%[0-9]+]]:vgpr(s32) = G_SHL [[LSHR7]], [[COPY28]](s32) + ; GREEDY: [[OR7:%[0-9]+]]:vgpr(s32) = G_OR [[AND7]], [[SHL7]] + ; GREEDY: [[BITCAST15:%[0-9]+]]:vgpr(<2 x s16>) = G_BITCAST [[OR7]](s32) + ; GREEDY: [[CONCAT_VECTORS2:%[0-9]+]]:vgpr(<8 x s16>) = G_CONCAT_VECTORS [[BITCAST12]](<2 x s16>), [[BITCAST13]](<2 x s16>), [[BITCAST14]](<2 x s16>), [[BITCAST15]](<2 x s16>) + ; GREEDY: [[C4:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16 + ; GREEDY: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C4]](s64) + ; GREEDY: G_STORE [[CONCAT_VECTORS2]](<8 x s16>), [[PTR_ADD]](p1) :: (store (<8 x s16>) into `<16 x i16> addrspace(1)* undef` + 128, align 32, addrspace 1) ; GREEDY: S_ENDPGM 0 %val = call <16 x i16> @llvm.amdgcn.s.buffer.load.v16i16(<4 x i32> %rsrc, i32 %soffset, i32 0) store <16 x i16> %val, <16 x i16> addrspace(1)* undef @@ -847,17 +975,167 @@ ; CHECK: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<8 x s16>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 32, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4) ; CHECK: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<8 x s16>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 48, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4) ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<32 x s16>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<8 x s16>), [[AMDGPU_BUFFER_LOAD1]](<8 x s16>), [[AMDGPU_BUFFER_LOAD2]](<8 x s16>), [[AMDGPU_BUFFER_LOAD3]](<8 x s16>) - ; CHECK: [[UV:%[0-9]+]]:vgpr(<8 x s16>), [[UV1:%[0-9]+]]:vgpr(<8 x s16>), [[UV2:%[0-9]+]]:vgpr(<8 x s16>), [[UV3:%[0-9]+]]:vgpr(<8 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<32 x s16>) - ; CHECK: G_STORE [[UV]](<8 x s16>), [[DEF]](p1) :: (store (<8 x s16>) into `<32 x i16> addrspace(1)* undef`, align 64, addrspace 1) - ; CHECK: [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16 - ; CHECK: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64) - ; CHECK: G_STORE [[UV1]](<8 x s16>), [[PTR_ADD]](p1) :: (store (<8 x s16>) into `<32 x i16> addrspace(1)* undef` + 16, basealign 64, addrspace 1) - ; CHECK: [[C3:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 32 - ; CHECK: [[PTR_ADD1:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C3]](s64) - ; CHECK: G_STORE [[UV2]](<8 x s16>), [[PTR_ADD1]](p1) :: (store (<8 x s16>) into `<32 x i16> addrspace(1)* undef` + 32, align 32, basealign 64, addrspace 1) - ; CHECK: [[C4:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 48 - ; CHECK: [[PTR_ADD2:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C4]](s64) - ; CHECK: G_STORE [[UV3]](<8 x s16>), [[PTR_ADD2]](p1) :: (store (<8 x s16>) into `<32 x i16> addrspace(1)* undef` + 48, basealign 64, addrspace 1) + ; CHECK: [[UV:%[0-9]+]]:vgpr(<2 x s16>), [[UV1:%[0-9]+]]:vgpr(<2 x s16>), [[UV2:%[0-9]+]]:vgpr(<2 x s16>), [[UV3:%[0-9]+]]:vgpr(<2 x s16>), [[UV4:%[0-9]+]]:vgpr(<2 x s16>), [[UV5:%[0-9]+]]:vgpr(<2 x s16>), [[UV6:%[0-9]+]]:vgpr(<2 x s16>), [[UV7:%[0-9]+]]:vgpr(<2 x s16>), [[UV8:%[0-9]+]]:vgpr(<2 x s16>), [[UV9:%[0-9]+]]:vgpr(<2 x s16>), [[UV10:%[0-9]+]]:vgpr(<2 x s16>), [[UV11:%[0-9]+]]:vgpr(<2 x s16>), [[UV12:%[0-9]+]]:vgpr(<2 x s16>), [[UV13:%[0-9]+]]:vgpr(<2 x s16>), [[UV14:%[0-9]+]]:vgpr(<2 x s16>), [[UV15:%[0-9]+]]:vgpr(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<32 x s16>) + ; CHECK: [[BITCAST:%[0-9]+]]:vgpr(s32) = G_BITCAST [[UV]](<2 x s16>) + ; CHECK: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16 + ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; CHECK: [[LSHR:%[0-9]+]]:vgpr(s32) = G_LSHR [[BITCAST]], [[COPY5]](s32) + ; CHECK: [[BITCAST1:%[0-9]+]]:vgpr(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; CHECK: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; CHECK: [[LSHR1:%[0-9]+]]:vgpr(s32) = G_LSHR [[BITCAST1]], [[COPY6]](s32) + ; CHECK: [[BITCAST2:%[0-9]+]]:vgpr(s32) = G_BITCAST [[UV2]](<2 x s16>) + ; CHECK: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; CHECK: [[LSHR2:%[0-9]+]]:vgpr(s32) = G_LSHR [[BITCAST2]], [[COPY7]](s32) + ; CHECK: [[BITCAST3:%[0-9]+]]:vgpr(s32) = G_BITCAST [[UV3]](<2 x s16>) + ; CHECK: [[COPY8:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; CHECK: [[LSHR3:%[0-9]+]]:vgpr(s32) = G_LSHR [[BITCAST3]], [[COPY8]](s32) + ; CHECK: [[BITCAST4:%[0-9]+]]:vgpr(s32) = G_BITCAST [[UV4]](<2 x s16>) + ; CHECK: [[COPY9:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; CHECK: [[LSHR4:%[0-9]+]]:vgpr(s32) = G_LSHR [[BITCAST4]], [[COPY9]](s32) + ; CHECK: [[BITCAST5:%[0-9]+]]:vgpr(s32) = G_BITCAST [[UV5]](<2 x s16>) + ; CHECK: [[COPY10:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; CHECK: [[LSHR5:%[0-9]+]]:vgpr(s32) = G_LSHR [[BITCAST5]], [[COPY10]](s32) + ; CHECK: [[BITCAST6:%[0-9]+]]:vgpr(s32) = G_BITCAST [[UV6]](<2 x s16>) + ; CHECK: [[COPY11:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; CHECK: [[LSHR6:%[0-9]+]]:vgpr(s32) = G_LSHR [[BITCAST6]], [[COPY11]](s32) + ; CHECK: [[BITCAST7:%[0-9]+]]:vgpr(s32) = G_BITCAST [[UV7]](<2 x s16>) + ; CHECK: [[COPY12:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; CHECK: [[LSHR7:%[0-9]+]]:vgpr(s32) = G_LSHR [[BITCAST7]], [[COPY12]](s32) + ; CHECK: [[BITCAST8:%[0-9]+]]:vgpr(s32) = G_BITCAST [[UV8]](<2 x s16>) + ; CHECK: [[COPY13:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; CHECK: [[LSHR8:%[0-9]+]]:vgpr(s32) = G_LSHR [[BITCAST8]], [[COPY13]](s32) + ; CHECK: [[BITCAST9:%[0-9]+]]:vgpr(s32) = G_BITCAST [[UV9]](<2 x s16>) + ; CHECK: [[COPY14:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; CHECK: [[LSHR9:%[0-9]+]]:vgpr(s32) = G_LSHR [[BITCAST9]], [[COPY14]](s32) + ; CHECK: [[BITCAST10:%[0-9]+]]:vgpr(s32) = G_BITCAST [[UV10]](<2 x s16>) + ; CHECK: [[COPY15:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; CHECK: [[LSHR10:%[0-9]+]]:vgpr(s32) = G_LSHR [[BITCAST10]], [[COPY15]](s32) + ; CHECK: [[BITCAST11:%[0-9]+]]:vgpr(s32) = G_BITCAST [[UV11]](<2 x s16>) + ; CHECK: [[COPY16:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; CHECK: [[LSHR11:%[0-9]+]]:vgpr(s32) = G_LSHR [[BITCAST11]], [[COPY16]](s32) + ; CHECK: [[BITCAST12:%[0-9]+]]:vgpr(s32) = G_BITCAST [[UV12]](<2 x s16>) + ; CHECK: [[COPY17:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; CHECK: [[LSHR12:%[0-9]+]]:vgpr(s32) = G_LSHR [[BITCAST12]], [[COPY17]](s32) + ; CHECK: [[BITCAST13:%[0-9]+]]:vgpr(s32) = G_BITCAST [[UV13]](<2 x s16>) + ; CHECK: [[COPY18:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; CHECK: [[LSHR13:%[0-9]+]]:vgpr(s32) = G_LSHR [[BITCAST13]], [[COPY18]](s32) + ; CHECK: [[BITCAST14:%[0-9]+]]:vgpr(s32) = G_BITCAST [[UV14]](<2 x s16>) + ; CHECK: [[COPY19:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; CHECK: [[LSHR14:%[0-9]+]]:vgpr(s32) = G_LSHR [[BITCAST14]], [[COPY19]](s32) + ; CHECK: [[BITCAST15:%[0-9]+]]:vgpr(s32) = G_BITCAST [[UV15]](<2 x s16>) + ; CHECK: [[COPY20:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; CHECK: [[LSHR15:%[0-9]+]]:vgpr(s32) = G_LSHR [[BITCAST15]], [[COPY20]](s32) + ; CHECK: [[C3:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 65535 + ; CHECK: [[COPY21:%[0-9]+]]:vgpr(s32) = COPY [[C3]](s32) + ; CHECK: [[AND:%[0-9]+]]:vgpr(s32) = G_AND [[BITCAST]], [[COPY21]] + ; CHECK: [[COPY22:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; CHECK: [[SHL:%[0-9]+]]:vgpr(s32) = G_SHL [[LSHR]], [[COPY22]](s32) + ; CHECK: [[OR:%[0-9]+]]:vgpr(s32) = G_OR [[AND]], [[SHL]] + ; CHECK: [[BITCAST16:%[0-9]+]]:vgpr(<2 x s16>) = G_BITCAST [[OR]](s32) + ; CHECK: [[COPY23:%[0-9]+]]:vgpr(s32) = COPY [[C3]](s32) + ; CHECK: [[AND1:%[0-9]+]]:vgpr(s32) = G_AND [[BITCAST1]], [[COPY23]] + ; CHECK: [[COPY24:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; CHECK: [[SHL1:%[0-9]+]]:vgpr(s32) = G_SHL [[LSHR1]], [[COPY24]](s32) + ; CHECK: [[OR1:%[0-9]+]]:vgpr(s32) = G_OR [[AND1]], [[SHL1]] + ; CHECK: [[BITCAST17:%[0-9]+]]:vgpr(<2 x s16>) = G_BITCAST [[OR1]](s32) + ; CHECK: [[COPY25:%[0-9]+]]:vgpr(s32) = COPY [[C3]](s32) + ; CHECK: [[AND2:%[0-9]+]]:vgpr(s32) = G_AND [[BITCAST2]], [[COPY25]] + ; CHECK: [[COPY26:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; CHECK: [[SHL2:%[0-9]+]]:vgpr(s32) = G_SHL [[LSHR2]], [[COPY26]](s32) + ; CHECK: [[OR2:%[0-9]+]]:vgpr(s32) = G_OR [[AND2]], [[SHL2]] + ; CHECK: [[BITCAST18:%[0-9]+]]:vgpr(<2 x s16>) = G_BITCAST [[OR2]](s32) + ; CHECK: [[COPY27:%[0-9]+]]:vgpr(s32) = COPY [[C3]](s32) + ; CHECK: [[AND3:%[0-9]+]]:vgpr(s32) = G_AND [[BITCAST3]], [[COPY27]] + ; CHECK: [[COPY28:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; CHECK: [[SHL3:%[0-9]+]]:vgpr(s32) = G_SHL [[LSHR3]], [[COPY28]](s32) + ; CHECK: [[OR3:%[0-9]+]]:vgpr(s32) = G_OR [[AND3]], [[SHL3]] + ; CHECK: [[BITCAST19:%[0-9]+]]:vgpr(<2 x s16>) = G_BITCAST [[OR3]](s32) + ; CHECK: [[CONCAT_VECTORS1:%[0-9]+]]:vgpr(<8 x s16>) = G_CONCAT_VECTORS [[BITCAST16]](<2 x s16>), [[BITCAST17]](<2 x s16>), [[BITCAST18]](<2 x s16>), [[BITCAST19]](<2 x s16>) + ; CHECK: G_STORE [[CONCAT_VECTORS1]](<8 x s16>), [[DEF]](p1) :: (store (<8 x s16>) into `<32 x i16> addrspace(1)* undef`, align 64, addrspace 1) + ; CHECK: [[COPY29:%[0-9]+]]:vgpr(s32) = COPY [[C3]](s32) + ; CHECK: [[AND4:%[0-9]+]]:vgpr(s32) = G_AND [[BITCAST4]], [[COPY29]] + ; CHECK: [[COPY30:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; CHECK: [[SHL4:%[0-9]+]]:vgpr(s32) = G_SHL [[LSHR4]], [[COPY30]](s32) + ; CHECK: [[OR4:%[0-9]+]]:vgpr(s32) = G_OR [[AND4]], [[SHL4]] + ; CHECK: [[BITCAST20:%[0-9]+]]:vgpr(<2 x s16>) = G_BITCAST [[OR4]](s32) + ; CHECK: [[COPY31:%[0-9]+]]:vgpr(s32) = COPY [[C3]](s32) + ; CHECK: [[AND5:%[0-9]+]]:vgpr(s32) = G_AND [[BITCAST5]], [[COPY31]] + ; CHECK: [[COPY32:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; CHECK: [[SHL5:%[0-9]+]]:vgpr(s32) = G_SHL [[LSHR5]], [[COPY32]](s32) + ; CHECK: [[OR5:%[0-9]+]]:vgpr(s32) = G_OR [[AND5]], [[SHL5]] + ; CHECK: [[BITCAST21:%[0-9]+]]:vgpr(<2 x s16>) = G_BITCAST [[OR5]](s32) + ; CHECK: [[COPY33:%[0-9]+]]:vgpr(s32) = COPY [[C3]](s32) + ; CHECK: [[AND6:%[0-9]+]]:vgpr(s32) = G_AND [[BITCAST6]], [[COPY33]] + ; CHECK: [[COPY34:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; CHECK: [[SHL6:%[0-9]+]]:vgpr(s32) = G_SHL [[LSHR6]], [[COPY34]](s32) + ; CHECK: [[OR6:%[0-9]+]]:vgpr(s32) = G_OR [[AND6]], [[SHL6]] + ; CHECK: [[BITCAST22:%[0-9]+]]:vgpr(<2 x s16>) = G_BITCAST [[OR6]](s32) + ; CHECK: [[COPY35:%[0-9]+]]:vgpr(s32) = COPY [[C3]](s32) + ; CHECK: [[AND7:%[0-9]+]]:vgpr(s32) = G_AND [[BITCAST7]], [[COPY35]] + ; CHECK: [[COPY36:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; CHECK: [[SHL7:%[0-9]+]]:vgpr(s32) = G_SHL [[LSHR7]], [[COPY36]](s32) + ; CHECK: [[OR7:%[0-9]+]]:vgpr(s32) = G_OR [[AND7]], [[SHL7]] + ; CHECK: [[BITCAST23:%[0-9]+]]:vgpr(<2 x s16>) = G_BITCAST [[OR7]](s32) + ; CHECK: [[CONCAT_VECTORS2:%[0-9]+]]:vgpr(<8 x s16>) = G_CONCAT_VECTORS [[BITCAST20]](<2 x s16>), [[BITCAST21]](<2 x s16>), [[BITCAST22]](<2 x s16>), [[BITCAST23]](<2 x s16>) + ; CHECK: [[C4:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16 + ; CHECK: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C4]](s64) + ; CHECK: G_STORE [[CONCAT_VECTORS2]](<8 x s16>), [[PTR_ADD]](p1) :: (store (<8 x s16>) into `<32 x i16> addrspace(1)* undef` + 128, align 64, addrspace 1) + ; CHECK: [[C5:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 32 + ; CHECK: [[PTR_ADD1:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C5]](s64) + ; CHECK: [[COPY37:%[0-9]+]]:vgpr(s32) = COPY [[C3]](s32) + ; CHECK: [[AND8:%[0-9]+]]:vgpr(s32) = G_AND [[BITCAST8]], [[COPY37]] + ; CHECK: [[COPY38:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; CHECK: [[SHL8:%[0-9]+]]:vgpr(s32) = G_SHL [[LSHR8]], [[COPY38]](s32) + ; CHECK: [[OR8:%[0-9]+]]:vgpr(s32) = G_OR [[AND8]], [[SHL8]] + ; CHECK: [[BITCAST24:%[0-9]+]]:vgpr(<2 x s16>) = G_BITCAST [[OR8]](s32) + ; CHECK: [[COPY39:%[0-9]+]]:vgpr(s32) = COPY [[C3]](s32) + ; CHECK: [[AND9:%[0-9]+]]:vgpr(s32) = G_AND [[BITCAST9]], [[COPY39]] + ; CHECK: [[COPY40:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; CHECK: [[SHL9:%[0-9]+]]:vgpr(s32) = G_SHL [[LSHR9]], [[COPY40]](s32) + ; CHECK: [[OR9:%[0-9]+]]:vgpr(s32) = G_OR [[AND9]], [[SHL9]] + ; CHECK: [[BITCAST25:%[0-9]+]]:vgpr(<2 x s16>) = G_BITCAST [[OR9]](s32) + ; CHECK: [[COPY41:%[0-9]+]]:vgpr(s32) = COPY [[C3]](s32) + ; CHECK: [[AND10:%[0-9]+]]:vgpr(s32) = G_AND [[BITCAST10]], [[COPY41]] + ; CHECK: [[COPY42:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; CHECK: [[SHL10:%[0-9]+]]:vgpr(s32) = G_SHL [[LSHR10]], [[COPY42]](s32) + ; CHECK: [[OR10:%[0-9]+]]:vgpr(s32) = G_OR [[AND10]], [[SHL10]] + ; CHECK: [[BITCAST26:%[0-9]+]]:vgpr(<2 x s16>) = G_BITCAST [[OR10]](s32) + ; CHECK: [[COPY43:%[0-9]+]]:vgpr(s32) = COPY [[C3]](s32) + ; CHECK: [[AND11:%[0-9]+]]:vgpr(s32) = G_AND [[BITCAST11]], [[COPY43]] + ; CHECK: [[COPY44:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; CHECK: [[SHL11:%[0-9]+]]:vgpr(s32) = G_SHL [[LSHR11]], [[COPY44]](s32) + ; CHECK: [[OR11:%[0-9]+]]:vgpr(s32) = G_OR [[AND11]], [[SHL11]] + ; CHECK: [[BITCAST27:%[0-9]+]]:vgpr(<2 x s16>) = G_BITCAST [[OR11]](s32) + ; CHECK: [[CONCAT_VECTORS3:%[0-9]+]]:vgpr(<8 x s16>) = G_CONCAT_VECTORS [[BITCAST24]](<2 x s16>), [[BITCAST25]](<2 x s16>), [[BITCAST26]](<2 x s16>), [[BITCAST27]](<2 x s16>) + ; CHECK: G_STORE [[CONCAT_VECTORS3]](<8 x s16>), [[PTR_ADD1]](p1) :: (store (<8 x s16>) into `<32 x i16> addrspace(1)* undef` + 256, align 64, addrspace 1) + ; CHECK: [[COPY45:%[0-9]+]]:vgpr(s32) = COPY [[C3]](s32) + ; CHECK: [[AND12:%[0-9]+]]:vgpr(s32) = G_AND [[BITCAST12]], [[COPY45]] + ; CHECK: [[COPY46:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; CHECK: [[SHL12:%[0-9]+]]:vgpr(s32) = G_SHL [[LSHR12]], [[COPY46]](s32) + ; CHECK: [[OR12:%[0-9]+]]:vgpr(s32) = G_OR [[AND12]], [[SHL12]] + ; CHECK: [[BITCAST28:%[0-9]+]]:vgpr(<2 x s16>) = G_BITCAST [[OR12]](s32) + ; CHECK: [[COPY47:%[0-9]+]]:vgpr(s32) = COPY [[C3]](s32) + ; CHECK: [[AND13:%[0-9]+]]:vgpr(s32) = G_AND [[BITCAST13]], [[COPY47]] + ; CHECK: [[COPY48:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; CHECK: [[SHL13:%[0-9]+]]:vgpr(s32) = G_SHL [[LSHR13]], [[COPY48]](s32) + ; CHECK: [[OR13:%[0-9]+]]:vgpr(s32) = G_OR [[AND13]], [[SHL13]] + ; CHECK: [[BITCAST29:%[0-9]+]]:vgpr(<2 x s16>) = G_BITCAST [[OR13]](s32) + ; CHECK: [[COPY49:%[0-9]+]]:vgpr(s32) = COPY [[C3]](s32) + ; CHECK: [[AND14:%[0-9]+]]:vgpr(s32) = G_AND [[BITCAST14]], [[COPY49]] + ; CHECK: [[COPY50:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; CHECK: [[SHL14:%[0-9]+]]:vgpr(s32) = G_SHL [[LSHR14]], [[COPY50]](s32) + ; CHECK: [[OR14:%[0-9]+]]:vgpr(s32) = G_OR [[AND14]], [[SHL14]] + ; CHECK: [[BITCAST30:%[0-9]+]]:vgpr(<2 x s16>) = G_BITCAST [[OR14]](s32) + ; CHECK: [[COPY51:%[0-9]+]]:vgpr(s32) = COPY [[C3]](s32) + ; CHECK: [[AND15:%[0-9]+]]:vgpr(s32) = G_AND [[BITCAST15]], [[COPY51]] + ; CHECK: [[COPY52:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; CHECK: [[SHL15:%[0-9]+]]:vgpr(s32) = G_SHL [[LSHR15]], [[COPY52]](s32) + ; CHECK: [[OR15:%[0-9]+]]:vgpr(s32) = G_OR [[AND15]], [[SHL15]] + ; CHECK: [[BITCAST31:%[0-9]+]]:vgpr(<2 x s16>) = G_BITCAST [[OR15]](s32) + ; CHECK: [[CONCAT_VECTORS4:%[0-9]+]]:vgpr(<8 x s16>) = G_CONCAT_VECTORS [[BITCAST28]](<2 x s16>), [[BITCAST29]](<2 x s16>), [[BITCAST30]](<2 x s16>), [[BITCAST31]](<2 x s16>) + ; CHECK: [[C6:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 48 + ; CHECK: [[PTR_ADD2:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C6]](s64) + ; CHECK: G_STORE [[CONCAT_VECTORS4]](<8 x s16>), [[PTR_ADD2]](p1) :: (store (<8 x s16>) into `<32 x i16> addrspace(1)* undef` + 384, align 64, addrspace 1) ; CHECK: S_ENDPGM 0 ; GREEDY-LABEL: name: s_buffer_load_v32i16_vgpr_offset ; GREEDY: bb.1 (%ir-block.0): @@ -876,17 +1154,167 @@ ; GREEDY: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<8 x s16>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 32, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4) ; GREEDY: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<8 x s16>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 48, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4) ; GREEDY: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<32 x s16>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<8 x s16>), [[AMDGPU_BUFFER_LOAD1]](<8 x s16>), [[AMDGPU_BUFFER_LOAD2]](<8 x s16>), [[AMDGPU_BUFFER_LOAD3]](<8 x s16>) - ; GREEDY: [[UV:%[0-9]+]]:vgpr(<8 x s16>), [[UV1:%[0-9]+]]:vgpr(<8 x s16>), [[UV2:%[0-9]+]]:vgpr(<8 x s16>), [[UV3:%[0-9]+]]:vgpr(<8 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<32 x s16>) - ; GREEDY: G_STORE [[UV]](<8 x s16>), [[DEF]](p1) :: (store (<8 x s16>) into `<32 x i16> addrspace(1)* undef`, align 64, addrspace 1) - ; GREEDY: [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16 - ; GREEDY: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64) - ; GREEDY: G_STORE [[UV1]](<8 x s16>), [[PTR_ADD]](p1) :: (store (<8 x s16>) into `<32 x i16> addrspace(1)* undef` + 16, basealign 64, addrspace 1) - ; GREEDY: [[C3:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 32 - ; GREEDY: [[PTR_ADD1:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C3]](s64) - ; GREEDY: G_STORE [[UV2]](<8 x s16>), [[PTR_ADD1]](p1) :: (store (<8 x s16>) into `<32 x i16> addrspace(1)* undef` + 32, align 32, basealign 64, addrspace 1) - ; GREEDY: [[C4:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 48 - ; GREEDY: [[PTR_ADD2:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C4]](s64) - ; GREEDY: G_STORE [[UV3]](<8 x s16>), [[PTR_ADD2]](p1) :: (store (<8 x s16>) into `<32 x i16> addrspace(1)* undef` + 48, basealign 64, addrspace 1) + ; GREEDY: [[UV:%[0-9]+]]:vgpr(<2 x s16>), [[UV1:%[0-9]+]]:vgpr(<2 x s16>), [[UV2:%[0-9]+]]:vgpr(<2 x s16>), [[UV3:%[0-9]+]]:vgpr(<2 x s16>), [[UV4:%[0-9]+]]:vgpr(<2 x s16>), [[UV5:%[0-9]+]]:vgpr(<2 x s16>), [[UV6:%[0-9]+]]:vgpr(<2 x s16>), [[UV7:%[0-9]+]]:vgpr(<2 x s16>), [[UV8:%[0-9]+]]:vgpr(<2 x s16>), [[UV9:%[0-9]+]]:vgpr(<2 x s16>), [[UV10:%[0-9]+]]:vgpr(<2 x s16>), [[UV11:%[0-9]+]]:vgpr(<2 x s16>), [[UV12:%[0-9]+]]:vgpr(<2 x s16>), [[UV13:%[0-9]+]]:vgpr(<2 x s16>), [[UV14:%[0-9]+]]:vgpr(<2 x s16>), [[UV15:%[0-9]+]]:vgpr(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<32 x s16>) + ; GREEDY: [[BITCAST:%[0-9]+]]:vgpr(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GREEDY: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16 + ; GREEDY: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; GREEDY: [[LSHR:%[0-9]+]]:vgpr(s32) = G_LSHR [[BITCAST]], [[COPY5]](s32) + ; GREEDY: [[BITCAST1:%[0-9]+]]:vgpr(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; GREEDY: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; GREEDY: [[LSHR1:%[0-9]+]]:vgpr(s32) = G_LSHR [[BITCAST1]], [[COPY6]](s32) + ; GREEDY: [[BITCAST2:%[0-9]+]]:vgpr(s32) = G_BITCAST [[UV2]](<2 x s16>) + ; GREEDY: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; GREEDY: [[LSHR2:%[0-9]+]]:vgpr(s32) = G_LSHR [[BITCAST2]], [[COPY7]](s32) + ; GREEDY: [[BITCAST3:%[0-9]+]]:vgpr(s32) = G_BITCAST [[UV3]](<2 x s16>) + ; GREEDY: [[COPY8:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; GREEDY: [[LSHR3:%[0-9]+]]:vgpr(s32) = G_LSHR [[BITCAST3]], [[COPY8]](s32) + ; GREEDY: [[BITCAST4:%[0-9]+]]:vgpr(s32) = G_BITCAST [[UV4]](<2 x s16>) + ; GREEDY: [[COPY9:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; GREEDY: [[LSHR4:%[0-9]+]]:vgpr(s32) = G_LSHR [[BITCAST4]], [[COPY9]](s32) + ; GREEDY: [[BITCAST5:%[0-9]+]]:vgpr(s32) = G_BITCAST [[UV5]](<2 x s16>) + ; GREEDY: [[COPY10:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; GREEDY: [[LSHR5:%[0-9]+]]:vgpr(s32) = G_LSHR [[BITCAST5]], [[COPY10]](s32) + ; GREEDY: [[BITCAST6:%[0-9]+]]:vgpr(s32) = G_BITCAST [[UV6]](<2 x s16>) + ; GREEDY: [[COPY11:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; GREEDY: [[LSHR6:%[0-9]+]]:vgpr(s32) = G_LSHR [[BITCAST6]], [[COPY11]](s32) + ; GREEDY: [[BITCAST7:%[0-9]+]]:vgpr(s32) = G_BITCAST [[UV7]](<2 x s16>) + ; GREEDY: [[COPY12:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; GREEDY: [[LSHR7:%[0-9]+]]:vgpr(s32) = G_LSHR [[BITCAST7]], [[COPY12]](s32) + ; GREEDY: [[BITCAST8:%[0-9]+]]:vgpr(s32) = G_BITCAST [[UV8]](<2 x s16>) + ; GREEDY: [[COPY13:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; GREEDY: [[LSHR8:%[0-9]+]]:vgpr(s32) = G_LSHR [[BITCAST8]], [[COPY13]](s32) + ; GREEDY: [[BITCAST9:%[0-9]+]]:vgpr(s32) = G_BITCAST [[UV9]](<2 x s16>) + ; GREEDY: [[COPY14:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; GREEDY: [[LSHR9:%[0-9]+]]:vgpr(s32) = G_LSHR [[BITCAST9]], [[COPY14]](s32) + ; GREEDY: [[BITCAST10:%[0-9]+]]:vgpr(s32) = G_BITCAST [[UV10]](<2 x s16>) + ; GREEDY: [[COPY15:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; GREEDY: [[LSHR10:%[0-9]+]]:vgpr(s32) = G_LSHR [[BITCAST10]], [[COPY15]](s32) + ; GREEDY: [[BITCAST11:%[0-9]+]]:vgpr(s32) = G_BITCAST [[UV11]](<2 x s16>) + ; GREEDY: [[COPY16:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; GREEDY: [[LSHR11:%[0-9]+]]:vgpr(s32) = G_LSHR [[BITCAST11]], [[COPY16]](s32) + ; GREEDY: [[BITCAST12:%[0-9]+]]:vgpr(s32) = G_BITCAST [[UV12]](<2 x s16>) + ; GREEDY: [[COPY17:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; GREEDY: [[LSHR12:%[0-9]+]]:vgpr(s32) = G_LSHR [[BITCAST12]], [[COPY17]](s32) + ; GREEDY: [[BITCAST13:%[0-9]+]]:vgpr(s32) = G_BITCAST [[UV13]](<2 x s16>) + ; GREEDY: [[COPY18:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; GREEDY: [[LSHR13:%[0-9]+]]:vgpr(s32) = G_LSHR [[BITCAST13]], [[COPY18]](s32) + ; GREEDY: [[BITCAST14:%[0-9]+]]:vgpr(s32) = G_BITCAST [[UV14]](<2 x s16>) + ; GREEDY: [[COPY19:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; GREEDY: [[LSHR14:%[0-9]+]]:vgpr(s32) = G_LSHR [[BITCAST14]], [[COPY19]](s32) + ; GREEDY: [[BITCAST15:%[0-9]+]]:vgpr(s32) = G_BITCAST [[UV15]](<2 x s16>) + ; GREEDY: [[COPY20:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; GREEDY: [[LSHR15:%[0-9]+]]:vgpr(s32) = G_LSHR [[BITCAST15]], [[COPY20]](s32) + ; GREEDY: [[C3:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 65535 + ; GREEDY: [[COPY21:%[0-9]+]]:vgpr(s32) = COPY [[C3]](s32) + ; GREEDY: [[AND:%[0-9]+]]:vgpr(s32) = G_AND [[BITCAST]], [[COPY21]] + ; GREEDY: [[COPY22:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; GREEDY: [[SHL:%[0-9]+]]:vgpr(s32) = G_SHL [[LSHR]], [[COPY22]](s32) + ; GREEDY: [[OR:%[0-9]+]]:vgpr(s32) = G_OR [[AND]], [[SHL]] + ; GREEDY: [[BITCAST16:%[0-9]+]]:vgpr(<2 x s16>) = G_BITCAST [[OR]](s32) + ; GREEDY: [[COPY23:%[0-9]+]]:vgpr(s32) = COPY [[C3]](s32) + ; GREEDY: [[AND1:%[0-9]+]]:vgpr(s32) = G_AND [[BITCAST1]], [[COPY23]] + ; GREEDY: [[COPY24:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; GREEDY: [[SHL1:%[0-9]+]]:vgpr(s32) = G_SHL [[LSHR1]], [[COPY24]](s32) + ; GREEDY: [[OR1:%[0-9]+]]:vgpr(s32) = G_OR [[AND1]], [[SHL1]] + ; GREEDY: [[BITCAST17:%[0-9]+]]:vgpr(<2 x s16>) = G_BITCAST [[OR1]](s32) + ; GREEDY: [[COPY25:%[0-9]+]]:vgpr(s32) = COPY [[C3]](s32) + ; GREEDY: [[AND2:%[0-9]+]]:vgpr(s32) = G_AND [[BITCAST2]], [[COPY25]] + ; GREEDY: [[COPY26:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; GREEDY: [[SHL2:%[0-9]+]]:vgpr(s32) = G_SHL [[LSHR2]], [[COPY26]](s32) + ; GREEDY: [[OR2:%[0-9]+]]:vgpr(s32) = G_OR [[AND2]], [[SHL2]] + ; GREEDY: [[BITCAST18:%[0-9]+]]:vgpr(<2 x s16>) = G_BITCAST [[OR2]](s32) + ; GREEDY: [[COPY27:%[0-9]+]]:vgpr(s32) = COPY [[C3]](s32) + ; GREEDY: [[AND3:%[0-9]+]]:vgpr(s32) = G_AND [[BITCAST3]], [[COPY27]] + ; GREEDY: [[COPY28:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; GREEDY: [[SHL3:%[0-9]+]]:vgpr(s32) = G_SHL [[LSHR3]], [[COPY28]](s32) + ; GREEDY: [[OR3:%[0-9]+]]:vgpr(s32) = G_OR [[AND3]], [[SHL3]] + ; GREEDY: [[BITCAST19:%[0-9]+]]:vgpr(<2 x s16>) = G_BITCAST [[OR3]](s32) + ; GREEDY: [[CONCAT_VECTORS1:%[0-9]+]]:vgpr(<8 x s16>) = G_CONCAT_VECTORS [[BITCAST16]](<2 x s16>), [[BITCAST17]](<2 x s16>), [[BITCAST18]](<2 x s16>), [[BITCAST19]](<2 x s16>) + ; GREEDY: G_STORE [[CONCAT_VECTORS1]](<8 x s16>), [[DEF]](p1) :: (store (<8 x s16>) into `<32 x i16> addrspace(1)* undef`, align 64, addrspace 1) + ; GREEDY: [[COPY29:%[0-9]+]]:vgpr(s32) = COPY [[C3]](s32) + ; GREEDY: [[AND4:%[0-9]+]]:vgpr(s32) = G_AND [[BITCAST4]], [[COPY29]] + ; GREEDY: [[COPY30:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; GREEDY: [[SHL4:%[0-9]+]]:vgpr(s32) = G_SHL [[LSHR4]], [[COPY30]](s32) + ; GREEDY: [[OR4:%[0-9]+]]:vgpr(s32) = G_OR [[AND4]], [[SHL4]] + ; GREEDY: [[BITCAST20:%[0-9]+]]:vgpr(<2 x s16>) = G_BITCAST [[OR4]](s32) + ; GREEDY: [[COPY31:%[0-9]+]]:vgpr(s32) = COPY [[C3]](s32) + ; GREEDY: [[AND5:%[0-9]+]]:vgpr(s32) = G_AND [[BITCAST5]], [[COPY31]] + ; GREEDY: [[COPY32:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; GREEDY: [[SHL5:%[0-9]+]]:vgpr(s32) = G_SHL [[LSHR5]], [[COPY32]](s32) + ; GREEDY: [[OR5:%[0-9]+]]:vgpr(s32) = G_OR [[AND5]], [[SHL5]] + ; GREEDY: [[BITCAST21:%[0-9]+]]:vgpr(<2 x s16>) = G_BITCAST [[OR5]](s32) + ; GREEDY: [[COPY33:%[0-9]+]]:vgpr(s32) = COPY [[C3]](s32) + ; GREEDY: [[AND6:%[0-9]+]]:vgpr(s32) = G_AND [[BITCAST6]], [[COPY33]] + ; GREEDY: [[COPY34:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; GREEDY: [[SHL6:%[0-9]+]]:vgpr(s32) = G_SHL [[LSHR6]], [[COPY34]](s32) + ; GREEDY: [[OR6:%[0-9]+]]:vgpr(s32) = G_OR [[AND6]], [[SHL6]] + ; GREEDY: [[BITCAST22:%[0-9]+]]:vgpr(<2 x s16>) = G_BITCAST [[OR6]](s32) + ; GREEDY: [[COPY35:%[0-9]+]]:vgpr(s32) = COPY [[C3]](s32) + ; GREEDY: [[AND7:%[0-9]+]]:vgpr(s32) = G_AND [[BITCAST7]], [[COPY35]] + ; GREEDY: [[COPY36:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; GREEDY: [[SHL7:%[0-9]+]]:vgpr(s32) = G_SHL [[LSHR7]], [[COPY36]](s32) + ; GREEDY: [[OR7:%[0-9]+]]:vgpr(s32) = G_OR [[AND7]], [[SHL7]] + ; GREEDY: [[BITCAST23:%[0-9]+]]:vgpr(<2 x s16>) = G_BITCAST [[OR7]](s32) + ; GREEDY: [[CONCAT_VECTORS2:%[0-9]+]]:vgpr(<8 x s16>) = G_CONCAT_VECTORS [[BITCAST20]](<2 x s16>), [[BITCAST21]](<2 x s16>), [[BITCAST22]](<2 x s16>), [[BITCAST23]](<2 x s16>) + ; GREEDY: [[C4:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16 + ; GREEDY: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C4]](s64) + ; GREEDY: G_STORE [[CONCAT_VECTORS2]](<8 x s16>), [[PTR_ADD]](p1) :: (store (<8 x s16>) into `<32 x i16> addrspace(1)* undef` + 128, align 64, addrspace 1) + ; GREEDY: [[C5:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 32 + ; GREEDY: [[PTR_ADD1:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C5]](s64) + ; GREEDY: [[COPY37:%[0-9]+]]:vgpr(s32) = COPY [[C3]](s32) + ; GREEDY: [[AND8:%[0-9]+]]:vgpr(s32) = G_AND [[BITCAST8]], [[COPY37]] + ; GREEDY: [[COPY38:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; GREEDY: [[SHL8:%[0-9]+]]:vgpr(s32) = G_SHL [[LSHR8]], [[COPY38]](s32) + ; GREEDY: [[OR8:%[0-9]+]]:vgpr(s32) = G_OR [[AND8]], [[SHL8]] + ; GREEDY: [[BITCAST24:%[0-9]+]]:vgpr(<2 x s16>) = G_BITCAST [[OR8]](s32) + ; GREEDY: [[COPY39:%[0-9]+]]:vgpr(s32) = COPY [[C3]](s32) + ; GREEDY: [[AND9:%[0-9]+]]:vgpr(s32) = G_AND [[BITCAST9]], [[COPY39]] + ; GREEDY: [[COPY40:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; GREEDY: [[SHL9:%[0-9]+]]:vgpr(s32) = G_SHL [[LSHR9]], [[COPY40]](s32) + ; GREEDY: [[OR9:%[0-9]+]]:vgpr(s32) = G_OR [[AND9]], [[SHL9]] + ; GREEDY: [[BITCAST25:%[0-9]+]]:vgpr(<2 x s16>) = G_BITCAST [[OR9]](s32) + ; GREEDY: [[COPY41:%[0-9]+]]:vgpr(s32) = COPY [[C3]](s32) + ; GREEDY: [[AND10:%[0-9]+]]:vgpr(s32) = G_AND [[BITCAST10]], [[COPY41]] + ; GREEDY: [[COPY42:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; GREEDY: [[SHL10:%[0-9]+]]:vgpr(s32) = G_SHL [[LSHR10]], [[COPY42]](s32) + ; GREEDY: [[OR10:%[0-9]+]]:vgpr(s32) = G_OR [[AND10]], [[SHL10]] + ; GREEDY: [[BITCAST26:%[0-9]+]]:vgpr(<2 x s16>) = G_BITCAST [[OR10]](s32) + ; GREEDY: [[COPY43:%[0-9]+]]:vgpr(s32) = COPY [[C3]](s32) + ; GREEDY: [[AND11:%[0-9]+]]:vgpr(s32) = G_AND [[BITCAST11]], [[COPY43]] + ; GREEDY: [[COPY44:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; GREEDY: [[SHL11:%[0-9]+]]:vgpr(s32) = G_SHL [[LSHR11]], [[COPY44]](s32) + ; GREEDY: [[OR11:%[0-9]+]]:vgpr(s32) = G_OR [[AND11]], [[SHL11]] + ; GREEDY: [[BITCAST27:%[0-9]+]]:vgpr(<2 x s16>) = G_BITCAST [[OR11]](s32) + ; GREEDY: [[CONCAT_VECTORS3:%[0-9]+]]:vgpr(<8 x s16>) = G_CONCAT_VECTORS [[BITCAST24]](<2 x s16>), [[BITCAST25]](<2 x s16>), [[BITCAST26]](<2 x s16>), [[BITCAST27]](<2 x s16>) + ; GREEDY: G_STORE [[CONCAT_VECTORS3]](<8 x s16>), [[PTR_ADD1]](p1) :: (store (<8 x s16>) into `<32 x i16> addrspace(1)* undef` + 256, align 64, addrspace 1) + ; GREEDY: [[COPY45:%[0-9]+]]:vgpr(s32) = COPY [[C3]](s32) + ; GREEDY: [[AND12:%[0-9]+]]:vgpr(s32) = G_AND [[BITCAST12]], [[COPY45]] + ; GREEDY: [[COPY46:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; GREEDY: [[SHL12:%[0-9]+]]:vgpr(s32) = G_SHL [[LSHR12]], [[COPY46]](s32) + ; GREEDY: [[OR12:%[0-9]+]]:vgpr(s32) = G_OR [[AND12]], [[SHL12]] + ; GREEDY: [[BITCAST28:%[0-9]+]]:vgpr(<2 x s16>) = G_BITCAST [[OR12]](s32) + ; GREEDY: [[COPY47:%[0-9]+]]:vgpr(s32) = COPY [[C3]](s32) + ; GREEDY: [[AND13:%[0-9]+]]:vgpr(s32) = G_AND [[BITCAST13]], [[COPY47]] + ; GREEDY: [[COPY48:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; GREEDY: [[SHL13:%[0-9]+]]:vgpr(s32) = G_SHL [[LSHR13]], [[COPY48]](s32) + ; GREEDY: [[OR13:%[0-9]+]]:vgpr(s32) = G_OR [[AND13]], [[SHL13]] + ; GREEDY: [[BITCAST29:%[0-9]+]]:vgpr(<2 x s16>) = G_BITCAST [[OR13]](s32) + ; GREEDY: [[COPY49:%[0-9]+]]:vgpr(s32) = COPY [[C3]](s32) + ; GREEDY: [[AND14:%[0-9]+]]:vgpr(s32) = G_AND [[BITCAST14]], [[COPY49]] + ; GREEDY: [[COPY50:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; GREEDY: [[SHL14:%[0-9]+]]:vgpr(s32) = G_SHL [[LSHR14]], [[COPY50]](s32) + ; GREEDY: [[OR14:%[0-9]+]]:vgpr(s32) = G_OR [[AND14]], [[SHL14]] + ; GREEDY: [[BITCAST30:%[0-9]+]]:vgpr(<2 x s16>) = G_BITCAST [[OR14]](s32) + ; GREEDY: [[COPY51:%[0-9]+]]:vgpr(s32) = COPY [[C3]](s32) + ; GREEDY: [[AND15:%[0-9]+]]:vgpr(s32) = G_AND [[BITCAST15]], [[COPY51]] + ; GREEDY: [[COPY52:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; GREEDY: [[SHL15:%[0-9]+]]:vgpr(s32) = G_SHL [[LSHR15]], [[COPY52]](s32) + ; GREEDY: [[OR15:%[0-9]+]]:vgpr(s32) = G_OR [[AND15]], [[SHL15]] + ; GREEDY: [[BITCAST31:%[0-9]+]]:vgpr(<2 x s16>) = G_BITCAST [[OR15]](s32) + ; GREEDY: [[CONCAT_VECTORS4:%[0-9]+]]:vgpr(<8 x s16>) = G_CONCAT_VECTORS [[BITCAST28]](<2 x s16>), [[BITCAST29]](<2 x s16>), [[BITCAST30]](<2 x s16>), [[BITCAST31]](<2 x s16>) + ; GREEDY: [[C6:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 48 + ; GREEDY: [[PTR_ADD2:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C6]](s64) + ; GREEDY: G_STORE [[CONCAT_VECTORS4]](<8 x s16>), [[PTR_ADD2]](p1) :: (store (<8 x s16>) into `<32 x i16> addrspace(1)* undef` + 384, align 64, addrspace 1) ; GREEDY: S_ENDPGM 0 %val = call <32 x i16> @llvm.amdgcn.s.buffer.load.v32i16(<4 x i32> %rsrc, i32 %soffset, i32 0) store <32 x i16> %val, <32 x i16> addrspace(1)* undef @@ -910,11 +1338,13 @@ ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<2 x s64>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) ; CHECK: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<2 x s64>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<4 x s64>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<2 x s64>), [[AMDGPU_BUFFER_LOAD1]](<2 x s64>) - ; CHECK: [[UV:%[0-9]+]]:vgpr(<2 x s64>), [[UV1:%[0-9]+]]:vgpr(<2 x s64>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<4 x s64>) - ; CHECK: G_STORE [[UV]](<2 x s64>), [[DEF]](p1) :: (store (<2 x s64>) into `<4 x i64> addrspace(1)* undef`, align 32, addrspace 1) + ; CHECK: [[UV:%[0-9]+]]:vgpr(s64), [[UV1:%[0-9]+]]:vgpr(s64), [[UV2:%[0-9]+]]:vgpr(s64), [[UV3:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<4 x s64>) + ; CHECK: [[BUILD_VECTOR1:%[0-9]+]]:vgpr(<2 x s64>) = G_BUILD_VECTOR [[UV]](s64), [[UV1]](s64) + ; CHECK: G_STORE [[BUILD_VECTOR1]](<2 x s64>), [[DEF]](p1) :: (store (<2 x s64>) into `<4 x i64> addrspace(1)* undef`, align 32, addrspace 1) + ; CHECK: [[BUILD_VECTOR2:%[0-9]+]]:vgpr(<2 x s64>) = G_BUILD_VECTOR [[UV2]](s64), [[UV3]](s64) ; CHECK: [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16 ; CHECK: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64) - ; CHECK: G_STORE [[UV1]](<2 x s64>), [[PTR_ADD]](p1) :: (store (<2 x s64>) into `<4 x i64> addrspace(1)* undef` + 16, basealign 32, addrspace 1) + ; CHECK: G_STORE [[BUILD_VECTOR2]](<2 x s64>), [[PTR_ADD]](p1) :: (store (<2 x s64>) into `<4 x i64> addrspace(1)* undef` + 128, align 32, addrspace 1) ; CHECK: S_ENDPGM 0 ; GREEDY-LABEL: name: s_buffer_load_v4i64_vgpr_offset ; GREEDY: bb.1 (%ir-block.0): @@ -931,11 +1361,13 @@ ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<2 x s64>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) ; GREEDY: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<2 x s64>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) ; GREEDY: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<4 x s64>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<2 x s64>), [[AMDGPU_BUFFER_LOAD1]](<2 x s64>) - ; GREEDY: [[UV:%[0-9]+]]:vgpr(<2 x s64>), [[UV1:%[0-9]+]]:vgpr(<2 x s64>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<4 x s64>) - ; GREEDY: G_STORE [[UV]](<2 x s64>), [[DEF]](p1) :: (store (<2 x s64>) into `<4 x i64> addrspace(1)* undef`, align 32, addrspace 1) + ; GREEDY: [[UV:%[0-9]+]]:vgpr(s64), [[UV1:%[0-9]+]]:vgpr(s64), [[UV2:%[0-9]+]]:vgpr(s64), [[UV3:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<4 x s64>) + ; GREEDY: [[BUILD_VECTOR1:%[0-9]+]]:vgpr(<2 x s64>) = G_BUILD_VECTOR [[UV]](s64), [[UV1]](s64) + ; GREEDY: G_STORE [[BUILD_VECTOR1]](<2 x s64>), [[DEF]](p1) :: (store (<2 x s64>) into `<4 x i64> addrspace(1)* undef`, align 32, addrspace 1) + ; GREEDY: [[BUILD_VECTOR2:%[0-9]+]]:vgpr(<2 x s64>) = G_BUILD_VECTOR [[UV2]](s64), [[UV3]](s64) ; GREEDY: [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16 ; GREEDY: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64) - ; GREEDY: G_STORE [[UV1]](<2 x s64>), [[PTR_ADD]](p1) :: (store (<2 x s64>) into `<4 x i64> addrspace(1)* undef` + 16, basealign 32, addrspace 1) + ; GREEDY: G_STORE [[BUILD_VECTOR2]](<2 x s64>), [[PTR_ADD]](p1) :: (store (<2 x s64>) into `<4 x i64> addrspace(1)* undef` + 128, align 32, addrspace 1) ; GREEDY: S_ENDPGM 0 %val = call <4 x i64> @llvm.amdgcn.s.buffer.load.v4i64(<4 x i32> %rsrc, i32 %soffset, i32 0) store <4 x i64> %val, <4 x i64> addrspace(1)* undef @@ -961,17 +1393,21 @@ ; CHECK: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<2 x s64>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 32, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4) ; CHECK: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<2 x s64>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 48, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4) ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s64>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<2 x s64>), [[AMDGPU_BUFFER_LOAD1]](<2 x s64>), [[AMDGPU_BUFFER_LOAD2]](<2 x s64>), [[AMDGPU_BUFFER_LOAD3]](<2 x s64>) - ; CHECK: [[UV:%[0-9]+]]:vgpr(<2 x s64>), [[UV1:%[0-9]+]]:vgpr(<2 x s64>), [[UV2:%[0-9]+]]:vgpr(<2 x s64>), [[UV3:%[0-9]+]]:vgpr(<2 x s64>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s64>) - ; CHECK: G_STORE [[UV]](<2 x s64>), [[DEF]](p1) :: (store (<2 x s64>) into `<8 x i64> addrspace(1)* undef`, align 64, addrspace 1) + ; CHECK: [[UV:%[0-9]+]]:vgpr(s64), [[UV1:%[0-9]+]]:vgpr(s64), [[UV2:%[0-9]+]]:vgpr(s64), [[UV3:%[0-9]+]]:vgpr(s64), [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64), [[UV6:%[0-9]+]]:vgpr(s64), [[UV7:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s64>) + ; CHECK: [[BUILD_VECTOR1:%[0-9]+]]:vgpr(<2 x s64>) = G_BUILD_VECTOR [[UV]](s64), [[UV1]](s64) + ; CHECK: G_STORE [[BUILD_VECTOR1]](<2 x s64>), [[DEF]](p1) :: (store (<2 x s64>) into `<8 x i64> addrspace(1)* undef`, align 64, addrspace 1) + ; CHECK: [[BUILD_VECTOR2:%[0-9]+]]:vgpr(<2 x s64>) = G_BUILD_VECTOR [[UV2]](s64), [[UV3]](s64) ; CHECK: [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16 ; CHECK: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64) - ; CHECK: G_STORE [[UV1]](<2 x s64>), [[PTR_ADD]](p1) :: (store (<2 x s64>) into `<8 x i64> addrspace(1)* undef` + 16, basealign 64, addrspace 1) + ; CHECK: G_STORE [[BUILD_VECTOR2]](<2 x s64>), [[PTR_ADD]](p1) :: (store (<2 x s64>) into `<8 x i64> addrspace(1)* undef` + 128, align 64, addrspace 1) ; CHECK: [[C3:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 32 ; CHECK: [[PTR_ADD1:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C3]](s64) - ; CHECK: G_STORE [[UV2]](<2 x s64>), [[PTR_ADD1]](p1) :: (store (<2 x s64>) into `<8 x i64> addrspace(1)* undef` + 32, align 32, basealign 64, addrspace 1) + ; CHECK: [[BUILD_VECTOR3:%[0-9]+]]:vgpr(<2 x s64>) = G_BUILD_VECTOR [[UV4]](s64), [[UV5]](s64) + ; CHECK: G_STORE [[BUILD_VECTOR3]](<2 x s64>), [[PTR_ADD1]](p1) :: (store (<2 x s64>) into `<8 x i64> addrspace(1)* undef` + 256, align 64, addrspace 1) + ; CHECK: [[BUILD_VECTOR4:%[0-9]+]]:vgpr(<2 x s64>) = G_BUILD_VECTOR [[UV6]](s64), [[UV7]](s64) ; CHECK: [[C4:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 48 ; CHECK: [[PTR_ADD2:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C4]](s64) - ; CHECK: G_STORE [[UV3]](<2 x s64>), [[PTR_ADD2]](p1) :: (store (<2 x s64>) into `<8 x i64> addrspace(1)* undef` + 48, basealign 64, addrspace 1) + ; CHECK: G_STORE [[BUILD_VECTOR4]](<2 x s64>), [[PTR_ADD2]](p1) :: (store (<2 x s64>) into `<8 x i64> addrspace(1)* undef` + 384, align 64, addrspace 1) ; CHECK: S_ENDPGM 0 ; GREEDY-LABEL: name: s_buffer_load_v8i64_vgpr_offset ; GREEDY: bb.1 (%ir-block.0): @@ -990,17 +1426,21 @@ ; GREEDY: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<2 x s64>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 32, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4) ; GREEDY: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<2 x s64>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 48, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4) ; GREEDY: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s64>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<2 x s64>), [[AMDGPU_BUFFER_LOAD1]](<2 x s64>), [[AMDGPU_BUFFER_LOAD2]](<2 x s64>), [[AMDGPU_BUFFER_LOAD3]](<2 x s64>) - ; GREEDY: [[UV:%[0-9]+]]:vgpr(<2 x s64>), [[UV1:%[0-9]+]]:vgpr(<2 x s64>), [[UV2:%[0-9]+]]:vgpr(<2 x s64>), [[UV3:%[0-9]+]]:vgpr(<2 x s64>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s64>) - ; GREEDY: G_STORE [[UV]](<2 x s64>), [[DEF]](p1) :: (store (<2 x s64>) into `<8 x i64> addrspace(1)* undef`, align 64, addrspace 1) + ; GREEDY: [[UV:%[0-9]+]]:vgpr(s64), [[UV1:%[0-9]+]]:vgpr(s64), [[UV2:%[0-9]+]]:vgpr(s64), [[UV3:%[0-9]+]]:vgpr(s64), [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64), [[UV6:%[0-9]+]]:vgpr(s64), [[UV7:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s64>) + ; GREEDY: [[BUILD_VECTOR1:%[0-9]+]]:vgpr(<2 x s64>) = G_BUILD_VECTOR [[UV]](s64), [[UV1]](s64) + ; GREEDY: G_STORE [[BUILD_VECTOR1]](<2 x s64>), [[DEF]](p1) :: (store (<2 x s64>) into `<8 x i64> addrspace(1)* undef`, align 64, addrspace 1) + ; GREEDY: [[BUILD_VECTOR2:%[0-9]+]]:vgpr(<2 x s64>) = G_BUILD_VECTOR [[UV2]](s64), [[UV3]](s64) ; GREEDY: [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16 ; GREEDY: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64) - ; GREEDY: G_STORE [[UV1]](<2 x s64>), [[PTR_ADD]](p1) :: (store (<2 x s64>) into `<8 x i64> addrspace(1)* undef` + 16, basealign 64, addrspace 1) + ; GREEDY: G_STORE [[BUILD_VECTOR2]](<2 x s64>), [[PTR_ADD]](p1) :: (store (<2 x s64>) into `<8 x i64> addrspace(1)* undef` + 128, align 64, addrspace 1) ; GREEDY: [[C3:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 32 ; GREEDY: [[PTR_ADD1:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C3]](s64) - ; GREEDY: G_STORE [[UV2]](<2 x s64>), [[PTR_ADD1]](p1) :: (store (<2 x s64>) into `<8 x i64> addrspace(1)* undef` + 32, align 32, basealign 64, addrspace 1) + ; GREEDY: [[BUILD_VECTOR3:%[0-9]+]]:vgpr(<2 x s64>) = G_BUILD_VECTOR [[UV4]](s64), [[UV5]](s64) + ; GREEDY: G_STORE [[BUILD_VECTOR3]](<2 x s64>), [[PTR_ADD1]](p1) :: (store (<2 x s64>) into `<8 x i64> addrspace(1)* undef` + 256, align 64, addrspace 1) + ; GREEDY: [[BUILD_VECTOR4:%[0-9]+]]:vgpr(<2 x s64>) = G_BUILD_VECTOR [[UV6]](s64), [[UV7]](s64) ; GREEDY: [[C4:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 48 ; GREEDY: [[PTR_ADD2:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C4]](s64) - ; GREEDY: G_STORE [[UV3]](<2 x s64>), [[PTR_ADD2]](p1) :: (store (<2 x s64>) into `<8 x i64> addrspace(1)* undef` + 48, basealign 64, addrspace 1) + ; GREEDY: G_STORE [[BUILD_VECTOR4]](<2 x s64>), [[PTR_ADD2]](p1) :: (store (<2 x s64>) into `<8 x i64> addrspace(1)* undef` + 384, align 64, addrspace 1) ; GREEDY: S_ENDPGM 0 %val = call <8 x i64> @llvm.amdgcn.s.buffer.load.v8i64(<4 x i32> %rsrc, i32 %soffset, i32 0) store <8 x i64> %val, <8 x i64> addrspace(1)* undef @@ -1024,11 +1464,13 @@ ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<2 x p1>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) ; CHECK: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<2 x p1>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<4 x p1>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<2 x p1>), [[AMDGPU_BUFFER_LOAD1]](<2 x p1>) - ; CHECK: [[UV:%[0-9]+]]:vgpr(<2 x p1>), [[UV1:%[0-9]+]]:vgpr(<2 x p1>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<4 x p1>) - ; CHECK: G_STORE [[UV]](<2 x p1>), [[DEF]](p1) :: (store (<2 x p1>) into `<4 x i8 addrspace(1)*> addrspace(1)* undef`, align 32, addrspace 1) + ; CHECK: [[UV:%[0-9]+]]:vgpr(p1), [[UV1:%[0-9]+]]:vgpr(p1), [[UV2:%[0-9]+]]:vgpr(p1), [[UV3:%[0-9]+]]:vgpr(p1) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<4 x p1>) + ; CHECK: [[BUILD_VECTOR1:%[0-9]+]]:vgpr(<2 x s64>) = G_BUILD_VECTOR [[UV]](p1), [[UV1]](p1) + ; CHECK: G_STORE [[BUILD_VECTOR1]](<2 x s64>), [[DEF]](p1) :: (store (<2 x s64>) into `<4 x i8 addrspace(1)*> addrspace(1)* undef`, align 32, addrspace 1) + ; CHECK: [[BUILD_VECTOR2:%[0-9]+]]:vgpr(<2 x s64>) = G_BUILD_VECTOR [[UV2]](p1), [[UV3]](p1) ; CHECK: [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16 ; CHECK: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64) - ; CHECK: G_STORE [[UV1]](<2 x p1>), [[PTR_ADD]](p1) :: (store (<2 x p1>) into `<4 x i8 addrspace(1)*> addrspace(1)* undef` + 16, basealign 32, addrspace 1) + ; CHECK: G_STORE [[BUILD_VECTOR2]](<2 x s64>), [[PTR_ADD]](p1) :: (store (<2 x s64>) into `<4 x i8 addrspace(1)*> addrspace(1)* undef` + 128, align 32, addrspace 1) ; CHECK: S_ENDPGM 0 ; GREEDY-LABEL: name: s_buffer_load_v4p1_vgpr_offset ; GREEDY: bb.1 (%ir-block.0): @@ -1045,11 +1487,13 @@ ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<2 x p1>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) ; GREEDY: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<2 x p1>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) ; GREEDY: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<4 x p1>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<2 x p1>), [[AMDGPU_BUFFER_LOAD1]](<2 x p1>) - ; GREEDY: [[UV:%[0-9]+]]:vgpr(<2 x p1>), [[UV1:%[0-9]+]]:vgpr(<2 x p1>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<4 x p1>) - ; GREEDY: G_STORE [[UV]](<2 x p1>), [[DEF]](p1) :: (store (<2 x p1>) into `<4 x i8 addrspace(1)*> addrspace(1)* undef`, align 32, addrspace 1) + ; GREEDY: [[UV:%[0-9]+]]:vgpr(p1), [[UV1:%[0-9]+]]:vgpr(p1), [[UV2:%[0-9]+]]:vgpr(p1), [[UV3:%[0-9]+]]:vgpr(p1) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<4 x p1>) + ; GREEDY: [[BUILD_VECTOR1:%[0-9]+]]:vgpr(<2 x s64>) = G_BUILD_VECTOR [[UV]](p1), [[UV1]](p1) + ; GREEDY: G_STORE [[BUILD_VECTOR1]](<2 x s64>), [[DEF]](p1) :: (store (<2 x s64>) into `<4 x i8 addrspace(1)*> addrspace(1)* undef`, align 32, addrspace 1) + ; GREEDY: [[BUILD_VECTOR2:%[0-9]+]]:vgpr(<2 x s64>) = G_BUILD_VECTOR [[UV2]](p1), [[UV3]](p1) ; GREEDY: [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16 ; GREEDY: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64) - ; GREEDY: G_STORE [[UV1]](<2 x p1>), [[PTR_ADD]](p1) :: (store (<2 x p1>) into `<4 x i8 addrspace(1)*> addrspace(1)* undef` + 16, basealign 32, addrspace 1) + ; GREEDY: G_STORE [[BUILD_VECTOR2]](<2 x s64>), [[PTR_ADD]](p1) :: (store (<2 x s64>) into `<4 x i8 addrspace(1)*> addrspace(1)* undef` + 128, align 32, addrspace 1) ; GREEDY: S_ENDPGM 0 %val = call <4 x i8 addrspace(1)*> @llvm.amdgcn.s.buffer.load.v4p1i8(<4 x i32> %rsrc, i32 %soffset, i32 0) store <4 x i8 addrspace(1)*> %val, <4 x i8 addrspace(1)*> addrspace(1)* undef @@ -1075,17 +1519,29 @@ ; CHECK: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<2 x p1>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 32, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4) ; CHECK: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<2 x p1>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 48, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4) ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x p1>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<2 x p1>), [[AMDGPU_BUFFER_LOAD1]](<2 x p1>), [[AMDGPU_BUFFER_LOAD2]](<2 x p1>), [[AMDGPU_BUFFER_LOAD3]](<2 x p1>) - ; CHECK: [[UV:%[0-9]+]]:vgpr(<2 x p1>), [[UV1:%[0-9]+]]:vgpr(<2 x p1>), [[UV2:%[0-9]+]]:vgpr(<2 x p1>), [[UV3:%[0-9]+]]:vgpr(<2 x p1>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x p1>) - ; CHECK: G_STORE [[UV]](<2 x p1>), [[DEF]](p1) :: (store (<2 x p1>) into `<8 x i8 addrspace(1)*> addrspace(1)* undef`, align 64, addrspace 1) + ; CHECK: [[UV:%[0-9]+]]:vgpr(p1), [[UV1:%[0-9]+]]:vgpr(p1), [[UV2:%[0-9]+]]:vgpr(p1), [[UV3:%[0-9]+]]:vgpr(p1), [[UV4:%[0-9]+]]:vgpr(p1), [[UV5:%[0-9]+]]:vgpr(p1), [[UV6:%[0-9]+]]:vgpr(p1), [[UV7:%[0-9]+]]:vgpr(p1) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x p1>) + ; CHECK: [[BITCAST:%[0-9]+]]:vgpr(s64) = G_BITCAST [[UV]](p1) + ; CHECK: [[BITCAST1:%[0-9]+]]:vgpr(s64) = G_BITCAST [[UV1]](p1) + ; CHECK: [[BITCAST2:%[0-9]+]]:vgpr(s64) = G_BITCAST [[UV2]](p1) + ; CHECK: [[BITCAST3:%[0-9]+]]:vgpr(s64) = G_BITCAST [[UV3]](p1) + ; CHECK: [[BUILD_VECTOR1:%[0-9]+]]:vgpr(<2 x s64>) = G_BUILD_VECTOR [[BITCAST]](s64), [[BITCAST1]](s64) + ; CHECK: G_STORE [[BUILD_VECTOR1]](<2 x s64>), [[DEF]](p1) :: (store (<2 x s64>) into `<8 x i8 addrspace(1)*> addrspace(1)* undef`, align 64, addrspace 1) + ; CHECK: [[BUILD_VECTOR2:%[0-9]+]]:vgpr(<2 x s64>) = G_BUILD_VECTOR [[BITCAST2]](s64), [[BITCAST3]](s64) ; CHECK: [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16 ; CHECK: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64) - ; CHECK: G_STORE [[UV1]](<2 x p1>), [[PTR_ADD]](p1) :: (store (<2 x p1>) into `<8 x i8 addrspace(1)*> addrspace(1)* undef` + 16, basealign 64, addrspace 1) + ; CHECK: G_STORE [[BUILD_VECTOR2]](<2 x s64>), [[PTR_ADD]](p1) :: (store (<2 x s64>) into `<8 x i8 addrspace(1)*> addrspace(1)* undef` + 128, align 64, addrspace 1) ; CHECK: [[C3:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 32 ; CHECK: [[PTR_ADD1:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C3]](s64) - ; CHECK: G_STORE [[UV2]](<2 x p1>), [[PTR_ADD1]](p1) :: (store (<2 x p1>) into `<8 x i8 addrspace(1)*> addrspace(1)* undef` + 32, align 32, basealign 64, addrspace 1) + ; CHECK: [[BITCAST4:%[0-9]+]]:vgpr(s64) = G_BITCAST [[UV4]](p1) + ; CHECK: [[BITCAST5:%[0-9]+]]:vgpr(s64) = G_BITCAST [[UV5]](p1) + ; CHECK: [[BITCAST6:%[0-9]+]]:vgpr(s64) = G_BITCAST [[UV6]](p1) + ; CHECK: [[BITCAST7:%[0-9]+]]:vgpr(s64) = G_BITCAST [[UV7]](p1) + ; CHECK: [[BUILD_VECTOR3:%[0-9]+]]:vgpr(<2 x s64>) = G_BUILD_VECTOR [[BITCAST4]](s64), [[BITCAST5]](s64) + ; CHECK: G_STORE [[BUILD_VECTOR3]](<2 x s64>), [[PTR_ADD1]](p1) :: (store (<2 x s64>) into `<8 x i8 addrspace(1)*> addrspace(1)* undef` + 256, align 64, addrspace 1) + ; CHECK: [[BUILD_VECTOR4:%[0-9]+]]:vgpr(<2 x s64>) = G_BUILD_VECTOR [[BITCAST6]](s64), [[BITCAST7]](s64) ; CHECK: [[C4:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 48 ; CHECK: [[PTR_ADD2:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C4]](s64) - ; CHECK: G_STORE [[UV3]](<2 x p1>), [[PTR_ADD2]](p1) :: (store (<2 x p1>) into `<8 x i8 addrspace(1)*> addrspace(1)* undef` + 48, basealign 64, addrspace 1) + ; CHECK: G_STORE [[BUILD_VECTOR4]](<2 x s64>), [[PTR_ADD2]](p1) :: (store (<2 x s64>) into `<8 x i8 addrspace(1)*> addrspace(1)* undef` + 384, align 64, addrspace 1) ; CHECK: S_ENDPGM 0 ; GREEDY-LABEL: name: s_buffer_load_v8p1_vgpr_offset ; GREEDY: bb.1 (%ir-block.0): @@ -1104,17 +1560,29 @@ ; GREEDY: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<2 x p1>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 32, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4) ; GREEDY: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<2 x p1>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 48, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4) ; GREEDY: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x p1>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<2 x p1>), [[AMDGPU_BUFFER_LOAD1]](<2 x p1>), [[AMDGPU_BUFFER_LOAD2]](<2 x p1>), [[AMDGPU_BUFFER_LOAD3]](<2 x p1>) - ; GREEDY: [[UV:%[0-9]+]]:vgpr(<2 x p1>), [[UV1:%[0-9]+]]:vgpr(<2 x p1>), [[UV2:%[0-9]+]]:vgpr(<2 x p1>), [[UV3:%[0-9]+]]:vgpr(<2 x p1>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x p1>) - ; GREEDY: G_STORE [[UV]](<2 x p1>), [[DEF]](p1) :: (store (<2 x p1>) into `<8 x i8 addrspace(1)*> addrspace(1)* undef`, align 64, addrspace 1) + ; GREEDY: [[UV:%[0-9]+]]:vgpr(p1), [[UV1:%[0-9]+]]:vgpr(p1), [[UV2:%[0-9]+]]:vgpr(p1), [[UV3:%[0-9]+]]:vgpr(p1), [[UV4:%[0-9]+]]:vgpr(p1), [[UV5:%[0-9]+]]:vgpr(p1), [[UV6:%[0-9]+]]:vgpr(p1), [[UV7:%[0-9]+]]:vgpr(p1) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x p1>) + ; GREEDY: [[BITCAST:%[0-9]+]]:vgpr(s64) = G_BITCAST [[UV]](p1) + ; GREEDY: [[BITCAST1:%[0-9]+]]:vgpr(s64) = G_BITCAST [[UV1]](p1) + ; GREEDY: [[BITCAST2:%[0-9]+]]:vgpr(s64) = G_BITCAST [[UV2]](p1) + ; GREEDY: [[BITCAST3:%[0-9]+]]:vgpr(s64) = G_BITCAST [[UV3]](p1) + ; GREEDY: [[BUILD_VECTOR1:%[0-9]+]]:vgpr(<2 x s64>) = G_BUILD_VECTOR [[BITCAST]](s64), [[BITCAST1]](s64) + ; GREEDY: G_STORE [[BUILD_VECTOR1]](<2 x s64>), [[DEF]](p1) :: (store (<2 x s64>) into `<8 x i8 addrspace(1)*> addrspace(1)* undef`, align 64, addrspace 1) + ; GREEDY: [[BUILD_VECTOR2:%[0-9]+]]:vgpr(<2 x s64>) = G_BUILD_VECTOR [[BITCAST2]](s64), [[BITCAST3]](s64) ; GREEDY: [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16 ; GREEDY: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64) - ; GREEDY: G_STORE [[UV1]](<2 x p1>), [[PTR_ADD]](p1) :: (store (<2 x p1>) into `<8 x i8 addrspace(1)*> addrspace(1)* undef` + 16, basealign 64, addrspace 1) + ; GREEDY: G_STORE [[BUILD_VECTOR2]](<2 x s64>), [[PTR_ADD]](p1) :: (store (<2 x s64>) into `<8 x i8 addrspace(1)*> addrspace(1)* undef` + 128, align 64, addrspace 1) ; GREEDY: [[C3:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 32 ; GREEDY: [[PTR_ADD1:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C3]](s64) - ; GREEDY: G_STORE [[UV2]](<2 x p1>), [[PTR_ADD1]](p1) :: (store (<2 x p1>) into `<8 x i8 addrspace(1)*> addrspace(1)* undef` + 32, align 32, basealign 64, addrspace 1) + ; GREEDY: [[BITCAST4:%[0-9]+]]:vgpr(s64) = G_BITCAST [[UV4]](p1) + ; GREEDY: [[BITCAST5:%[0-9]+]]:vgpr(s64) = G_BITCAST [[UV5]](p1) + ; GREEDY: [[BITCAST6:%[0-9]+]]:vgpr(s64) = G_BITCAST [[UV6]](p1) + ; GREEDY: [[BITCAST7:%[0-9]+]]:vgpr(s64) = G_BITCAST [[UV7]](p1) + ; GREEDY: [[BUILD_VECTOR3:%[0-9]+]]:vgpr(<2 x s64>) = G_BUILD_VECTOR [[BITCAST4]](s64), [[BITCAST5]](s64) + ; GREEDY: G_STORE [[BUILD_VECTOR3]](<2 x s64>), [[PTR_ADD1]](p1) :: (store (<2 x s64>) into `<8 x i8 addrspace(1)*> addrspace(1)* undef` + 256, align 64, addrspace 1) + ; GREEDY: [[BUILD_VECTOR4:%[0-9]+]]:vgpr(<2 x s64>) = G_BUILD_VECTOR [[BITCAST6]](s64), [[BITCAST7]](s64) ; GREEDY: [[C4:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 48 ; GREEDY: [[PTR_ADD2:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C4]](s64) - ; GREEDY: G_STORE [[UV3]](<2 x p1>), [[PTR_ADD2]](p1) :: (store (<2 x p1>) into `<8 x i8 addrspace(1)*> addrspace(1)* undef` + 48, basealign 64, addrspace 1) + ; GREEDY: G_STORE [[BUILD_VECTOR4]](<2 x s64>), [[PTR_ADD2]](p1) :: (store (<2 x s64>) into `<8 x i8 addrspace(1)*> addrspace(1)* undef` + 384, align 64, addrspace 1) ; GREEDY: S_ENDPGM 0 %val = call <8 x i8 addrspace(1)*> @llvm.amdgcn.s.buffer.load.v8p1i8(<4 x i32> %rsrc, i32 %soffset, i32 0) store <8 x i8 addrspace(1)*> %val, <8 x i8 addrspace(1)*> addrspace(1)* undef Index: llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll @@ -309,13 +309,13 @@ ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v4, s2 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NEXT: ds_write2_b32 v1, v0, v2 offset1:1 -; GFX9-NEXT: v_mov_b32_e32 v3, s6 -; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: ds_write2_b32 v1, v3, v0 offset0:2 offset1:3 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: ds_write2_b32 v4, v0, v1 offset1:1 +; GFX9-NEXT: ds_write2_b32 v4, v2, v3 offset0:2 offset1:3 ; GFX9-NEXT: s_endpgm ; ; GFX7-LABEL: store_lds_v4i32_align4: @@ -324,13 +324,13 @@ ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: ds_write2_b32 v1, v0, v2 offset1:1 -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v2, s3 -; GFX7-NEXT: ds_write2_b32 v1, v0, v2 offset0:2 offset1:3 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: ds_write2_b32 v4, v0, v1 offset1:1 +; GFX7-NEXT: ds_write2_b32 v4, v2, v3 offset0:2 offset1:3 ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: store_lds_v4i32_align4: @@ -340,12 +340,12 @@ ; GFX10-NEXT: s_load_dword s2, s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-NEXT: v_mov_b32_e32 v3, s6 -; GFX10-NEXT: v_mov_b32_e32 v4, s7 -; GFX10-NEXT: ds_write2_b32 v1, v0, v2 offset1:1 -; GFX10-NEXT: ds_write2_b32 v1, v3, v4 offset0:2 offset1:3 +; GFX10-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-NEXT: v_mov_b32_e32 v4, s2 +; GFX10-NEXT: v_mov_b32_e32 v3, s7 +; GFX10-NEXT: ds_write2_b32 v4, v0, v1 offset1:1 +; GFX10-NEXT: ds_write2_b32 v4, v2, v3 offset0:2 offset1:3 ; GFX10-NEXT: s_endpgm store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 4 ret void Index: llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll @@ -52,39 +52,39 @@ ; GFX9-LABEL: store_lds_v3i32_align1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: v_mov_b32_e32 v0, s12 -; GFX9-NEXT: s_lshr_b32 s0, s12, 8 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: s_lshr_b32 s0, s4, 8 ; GFX9-NEXT: ds_write_b8 v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: s_lshr_b32 s1, s12, 16 +; GFX9-NEXT: s_lshr_b32 s1, s4, 16 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:1 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: s_lshr_b32 s3, s12, 24 +; GFX9-NEXT: s_lshr_b32 s3, s4, 24 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:2 ; GFX9-NEXT: v_mov_b32_e32 v0, s3 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:3 -; GFX9-NEXT: v_mov_b32_e32 v0, s13 -; GFX9-NEXT: s_lshr_b32 s0, s13, 8 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: s_lshr_b32 s0, s5, 8 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:4 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: s_lshr_b32 s1, s13, 16 +; GFX9-NEXT: s_lshr_b32 s1, s5, 16 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:5 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: s_lshr_b32 s2, s13, 24 +; GFX9-NEXT: s_lshr_b32 s2, s5, 24 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:6 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:7 -; GFX9-NEXT: v_mov_b32_e32 v0, s14 -; GFX9-NEXT: s_lshr_b32 s0, s14, 8 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: s_lshr_b32 s0, s6, 8 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:8 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: s_lshr_b32 s1, s14, 16 +; GFX9-NEXT: s_lshr_b32 s1, s6, 16 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:9 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: s_lshr_b32 s2, s14, 24 +; GFX9-NEXT: s_lshr_b32 s2, s6, 24 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:10 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:11 @@ -135,27 +135,27 @@ ; GFX10-LABEL: store_lds_v3i32_align1: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX10-NEXT: s_load_dword s2, s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_lshr_b32 s0, s12, 8 +; GFX10-NEXT: s_lshr_b32 s0, s4, 8 ; GFX10-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-NEXT: s_lshr_b32 s5, s13, 24 -; GFX10-NEXT: v_mov_b32_e32 v0, s12 -; GFX10-NEXT: s_lshr_b32 s1, s12, 16 -; GFX10-NEXT: v_mov_b32_e32 v2, s13 -; GFX10-NEXT: s_lshr_b32 s3, s12, 24 -; GFX10-NEXT: s_lshr_b32 s6, s14, 8 +; GFX10-NEXT: s_lshr_b32 s7, s5, 24 +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: s_lshr_b32 s1, s4, 16 +; GFX10-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-NEXT: s_lshr_b32 s3, s4, 24 +; GFX10-NEXT: s_lshr_b32 s2, s5, 8 +; GFX10-NEXT: s_lshr_b32 s4, s5, 16 +; GFX10-NEXT: s_lshr_b32 s5, s6, 8 ; GFX10-NEXT: v_mov_b32_e32 v4, s0 -; GFX10-NEXT: v_mov_b32_e32 v9, s5 -; GFX10-NEXT: s_lshr_b32 s2, s13, 8 -; GFX10-NEXT: s_lshr_b32 s4, s13, 16 -; GFX10-NEXT: s_lshr_b32 s7, s14, 16 -; GFX10-NEXT: v_mov_b32_e32 v3, s14 +; GFX10-NEXT: v_mov_b32_e32 v9, s7 +; GFX10-NEXT: s_lshr_b32 s8, s6, 16 +; GFX10-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-NEXT: v_mov_b32_e32 v5, s1 -; GFX10-NEXT: s_lshr_b32 s8, s14, 24 +; GFX10-NEXT: s_lshr_b32 s9, s6, 24 ; GFX10-NEXT: v_mov_b32_e32 v6, s3 -; GFX10-NEXT: v_mov_b32_e32 v10, s6 +; GFX10-NEXT: v_mov_b32_e32 v10, s5 ; GFX10-NEXT: v_mov_b32_e32 v7, s2 ; GFX10-NEXT: v_mov_b32_e32 v8, s4 ; GFX10-NEXT: ds_write_b8 v1, v0 @@ -165,8 +165,8 @@ ; GFX10-NEXT: ds_write_b8 v1, v6 offset:3 ; GFX10-NEXT: ds_write_b8 v1, v7 offset:5 ; GFX10-NEXT: ds_write_b8 v1, v8 offset:6 -; GFX10-NEXT: v_mov_b32_e32 v0, s7 -; GFX10-NEXT: v_mov_b32_e32 v2, s8 +; GFX10-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-NEXT: v_mov_b32_e32 v2, s9 ; GFX10-NEXT: ds_write_b8 v1, v9 offset:7 ; GFX10-NEXT: ds_write_b8 v1, v3 offset:8 ; GFX10-NEXT: ds_write_b8 v1, v10 offset:9 @@ -181,21 +181,21 @@ ; GFX9-LABEL: store_lds_v3i32_align2: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: v_mov_b32_e32 v0, s12 -; GFX9-NEXT: s_lshr_b32 s0, s12, 16 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: s_lshr_b32 s0, s4, 16 ; GFX9-NEXT: ds_write_b16 v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: ds_write_b16 v1, v0 offset:2 -; GFX9-NEXT: v_mov_b32_e32 v0, s13 -; GFX9-NEXT: s_lshr_b32 s0, s13, 16 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: s_lshr_b32 s0, s5, 16 ; GFX9-NEXT: ds_write_b16 v1, v0 offset:4 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: ds_write_b16 v1, v0 offset:6 -; GFX9-NEXT: v_mov_b32_e32 v0, s14 -; GFX9-NEXT: s_lshr_b32 s0, s14, 16 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: s_lshr_b32 s0, s6, 16 ; GFX9-NEXT: ds_write_b16 v1, v0 offset:8 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: ds_write_b16 v1, v0 offset:10 @@ -228,16 +228,16 @@ ; GFX10-LABEL: store_lds_v3i32_align2: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX10-NEXT: s_load_dword s2, s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s12 +; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-NEXT: s_lshr_b32 s0, s12, 16 -; GFX10-NEXT: v_mov_b32_e32 v2, s13 -; GFX10-NEXT: s_lshr_b32 s1, s13, 16 -; GFX10-NEXT: v_mov_b32_e32 v3, s14 -; GFX10-NEXT: s_lshr_b32 s2, s14, 16 +; GFX10-NEXT: s_lshr_b32 s0, s4, 16 +; GFX10-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-NEXT: s_lshr_b32 s1, s5, 16 +; GFX10-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-NEXT: s_lshr_b32 s2, s6, 16 ; GFX10-NEXT: v_mov_b32_e32 v4, s0 ; GFX10-NEXT: v_mov_b32_e32 v5, s1 ; GFX10-NEXT: v_mov_b32_e32 v6, s2 @@ -256,12 +256,12 @@ ; GFX9-LABEL: store_lds_v3i32_align4: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_mov_b32_e32 v0, s12 -; GFX9-NEXT: v_mov_b32_e32 v1, s13 -; GFX9-NEXT: v_mov_b32_e32 v3, s14 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 ; GFX9-NEXT: ds_write2_b32 v2, v0, v1 offset1:1 ; GFX9-NEXT: ds_write_b32 v2, v3 offset:8 ; GFX9-NEXT: s_endpgm @@ -283,13 +283,13 @@ ; GFX10-LABEL: store_lds_v3i32_align4: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX10-NEXT: s_load_dword s2, s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s12 -; GFX10-NEXT: v_mov_b32_e32 v1, s13 +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-NEXT: v_mov_b32_e32 v3, s14 +; GFX10-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-NEXT: ds_write2_b32 v2, v0, v1 offset1:1 ; GFX10-NEXT: ds_write_b32 v2, v3 offset:8 ; GFX10-NEXT: s_endpgm @@ -301,12 +301,12 @@ ; GFX9-LABEL: store_lds_v3i32_align8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_mov_b32_e32 v0, s12 -; GFX9-NEXT: v_mov_b32_e32 v1, s13 -; GFX9-NEXT: v_mov_b32_e32 v3, s14 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 ; GFX9-NEXT: ds_write_b64 v2, v[0:1] ; GFX9-NEXT: ds_write_b32 v2, v3 offset:8 ; GFX9-NEXT: s_endpgm @@ -328,13 +328,13 @@ ; GFX10-LABEL: store_lds_v3i32_align8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX10-NEXT: s_load_dword s2, s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s12 -; GFX10-NEXT: v_mov_b32_e32 v1, s13 +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-NEXT: v_mov_b32_e32 v3, s14 +; GFX10-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-NEXT: ds_write_b64 v2, v[0:1] ; GFX10-NEXT: ds_write_b32 v2, v3 offset:8 ; GFX10-NEXT: s_endpgm Index: llvm/test/CodeGen/AMDGPU/GlobalISel/vector-legalizer-after-legalizer.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/vector-legalizer-after-legalizer.ll @@ -0,0 +1,200 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -stop-after=legalizer -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s + +define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <4 x half> %data, i32 %vindex) { + ; GFX10-LABEL: name: tbuffer_store_d16_xyz + ; GFX10: bb.1.main_body: + ; GFX10: liveins: $sgpr0_sgpr1 + ; GFX10: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; GFX10: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; GFX10: [[COPY1:%[0-9]+]]:_(p4) = COPY [[COPY]](p4) + ; GFX10: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 + ; GFX10: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY1]], [[C]](s64) + ; GFX10: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (<4 x s32>) from %ir.rsrc.kernarg.offset.cast, align 4, addrspace 4) + ; GFX10: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 52 + ; GFX10: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY1]], [[C1]](s64) + ; GFX10: [[LOAD1:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<4 x s16>) from %ir.data.kernarg.offset.cast, align 4, addrspace 4) + ; GFX10: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 60 + ; GFX10: [[PTR_ADD2:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY1]], [[C2]](s64) + ; GFX10: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p4) :: (dereferenceable invariant load (s32) from %ir.vindex.kernarg.offset.cast, addrspace 4) + ; GFX10: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[LOAD1]](<4 x s16>) + ; GFX10: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX10: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C3]](s32) + ; GFX10: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; GFX10: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C3]](s32) + ; GFX10: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY2]](s32), [[COPY3]](s32) + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) + ; GFX10: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY4]](s32), [[DEF1]](s32) + ; GFX10: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10: G_AMDGPU_TBUFFER_STORE_FORMAT_D16 [[CONCAT_VECTORS]](<4 x s16>), [[LOAD]](<4 x s32>), [[LOAD2]](s32), [[C4]], [[C4]], 0, 33, 0, -1 :: (dereferenceable store (<3 x s16>), align 1, addrspace 4) + ; GFX10: S_ENDPGM 0 +main_body: + %data_subvec = shufflevector <4 x half> %data, <4 x half> undef, <3 x i32> + call void @llvm.amdgcn.struct.tbuffer.store.v3f16(<3 x half> %data_subvec, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 33, i32 0) + ret void +} + +declare void @llvm.amdgcn.struct.tbuffer.store.v3f16(<3 x half>, <4 x i32>, i32, i32, i32, i32, i32) + +define dllexport amdgpu_cs void @_amdgpu_cs_main(<2 x i32> inreg %pushConst0) { + ; GFX10-LABEL: name: _amdgpu_cs_main + ; GFX10: bb.1..entry: + ; GFX10: liveins: $sgpr0, $sgpr1 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr1 + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32) + ; GFX10: [[BITCAST:%[0-9]+]]:_(s64) = G_BITCAST [[BUILD_VECTOR]](<2 x s32>) + ; GFX10: [[INTTOPTR:%[0-9]+]]:_(p1) = G_INTTOPTR [[BITCAST]](s64) + ; GFX10: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 6 + ; GFX10: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C1]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[C]](s32), [[COPY2]](s32) + ; GFX10: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2 + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C2]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY3]](s32), [[COPY2]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY3]](s32), [[C]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY2]](s32), [[COPY3]](s32) + ; GFX10: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>), [[BUILD_VECTOR_TRUNC3]](<2 x s16>) + ; GFX10: [[BITCAST1:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[CONCAT_VECTORS]](<8 x s16>) + ; GFX10: G_STORE [[BITCAST1]](<4 x s32>), [[INTTOPTR]](p1) :: (store (<4 x s32>) into %ir.1, align 8, addrspace 1) + ; GFX10: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 + ; GFX10: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[INTTOPTR]], [[C3]](s64) + ; GFX10: [[BUILD_VECTOR_TRUNC4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY2]](s32), [[COPY3]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[C]](s32), [[COPY2]](s32) + ; GFX10: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC4]](<2 x s16>), [[BUILD_VECTOR_TRUNC5]](<2 x s16>) + ; GFX10: G_STORE [[CONCAT_VECTORS1]](<4 x s16>), [[PTR_ADD]](p1) :: (store (<4 x s16>) into %ir.1 + 128, addrspace 1) + ; GFX10: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; GFX10: [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD]], [[C4]](s64) + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C2]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC6:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY4]](s32), [[C1]](s32) + ; GFX10: G_STORE [[BUILD_VECTOR_TRUNC6]](<2 x s16>), [[PTR_ADD1]](p1) :: (store (<2 x s16>) into %ir.1 + 192, align 8, addrspace 1) + ; GFX10: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; GFX10: [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD1]], [[C5]](s64) + ; GFX10: G_STORE [[C2]](s32), [[PTR_ADD2]](p1) :: (store (s16) into %ir.1 + 224, align 8, addrspace 1) + ; GFX10: S_ENDPGM 0 +.entry: + %0 = bitcast <2 x i32> %pushConst0 to i64 + %1 = inttoptr i64 %0 to <15 x i16> addrspace(1)* + store <15 x i16> , <15 x i16> addrspace(1)* %1, align 8 + ret void +} + +define <3 x i16> @add_v3i16(<3 x i16> %a, <3 x i16> %b) { + ; GFX10-LABEL: name: add_v3i16 + ; GFX10: bb.1 (%ir-block.0): + ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr30_sgpr31 + ; GFX10: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 + ; GFX10: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 + ; GFX10: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 + ; GFX10: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3 + ; GFX10: [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; GFX10: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) + ; GFX10: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX10: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) + ; GFX10: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) + ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY5]](s32), [[COPY6]](s32) + ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) + ; GFX10: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY7]](s32), [[DEF]](s32) + ; GFX10: [[DEF1:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF + ; GFX10: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY2]](<2 x s16>) + ; GFX10: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX10: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY3]](<2 x s16>) + ; GFX10: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) + ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY8]](s32), [[COPY9]](s32) + ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST3]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[DEF]](s32) + ; GFX10: [[ADD:%[0-9]+]]:_(<2 x s16>) = G_ADD [[BUILD_VECTOR_TRUNC]], [[BUILD_VECTOR_TRUNC2]] + ; GFX10: [[ADD1:%[0-9]+]]:_(<2 x s16>) = G_ADD [[BUILD_VECTOR_TRUNC1]], [[BUILD_VECTOR_TRUNC3]] + ; GFX10: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[ADD]](<2 x s16>) + ; GFX10: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32) + ; GFX10: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[ADD1]](<2 x s16>) + ; GFX10: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST5]], [[C]](s32) + ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY [[BITCAST4]](s32) + ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY11]](s32), [[COPY12]](s32) + ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY [[BITCAST5]](s32) + ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY [[DEF]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[COPY14]](s32) + ; GFX10: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC4]](<2 x s16>) + ; GFX10: $vgpr1 = COPY [[BUILD_VECTOR_TRUNC5]](<2 x s16>) + ; GFX10: [[COPY15:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY4]] + ; GFX10: S_SETPC_B64_return [[COPY15]], implicit $vgpr0, implicit $vgpr1 + %result = add <3 x i16> %a, %b + ret <3 x i16> %result +} + +define <3 x i8> @or_v3i8(<3 x i8> %a, <3 x i8> %b) { + ; GFX10-LABEL: name: or_v3i8 + ; GFX10: bb.1 (%ir-block.0): + ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $sgpr30_sgpr31 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX10: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY [[COPY3]](s32) + ; GFX10: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY7]], [[COPY8]] + ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY [[COPY4]](s32) + ; GFX10: [[OR1:%[0-9]+]]:_(s32) = G_OR [[COPY9]], [[COPY10]] + ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]](s32) + ; GFX10: [[OR2:%[0-9]+]]:_(s32) = G_OR [[COPY11]], [[COPY12]] + ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY [[OR]](s32) + ; GFX10: $vgpr0 = COPY [[COPY13]](s32) + ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY [[OR1]](s32) + ; GFX10: $vgpr1 = COPY [[COPY14]](s32) + ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY [[OR2]](s32) + ; GFX10: $vgpr2 = COPY [[COPY15]](s32) + ; GFX10: [[COPY16:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY6]] + ; GFX10: S_SETPC_B64_return [[COPY16]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2 + %result = or <3 x i8> %a, %b + ret <3 x i8> %result +} + +define void @load_store_large(<11 x float> addrspace(1)* %ptr, <11 x float> addrspace(1)* %ptr2) { + ; GFX10-LABEL: name: load_store_large + ; GFX10: bb.1 (%ir-block.0): + ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr30_sgpr31 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10: [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) + ; GFX10: [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; GFX10: [[LOAD:%[0-9]+]]:_(<8 x s32>) = G_LOAD [[MV]](p1) :: (load (<8 x s32>) from %ir.ptr, align 4, addrspace 1) + ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<8 x s32>) + ; GFX10: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 32 + ; GFX10: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV]], [[C]](s64) + ; GFX10: [[LOAD1:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[PTR_ADD]](p1) :: (load (<3 x s32>) from %ir.ptr + 256, align 4, addrspace 1) + ; GFX10: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32), [[UV10:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD1]](<3 x s32>) + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32), [[UV3]](s32) + ; GFX10: G_STORE [[BUILD_VECTOR]](<4 x s32>), [[MV1]](p1) :: (store (<4 x s32>) into %ir.ptr2, align 4, addrspace 1) + ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[UV4]](s32), [[UV5]](s32), [[UV6]](s32), [[UV7]](s32) + ; GFX10: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 + ; GFX10: [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV1]], [[C1]](s64) + ; GFX10: G_STORE [[BUILD_VECTOR1]](<4 x s32>), [[PTR_ADD1]](p1) :: (store (<4 x s32>) into %ir.ptr2 + 128, align 4, addrspace 1) + ; GFX10: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[UV8]](s32), [[UV9]](s32), [[UV10]](s32) + ; GFX10: [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV1]], [[C]](s64) + ; GFX10: G_STORE [[BUILD_VECTOR2]](<3 x s32>), [[PTR_ADD2]](p1) :: (store (<3 x s32>) into %ir.ptr2 + 256, align 4, addrspace 1) + ; GFX10: [[COPY5:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY4]] + ; GFX10: S_SETPC_B64_return [[COPY5]] + %load = load <11 x float>, <11 x float> addrspace(1)* %ptr, align 4 + store <11 x float> %load, <11 x float> addrspace(1)* %ptr2, align 4 + ret void +} Index: llvm/test/CodeGen/AMDGPU/GlobalISel/vector-legalizer.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/vector-legalizer.ll @@ -0,0 +1,115 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s + +define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <4 x half> %data, i32 %vindex) { +; GFX10-LABEL: tbuffer_store_d16_xyz: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: s_clause 0x2 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_pack_lh_b32_b16 s2, s2, s2 +; GFX10-NEXT: v_mov_b32_e32 v2, s8 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: tbuffer_store_format_d16_xyzw v[0:1], v2, s[4:7], 0 format:[BUF_FMT_10_11_11_SSCALED] idxen +; GFX10-NEXT: s_endpgm +main_body: + %data_subvec = shufflevector <4 x half> %data, <4 x half> undef, <3 x i32> + call void @llvm.amdgcn.struct.tbuffer.store.v3f16(<3 x half> %data_subvec, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 33, i32 0) + ret void +} + +declare void @llvm.amdgcn.struct.tbuffer.store.v3f16(<3 x half>, <4 x i32>, i32, i32, i32, i32, i32) + +define dllexport amdgpu_cs void @_amdgpu_cs_main(<2 x i32> inreg %pushConst0) { +; GFX10-LABEL: _amdgpu_cs_main: +; GFX10: ; %bb.0: ; %.entry +; GFX10-NEXT: s_mov_b32 s4, 0x10006 +; GFX10-NEXT: s_mov_b32 s7, 0xfffe0001 +; GFX10-NEXT: s_mov_b32 s5, 0x1fffe +; GFX10-NEXT: s_mov_b32 s6, 0x6fffe +; GFX10-NEXT: s_mov_b32 s3, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: s_mov_b32 s2, s7 +; GFX10-NEXT: v_mov_b32_e32 v5, s3 +; GFX10-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-NEXT: v_mov_b32_e32 v3, s7 +; GFX10-NEXT: v_mov_b32_e32 v6, 0 +; GFX10-NEXT: v_mov_b32_e32 v4, s2 +; GFX10-NEXT: v_mov_b32_e32 v7, 0x1fffe +; GFX10-NEXT: v_mov_b32_e32 v8, -2 +; GFX10-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX10-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] offset:16 +; GFX10-NEXT: global_store_dword v6, v7, s[0:1] offset:24 +; GFX10-NEXT: global_store_short v6, v8, s[0:1] offset:28 +; GFX10-NEXT: s_endpgm +.entry: + %0 = bitcast <2 x i32> %pushConst0 to i64 + %1 = inttoptr i64 %0 to <15 x i16> addrspace(1)* + store <15 x i16> , <15 x i16> addrspace(1)* %1, align 8 + ret void +} + +define <3 x i16> @add_v3i16(<3 x i16> %a, <3 x i16> %b) { +; GFX10-LABEL: add_v3i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX10-NEXT: v_mov_b32_e32 v6, 0xffff +; GFX10-NEXT: s_lshl_b32 s4, s4, 16 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX10-NEXT: v_and_or_b32 v1, v1, v6, s4 +; GFX10-NEXT: v_and_or_b32 v3, v3, v6, s4 +; GFX10-NEXT: v_and_or_b32 v0, v0, v6, v4 +; GFX10-NEXT: v_and_or_b32 v2, v2, v6, v5 +; GFX10-NEXT: v_pk_add_u16 v1, v1, v3 +; GFX10-NEXT: v_pk_add_u16 v0, v0, v2 +; GFX10-NEXT: v_and_or_b32 v1, v1, v6, s4 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX10-NEXT: v_and_or_b32 v0, v0, v6, v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %result = add <3 x i16> %a, %b + ret <3 x i16> %result +} + +define <3 x i8> @or_v3i8(<3 x i8> %a, <3 x i8> %b) { +; GFX10-LABEL: or_v3i8: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX10-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX10-NEXT: v_or_b32_e32 v2, v2, v5 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %result = or <3 x i8> %a, %b + ret <3 x i8> %result +} + +define void @load_store_large(<11 x float> addrspace(1)* %ptr, <11 x float> addrspace(1)* %ptr2) { +; GFX10-LABEL: load_store_large: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_clause 0x2 +; GFX10-NEXT: global_load_dwordx4 v[4:7], v[0:1], off +; GFX10-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:16 +; GFX10-NEXT: global_load_dwordx3 v[12:14], v[0:1], off offset:32 +; GFX10-NEXT: s_waitcnt vmcnt(2) +; GFX10-NEXT: global_store_dwordx4 v[2:3], v[4:7], off +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: global_store_dwordx4 v[2:3], v[8:11], off offset:16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_store_dwordx3 v[2:3], v[12:14], off offset:32 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %load = load <11 x float>, <11 x float> addrspace(1)* %ptr, align 4 + store <11 x float> %load, <11 x float> addrspace(1)* %ptr2, align 4 + ret void +}