diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp --- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -126,10 +126,10 @@ bool SLC; bool DLC; bool UseST64; - SmallVector InstsToMove; int AddrIdx[5]; const MachineOperand *AddrReg[5]; unsigned NumAddresses; + unsigned Order; bool hasSameBaseAddress(const MachineInstr &MI) { for (unsigned i = 0; i < NumAddresses; i++) { @@ -210,7 +210,7 @@ const SIInstrInfo &TII, const CombineInfo &Paired); static bool offsetsCanBeCombined(CombineInfo &CI, const MCSubtargetInfo &STI, - CombineInfo &Paired); + CombineInfo &Paired, bool Modify = false); static bool widthsFit(const GCNSubtarget &STM, const CombineInfo &CI, const CombineInfo &Paired); static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired); @@ -219,21 +219,38 @@ const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI, const CombineInfo &Paired); - bool findMatchingInst(CombineInfo &CI, CombineInfo &Paired); + bool checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired, + SmallVectorImpl &InstsToMove); unsigned read2Opcode(unsigned EltSize) const; unsigned read2ST64Opcode(unsigned EltSize) const; - MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired); + MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI, + CombineInfo &Paired, + const SmallVectorImpl &InstsToMove); unsigned write2Opcode(unsigned EltSize) const; unsigned write2ST64Opcode(unsigned EltSize) const; - MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired); - MachineBasicBlock::iterator mergeImagePair(CombineInfo &CI, CombineInfo &Paired); - MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI, CombineInfo &Paired); - MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired); - MachineBasicBlock::iterator mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired); - MachineBasicBlock::iterator mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired); - MachineBasicBlock::iterator mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired); + MachineBasicBlock::iterator + mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired, + const SmallVectorImpl &InstsToMove); + MachineBasicBlock::iterator + mergeImagePair(CombineInfo &CI, CombineInfo &Paired, + const SmallVectorImpl &InstsToMove); + MachineBasicBlock::iterator + mergeSBufferLoadImmPair(CombineInfo &CI, CombineInfo &Paired, + const SmallVectorImpl &InstsToMove); + MachineBasicBlock::iterator + mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired, + const SmallVectorImpl &InstsToMove); + MachineBasicBlock::iterator + mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired, + const SmallVectorImpl &InstsToMove); + MachineBasicBlock::iterator + mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired, + const SmallVectorImpl &InstsToMove); + MachineBasicBlock::iterator + mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired, + const SmallVectorImpl &InstsToMove); void updateBaseAndOffset(MachineInstr &I, unsigned NewBase, int32_t NewOffset) const; @@ -259,8 +276,6 @@ initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry()); } - void removeCombinedInst(std::list &MergeList, - const MachineInstr &MI); bool optimizeInstsWithSameBaseAddr(std::list &MergeList, bool &OptimizeListAgain); bool optimizeBlock(std::list > &MergeableInsts); @@ -547,8 +562,6 @@ AddrIdx[i] = AMDGPU::getNamedOperandIdx(I->getOpcode(), AddrOpName[i]); AddrReg[i] = &I->getOperand(AddrIdx[i]); } - - InstsToMove.clear(); } } // end anonymous namespace. @@ -720,7 +733,8 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI, const MCSubtargetInfo &STI, - CombineInfo &Paired) { + CombineInfo &Paired, + bool Modify) { assert(CI.InstClass != MIMG); // XXX - Would the same offset be OK? Is there any reason this would happen or @@ -761,7 +775,7 @@ CI.UseST64 = false; CI.BaseOff = 0; - // Handle SMEM and VMEM instructions. + // Handle DS instructions. if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) { return (EltOffset0 + CI.Width == EltOffset1 || EltOffset1 + Paired.Width == EltOffset0) && @@ -769,20 +783,25 @@ (CI.InstClass == S_BUFFER_LOAD_IMM || CI.SLC == Paired.SLC); } + // Handle SMEM and VMEM instructions. // If the offset in elements doesn't fit in 8-bits, we might be able to use // the stride 64 versions. if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 && isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) { - CI.Offset = EltOffset0 / 64; - Paired.Offset = EltOffset1 / 64; - CI.UseST64 = true; + if (Modify) { + CI.Offset = EltOffset0 / 64; + Paired.Offset = EltOffset1 / 64; + CI.UseST64 = true; + } return true; } // Check if the new offsets fit in the reduced 8-bit range. if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) { - CI.Offset = EltOffset0; - Paired.Offset = EltOffset1; + if (Modify) { + CI.Offset = EltOffset0; + Paired.Offset = EltOffset1; + } return true; } @@ -791,15 +810,19 @@ CI.BaseOff = std::min(CI.Offset, Paired.Offset); if ((OffsetDiff % 64 == 0) && isUInt<8>(OffsetDiff / 64)) { - CI.Offset = (EltOffset0 - CI.BaseOff / CI.EltSize) / 64; - Paired.Offset = (EltOffset1 - CI.BaseOff / CI.EltSize) / 64; - CI.UseST64 = true; + if (Modify) { + CI.Offset = (EltOffset0 - CI.BaseOff / CI.EltSize) / 64; + Paired.Offset = (EltOffset1 - CI.BaseOff / CI.EltSize) / 64; + CI.UseST64 = true; + } return true; } if (isUInt<8>(OffsetDiff)) { - CI.Offset = EltOffset0 - CI.BaseOff / CI.EltSize; - Paired.Offset = EltOffset1 - CI.BaseOff / CI.EltSize; + if (Modify) { + CI.Offset = EltOffset0 - CI.BaseOff / CI.EltSize; + Paired.Offset = EltOffset1 - CI.BaseOff / CI.EltSize; + } return true; } @@ -824,11 +847,19 @@ } } -bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI, - CombineInfo &Paired) { - MachineBasicBlock *MBB = CI.I->getParent(); - MachineBasicBlock::iterator E = MBB->end(); - MachineBasicBlock::iterator MBBI = CI.I; +/// This function assumes that CI comes before Paired in a basic block. +bool SILoadStoreOptimizer::checkAndPrepareMerge( + CombineInfo &CI, CombineInfo &Paired, + SmallVectorImpl &InstsToMove) { + + // Check both offsets (or masks for MIMG) can be combined and fit in the + // reduced range. + if (CI.InstClass == MIMG && !dmasksCanBeCombined(CI, *TII, Paired)) + return false; + + if (CI.InstClass != MIMG && + (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STI, Paired))) + return false; const unsigned Opc = CI.I->getOpcode(); const InstClassEnum InstClass = getInstClass(Opc, *TII); @@ -844,12 +875,12 @@ if (Swizzled != -1 && CI.I->getOperand(Swizzled).getImm()) return false; - ++MBBI; - DenseSet RegDefsToMove; DenseSet PhysRegUsesToMove; addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove); + MachineBasicBlock::iterator E = std::next(Paired.I); + MachineBasicBlock::iterator MBBI = std::next(CI.I); for (; MBBI != E; ++MBBI) { if ((getInstClass(MBBI->getOpcode(), *TII) != InstClass) || @@ -868,11 +899,11 @@ if (MBBI->mayLoadOrStore() && (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) || - !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA))) { + !canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA))) { // We fail condition #1, but we may still be able to satisfy condition // #2. Add this instruction to the move list and then we will check // if condition #2 holds once we have selected the matching instruction. - CI.InstsToMove.push_back(&*MBBI); + InstsToMove.push_back(&*MBBI); addDefsUsesToList(*MBBI, RegDefsToMove, PhysRegUsesToMove); continue; } @@ -881,7 +912,7 @@ // to the location of the matched instruction any uses of I will need to // be moved down as well. addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove, - CI.InstsToMove); + InstsToMove); continue; } @@ -901,26 +932,24 @@ // where the DS_READ_B32 ends up in InstsToMove and therefore prevents // merging of the two writes. if (addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove, - CI.InstsToMove)) + InstsToMove)) continue; - bool Match = CI.hasSameBaseAddress(*MBBI); - - if (Match) { - Paired.setMI(MBBI, *TII, *STM); - - // Check both offsets (or masks for MIMG) can be combined and fit in the - // reduced range. - bool canBeCombined = - CI.InstClass == MIMG - ? dmasksCanBeCombined(CI, *TII, Paired) - : widthsFit(*STM, CI, Paired) && offsetsCanBeCombined(CI, *STI, Paired); - - // We also need to go through the list of instructions that we plan to + if (&*MBBI == &*Paired.I) { + // We need to go through the list of instructions that we plan to // move and make sure they are all safe to move down past the merged // instruction. - if (canBeCombined && canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA)) + if (canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA)) { + + // Call offsetsCanBeCombined with modify = true so that the offsets are + // correct for the new instruction. This should return true, because + // this function should only be called on CombineInfo objects that + // have already been confirmed to be mergeable. + if (CI.InstClass != MIMG) + offsetsCanBeCombined(CI, *STI, Paired, true); return true; + } + return false; } // We've found a load/store that we couldn't merge for some reason. @@ -929,7 +958,7 @@ // down past this instruction. // check if we can move I across MBBI and if we can move all I's users if (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) || - !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA)) + !canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA)) break; } return false; @@ -950,7 +979,8 @@ } MachineBasicBlock::iterator -SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired) { +SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired, + const SmallVectorImpl &InstsToMove) { MachineBasicBlock *MBB = CI.I->getParent(); // Be careful, since the addresses could be subregisters themselves in weird @@ -1023,7 +1053,7 @@ .add(*Dest1) .addReg(DestReg, RegState::Kill, SubRegIdx1); - moveInstsAfter(Copy1, CI.InstsToMove); + moveInstsAfter(Copy1, InstsToMove); CI.I->eraseFromParent(); Paired.I->eraseFromParent(); @@ -1049,7 +1079,8 @@ } MachineBasicBlock::iterator -SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired) { +SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired, + const SmallVectorImpl &InstsToMove) { MachineBasicBlock *MBB = CI.I->getParent(); // Be sure to use .addOperand(), and not .addReg() with these. We want to be @@ -1106,7 +1137,7 @@ .addImm(0) // gds .cloneMergedMemRefs({&*CI.I, &*Paired.I}); - moveInstsAfter(Write2, CI.InstsToMove); + moveInstsAfter(Write2, InstsToMove); CI.I->eraseFromParent(); Paired.I->eraseFromParent(); @@ -1116,7 +1147,8 @@ } MachineBasicBlock::iterator -SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired) { +SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired, + const SmallVectorImpl &InstsToMove) { MachineBasicBlock *MBB = CI.I->getParent(); DebugLoc DL = CI.I->getDebugLoc(); const unsigned Opcode = getNewOpcode(CI, Paired); @@ -1161,15 +1193,16 @@ .add(*Dest1) .addReg(DestReg, RegState::Kill, SubRegIdx1); - moveInstsAfter(Copy1, CI.InstsToMove); + moveInstsAfter(Copy1, InstsToMove); CI.I->eraseFromParent(); Paired.I->eraseFromParent(); return New; } -MachineBasicBlock::iterator -SILoadStoreOptimizer::mergeSBufferLoadImmPair(CombineInfo &CI, CombineInfo &Paired) { +MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair( + CombineInfo &CI, CombineInfo &Paired, + const SmallVectorImpl &InstsToMove) { MachineBasicBlock *MBB = CI.I->getParent(); DebugLoc DL = CI.I->getDebugLoc(); const unsigned Opcode = getNewOpcode(CI, Paired); @@ -1211,15 +1244,16 @@ .add(*Dest1) .addReg(DestReg, RegState::Kill, SubRegIdx1); - moveInstsAfter(Copy1, CI.InstsToMove); + moveInstsAfter(Copy1, InstsToMove); CI.I->eraseFromParent(); Paired.I->eraseFromParent(); return New; } -MachineBasicBlock::iterator -SILoadStoreOptimizer::mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired) { +MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair( + CombineInfo &CI, CombineInfo &Paired, + const SmallVectorImpl &InstsToMove) { MachineBasicBlock *MBB = CI.I->getParent(); DebugLoc DL = CI.I->getDebugLoc(); @@ -1273,15 +1307,16 @@ .add(*Dest1) .addReg(DestReg, RegState::Kill, SubRegIdx1); - moveInstsAfter(Copy1, CI.InstsToMove); + moveInstsAfter(Copy1, InstsToMove); CI.I->eraseFromParent(); Paired.I->eraseFromParent(); return New; } -MachineBasicBlock::iterator -SILoadStoreOptimizer::mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired) { +MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair( + CombineInfo &CI, CombineInfo &Paired, + const SmallVectorImpl &InstsToMove) { MachineBasicBlock *MBB = CI.I->getParent(); DebugLoc DL = CI.I->getDebugLoc(); @@ -1340,15 +1375,16 @@ .add(*Dest1) .addReg(DestReg, RegState::Kill, SubRegIdx1); - moveInstsAfter(Copy1, CI.InstsToMove); + moveInstsAfter(Copy1, InstsToMove); CI.I->eraseFromParent(); Paired.I->eraseFromParent(); return New; } -MachineBasicBlock::iterator -SILoadStoreOptimizer::mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired) { +MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair( + CombineInfo &CI, CombineInfo &Paired, + const SmallVectorImpl &InstsToMove) { MachineBasicBlock *MBB = CI.I->getParent(); DebugLoc DL = CI.I->getDebugLoc(); @@ -1403,7 +1439,7 @@ .addMemOperand( combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); - moveInstsAfter(MIB, CI.InstsToMove); + moveInstsAfter(MIB, InstsToMove); CI.I->eraseFromParent(); Paired.I->eraseFromParent(); @@ -1509,8 +1545,9 @@ } } -MachineBasicBlock::iterator -SILoadStoreOptimizer::mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired) { +MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair( + CombineInfo &CI, CombineInfo &Paired, + const SmallVectorImpl &InstsToMove) { MachineBasicBlock *MBB = CI.I->getParent(); DebugLoc DL = CI.I->getDebugLoc(); @@ -1561,7 +1598,7 @@ .addImm(0) // swz .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); - moveInstsAfter(MIB, CI.InstsToMove); + moveInstsAfter(MIB, InstsToMove); CI.I->eraseFromParent(); Paired.I->eraseFromParent(); @@ -1904,6 +1941,7 @@ SmallPtrSet AnchorList; // Sort potential mergeable instructions into lists. One list per base address. + unsigned Order = 0; for (MachineInstr &MI : MBB.instrs()) { // We run this before checking if an address is mergeable, because it can produce // better code even if the instructions aren't mergeable. @@ -1920,12 +1958,43 @@ CombineInfo CI; CI.setMI(MI, *TII, *STM); + CI.Order = Order++; if (!CI.hasMergeableAddress(*MRI)) continue; addInstToMergeableList(CI, MergeableInsts); } + + // At this point we have lists of Mergeable instructions. + // + // Part 2: Sort lists by offset and then for each CombineInfo object in the + // list try to find an instruction that can be merged with I. If an instruction + // is found, it is stored in the Paired field. If no instructions are found, then + // the CombineInfo object is deleted from the list. + + for (std::list>::iterator I = MergeableInsts.begin(), + E = MergeableInsts.end(); I != E;) { + + std::list &MergeList = *I; + if (MergeList.size() <= 1) { + // This means we have found only one instruction with a given address + // that can be merged, and we need at least 2 instructions to do a merge, + // so this list can be discarded. + I = MergeableInsts.erase(I); + continue; + } + + // Sort the lists by offsets, this way mergeable instructions will be + // adjacent to each other in the list, which will make it easier to find + // matches. + MergeList.sort( + [] (const CombineInfo &A, CombineInfo &B) { + return A.Offset < B.Offset; + }); + ++I; + } + return Modified; } @@ -1936,117 +2005,124 @@ std::list > &MergeableInsts) { bool Modified = false; - for (std::list &MergeList : MergeableInsts) { - if (MergeList.size() < 2) - continue; + for (std::list>::iterator I = MergeableInsts.begin(), + E = MergeableInsts.end(); I != E;) { + std::list &MergeList = *I; bool OptimizeListAgain = false; if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) { - // We weren't able to make any changes, so clear the list so we don't + // We weren't able to make any changes, so delete the list so we don't // process the same instructions the next time we try to optimize this // block. - MergeList.clear(); + I = MergeableInsts.erase(I); continue; } - // We made changes, but also determined that there were no more optimization - // opportunities, so we don't need to reprocess the list - if (!OptimizeListAgain) - MergeList.clear(); - - OptimizeAgain |= OptimizeListAgain; Modified = true; - } - return Modified; -} - -void -SILoadStoreOptimizer::removeCombinedInst(std::list &MergeList, - const MachineInstr &MI) { - for (auto CI = MergeList.begin(), E = MergeList.end(); CI != E; ++CI) { - if (&*CI->I == &MI) { - MergeList.erase(CI); - return; + // We made changes, but also determined that there were no more optimization + // opportunities, so we don't need to reprocess the list + if (!OptimizeListAgain) { + I = MergeableInsts.erase(I); + continue; } + OptimizeAgain = true; } + return Modified; } bool SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr( std::list &MergeList, bool &OptimizeListAgain) { + if (MergeList.empty()) + return false; + bool Modified = false; - for (auto I = MergeList.begin(); I != MergeList.end(); ++I) { - CombineInfo &CI = *I; - CombineInfo Paired; - if (CI.InstClass == UNKNOWN) - continue; + for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end(); + Next = std::next(I)) { + + auto First = I; + auto Second = Next; + + if ((*First).Order > (*Second).Order) + std::swap(First, Second); + CombineInfo &CI = *First; + CombineInfo &Paired = *Second; - if (!findMatchingInst(CI, Paired)) - goto done; + SmallVector InstsToMove; + if (!checkAndPrepareMerge(CI, Paired, InstsToMove)) { + ++I; + continue; + } Modified = true; - removeCombinedInst(MergeList, *Paired.I); switch (CI.InstClass) { default: llvm_unreachable("unknown InstClass"); break; case DS_READ: { - MachineBasicBlock::iterator NewMI = mergeRead2Pair(CI, Paired); + MachineBasicBlock::iterator NewMI = + mergeRead2Pair(CI, Paired, InstsToMove); CI.setMI(NewMI, *TII, *STM); break; } case DS_WRITE: { - MachineBasicBlock::iterator NewMI = mergeWrite2Pair(CI, Paired); + MachineBasicBlock::iterator NewMI = + mergeWrite2Pair(CI, Paired, InstsToMove); CI.setMI(NewMI, *TII, *STM); break; } case S_BUFFER_LOAD_IMM: { - MachineBasicBlock::iterator NewMI = mergeSBufferLoadImmPair(CI, Paired); + MachineBasicBlock::iterator NewMI = + mergeSBufferLoadImmPair(CI, Paired, InstsToMove); CI.setMI(NewMI, *TII, *STM); OptimizeListAgain |= (CI.Width + Paired.Width) < 16; break; } case BUFFER_LOAD: { - MachineBasicBlock::iterator NewMI = mergeBufferLoadPair(CI, Paired); + MachineBasicBlock::iterator NewMI = + mergeBufferLoadPair(CI, Paired, InstsToMove); CI.setMI(NewMI, *TII, *STM); OptimizeListAgain |= (CI.Width + Paired.Width) < 4; break; } case BUFFER_STORE: { - MachineBasicBlock::iterator NewMI = mergeBufferStorePair(CI, Paired); + MachineBasicBlock::iterator NewMI = + mergeBufferStorePair(CI, Paired, InstsToMove); CI.setMI(NewMI, *TII, *STM); OptimizeListAgain |= (CI.Width + Paired.Width) < 4; break; } case MIMG: { - MachineBasicBlock::iterator NewMI = mergeImagePair(CI, Paired); + MachineBasicBlock::iterator NewMI = + mergeImagePair(CI, Paired, InstsToMove); CI.setMI(NewMI, *TII, *STM); OptimizeListAgain |= (CI.Width + Paired.Width) < 4; break; } case TBUFFER_LOAD: { - MachineBasicBlock::iterator NewMI = mergeTBufferLoadPair(CI, Paired); + MachineBasicBlock::iterator NewMI = + mergeTBufferLoadPair(CI, Paired, InstsToMove); CI.setMI(NewMI, *TII, *STM); OptimizeListAgain |= (CI.Width + Paired.Width) < 4; break; } case TBUFFER_STORE: { - MachineBasicBlock::iterator NewMI = mergeTBufferStorePair(CI, Paired); + MachineBasicBlock::iterator NewMI = + mergeTBufferStorePair(CI, Paired, InstsToMove); CI.setMI(NewMI, *TII, *STM); OptimizeListAgain |= (CI.Width + Paired.Width) < 4; break; } } + CI.Order = Paired.Order; + if (I == Second) + I = Next; -done: - // Clear the InstsToMove after we have finished searching so we don't have - // stale values left over if we search for this CI again in another pass - // over the block. - CI.InstsToMove.clear(); + MergeList.erase(Second); } return Modified; diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2_offset_order.ll b/llvm/test/CodeGen/AMDGPU/ds_read2_offset_order.ll --- a/llvm/test/CodeGen/AMDGPU/ds_read2_offset_order.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_read2_offset_order.ll @@ -6,9 +6,9 @@ ; offset0 is larger than offset1 ; SI-LABEL: {{^}}offset_order: -; SI-DAG: ds_read2st64_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset1:4{{$}} +; SI-DAG: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset1:14{{$}} ; SI-DAG: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:2 offset1:3 -; SI-DAG: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:56 +; SI-DAG: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:1024 ; SI-DAG: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:11 offset1:12 define amdgpu_kernel void @offset_order(float addrspace(1)* %out) { entry: diff --git a/llvm/test/CodeGen/AMDGPU/merge-load-store.mir b/llvm/test/CodeGen/AMDGPU/merge-load-store.mir --- a/llvm/test/CodeGen/AMDGPU/merge-load-store.mir +++ b/llvm/test/CodeGen/AMDGPU/merge-load-store.mir @@ -66,6 +66,7 @@ attributes #0 = { convergent nounwind } define amdgpu_kernel void @merge_mmos(i32 addrspace(1)* %ptr_addr1) { ret void } + define amdgpu_kernel void @reorder_offsets(i32 addrspace(1)* %reorder_addr1) { ret void } ... --- @@ -194,3 +195,26 @@ S_ENDPGM 0 ... +--- +# CHECK-LABEL: reorder_offsets +# CHECK-DAG: BUFFER_STORE_DWORDX2_OFFSET_exact killed %{{[0-9]+}}, %0, 0, 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into %ir.reorder_addr1 + 16, align 4, addrspace 1) +# CHECK-DAG: BUFFER_STORE_DWORDX4_OFFSET_exact killed %{{[0-9]+}}, %0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into %ir.reorder_addr1, align 4, addrspace 1) + +name: reorder_offsets +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + + %0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + %1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET_exact %1, %0, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into %ir.reorder_addr1 + 4) + BUFFER_STORE_DWORD_OFFSET_exact %1, %0, 0, 8, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into %ir.reorder_addr1 + 8) + BUFFER_STORE_DWORD_OFFSET_exact %1, %0, 0, 12, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into %ir.reorder_addr1 + 12) + BUFFER_STORE_DWORD_OFFSET_exact %1, %0, 0, 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into %ir.reorder_addr1 + 16) + BUFFER_STORE_DWORD_OFFSET_exact %1, %0, 0, 20, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into %ir.reorder_addr1 + 20) + BUFFER_STORE_DWORD_OFFSET_exact %1, %0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into %ir.reorder_addr1) + S_ENDPGM 0 + + +... diff --git a/llvm/test/CodeGen/AMDGPU/merge-tbuffer.mir b/llvm/test/CodeGen/AMDGPU/merge-tbuffer.mir --- a/llvm/test/CodeGen/AMDGPU/merge-tbuffer.mir +++ b/llvm/test/CodeGen/AMDGPU/merge-tbuffer.mir @@ -133,17 +133,17 @@ # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %14.sub0 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %14.sub1 # GFX9: %{{[0-9]+}}:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET %4, 0, 16, 126, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16, align 1, addrspace 4) -# GFX9: %{{[0-9]+}}:vreg_64 = COPY %18.sub0_sub1 -# GFX9: %{{[0-9]+}}:vreg_64 = COPY killed %18.sub2_sub3 +# GFX9: %{{[0-9]+}}:vreg_96 = COPY %17.sub0_sub1_sub2 +# GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %17.sub3 +# GFX9: %{{[0-9]+}}:vreg_64 = COPY %16.sub0_sub1 +# GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %16.sub2 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %15.sub0 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %15.sub1 -# GFX9: %{{[0-9]+}}:vgpr_32 = COPY %16.sub0 -# GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %16.sub1 # GFX9: %{{[0-9]+}}:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %4, 0, 36, 125, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4) # GFX9: %{{[0-9]+}}:vreg_64 = COPY %19.sub0_sub1 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %19.sub2 -# GFX9: %{{[0-9]+}}:vgpr_32 = COPY %17.sub0 -# GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %17.sub1 +# GFX9: %{{[0-9]+}}:vgpr_32 = COPY %18.sub0 +# GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %18.sub1 name: gfx9_tbuffer_load_float_32 body: | @@ -170,17 +170,17 @@ # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %14.sub0 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %14.sub1 # GFX9: %{{[0-9]+}}:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET %4, 0, 16, 94, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16, align 1, addrspace 4) -# GFX9: %{{[0-9]+}}:vreg_64 = COPY %18.sub0_sub1 -# GFX9: %{{[0-9]+}}:vreg_64 = COPY killed %18.sub2_sub3 +# GFX9: %{{[0-9]+}}:vreg_96 = COPY %17.sub0_sub1_sub2 +# GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %17.sub3 +# GFX9: %{{[0-9]+}}:vreg_64 = COPY %16.sub0_sub1 +# GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %16.sub2 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %15.sub0 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %15.sub1 -# GFX9: %{{[0-9]+}}:vgpr_32 = COPY %16.sub0 -# GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %16.sub1 # GFX9: %{{[0-9]+}}:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %4, 0, 36, 93, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4) # GFX9: %{{[0-9]+}}:vreg_64 = COPY %19.sub0_sub1 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %19.sub2 -# GFX9: %{{[0-9]+}}:vgpr_32 = COPY %17.sub0 -# GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %17.sub1 +# GFX9: %{{[0-9]+}}:vgpr_32 = COPY %18.sub0 +# GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %18.sub1 name: gfx9_tbuffer_load_sint_32 body: | @@ -207,17 +207,17 @@ # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %14.sub0 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %14.sub1 # GFX9: %{{[0-9]+}}:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET %4, 0, 16, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16, align 1, addrspace 4) -# GFX9: %{{[0-9]+}}:vreg_64 = COPY %18.sub0_sub1 -# GFX9: %{{[0-9]+}}:vreg_64 = COPY killed %18.sub2_sub3 +# GFX9: %{{[0-9]+}}:vreg_96 = COPY %17.sub0_sub1_sub2 +# GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %17.sub3 +# GFX9: %{{[0-9]+}}:vreg_64 = COPY %16.sub0_sub1 +# GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %16.sub2 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %15.sub0 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %15.sub1 -# GFX9: %{{[0-9]+}}:vgpr_32 = COPY %16.sub0 -# GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %16.sub1 # GFX9: %{{[0-9]+}}:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %4, 0, 36, 77, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4) # GFX9: %{{[0-9]+}}:vreg_64 = COPY %19.sub0_sub1 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %19.sub2 -# GFX9: %{{[0-9]+}}:vgpr_32 = COPY %17.sub0 -# GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %17.sub1 +# GFX9: %{{[0-9]+}}:vgpr_32 = COPY %18.sub0 +# GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %18.sub1 name: gfx9_tbuffer_load_uint_32 body: | @@ -492,11 +492,11 @@ # GFX9: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %8, %subreg.sub0, %7, %subreg.sub1 # GFX9: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed %14, %13, 0, 4, 123, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4) # GFX9: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %6, %subreg.sub0, %5, %subreg.sub1 -# GFX9: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %4, %subreg.sub0, %3, %subreg.sub1 -# GFX9: %{{[0-9]+}}:vreg_128 = REG_SEQUENCE killed %15, %subreg.sub0_sub1, killed %16, %subreg.sub2_sub3 -# GFX9: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %18, %13, 0, 16, 126, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE killed %15, %subreg.sub0_sub1, %4, %subreg.sub2 +# GFX9: %{{[0-9]+}}:vreg_128 = REG_SEQUENCE killed %16, %subreg.sub0_sub1_sub2, %3, %subreg.sub3 +# GFX9: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %17, %13, 0, 16, 126, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4) # GFX9: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %1, %subreg.sub1 -# GFX9: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE killed %17, %subreg.sub0_sub1, %0, %subreg.sub2 +# GFX9: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE killed %18, %subreg.sub0_sub1, %0, %subreg.sub2 # GFX9: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed %19, %13, 0, 36, 125, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4) name: gfx9_tbuffer_store_float32 body: | @@ -546,11 +546,11 @@ # GFX9: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %8, %subreg.sub0, %7, %subreg.sub1 # GFX9: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed %14, %13, 0, 4, 91, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4) # GFX9: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %6, %subreg.sub0, %5, %subreg.sub1 -# GFX9: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %4, %subreg.sub0, %3, %subreg.sub1 -# GFX9: %{{[0-9]+}}:vreg_128 = REG_SEQUENCE killed %15, %subreg.sub0_sub1, killed %16, %subreg.sub2_sub3 -# GFX9: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %18, %13, 0, 16, 94, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE killed %15, %subreg.sub0_sub1, %4, %subreg.sub2 +# GFX9: %{{[0-9]+}}:vreg_128 = REG_SEQUENCE killed %16, %subreg.sub0_sub1_sub2, %3, %subreg.sub3 +# GFX9: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %17, %13, 0, 16, 94, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4) # GFX9: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %1, %subreg.sub1 -# GFX9: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE killed %17, %subreg.sub0_sub1, %0, %subreg.sub2 +# GFX9: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE killed %18, %subreg.sub0_sub1, %0, %subreg.sub2 # GFX9: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed %19, %13, 0, 36, 93, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4) name: gfx9_tbuffer_store_sint32 body: | @@ -600,11 +600,11 @@ # GFX9: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %8, %subreg.sub0, %7, %subreg.sub1 # GFX9: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed %14, %13, 0, 4, 75, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4) # GFX9: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %6, %subreg.sub0, %5, %subreg.sub1 -# GFX9: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %4, %subreg.sub0, %3, %subreg.sub1 -# GFX9: %{{[0-9]+}}:vreg_128 = REG_SEQUENCE killed %15, %subreg.sub0_sub1, killed %16, %subreg.sub2_sub3 -# GFX9: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %18, %13, 0, 16, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE killed %15, %subreg.sub0_sub1, %4, %subreg.sub2 +# GFX9: %{{[0-9]+}}:vreg_128 = REG_SEQUENCE killed %16, %subreg.sub0_sub1_sub2, %3, %subreg.sub3 +# GFX9: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %17, %13, 0, 16, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4) # GFX9: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %1, %subreg.sub1 -# GFX9: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE killed %17, %subreg.sub0_sub1, %0, %subreg.sub2 +# GFX9: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE killed %18, %subreg.sub0_sub1, %0, %subreg.sub2 # GFX9: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed %19, %13, 0, 36, 77, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4) name: gfx9_tbuffer_store_uint32 body: | @@ -911,17 +911,17 @@ # GFX10: %{{[0-9]+}}:vgpr_32 = COPY %14.sub0 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %14.sub1 # GFX10: %{{[0-9]+}}:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET %4, 0, 16, 77, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16, align 1, addrspace 4) -# GFX10: %{{[0-9]+}}:vreg_64 = COPY %18.sub0_sub1 -# GFX10: %{{[0-9]+}}:vreg_64 = COPY killed %18.sub2_sub3 +# GFX10: %{{[0-9]+}}:vreg_96 = COPY %17.sub0_sub1_sub2 +# GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %17.sub3 +# GFX10: %{{[0-9]+}}:vreg_64 = COPY %16.sub0_sub1 +# GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %16.sub2 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY %15.sub0 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %15.sub1 -# GFX10: %{{[0-9]+}}:vgpr_32 = COPY %16.sub0 -# GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %16.sub1 # GFX10: %{{[0-9]+}}:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %4, 0, 36, 74, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4) # GFX10: %{{[0-9]+}}:vreg_64 = COPY %19.sub0_sub1 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %19.sub2 -# GFX10: %{{[0-9]+}}:vgpr_32 = COPY %17.sub0 -# GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %17.sub1 +# GFX10: %{{[0-9]+}}:vgpr_32 = COPY %18.sub0 +# GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %18.sub1 name: gfx10_tbuffer_load_float_32 body: | @@ -948,17 +948,17 @@ # GFX10: %{{[0-9]+}}:vgpr_32 = COPY %14.sub0 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %14.sub1 # GFX10: %{{[0-9]+}}:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET %4, 0, 16, 76, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16, align 1, addrspace 4) -# GFX10: %{{[0-9]+}}:vreg_64 = COPY %18.sub0_sub1 -# GFX10: %{{[0-9]+}}:vreg_64 = COPY killed %18.sub2_sub3 +# GFX10: %{{[0-9]+}}:vreg_96 = COPY %17.sub0_sub1_sub2 +# GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %17.sub3 +# GFX10: %{{[0-9]+}}:vreg_64 = COPY %16.sub0_sub1 +# GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %16.sub2 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY %15.sub0 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %15.sub1 -# GFX10: %{{[0-9]+}}:vgpr_32 = COPY %16.sub0 -# GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %16.sub1 # GFX10: %{{[0-9]+}}:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %4, 0, 36, 73, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4) # GFX10: %{{[0-9]+}}:vreg_64 = COPY %19.sub0_sub1 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %19.sub2 -# GFX10: %{{[0-9]+}}:vgpr_32 = COPY %17.sub0 -# GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %17.sub1 +# GFX10: %{{[0-9]+}}:vgpr_32 = COPY %18.sub0 +# GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %18.sub1 name: gfx10_tbuffer_load_sint_32 body: | @@ -985,17 +985,17 @@ # GFX10: %{{[0-9]+}}:vgpr_32 = COPY %14.sub0 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %14.sub1 # GFX10: %{{[0-9]+}}:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET %4, 0, 16, 75, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16, align 1, addrspace 4) -# GFX10: %{{[0-9]+}}:vreg_64 = COPY %18.sub0_sub1 -# GFX10: %{{[0-9]+}}:vreg_64 = COPY killed %18.sub2_sub3 +# GFX10: %{{[0-9]+}}:vreg_96 = COPY %17.sub0_sub1_sub2 +# GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %17.sub3 +# GFX10: %{{[0-9]+}}:vreg_64 = COPY %16.sub0_sub1 +# GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %16.sub2 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY %15.sub0 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %15.sub1 -# GFX10: %{{[0-9]+}}:vgpr_32 = COPY %16.sub0 -# GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %16.sub1 # GFX10: %{{[0-9]+}}:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %4, 0, 36, 72, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4) # GFX10: %{{[0-9]+}}:vreg_64 = COPY %19.sub0_sub1 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %19.sub2 -# GFX10: %{{[0-9]+}}:vgpr_32 = COPY %17.sub0 -# GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %17.sub1 +# GFX10: %{{[0-9]+}}:vgpr_32 = COPY %18.sub0 +# GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %18.sub1 name: gfx10_tbuffer_load_uint_32 body: | @@ -1271,11 +1271,11 @@ # GFX10: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %8, %subreg.sub0, %7, %subreg.sub1 # GFX10: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed %14, %13, 0, 4, 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4) # GFX10: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %6, %subreg.sub0, %5, %subreg.sub1 -# GFX10: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %4, %subreg.sub0, %3, %subreg.sub1 -# GFX10: %{{[0-9]+}}:vreg_128 = REG_SEQUENCE killed %15, %subreg.sub0_sub1, killed %16, %subreg.sub2_sub3 -# GFX10: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %18, %13, 0, 16, 77, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4) +# GFX10: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE killed %15, %subreg.sub0_sub1, %4, %subreg.sub2 +# GFX10: %{{[0-9]+}}:vreg_128 = REG_SEQUENCE killed %16, %subreg.sub0_sub1_sub2, %3, %subreg.sub3 +# GFX10: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %17, %13, 0, 16, 77, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4) # GFX10: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %1, %subreg.sub1 -# GFX10: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE killed %17, %subreg.sub0_sub1, %0, %subreg.sub2 +# GFX10: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE killed %18, %subreg.sub0_sub1, %0, %subreg.sub2 # GFX10: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed %19, %13, 0, 36, 74, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4) name: gfx10_tbuffer_store_float32 body: | @@ -1325,11 +1325,11 @@ # GFX10: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %8, %subreg.sub0, %7, %subreg.sub1 # GFX10: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed %14, %13, 0, 4, 63, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4) # GFX10: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %6, %subreg.sub0, %5, %subreg.sub1 -# GFX10: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %4, %subreg.sub0, %3, %subreg.sub1 -# GFX10: %{{[0-9]+}}:vreg_128 = REG_SEQUENCE killed %15, %subreg.sub0_sub1, killed %16, %subreg.sub2_sub3 -# GFX10: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %18, %13, 0, 16, 76, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4) +# GFX10: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE killed %15, %subreg.sub0_sub1, %4, %subreg.sub2 +# GFX10: %{{[0-9]+}}:vreg_128 = REG_SEQUENCE killed %16, %subreg.sub0_sub1_sub2, %3, %subreg.sub3 +# GFX10: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %17, %13, 0, 16, 76, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4) # GFX10: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %1, %subreg.sub1 -# GFX10: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE killed %17, %subreg.sub0_sub1, %0, %subreg.sub2 +# GFX10: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE killed %18, %subreg.sub0_sub1, %0, %subreg.sub2 # GFX10: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed %19, %13, 0, 36, 73, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4) name: gfx10_tbuffer_store_sint32 body: | @@ -1379,11 +1379,11 @@ # GFX10: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %8, %subreg.sub0, %7, %subreg.sub1 # GFX10: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed %14, %13, 0, 4, 62, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4) # GFX10: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %6, %subreg.sub0, %5, %subreg.sub1 -# GFX10: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %4, %subreg.sub0, %3, %subreg.sub1 -# GFX10: %{{[0-9]+}}:vreg_128 = REG_SEQUENCE killed %15, %subreg.sub0_sub1, killed %16, %subreg.sub2_sub3 -# GFX10: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %18, %13, 0, 16, 75, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4) +# GFX10: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE killed %15, %subreg.sub0_sub1, %4, %subreg.sub2 +# GFX10: %{{[0-9]+}}:vreg_128 = REG_SEQUENCE killed %16, %subreg.sub0_sub1_sub2, %3, %subreg.sub3 +# GFX10: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %17, %13, 0, 16, 75, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4) # GFX10: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %1, %subreg.sub1 -# GFX10: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE killed %17, %subreg.sub0_sub1, %0, %subreg.sub2 +# GFX10: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE killed %18, %subreg.sub0_sub1, %0, %subreg.sub2 # GFX10: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed %19, %13, 0, 36, 72, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4) name: gfx10_tbuffer_store_uint32 body: |