diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp --- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -186,6 +186,10 @@ return true; } + bool pairedHasLowerOffset() { + return Offset1 < Offset0; + } + void setMI(MachineBasicBlock::iterator MI, const SIInstrInfo &TII, const GCNSubtarget &STM); void setPaired(MachineBasicBlock::iterator MI, const SIInstrInfo &TII); @@ -214,7 +218,7 @@ AliasAnalysis *AA = nullptr; bool OptimizeAgain; - static bool offsetsCanBeCombined(CombineInfo &CI); + static bool offsetsCanBeCombined(CombineInfo &CI, bool Modify = false); static bool widthsFit(const GCNSubtarget &STM, const CombineInfo &CI); static unsigned getNewOpcode(const CombineInfo &CI); static std::pair getSubRegIdxs(const CombineInfo &CI); @@ -224,14 +228,19 @@ unsigned read2Opcode(unsigned EltSize) const; unsigned read2ST64Opcode(unsigned EltSize) const; - MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI); + MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI, + std::list &MergeList); unsigned write2Opcode(unsigned EltSize) const; unsigned write2ST64Opcode(unsigned EltSize) const; - MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI); - MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI); - MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI); - MachineBasicBlock::iterator mergeBufferStorePair(CombineInfo &CI); + MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI, + std::list &MergeList); + MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI, + std::list &MergeList); + MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI, + std::list &MergeList); + MachineBasicBlock::iterator mergeBufferStorePair(CombineInfo &CI, + std::list &MergeList); void updateBaseAndOffset(MachineInstr &I, unsigned NewBase, int32_t NewOffset) const; @@ -249,6 +258,10 @@ std::list > &MergeableInsts) const; bool collectMergeableInsts(MachineBasicBlock &MBB, std::list > &MergeableInsts) const; + bool isInOrder(MachineBasicBlock::const_iterator First, + MachineBasicBlock::const_iterator Second) const; + void order(CombineInfo &CI) const; + bool isValid(CombineInfo &CI) const; public: static char ID; @@ -257,8 +270,9 @@ initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry()); } - void removeCombinedInst(std::list &MergeList, - const MachineInstr &MI); + void replaceInstsInList(std::list &MergeList, + const CombineInfo &CI, + MachineBasicBlock::iterator NewMI); bool optimizeInstsWithSameBaseAddr(std::list &MergeList, bool &OptimizeListAgain); bool optimizeBlock(std::list > &MergeableInsts); @@ -464,7 +478,14 @@ void SILoadStoreOptimizer::CombineInfo::setPaired(MachineBasicBlock::iterator MI, const SIInstrInfo &TII) { Paired = MI; - assert(InstClass == getInstClass(Paired->getOpcode(), TII)); + + // We can't pair these if we can't determine the instruction + // class. + if (InstClass == UNKNOWN || getInstClass(MI->getOpcode(), TII) != InstClass) { + InstClass = UNKNOWN; + return; + } + int OffsetIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(), AMDGPU::OpName::offset); Offset1 = Paired->getOperand(OffsetIdx).getImm(); @@ -586,7 +607,7 @@ return MMO; } -bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) { +bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI, bool Modify) { // XXX - Would the same offset be OK? Is there any reason this would happen or // be useful? if (CI.Offset0 == CI.Offset1) @@ -613,16 +634,20 @@ // the stride 64 versions. if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 && isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) { - CI.Offset0 = EltOffset0 / 64; - CI.Offset1 = EltOffset1 / 64; - CI.UseST64 = true; + if (Modify) { + CI.Offset0 = EltOffset0 / 64; + CI.Offset1 = EltOffset1 / 64; + CI.UseST64 = true; + } return true; } // Check if the new offsets fit in the reduced 8-bit range. if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) { - CI.Offset0 = EltOffset0; - CI.Offset1 = EltOffset1; + if (Modify) { + CI.Offset0 = EltOffset0; + CI.Offset1 = EltOffset1; + } return true; } @@ -631,15 +656,19 @@ CI.BaseOff = std::min(CI.Offset0, CI.Offset1); if ((OffsetDiff % 64 == 0) && isUInt<8>(OffsetDiff / 64)) { - CI.Offset0 = (EltOffset0 - CI.BaseOff / CI.EltSize) / 64; - CI.Offset1 = (EltOffset1 - CI.BaseOff / CI.EltSize) / 64; - CI.UseST64 = true; + if (Modify) { + CI.Offset0 = (EltOffset0 - CI.BaseOff / CI.EltSize) / 64; + CI.Offset1 = (EltOffset1 - CI.BaseOff / CI.EltSize) / 64; + CI.UseST64 = true; + } return true; } if (isUInt<8>(OffsetDiff)) { - CI.Offset0 = EltOffset0 - CI.BaseOff / CI.EltSize; - CI.Offset1 = EltOffset1 - CI.BaseOff / CI.EltSize; + if (Modify) { + CI.Offset0 = EltOffset0 - CI.BaseOff / CI.EltSize; + CI.Offset1 = EltOffset1 - CI.BaseOff / CI.EltSize; + } return true; } @@ -664,8 +693,7 @@ } bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) { - MachineBasicBlock *MBB = CI.I->getParent(); - MachineBasicBlock::iterator E = MBB->end(); + MachineBasicBlock::iterator E = std::next(CI.Paired); MachineBasicBlock::iterator MBBI = CI.I; const unsigned Opc = CI.I->getOpcode(); @@ -731,18 +759,18 @@ CI.InstsToMove)) continue; - bool Match = CI.hasSameBaseAddress(*MBBI); - - if (Match) { - CI.setPaired(MBBI, *TII); - - // Check both offsets fit in the reduced range. - // We also need to go through the list of instructions that we plan to + if (&*MBBI == &*CI.Paired) { + // We need to go through the list of instructions that we plan to // move and make sure they are all safe to move down past the merged // instruction. - if (widthsFit(*STM, CI) && offsetsCanBeCombined(CI)) - if (canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA)) - return true; + if (canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA)) { + // Call offsetsCanBeCombined with modify = true so that the offsets are + // correct for the new instruction. This should return true, because + // this function should only be called on CombineInfo objects that + // have already been confirmed to be mergeable. + offsetsCanBeCombined(CI, true); + return true; + } } // We've found a load/store that we couldn't merge for some reason. @@ -772,7 +800,8 @@ } MachineBasicBlock::iterator -SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI) { +SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, + std::list &MergeList) { MachineBasicBlock *MBB = CI.I->getParent(); // Be careful, since the addresses could be subregisters themselves in weird @@ -847,6 +876,8 @@ moveInstsAfter(Copy1, CI.InstsToMove); + replaceInstsInList(MergeList, CI, Read2); + CI.I->eraseFromParent(); CI.Paired->eraseFromParent(); @@ -871,7 +902,8 @@ } MachineBasicBlock::iterator -SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI) { +SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI, + std::list &MergeList) { MachineBasicBlock *MBB = CI.I->getParent(); // Be sure to use .addOperand(), and not .addReg() with these. We want to be @@ -930,6 +962,8 @@ moveInstsAfter(Write2, CI.InstsToMove); + replaceInstsInList(MergeList, CI, Write2); + CI.I->eraseFromParent(); CI.Paired->eraseFromParent(); @@ -938,7 +972,8 @@ } MachineBasicBlock::iterator -SILoadStoreOptimizer::mergeSBufferLoadImmPair(CombineInfo &CI) { +SILoadStoreOptimizer::mergeSBufferLoadImmPair(CombineInfo &CI, + std::list &MergeList) { MachineBasicBlock *MBB = CI.I->getParent(); DebugLoc DL = CI.I->getDebugLoc(); const unsigned Opcode = getNewOpcode(CI); @@ -982,13 +1017,16 @@ moveInstsAfter(Copy1, CI.InstsToMove); + replaceInstsInList(MergeList, CI, New); + CI.I->eraseFromParent(); CI.Paired->eraseFromParent(); return New; } MachineBasicBlock::iterator -SILoadStoreOptimizer::mergeBufferLoadPair(CombineInfo &CI) { +SILoadStoreOptimizer::mergeBufferLoadPair(CombineInfo &CI, + std::list &MergeList) { MachineBasicBlock *MBB = CI.I->getParent(); DebugLoc DL = CI.I->getDebugLoc(); @@ -1043,6 +1081,8 @@ moveInstsAfter(Copy1, CI.InstsToMove); + replaceInstsInList(MergeList, CI, New); + CI.I->eraseFromParent(); CI.Paired->eraseFromParent(); return New; @@ -1167,7 +1207,8 @@ } MachineBasicBlock::iterator -SILoadStoreOptimizer::mergeBufferStorePair(CombineInfo &CI) { +SILoadStoreOptimizer::mergeBufferStorePair(CombineInfo &CI, + std::list &MergeList) { MachineBasicBlock *MBB = CI.I->getParent(); DebugLoc DL = CI.I->getDebugLoc(); @@ -1219,6 +1260,8 @@ moveInstsAfter(MIB, CI.InstsToMove); + replaceInstsInList(MergeList, CI, New); + CI.I->eraseFromParent(); CI.Paired->eraseFromParent(); return New; @@ -1547,6 +1590,31 @@ MergeableInsts.emplace_back(1, CI); } +bool SILoadStoreOptimizer::isInOrder(MachineBasicBlock::const_iterator First, + MachineBasicBlock::const_iterator Second) const { + // FIXME: Is there a better way to do this. + const MachineBasicBlock *MBB = First->getParent(); + for (auto I = First, E = MBB->end(); I != E; ++I) { + if (I == Second) + return true; + } + return false; + +} + +void SILoadStoreOptimizer::order(CombineInfo &CI) const { + if (!isInOrder(CI.I, CI.Paired)) { + auto OldMI = CI.I; + CI.setMI(CI.Paired, *TII, *STM); + CI.setPaired(OldMI, *TII); + } +} + +bool SILoadStoreOptimizer::isValid(CombineInfo &CI) const { + return CI.InstClass != UNKNOWN && widthsFit(*STM, CI) && + offsetsCanBeCombined(CI); +} + bool SILoadStoreOptimizer::collectMergeableInsts(MachineBasicBlock &MBB, std::list > &MergeableInsts) const { bool Modified = false; @@ -1579,6 +1647,62 @@ addInstToMergeableList(CI, MergeableInsts); } + + // At this point we have lists of Mergeable instructions. In each list + // there is a CombineInfo object which has it's I member set, but nothing + // set of Paired yet. + // + // + // Part 2: Sort lists by offset and then for each CombineInfo object in the + // list try to find an instruction that can be merged with I. If an instruction + // is found, it is stored in the Paired field. If no instructions are found, then + // the CombineInfo object is deleted from the list. + for (std::list &MergeList : MergeableInsts) { + + if (MergeList.size() <= 1) { + // This means we have found only one instruction with a given address + // that can be merged, and we need at least 2 instructions to do a merge, + // so this list can be discarded. + // The code below assumes lists have at least 2 items. + MergeList.clear(); + continue; + } + + // Sort the lists by offsets, this way mergeable instructions will be + // adjacent to each other in the list, which will make it easier to find + // matches. + MergeList.sort( + [] (const CombineInfo &A, CombineInfo &B) { + return A.Offset0 < B.Offset0; + }); + + for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end(); + Next = std::next(I)) { + CombineInfo &CI = *I; + + // Check that the next instruction in the list can be paired with this + // one. Since the list is sorted by offset, if this instruction can't + // be paired with the next one in the list, then it can't be paired at + // all. + CI.setPaired(Next->I, *TII); + if (!widthsFit(*STM, CI) || !offsetsCanBeCombined(CI)) { + I = MergeList.erase(I); + continue; + } + + // Offsets can be combined. If the Paired instruction comes first in the + // order() will swap I and Paired, because the optimizeBlock() function + // only looks forward for matches. + order(CI); + + I = Next; + } + + // The last instruction in each list can be deleted, since it will have + // an empty paired value. + MergeList.pop_back(); + } + return Modified; } @@ -1590,7 +1714,7 @@ bool Modified = false; for (std::list &MergeList : MergeableInsts) { - if (MergeList.size() < 2) + if (MergeList.empty()) continue; bool OptimizeListAgain = false; @@ -1613,14 +1737,26 @@ return Modified; } +// FIXME: This doesn't have to be a loop. void -SILoadStoreOptimizer::removeCombinedInst(std::list &MergeList, - const MachineInstr &MI) { +SILoadStoreOptimizer::replaceInstsInList(std::list &MergeList, + const CombineInfo &CI, + MachineBasicBlock::iterator NewMI) { + for (auto I = MergeList.begin(), E = MergeList.end(); I != E; ++I) { + // Don't need to update CI, because this will be deleted. + if (&*I == &CI) + continue; - for (auto CI = MergeList.begin(), E = MergeList.end(); CI != E; ++CI) { - if (&*CI->I == &MI) { - MergeList.erase(CI); - return; + if (&*I->I == &*CI.I || &*I->I == &*CI.Paired) { + I->setMI(NewMI, *TII, *STM); + order(*I); + } + + if (&*I->Paired == &*CI.I || &*I->Paired == &*CI.Paired) { + I->setPaired(NewMI, *TII); + // Reset paired to re-calculate the + // FIXME: Do we need to reset Paired here? + order(*I); } } } @@ -1630,35 +1766,40 @@ std::list &MergeList, bool &OptimizeListAgain) { bool Modified = false; - for (auto I = MergeList.begin(); I != MergeList.end(); ++I) { + for (auto I = MergeList.begin(); I != MergeList.end();) { CombineInfo &CI = *I; + if (!isValid(CI)) { + ++I; + continue; + } + switch (CI.InstClass) { default: break; case DS_READ: if (findMatchingInst(CI)) { Modified = true; - removeCombinedInst(MergeList, *CI.Paired); - MachineBasicBlock::iterator NewMI = mergeRead2Pair(CI); - CI.setMI(NewMI, *TII, *STM); + mergeRead2Pair(CI, MergeList); + I = MergeList.erase(I); + continue; } break; case DS_WRITE: if (findMatchingInst(CI)) { Modified = true; - removeCombinedInst(MergeList, *CI.Paired); - MachineBasicBlock::iterator NewMI = mergeWrite2Pair(CI); - CI.setMI(NewMI, *TII, *STM); + mergeWrite2Pair(CI, MergeList); + I = MergeList.erase(I); + continue; } break; case S_BUFFER_LOAD_IMM: if (findMatchingInst(CI)) { Modified = true; - removeCombinedInst(MergeList, *CI.Paired); - MachineBasicBlock::iterator NewMI = mergeSBufferLoadImmPair(CI); - CI.setMI(NewMI, *TII, *STM); + mergeSBufferLoadImmPair(CI, MergeList); OptimizeListAgain |= (CI.Width0 + CI.Width1) < 16; + I = MergeList.erase(I); + continue; } break; case BUFFER_LOAD_OFFEN: @@ -1667,10 +1808,10 @@ case BUFFER_LOAD_OFFSET_exact: if (findMatchingInst(CI)) { Modified = true; - removeCombinedInst(MergeList, *CI.Paired); - MachineBasicBlock::iterator NewMI = mergeBufferLoadPair(CI); - CI.setMI(NewMI, *TII, *STM); + mergeBufferLoadPair(CI, MergeList); OptimizeListAgain |= (CI.Width0 + CI.Width1) < 4; + I = MergeList.erase(I); + continue; } break; case BUFFER_STORE_OFFEN: @@ -1679,17 +1820,14 @@ case BUFFER_STORE_OFFSET_exact: if (findMatchingInst(CI)) { Modified = true; - removeCombinedInst(MergeList, *CI.Paired); - MachineBasicBlock::iterator NewMI = mergeBufferStorePair(CI); - CI.setMI(NewMI, *TII, *STM); + mergeBufferStorePair(CI, MergeList); OptimizeListAgain |= (CI.Width0 + CI.Width1) < 4; + I = MergeList.erase(I); + continue; } break; } - // Clear the InstsToMove after we have finished searching so we don't have - // stale values left over if we search for this CI again in another pass - // over the block. - CI.InstsToMove.clear(); + ++I; } return Modified; diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2_offset_order.ll b/llvm/test/CodeGen/AMDGPU/ds_read2_offset_order.ll --- a/llvm/test/CodeGen/AMDGPU/ds_read2_offset_order.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_read2_offset_order.ll @@ -6,9 +6,9 @@ ; offset0 is larger than offset1 ; SI-LABEL: {{^}}offset_order: -; SI-DAG: ds_read2st64_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset1:4{{$}} +; SI-DAG: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset1:14{{$}} ; SI-DAG: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:2 offset1:3 -; SI-DAG: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:56 +; SI-DAG: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:1024 ; SI-DAG: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:11 offset1:12 define amdgpu_kernel void @offset_order(float addrspace(1)* %out) { entry: diff --git a/llvm/test/CodeGen/AMDGPU/merge-load-store.mir b/llvm/test/CodeGen/AMDGPU/merge-load-store.mir --- a/llvm/test/CodeGen/AMDGPU/merge-load-store.mir +++ b/llvm/test/CodeGen/AMDGPU/merge-load-store.mir @@ -66,6 +66,7 @@ attributes #0 = { convergent nounwind } define amdgpu_kernel void @merge_mmos(i32 addrspace(1)* %ptr_addr1) { ret void } + define amdgpu_kernel void @reorder_offsets(i32 addrspace(1)* %reorder_addr1) { ret void } ... --- @@ -194,3 +195,26 @@ S_ENDPGM 0 ... +--- +# CHECK-LABEL: reorder_offsets +# CHECK-DAG: BUFFER_STORE_DWORDX2_OFFSET_exact killed %{{[0-9]+}}, %0, 0, 16, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into %ir.reorder_addr1 + 16, align 4, addrspace 1) +# CHECK-DAG: BUFFER_STORE_DWORDX4_OFFSET_exact killed %{{[0-9]+}}, %0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into %ir.reorder_addr1, align 4, addrspace 1) + +name: reorder_offsets +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + + %0:sreg_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + %1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET_exact %1, %0, 0, 4, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into %ir.reorder_addr1 + 4) + BUFFER_STORE_DWORD_OFFSET_exact %1, %0, 0, 8, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into %ir.reorder_addr1 + 8) + BUFFER_STORE_DWORD_OFFSET_exact %1, %0, 0, 12, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into %ir.reorder_addr1 + 12) + BUFFER_STORE_DWORD_OFFSET_exact %1, %0, 0, 16, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into %ir.reorder_addr1 + 16) + BUFFER_STORE_DWORD_OFFSET_exact %1, %0, 0, 20, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into %ir.reorder_addr1 + 20) + BUFFER_STORE_DWORD_OFFSET_exact %1, %0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into %ir.reorder_addr1) + S_ENDPGM 0 + + +...