Index: llvm/trunk/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ llvm/trunk/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -161,6 +161,31 @@ return true; } + bool hasMergeableAddress(const MachineRegisterInfo &MRI) { + for (unsigned i = 0; i < NumAddresses; ++i) { + const MachineOperand *AddrOp = AddrReg[i]; + // Immediates are always OK. + if (AddrOp->isImm()) + continue; + + // Don't try to merge addresses that aren't either immediates or registers. + // TODO: Should be possible to merge FrameIndexes and maybe some other + // non-register + if (!AddrOp->isReg()) + return false; + + // TODO: We should be able to merge physical reg addreses. + if (Register::isPhysicalRegister(AddrOp->getReg())) + return false; + + // If an address has only one use then there will be on other + // instructions with the same address, so we can't merge this one. + if (MRI.hasOneNonDBGUse(AddrOp->getReg())) + return false; + } + return true; + } + void setMI(MachineBasicBlock::iterator MI, const SIInstrInfo &TII, const GCNSubtarget &STM); void setPaired(MachineBasicBlock::iterator MI, const SIInstrInfo &TII); @@ -220,6 +245,10 @@ bool promoteConstantOffsetToImm(MachineInstr &CI, MemInfoMap &Visited, SmallPtrSet &Promoted) const; + void addInstToMergeableList(const CombineInfo &CI, + std::list > &MergeableInsts) const; + bool collectMergeableInsts(MachineBasicBlock &MBB, + std::list > &MergeableInsts) const; public: static char ID; @@ -228,7 +257,11 @@ initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry()); } - bool optimizeBlock(MachineBasicBlock &MBB); + void removeCombinedInst(std::list &MergeList, + const MachineInstr &MI); + bool optimizeInstsWithSameBaseAddr(std::list &MergeList, + bool &OptimizeListAgain); + bool optimizeBlock(std::list > &MergeableInsts); bool runOnMachineFunction(MachineFunction &MF) override; @@ -424,6 +457,8 @@ AddrIdx[i] = AMDGPU::getNamedOperandIdx(I->getOpcode(), AddrOpName[i]); AddrReg[i] = &I->getOperand(AddrIdx[i]); } + + InstsToMove.clear(); } void SILoadStoreOptimizer::CombineInfo::setPaired(MachineBasicBlock::iterator MI, @@ -646,15 +681,6 @@ if (Swizzled != -1 && CI.I->getOperand(Swizzled).getImm()) return false; - for (unsigned i = 0; i < CI.NumAddresses; i++) { - // We only ever merge operations with the same base address register, so - // don't bother scanning forward if there are no other uses. - if (CI.AddrReg[i]->isReg() && - (Register::isPhysicalRegister(CI.AddrReg[i]->getReg()) || - MRI->hasOneNonDBGUse(CI.AddrReg[i]->getReg()))) - return false; - } - ++MBBI; DenseSet RegDefsToMove; @@ -827,12 +853,11 @@ moveInstsAfter(Copy1, CI.InstsToMove); - MachineBasicBlock::iterator Next = std::next(CI.I); CI.I->eraseFromParent(); CI.Paired->eraseFromParent(); LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n'); - return Next; + return Read2; } unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const { @@ -911,12 +936,11 @@ moveInstsAfter(Write2, CI.InstsToMove); - MachineBasicBlock::iterator Next = std::next(CI.I); CI.I->eraseFromParent(); CI.Paired->eraseFromParent(); LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n'); - return Next; + return Write2; } MachineBasicBlock::iterator @@ -938,12 +962,13 @@ const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin(); - BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg) - .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase)) - .addImm(MergedOffset) // offset - .addImm(CI.GLC0) // glc - .addImm(CI.DLC0) // dlc - .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); + MachineInstr *New = + BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg) + .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase)) + .addImm(MergedOffset) // offset + .addImm(CI.GLC0) // glc + .addImm(CI.DLC0) // dlc + .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); std::pair SubRegIdx = getSubRegIdxs(CI); const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); @@ -963,10 +988,9 @@ moveInstsAfter(Copy1, CI.InstsToMove); - MachineBasicBlock::iterator Next = std::next(CI.I); CI.I->eraseFromParent(); CI.Paired->eraseFromParent(); - return Next; + return New; } MachineBasicBlock::iterator @@ -997,15 +1021,16 @@ const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin(); - MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) - .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) - .addImm(MergedOffset) // offset - .addImm(CI.GLC0) // glc - .addImm(CI.SLC0) // slc - .addImm(0) // tfe - .addImm(CI.DLC0) // dlc - .addImm(0) // swz - .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); + MachineInstr *New = + MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) + .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) + .addImm(MergedOffset) // offset + .addImm(CI.GLC0) // glc + .addImm(CI.SLC0) // slc + .addImm(0) // tfe + .addImm(CI.DLC0) // dlc + .addImm(0) // swz + .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); std::pair SubRegIdx = getSubRegIdxs(CI); const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); @@ -1025,10 +1050,9 @@ moveInstsAfter(Copy1, CI.InstsToMove); - MachineBasicBlock::iterator Next = std::next(CI.I); CI.I->eraseFromParent(); CI.Paired->eraseFromParent(); - return Next; + return New; } unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI) { @@ -1191,22 +1215,22 @@ const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin(); - MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) - .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) - .addImm(std::min(CI.Offset0, CI.Offset1)) // offset - .addImm(CI.GLC0) // glc - .addImm(CI.SLC0) // slc - .addImm(0) // tfe - .addImm(CI.DLC0) // dlc - .addImm(0) // swz - .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); + MachineInstr *New = + MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) + .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) + .addImm(std::min(CI.Offset0, CI.Offset1)) // offset + .addImm(CI.GLC0) // glc + .addImm(CI.SLC0) // slc + .addImm(0) // tfe + .addImm(CI.DLC0) // dlc + .addImm(0) // swz + .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); moveInstsAfter(MIB, CI.InstsToMove); - MachineBasicBlock::iterator Next = std::next(CI.I); CI.I->eraseFromParent(); CI.Paired->eraseFromParent(); - return Next; + return New; } MachineOperand @@ -1519,32 +1543,105 @@ return false; } -// Scan through looking for adjacent LDS operations with constant offsets from -// the same base register. We rely on the scheduler to do the hard work of -// clustering nearby loads, and assume these are all adjacent. -bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) { - bool Modified = false; +void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI, + std::list > &MergeableInsts) const { + for (std::list &AddrList : MergeableInsts) { + if (AddrList.front().hasSameBaseAddress(*CI.I) && + AddrList.front().InstClass == CI.InstClass) { + AddrList.emplace_back(CI); + return; + } + } + + // Base address not found, so add a new list. + MergeableInsts.emplace_back(1, CI); +} +bool SILoadStoreOptimizer::collectMergeableInsts(MachineBasicBlock &MBB, + std::list > &MergeableInsts) const { + bool Modified = false; // Contain the list MemInfoMap Visited; // Contains the list of instructions for which constant offsets are being // promoted to the IMM. SmallPtrSet AnchorList; - for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) { - MachineInstr &MI = *I; - + // Sort potential mergeable instructions into lists. One list per base address. + for (MachineInstr &MI : MBB.instrs()) { + // We run this before checking if an address is mergeable, because it can produce + // better code even if the instructions aren't mergeable. if (promoteConstantOffsetToImm(MI, Visited, AnchorList)) Modified = true; + const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII); + if (InstClass == UNKNOWN) + continue; + // Don't combine if volatile. - if (MI.hasOrderedMemoryRef()) { - ++I; + if (MI.hasOrderedMemoryRef()) continue; - } CombineInfo CI; - CI.setMI(I, *TII, *STM); + CI.setMI(MI, *TII, *STM); + + if (!CI.hasMergeableAddress(*MRI)) + continue; + + addInstToMergeableList(CI, MergeableInsts); + } + return Modified; +} + +// Scan through looking for adjacent LDS operations with constant offsets from +// the same base register. We rely on the scheduler to do the hard work of +// clustering nearby loads, and assume these are all adjacent. +bool SILoadStoreOptimizer::optimizeBlock( + std::list > &MergeableInsts) { + bool Modified = false; + + for (std::list &MergeList : MergeableInsts) { + if (MergeList.size() < 2) + continue; + + bool OptimizeListAgain = false; + if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) { + // We weren't able to make any changes, so clear the list so we don't + // process the same instructions the next time we try to optimize this + // block. + MergeList.clear(); + continue; + } + + // We made changes, but also determined that there were no more optimization + // opportunities, so we don't need to reprocess the list + if (!OptimizeListAgain) + MergeList.clear(); + + OptimizeAgain |= OptimizeListAgain; + Modified = true; + } + return Modified; +} + +void +SILoadStoreOptimizer::removeCombinedInst(std::list &MergeList, + const MachineInstr &MI) { + + for (auto CI = MergeList.begin(), E = MergeList.end(); CI != E; ++CI) { + if (&*CI->I == &MI) { + MergeList.erase(CI); + return; + } + } +} + +bool +SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr( + std::list &MergeList, + bool &OptimizeListAgain) { + bool Modified = false; + for (auto I = MergeList.begin(); I != MergeList.end(); ++I) { + CombineInfo &CI = *I; switch (CI.InstClass) { default: @@ -1552,55 +1649,57 @@ case DS_READ: if (findMatchingInst(CI)) { Modified = true; - I = mergeRead2Pair(CI); - } else { - ++I; + removeCombinedInst(MergeList, *CI.Paired); + MachineBasicBlock::iterator NewMI = mergeRead2Pair(CI); + CI.setMI(NewMI, *TII, *STM); } - continue; + break; case DS_WRITE: if (findMatchingInst(CI)) { Modified = true; - I = mergeWrite2Pair(CI); - } else { - ++I; + removeCombinedInst(MergeList, *CI.Paired); + MachineBasicBlock::iterator NewMI = mergeWrite2Pair(CI); + CI.setMI(NewMI, *TII, *STM); } - continue; + break; case S_BUFFER_LOAD_IMM: if (findMatchingInst(CI)) { Modified = true; - I = mergeSBufferLoadImmPair(CI); - OptimizeAgain |= (CI.Width0 + CI.Width1) < 16; - } else { - ++I; + removeCombinedInst(MergeList, *CI.Paired); + MachineBasicBlock::iterator NewMI = mergeSBufferLoadImmPair(CI); + CI.setMI(NewMI, *TII, *STM); + OptimizeListAgain |= (CI.Width0 + CI.Width1) < 16; } - continue; + break; case BUFFER_LOAD_OFFEN: case BUFFER_LOAD_OFFSET: case BUFFER_LOAD_OFFEN_exact: case BUFFER_LOAD_OFFSET_exact: if (findMatchingInst(CI)) { Modified = true; - I = mergeBufferLoadPair(CI); - OptimizeAgain |= (CI.Width0 + CI.Width1) < 4; - } else { - ++I; + removeCombinedInst(MergeList, *CI.Paired); + MachineBasicBlock::iterator NewMI = mergeBufferLoadPair(CI); + CI.setMI(NewMI, *TII, *STM); + OptimizeListAgain |= (CI.Width0 + CI.Width1) < 4; } - continue; + break; case BUFFER_STORE_OFFEN: case BUFFER_STORE_OFFSET: case BUFFER_STORE_OFFEN_exact: case BUFFER_STORE_OFFSET_exact: if (findMatchingInst(CI)) { Modified = true; - I = mergeBufferStorePair(CI); - OptimizeAgain |= (CI.Width0 + CI.Width1) < 4; - } else { - ++I; + removeCombinedInst(MergeList, *CI.Paired); + MachineBasicBlock::iterator NewMI = mergeBufferStorePair(CI); + CI.setMI(NewMI, *TII, *STM); + OptimizeListAgain |= (CI.Width0 + CI.Width1) < 4; } - continue; + break; } - - ++I; + // Clear the InstsToMove after we have finished searching so we don't have + // stale values left over if we search for this CI again in another pass + // over the block. + CI.InstsToMove.clear(); } return Modified; @@ -1626,10 +1725,14 @@ bool Modified = false; + for (MachineBasicBlock &MBB : MF) { + std::list > MergeableInsts; + // First pass: Collect list of all instructions we know how to merge. + Modified |= collectMergeableInsts(MBB, MergeableInsts); do { OptimizeAgain = false; - Modified |= optimizeBlock(MBB); + Modified |= optimizeBlock(MergeableInsts); } while (OptimizeAgain); }