diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp --- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -185,6 +185,9 @@ AliasAnalysis *AA = nullptr; bool OptimizeAgain; + bool canSwapInstructions(const DenseSet &ARegDefs, + const DenseSet &ARegUses, + const MachineInstr &A, const MachineInstr &B) const; static bool dmasksCanBeCombined(const CombineInfo &CI, const SIInstrInfo &TII, const CombineInfo &Paired); @@ -199,38 +202,37 @@ const CombineInfo &Paired); const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const; - bool checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired, - SmallVectorImpl &InstsToMove); + CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired); unsigned read2Opcode(unsigned EltSize) const; unsigned read2ST64Opcode(unsigned EltSize) const; - MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI, - CombineInfo &Paired, - const SmallVectorImpl &InstsToMove); + MachineBasicBlock::iterator + mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired, + MachineBasicBlock::iterator InsertBefore); unsigned write2Opcode(unsigned EltSize) const; unsigned write2ST64Opcode(unsigned EltSize) const; MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired, - const SmallVectorImpl &InstsToMove); + MachineBasicBlock::iterator InsertBefore); MachineBasicBlock::iterator mergeImagePair(CombineInfo &CI, CombineInfo &Paired, - const SmallVectorImpl &InstsToMove); + MachineBasicBlock::iterator InsertBefore); MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI, CombineInfo &Paired, - const SmallVectorImpl &InstsToMove); + MachineBasicBlock::iterator InsertBefore); MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired, - const SmallVectorImpl &InstsToMove); + MachineBasicBlock::iterator InsertBefore); MachineBasicBlock::iterator mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired, - const SmallVectorImpl &InstsToMove); + MachineBasicBlock::iterator InsertBefore); MachineBasicBlock::iterator mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired, - const SmallVectorImpl &InstsToMove); + MachineBasicBlock::iterator InsertBefore); MachineBasicBlock::iterator mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired, - const SmallVectorImpl &InstsToMove); + MachineBasicBlock::iterator InsertBefore); void updateBaseAndOffset(MachineInstr &I, Register NewBase, int32_t NewOffset) const; @@ -580,74 +582,31 @@ return new SILoadStoreOptimizer(); } -static void moveInstsAfter(MachineBasicBlock::iterator I, - ArrayRef InstsToMove) { - MachineBasicBlock *MBB = I->getParent(); - ++I; - for (MachineInstr *MI : InstsToMove) { - MI->removeFromParent(); - MBB->insert(I, MI); - } -} - static void addDefsUsesToList(const MachineInstr &MI, DenseSet &RegDefs, - DenseSet &PhysRegUses) { - for (const MachineOperand &Op : MI.operands()) { - if (Op.isReg()) { - if (Op.isDef()) - RegDefs.insert(Op.getReg()); - else if (Op.readsReg() && Op.getReg().isPhysical()) - PhysRegUses.insert(Op.getReg()); - } - } -} - -static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A, - MachineBasicBlock::iterator B, - AliasAnalysis *AA) { - // RAW or WAR - cannot reorder - // WAW - cannot reorder - // RAR - safe to reorder - return !(A->mayStore() || B->mayStore()) || !A->mayAlias(AA, *B, true); -} - -// Add MI and its defs to the lists if MI reads one of the defs that are -// already in the list. Returns true in that case. -static bool addToListsIfDependent(MachineInstr &MI, DenseSet &RegDefs, - DenseSet &PhysRegUses, - SmallVectorImpl &Insts) { - for (MachineOperand &Use : MI.operands()) { - // If one of the defs is read, then there is a use of Def between I and the - // instruction that I will potentially be merged with. We will need to move - // this instruction after the merged instructions. - // - // Similarly, if there is a def which is read by an instruction that is to - // be moved for merging, then we need to move the def-instruction as well. - // This can only happen for physical registers such as M0; virtual - // registers are in SSA form. - if (Use.isReg() && ((Use.readsReg() && RegDefs.count(Use.getReg())) || - (Use.isDef() && RegDefs.count(Use.getReg())) || - (Use.isDef() && Use.getReg().isPhysical() && - PhysRegUses.count(Use.getReg())))) { - Insts.push_back(&MI); - addDefsUsesToList(MI, RegDefs, PhysRegUses); - return true; - } + DenseSet &RegUses) { + for (const auto &Op : MI.operands()) { + if (!Op.isReg()) + continue; + if (Op.isDef()) + RegDefs.insert(Op.getReg()); + if (Op.readsReg()) + RegUses.insert(Op.getReg()); } - - return false; } -static bool canMoveInstsAcrossMemOp(MachineInstr &MemOp, - ArrayRef InstsToMove, - AliasAnalysis *AA) { - assert(MemOp.mayLoadOrStore()); - - for (MachineInstr *InstToMove : InstsToMove) { - if (!InstToMove->mayLoadOrStore()) +bool SILoadStoreOptimizer::canSwapInstructions( + const DenseSet &ARegDefs, const DenseSet &ARegUses, + const MachineInstr &A, const MachineInstr &B) const { + if (A.mayLoadOrStore() && B.mayLoadOrStore() && + (A.mayStore() || B.mayStore()) && A.mayAlias(AA, B, true)) + return false; + for (const auto &BOp : B.operands()) { + if (!BOp.isReg()) continue; - if (!memAccessesCanBeReordered(MemOp, *InstToMove, AA)) + if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.contains(BOp.getReg())) + return false; + if (BOp.isDef() && ARegUses.contains(BOp.getReg())) return false; } return true; @@ -890,86 +849,59 @@ return nullptr; } -/// This function assumes that CI comes before Paired in a basic block. -bool SILoadStoreOptimizer::checkAndPrepareMerge( - CombineInfo &CI, CombineInfo &Paired, - SmallVectorImpl &InstsToMove) { +/// This function assumes that CI comes before Paired in a basic block. Return +/// an insertion point for the merged instruction or nullptr on failure. +SILoadStoreOptimizer::CombineInfo * +SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI, + CombineInfo &Paired) { // If another instruction has already been merged into CI, it may now be a // type that we can't do any further merging into. if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN) - return false; + return nullptr; assert(CI.InstClass == Paired.InstClass); if (getInstSubclass(CI.I->getOpcode(), *TII) != getInstSubclass(Paired.I->getOpcode(), *TII)) - return false; + return nullptr; // Check both offsets (or masks for MIMG) can be combined and fit in the // reduced range. if (CI.InstClass == MIMG) { if (!dmasksCanBeCombined(CI, *TII, Paired)) - return false; + return nullptr; } else { if (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired)) - return false; + return nullptr; } - DenseSet RegDefsToMove; - DenseSet PhysRegUsesToMove; - addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove); - - MachineBasicBlock::iterator MBBE = CI.I->getParent()->end(); - for (MachineBasicBlock::iterator MBBI = CI.I; ++MBBI != Paired.I;) { - if (MBBI == MBBE) { - // CombineInfo::Order is a hint on the instruction ordering within the - // basic block. This hint suggests that CI precedes Paired, which is - // true most of the time. However, moveInstsAfter() processing a - // previous list may have changed this order in a situation when it - // moves an instruction which exists in some other merge list. - // In this case it must be dependent. - return false; + DenseSet RegDefs; + DenseSet RegUses; + CombineInfo *Where; + if (CI.I->mayLoad()) { + // Try to hoist Paired up to CI. + addDefsUsesToList(*Paired.I, RegDefs, RegUses); + for (MachineBasicBlock::iterator MBBI = Paired.I; --MBBI != CI.I;) { + if (!canSwapInstructions(RegDefs, RegUses, *Paired.I, *MBBI)) + return nullptr; } - - // Keep going as long as one of these conditions are met: - // 1. It is safe to move I down past MBBI. - // 2. It is safe to move MBBI down past the instruction that I will - // be merged into. - - if (MBBI->mayLoadOrStore() && - (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) || - !canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA))) { - // We fail condition #1, but we may still be able to satisfy condition - // #2. Add this instruction to the move list and then we will check - // if condition #2 holds once we have selected the matching instruction. - InstsToMove.push_back(&*MBBI); - addDefsUsesToList(*MBBI, RegDefsToMove, PhysRegUsesToMove); - continue; + Where = &CI; + } else { + // Try to sink CI down to Paired. + addDefsUsesToList(*CI.I, RegDefs, RegUses); + for (MachineBasicBlock::iterator MBBI = CI.I; ++MBBI != Paired.I;) { + if (!canSwapInstructions(RegDefs, RegUses, *CI.I, *MBBI)) + return nullptr; } - - // When we match I with another load/store instruction we will be moving I - // down to the location of the matched instruction any uses of I will need - // to be moved down as well. - addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove, InstsToMove); + Where = &Paired; } - // If Paired depends on any of the instructions we plan to move, give up. - if (addToListsIfDependent(*Paired.I, RegDefsToMove, PhysRegUsesToMove, - InstsToMove)) - return false; - - // We need to go through the list of instructions that we plan to - // move and make sure they are all safe to move down past the merged - // instruction. - if (!canMoveInstsAcrossMemOp(*Paired.I, InstsToMove, AA)) - return false; - // Call offsetsCanBeCombined with modify = true so that the offsets are // correct for the new instruction. This should return true, because // this function should only be called on CombineInfo objects that // have already been confirmed to be mergeable. if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE) offsetsCanBeCombined(CI, *STM, Paired, true); - return true; + return Where; } unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const { @@ -988,7 +920,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired, - const SmallVectorImpl &InstsToMove) { + MachineBasicBlock::iterator InsertBefore) { MachineBasicBlock *MBB = CI.I->getParent(); // Be careful, since the addresses could be subregisters themselves in weird @@ -1027,13 +959,13 @@ unsigned BaseRegFlags = 0; if (CI.BaseOff) { Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); - BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) + BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) .addImm(CI.BaseOff); BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); BaseRegFlags = RegState::Kill; - TII->getAddNoCarry(*MBB, Paired.I, DL, BaseReg) + TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg) .addReg(ImmReg) .addReg(AddrReg->getReg(), 0, BaseSubReg) .addImm(0); // clamp bit @@ -1041,7 +973,7 @@ } MachineInstrBuilder Read2 = - BuildMI(*MBB, Paired.I, DL, Read2Desc, DestReg) + BuildMI(*MBB, InsertBefore, DL, Read2Desc, DestReg) .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr .addImm(NewOffset0) // offset0 .addImm(NewOffset1) // offset1 @@ -1053,14 +985,12 @@ const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); // Copy to the old destination registers. - BuildMI(*MBB, Paired.I, DL, CopyDesc) + BuildMI(*MBB, InsertBefore, DL, CopyDesc) .add(*Dest0) // Copy to same destination including flags and sub reg. .addReg(DestReg, 0, SubRegIdx0); - MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc) - .add(*Dest1) - .addReg(DestReg, RegState::Kill, SubRegIdx1); - - moveInstsAfter(Copy1, InstsToMove); + BuildMI(*MBB, InsertBefore, DL, CopyDesc) + .add(*Dest1) + .addReg(DestReg, RegState::Kill, SubRegIdx1); CI.I->eraseFromParent(); Paired.I->eraseFromParent(); @@ -1085,9 +1015,9 @@ : AMDGPU::DS_WRITE2ST64_B64_gfx9; } -MachineBasicBlock::iterator -SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired, - const SmallVectorImpl &InstsToMove) { +MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( + CombineInfo &CI, CombineInfo &Paired, + MachineBasicBlock::iterator InsertBefore) { MachineBasicBlock *MBB = CI.I->getParent(); // Be sure to use .addOperand(), and not .addReg() with these. We want to be @@ -1121,13 +1051,13 @@ unsigned BaseRegFlags = 0; if (CI.BaseOff) { Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); - BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) + BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) .addImm(CI.BaseOff); BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); BaseRegFlags = RegState::Kill; - TII->getAddNoCarry(*MBB, Paired.I, DL, BaseReg) + TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg) .addReg(ImmReg) .addReg(AddrReg->getReg(), 0, BaseSubReg) .addImm(0); // clamp bit @@ -1135,7 +1065,7 @@ } MachineInstrBuilder Write2 = - BuildMI(*MBB, Paired.I, DL, Write2Desc) + BuildMI(*MBB, InsertBefore, DL, Write2Desc) .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr .add(*Data0) // data0 .add(*Data1) // data1 @@ -1144,8 +1074,6 @@ .addImm(0) // gds .cloneMergedMemRefs({&*CI.I, &*Paired.I}); - moveInstsAfter(Write2, InstsToMove); - CI.I->eraseFromParent(); Paired.I->eraseFromParent(); @@ -1155,7 +1083,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired, - const SmallVectorImpl &InstsToMove) { + MachineBasicBlock::iterator InsertBefore) { MachineBasicBlock *MBB = CI.I->getParent(); DebugLoc DL = CI.I->getDebugLoc(); const unsigned Opcode = getNewOpcode(CI, Paired); @@ -1167,7 +1095,7 @@ unsigned DMaskIdx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask); - auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg); + auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) { if (I == DMaskIdx) MIB.addImm(MergedDMask); @@ -1193,14 +1121,12 @@ const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); - BuildMI(*MBB, Paired.I, DL, CopyDesc) + BuildMI(*MBB, InsertBefore, DL, CopyDesc) .add(*Dest0) // Copy to same destination including flags and sub reg. .addReg(DestReg, 0, SubRegIdx0); - MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc) - .add(*Dest1) - .addReg(DestReg, RegState::Kill, SubRegIdx1); - - moveInstsAfter(Copy1, InstsToMove); + BuildMI(*MBB, InsertBefore, DL, CopyDesc) + .add(*Dest1) + .addReg(DestReg, RegState::Kill, SubRegIdx1); CI.I->eraseFromParent(); Paired.I->eraseFromParent(); @@ -1209,7 +1135,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair( CombineInfo &CI, CombineInfo &Paired, - const SmallVectorImpl &InstsToMove) { + MachineBasicBlock::iterator InsertBefore) { MachineBasicBlock *MBB = CI.I->getParent(); DebugLoc DL = CI.I->getDebugLoc(); const unsigned Opcode = getNewOpcode(CI, Paired); @@ -1228,11 +1154,12 @@ const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); MachineInstr *New = - BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg) - .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase)) - .addImm(MergedOffset) // offset - .addImm(CI.CPol) // cpol - .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); + BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg) + .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase)) + .addImm(MergedOffset) // offset + .addImm(CI.CPol) // cpol + .addMemOperand( + combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); std::pair SubRegIdx = getSubRegIdxs(CI, Paired); const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); @@ -1243,14 +1170,12 @@ const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst); const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::sdst); - BuildMI(*MBB, Paired.I, DL, CopyDesc) + BuildMI(*MBB, InsertBefore, DL, CopyDesc) .add(*Dest0) // Copy to same destination including flags and sub reg. .addReg(DestReg, 0, SubRegIdx0); - MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc) - .add(*Dest1) - .addReg(DestReg, RegState::Kill, SubRegIdx1); - - moveInstsAfter(Copy1, InstsToMove); + BuildMI(*MBB, InsertBefore, DL, CopyDesc) + .add(*Dest1) + .addReg(DestReg, RegState::Kill, SubRegIdx1); CI.I->eraseFromParent(); Paired.I->eraseFromParent(); @@ -1259,7 +1184,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair( CombineInfo &CI, CombineInfo &Paired, - const SmallVectorImpl &InstsToMove) { + MachineBasicBlock::iterator InsertBefore) { MachineBasicBlock *MBB = CI.I->getParent(); DebugLoc DL = CI.I->getDebugLoc(); @@ -1271,7 +1196,7 @@ Register DestReg = MRI->createVirtualRegister(SuperRC); unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); - auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg); + auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); AddressRegs Regs = getRegs(Opcode, *TII); @@ -1304,14 +1229,12 @@ const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); - BuildMI(*MBB, Paired.I, DL, CopyDesc) + BuildMI(*MBB, InsertBefore, DL, CopyDesc) .add(*Dest0) // Copy to same destination including flags and sub reg. .addReg(DestReg, 0, SubRegIdx0); - MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc) - .add(*Dest1) - .addReg(DestReg, RegState::Kill, SubRegIdx1); - - moveInstsAfter(Copy1, InstsToMove); + BuildMI(*MBB, InsertBefore, DL, CopyDesc) + .add(*Dest1) + .addReg(DestReg, RegState::Kill, SubRegIdx1); CI.I->eraseFromParent(); Paired.I->eraseFromParent(); @@ -1320,7 +1243,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair( CombineInfo &CI, CombineInfo &Paired, - const SmallVectorImpl &InstsToMove) { + MachineBasicBlock::iterator InsertBefore) { MachineBasicBlock *MBB = CI.I->getParent(); DebugLoc DL = CI.I->getDebugLoc(); @@ -1332,7 +1255,7 @@ Register DestReg = MRI->createVirtualRegister(SuperRC); unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); - auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg); + auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); AddressRegs Regs = getRegs(Opcode, *TII); @@ -1370,14 +1293,12 @@ const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); - BuildMI(*MBB, Paired.I, DL, CopyDesc) + BuildMI(*MBB, InsertBefore, DL, CopyDesc) .add(*Dest0) // Copy to same destination including flags and sub reg. .addReg(DestReg, 0, SubRegIdx0); - MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc) - .add(*Dest1) - .addReg(DestReg, RegState::Kill, SubRegIdx1); - - moveInstsAfter(Copy1, InstsToMove); + BuildMI(*MBB, InsertBefore, DL, CopyDesc) + .add(*Dest1) + .addReg(DestReg, RegState::Kill, SubRegIdx1); CI.I->eraseFromParent(); Paired.I->eraseFromParent(); @@ -1386,7 +1307,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair( CombineInfo &CI, CombineInfo &Paired, - const SmallVectorImpl &InstsToMove) { + MachineBasicBlock::iterator InsertBefore) { MachineBasicBlock *MBB = CI.I->getParent(); DebugLoc DL = CI.I->getDebugLoc(); @@ -1403,13 +1324,13 @@ const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); - BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) + BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) .add(*Src0) .addImm(SubRegIdx0) .add(*Src1) .addImm(SubRegIdx1); - auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode)) + auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode)) .addReg(SrcReg, RegState::Kill); AddressRegs Regs = getRegs(Opcode, *TII); @@ -1439,8 +1360,6 @@ .addMemOperand( combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); - moveInstsAfter(MIB, InstsToMove); - CI.I->eraseFromParent(); Paired.I->eraseFromParent(); return New; @@ -1545,7 +1464,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair( CombineInfo &CI, CombineInfo &Paired, - const SmallVectorImpl &InstsToMove) { + MachineBasicBlock::iterator InsertBefore) { MachineBasicBlock *MBB = CI.I->getParent(); DebugLoc DL = CI.I->getDebugLoc(); @@ -1562,13 +1481,13 @@ const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); - BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) + BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) .add(*Src0) .addImm(SubRegIdx0) .add(*Src1) .addImm(SubRegIdx1); - auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode)) + auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode)) .addReg(SrcReg, RegState::Kill); AddressRegs Regs = getRegs(Opcode, *TII); @@ -1594,8 +1513,6 @@ .addImm(0) // swz .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); - moveInstsAfter(MIB, InstsToMove); - CI.I->eraseFromParent(); Paired.I->eraseFromParent(); return New; @@ -2074,8 +1991,8 @@ CombineInfo &CI = *First; CombineInfo &Paired = *Second; - SmallVector InstsToMove; - if (!checkAndPrepareMerge(CI, Paired, InstsToMove)) { + CombineInfo *Where = checkAndPrepareMerge(CI, Paired); + if (!Where) { ++I; continue; } @@ -2090,38 +2007,38 @@ llvm_unreachable("unknown InstClass"); break; case DS_READ: - NewMI = mergeRead2Pair(CI, Paired, InstsToMove); + NewMI = mergeRead2Pair(CI, Paired, Where->I); break; case DS_WRITE: - NewMI = mergeWrite2Pair(CI, Paired, InstsToMove); + NewMI = mergeWrite2Pair(CI, Paired, Where->I); break; case S_BUFFER_LOAD_IMM: - NewMI = mergeSBufferLoadImmPair(CI, Paired, InstsToMove); + NewMI = mergeSBufferLoadImmPair(CI, Paired, Where->I); OptimizeListAgain |= CI.Width + Paired.Width < 8; break; case BUFFER_LOAD: - NewMI = mergeBufferLoadPair(CI, Paired, InstsToMove); + NewMI = mergeBufferLoadPair(CI, Paired, Where->I); OptimizeListAgain |= CI.Width + Paired.Width < 4; break; case BUFFER_STORE: - NewMI = mergeBufferStorePair(CI, Paired, InstsToMove); + NewMI = mergeBufferStorePair(CI, Paired, Where->I); OptimizeListAgain |= CI.Width + Paired.Width < 4; break; case MIMG: - NewMI = mergeImagePair(CI, Paired, InstsToMove); + NewMI = mergeImagePair(CI, Paired, Where->I); OptimizeListAgain |= CI.Width + Paired.Width < 4; break; case TBUFFER_LOAD: - NewMI = mergeTBufferLoadPair(CI, Paired, InstsToMove); + NewMI = mergeTBufferLoadPair(CI, Paired, Where->I); OptimizeListAgain |= CI.Width + Paired.Width < 4; break; case TBUFFER_STORE: - NewMI = mergeTBufferStorePair(CI, Paired, InstsToMove); + NewMI = mergeTBufferStorePair(CI, Paired, Where->I); OptimizeListAgain |= CI.Width + Paired.Width < 4; break; } CI.setMI(NewMI, *this); - CI.Order = Paired.Order; + CI.Order = Where->Order; if (I == Second) I = Next; diff --git a/llvm/test/CodeGen/AMDGPU/ds-combine-with-dependence.ll b/llvm/test/CodeGen/AMDGPU/ds-combine-with-dependence.ll --- a/llvm/test/CodeGen/AMDGPU/ds-combine-with-dependence.ll +++ b/llvm/test/CodeGen/AMDGPU/ds-combine-with-dependence.ll @@ -66,13 +66,15 @@ } -; The second load depends on the store. We can combine the two loads, and the combined load is -; at the original place of the second load. +; The second load depends on the store. We could combine the two loads, putting +; the combined load at the original place of the second load, but we prefer to +; leave the first load near the start of the function to hide its latency. ; GCN-LABEL: {{^}}ds_combine_RAW ; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:26 offset1:27 -; GCN-NEXT: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:8 offset1:26 +; GCN-NEXT: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:32 +; GCN-NEXT: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:104 define amdgpu_kernel void @ds_combine_RAW(float addrspace(1)* %out, float addrspace(3)* %inptr) { %base = bitcast float addrspace(3)* %inptr to i8 addrspace(3)* diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2.ll b/llvm/test/CodeGen/AMDGPU/ds_read2.ll --- a/llvm/test/CodeGen/AMDGPU/ds_read2.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_read2.ll @@ -1244,28 +1244,28 @@ ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 +; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v1 -; CI-NEXT: v_add_i32_e32 v3, vcc, s5, v4 -; CI-NEXT: v_add_i32_e32 v5, vcc, s6, v1 +; CI-NEXT: v_add_i32_e32 v3, vcc, s5, v0 +; CI-NEXT: v_add_i32_e32 v4, vcc, s6, v1 +; CI-NEXT: v_add_i32_e32 v6, vcc, s7, v0 ; CI-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 ; CI-NEXT: ds_read2_b32 v[2:3], v3 offset1:4 -; CI-NEXT: v_add_i32_e32 v6, vcc, s7, v4 -; CI-NEXT: ds_read2_b32 v[4:5], v5 offset1:1 +; CI-NEXT: ds_read2_b32 v[4:5], v4 offset1:1 ; CI-NEXT: ds_read2_b32 v[6:7], v6 offset1:4 ; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(2) ; CI-NEXT: v_mul_f32_e32 v0, v0, v2 ; CI-NEXT: v_add_f32_e32 v0, 2.0, v0 -; CI-NEXT: v_mul_f32_e32 v1, v1, v3 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mul_f32_e32 v2, v4, v6 ; CI-NEXT: v_sub_f32_e32 v0, v0, v2 +; CI-NEXT: v_mul_f32_e32 v1, v1, v3 ; CI-NEXT: v_sub_f32_e32 v0, v0, v1 ; CI-NEXT: v_mul_f32_e32 v1, v5, v7 -; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_sub_f32_e32 v0, v0, v1 ; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:40 ; CI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll --- a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll @@ -6,9 +6,15 @@ @b = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 4 @c = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 4 +; FIXME: Should combine the DS instructions into ds_write2 and ds_read2. This +; does not happen because when SILoadStoreOptimizer is run, the reads and writes +; are not adjacent. They are only moved later by MachineScheduler. + ; GCN-LABEL: {{^}}no_clobber_ds_load_stores_x2: -; GCN: ds_write2st64_b32 -; GCN: ds_read2st64_b32 +; GCN: ds_write_b32 +; GCN: ds_write_b32 +; GCN: ds_read_b32 +; GCN: ds_read_b32 ; CHECK-LABEL: @no_clobber_ds_load_stores_x2 ; CHECK: store i32 1, i32 addrspace(3)* %0, align 16, !alias.scope !0, !noalias !3 @@ -30,9 +36,11 @@ } ; GCN-LABEL: {{^}}no_clobber_ds_load_stores_x3: -; GCN-DAG: ds_write2st64_b32 ; GCN-DAG: ds_write_b32 -; GCN-DAG: ds_read2st64_b32 +; GCN-DAG: ds_write_b32 +; GCN-DAG: ds_write_b32 +; GCN-DAG: ds_read_b32 +; GCN-DAG: ds_read_b32 ; GCN-DAG: ds_read_b32 ; CHECK-LABEL: @no_clobber_ds_load_stores_x3 diff --git a/llvm/test/CodeGen/AMDGPU/merge-load-store-physreg.mir b/llvm/test/CodeGen/AMDGPU/merge-load-store-physreg.mir --- a/llvm/test/CodeGen/AMDGPU/merge-load-store-physreg.mir +++ b/llvm/test/CodeGen/AMDGPU/merge-load-store-physreg.mir @@ -7,9 +7,9 @@ # However, an equivalent situation can occur with buffer instructions as well. # CHECK-LABEL: name: scc_def_and_use_no_dependency +# CHECK: DS_READ2_B32 # CHECK: S_ADD_U32 # CHECK: S_ADDC_U32 -# CHECK: DS_READ2_B32 --- name: scc_def_and_use_no_dependency machineFunctionInfo: diff --git a/llvm/test/CodeGen/AMDGPU/merge-out-of-order-ldst.mir b/llvm/test/CodeGen/AMDGPU/merge-out-of-order-ldst.mir --- a/llvm/test/CodeGen/AMDGPU/merge-out-of-order-ldst.mir +++ b/llvm/test/CodeGen/AMDGPU/merge-out-of-order-ldst.mir @@ -2,9 +2,8 @@ # GCN-LABEL: name: out_of_order_merge # GCN: DS_READ2_B64_gfx9 -# GCN: DS_WRITE_B64_gfx9 # GCN: DS_READ2_B64_gfx9 -# GCN: DS_WRITE_B64_gfx9 +# GCN: DS_WRITE2_B64_gfx9 # GCN: DS_WRITE_B64_gfx9 --- name: out_of_order_merge diff --git a/llvm/test/CodeGen/AMDGPU/merge-tbuffer.mir b/llvm/test/CodeGen/AMDGPU/merge-tbuffer.mir --- a/llvm/test/CodeGen/AMDGPU/merge-tbuffer.mir +++ b/llvm/test/CodeGen/AMDGPU/merge-tbuffer.mir @@ -780,8 +780,8 @@ # GFX9-LABEL: name: gfx9_tbuffer_load_merge_across_swizzle -# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 12, 116, 0, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) -# GFX9: %{{[0-9]+}}:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %4, 0, 4, 123, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) +# GFX9-DAG: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 12, 116, 0, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +# GFX9-DAG: %{{[0-9]+}}:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %4, 0, 4, 123, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) name: gfx9_tbuffer_load_merge_across_swizzle body: | bb.0.entry: @@ -1597,8 +1597,8 @@ # GFX10-LABEL: name: gfx10_tbuffer_load_merge_across_swizzle -# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 12, 22, 0, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) -# GFX10: %{{[0-9]+}}:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %4, 0, 4, 64, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) +# GFX10-DAG: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 12, 22, 0, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +# GFX10-DAG: %{{[0-9]+}}:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %4, 0, 4, 64, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) name: gfx10_tbuffer_load_merge_across_swizzle body: | bb.0.entry: diff --git a/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll b/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll --- a/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll +++ b/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll @@ -9,9 +9,9 @@ ; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:1 offset1:3 ; CI: buffer_store_dword -; GFX9: global_store_dword ; GFX9: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:1 offset1:3 ; GFX9: global_store_dword +; GFX9: global_store_dword define amdgpu_kernel void @reorder_local_load_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 { %ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4