diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp --- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -135,6 +135,31 @@ bool DLC1; bool UseST64; SmallVector InstsToMove; + int AddrIdx[5]; + const MachineOperand *AddrReg[5]; + unsigned NumAddresses; + + bool hasSameBaseAddress(const MachineInstr &MI) { + for (unsigned i = 0; i < NumAddresses; i++) { + const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]); + + if (AddrReg[i]->isImm() || AddrRegNext.isImm()) { + if (AddrReg[i]->isImm() != AddrRegNext.isImm() || + AddrReg[i]->getImm() != AddrRegNext.getImm()) { + return false; + } + continue; + } + + // Check same base pointer. Be careful of subregisters, which can occur + // with vectors of pointers. + if (AddrReg[i]->getReg() != AddrRegNext.getReg() || + AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) { + return false; + } + } + return true; + } }; struct BaseRegisters { @@ -171,6 +196,9 @@ bool findMatchingInst(CombineInfo &CI); + void setMI(CombineInfo &CI, MachineBasicBlock::iterator I) const; + void setPaired(CombineInfo &CI, MachineBasicBlock::iterator I) const; + unsigned read2Opcode(unsigned EltSize) const; unsigned read2ST64Opcode(unsigned EltSize) const; MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI); @@ -502,54 +530,117 @@ } } -bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) { - MachineBasicBlock *MBB = CI.I->getParent(); - MachineBasicBlock::iterator E = MBB->end(); - MachineBasicBlock::iterator MBBI = CI.I; +void SILoadStoreOptimizer::setMI(CombineInfo &CI, + MachineBasicBlock::iterator I) const { + CI.I = I; + unsigned Opc = I->getOpcode(); + CI.InstClass = getInstClass(Opc); - const unsigned Opc = CI.I->getOpcode(); - const InstClassEnum InstClass = getInstClass(Opc); + if (CI.InstClass == UNKNOWN) + return; - if (InstClass == UNKNOWN) { - return false; + switch (CI.InstClass) { + case DS_READ: + CI.EltSize = + (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8 + : 4; + break; + case DS_WRITE: + CI.EltSize = + (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8 + : 4; + break; + case S_BUFFER_LOAD_IMM: + CI.EltSize = AMDGPU::getSMRDEncodedOffset(*STM, 4); + break; + default: + CI.EltSize = 4; + break; } - const unsigned Regs = getRegs(Opc); + int OffsetIdx = + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset); + CI.Offset0 = CI.I->getOperand(OffsetIdx).getImm(); + CI.Width0 = getOpcodeWidth(*CI.I); + + if ((CI.InstClass == DS_READ) || (CI.InstClass == DS_WRITE)) { + CI.Offset0 &= 0xffff; + } else { + CI.GLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::glc)->getImm(); + if (CI.InstClass != S_BUFFER_LOAD_IMM) { + CI.SLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::slc)->getImm(); + } + CI.DLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::dlc)->getImm(); + } unsigned AddrOpName[5] = {0}; - int AddrIdx[5]; - const MachineOperand *AddrReg[5]; - unsigned NumAddresses = 0; + CI.NumAddresses = 0; + const unsigned Regs = getRegs(CI.I->getOpcode()); if (Regs & ADDR) { - AddrOpName[NumAddresses++] = AMDGPU::OpName::addr; + AddrOpName[CI.NumAddresses++] = AMDGPU::OpName::addr; } if (Regs & SBASE) { - AddrOpName[NumAddresses++] = AMDGPU::OpName::sbase; + AddrOpName[CI.NumAddresses++] = AMDGPU::OpName::sbase; } if (Regs & SRSRC) { - AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc; + AddrOpName[CI.NumAddresses++] = AMDGPU::OpName::srsrc; } if (Regs & SOFFSET) { - AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset; + AddrOpName[CI.NumAddresses++] = AMDGPU::OpName::soffset; } if (Regs & VADDR) { - AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr; + AddrOpName[CI.NumAddresses++] = AMDGPU::OpName::vaddr; + } + + for (unsigned i = 0; i < CI.NumAddresses; i++) { + CI.AddrIdx[i] = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AddrOpName[i]); + CI.AddrReg[i] = &CI.I->getOperand(CI.AddrIdx[i]); + } +} + +void SILoadStoreOptimizer::setPaired(CombineInfo &CI, + MachineBasicBlock::iterator I) const { + CI.Paired = I; + assert(CI.InstClass == getInstClass(I->getOpcode())); + int OffsetIdx = + AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::offset); + CI.Offset1 = CI.Paired->getOperand(OffsetIdx).getImm(); + CI.Width1 = getOpcodeWidth(*CI.Paired); + if ((CI.InstClass == DS_READ) || (CI.InstClass == DS_WRITE)) { + CI.Offset1 &= 0xffff; + } else { + CI.GLC1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::glc)->getImm(); + if (CI.InstClass != S_BUFFER_LOAD_IMM) { + CI.SLC1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::slc)->getImm(); + } + CI.DLC1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::dlc)->getImm(); } - for (unsigned i = 0; i < NumAddresses; i++) { - AddrIdx[i] = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AddrOpName[i]); - AddrReg[i] = &CI.I->getOperand(AddrIdx[i]); +} + +bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) { + MachineBasicBlock *MBB = CI.I->getParent(); + MachineBasicBlock::iterator E = MBB->end(); + MachineBasicBlock::iterator MBBI = CI.I; + + const unsigned Opc = CI.I->getOpcode(); + const InstClassEnum InstClass = getInstClass(Opc); + + if (InstClass == UNKNOWN) { + return false; + } + for (unsigned i = 0; i < CI.NumAddresses; i++) { // We only ever merge operations with the same base address register, so // don't bother scanning forward if there are no other uses. - if (AddrReg[i]->isReg() && - (TargetRegisterInfo::isPhysicalRegister(AddrReg[i]->getReg()) || - MRI->hasOneNonDBGUse(AddrReg[i]->getReg()))) + if (CI.AddrReg[i]->isReg() && + (TargetRegisterInfo::isPhysicalRegister(CI.AddrReg[i]->getReg()) || + MRI->hasOneNonDBGUse(CI.AddrReg[i]->getReg()))) return false; } @@ -609,50 +700,10 @@ CI.InstsToMove)) continue; - bool Match = true; - for (unsigned i = 0; i < NumAddresses; i++) { - const MachineOperand &AddrRegNext = MBBI->getOperand(AddrIdx[i]); - - if (AddrReg[i]->isImm() || AddrRegNext.isImm()) { - if (AddrReg[i]->isImm() != AddrRegNext.isImm() || - AddrReg[i]->getImm() != AddrRegNext.getImm()) { - Match = false; - break; - } - continue; - } - - // Check same base pointer. Be careful of subregisters, which can occur - // with vectors of pointers. - if (AddrReg[i]->getReg() != AddrRegNext.getReg() || - AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) { - Match = false; - break; - } - } + bool Match = CI.hasSameBaseAddress(*MBBI); if (Match) { - int OffsetIdx = - AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::offset); - CI.Offset0 = CI.I->getOperand(OffsetIdx).getImm(); - CI.Width0 = getOpcodeWidth(*CI.I); - CI.Offset1 = MBBI->getOperand(OffsetIdx).getImm(); - CI.Width1 = getOpcodeWidth(*MBBI); - CI.Paired = MBBI; - - if ((CI.InstClass == DS_READ) || (CI.InstClass == DS_WRITE)) { - CI.Offset0 &= 0xffff; - CI.Offset1 &= 0xffff; - } else { - CI.GLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::glc)->getImm(); - CI.GLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::glc)->getImm(); - if (CI.InstClass != S_BUFFER_LOAD_IMM) { - CI.SLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::slc)->getImm(); - CI.SLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::slc)->getImm(); - } - CI.DLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::dlc)->getImm(); - CI.DLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::dlc)->getImm(); - } + setPaired(CI, MBBI); // Check both offsets fit in the reduced range. // We also need to go through the list of instructions that we plan to @@ -1477,19 +1528,13 @@ continue; } - const unsigned Opc = MI.getOpcode(); - CombineInfo CI; - CI.I = I; - CI.InstClass = getInstClass(Opc); + setMI(CI, I); switch (CI.InstClass) { default: break; case DS_READ: - CI.EltSize = - (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8 - : 4; if (findMatchingInst(CI)) { Modified = true; I = mergeRead2Pair(CI); @@ -1498,9 +1543,6 @@ } continue; case DS_WRITE: - CI.EltSize = - (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8 - : 4; if (findMatchingInst(CI)) { Modified = true; I = mergeWrite2Pair(CI); @@ -1509,7 +1551,6 @@ } continue; case S_BUFFER_LOAD_IMM: - CI.EltSize = AMDGPU::getSMRDEncodedOffset(*STM, 4); if (findMatchingInst(CI)) { Modified = true; I = mergeSBufferLoadImmPair(CI); @@ -1522,7 +1563,6 @@ case BUFFER_LOAD_OFFSET: case BUFFER_LOAD_OFFEN_exact: case BUFFER_LOAD_OFFSET_exact: - CI.EltSize = 4; if (findMatchingInst(CI)) { Modified = true; I = mergeBufferLoadPair(CI); @@ -1535,7 +1575,6 @@ case BUFFER_STORE_OFFSET: case BUFFER_STORE_OFFEN_exact: case BUFFER_STORE_OFFSET_exact: - CI.EltSize = 4; if (findMatchingInst(CI)) { Modified = true; I = mergeBufferStorePair(CI);