Changeset View
Changeset View
Standalone View
Standalone View
llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
Show First 20 Lines • Show All 91 Lines • ▼ Show 20 Lines | |||||
namespace { | namespace { | ||||
enum InstClassEnum { | enum InstClassEnum { | ||||
UNKNOWN, | UNKNOWN, | ||||
DS_READ, | DS_READ, | ||||
DS_WRITE, | DS_WRITE, | ||||
S_BUFFER_LOAD_IMM, | S_BUFFER_LOAD_IMM, | ||||
BUFFER_LOAD, | BUFFER_LOAD, | ||||
BUFFER_STORE, | BUFFER_STORE, | ||||
MIMG, | |||||
}; | }; | ||||
enum RegisterEnum { | enum RegisterEnum { | ||||
SBASE = 0x1, | SBASE = 0x1, | ||||
SRSRC = 0x2, | SRSRC = 0x2, | ||||
SOFFSET = 0x4, | SOFFSET = 0x4, | ||||
VADDR = 0x8, | VADDR = 0x8, | ||||
ADDR = 0x10, | ADDR = 0x10, | ||||
SSAMP = 0x20, | |||||
}; | }; | ||||
class SILoadStoreOptimizer : public MachineFunctionPass { | class SILoadStoreOptimizer : public MachineFunctionPass { | ||||
struct CombineInfo { | struct CombineInfo { | ||||
MachineBasicBlock::iterator I; | MachineBasicBlock::iterator I; | ||||
MachineBasicBlock::iterator Paired; | MachineBasicBlock::iterator Paired; | ||||
unsigned EltSize; | unsigned EltSize; | ||||
unsigned Offset0; | unsigned Offset0; | ||||
unsigned Offset1; | unsigned Offset1; | ||||
unsigned Width0; | unsigned Width0; | ||||
unsigned Width1; | unsigned Width1; | ||||
unsigned BaseOff; | unsigned BaseOff; | ||||
unsigned DMask0; | |||||
unsigned DMask1; | |||||
InstClassEnum InstClass; | InstClassEnum InstClass; | ||||
bool GLC0; | bool GLC0; | ||||
bool GLC1; | bool GLC1; | ||||
bool SLC0; | bool SLC0; | ||||
bool SLC1; | bool SLC1; | ||||
bool DLC0; | bool DLC0; | ||||
bool DLC1; | bool DLC1; | ||||
bool UseST64; | bool UseST64; | ||||
▲ Show 20 Lines • Show All 72 Lines • ▼ Show 20 Lines | |||||
private: | private: | ||||
const GCNSubtarget *STM = nullptr; | const GCNSubtarget *STM = nullptr; | ||||
const SIInstrInfo *TII = nullptr; | const SIInstrInfo *TII = nullptr; | ||||
const SIRegisterInfo *TRI = nullptr; | const SIRegisterInfo *TRI = nullptr; | ||||
MachineRegisterInfo *MRI = nullptr; | MachineRegisterInfo *MRI = nullptr; | ||||
AliasAnalysis *AA = nullptr; | AliasAnalysis *AA = nullptr; | ||||
bool OptimizeAgain; | bool OptimizeAgain; | ||||
static bool dmasksCanBeCombined(const CombineInfo &CI, const SIInstrInfo &TII); | |||||
static bool offsetsCanBeCombined(CombineInfo &CI); | static bool offsetsCanBeCombined(CombineInfo &CI); | ||||
static bool widthsFit(const GCNSubtarget &STM, const CombineInfo &CI); | static bool widthsFit(const GCNSubtarget &STM, const CombineInfo &CI); | ||||
static unsigned getNewOpcode(const CombineInfo &CI); | static unsigned getNewOpcode(const CombineInfo &CI); | ||||
static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI); | static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI); | ||||
const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI); | const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI); | ||||
bool findMatchingInst(CombineInfo &CI); | bool findMatchingInst(CombineInfo &CI); | ||||
unsigned read2Opcode(unsigned EltSize) const; | unsigned read2Opcode(unsigned EltSize) const; | ||||
unsigned read2ST64Opcode(unsigned EltSize) const; | unsigned read2ST64Opcode(unsigned EltSize) const; | ||||
MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI); | MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI); | ||||
unsigned write2Opcode(unsigned EltSize) const; | unsigned write2Opcode(unsigned EltSize) const; | ||||
unsigned write2ST64Opcode(unsigned EltSize) const; | unsigned write2ST64Opcode(unsigned EltSize) const; | ||||
MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI); | MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI); | ||||
MachineBasicBlock::iterator mergeImagePair(CombineInfo &CI); | |||||
MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI); | MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI); | ||||
MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI); | MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI); | ||||
MachineBasicBlock::iterator mergeBufferStorePair(CombineInfo &CI); | MachineBasicBlock::iterator mergeBufferStorePair(CombineInfo &CI); | ||||
void updateBaseAndOffset(MachineInstr &I, unsigned NewBase, | void updateBaseAndOffset(MachineInstr &I, unsigned NewBase, | ||||
int32_t NewOffset) const; | int32_t NewOffset) const; | ||||
unsigned computeBase(MachineInstr &MI, const MemAddress &Addr) const; | unsigned computeBase(MachineInstr &MI, const MemAddress &Addr) const; | ||||
MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const; | MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const; | ||||
Show All 37 Lines | |||||
static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { | static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { | ||||
const unsigned Opc = MI.getOpcode(); | const unsigned Opc = MI.getOpcode(); | ||||
if (TII.isMUBUF(Opc)) { | if (TII.isMUBUF(Opc)) { | ||||
// FIXME: Handle d16 correctly | // FIXME: Handle d16 correctly | ||||
return AMDGPU::getMUBUFElements(Opc); | return AMDGPU::getMUBUFElements(Opc); | ||||
} | } | ||||
if (TII.isMIMG(MI)) { | |||||
uint64_t DMaskImm = | |||||
TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm(); | |||||
return countPopulation(DMaskImm); | |||||
} | |||||
switch (Opc) { | switch (Opc) { | ||||
case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: | case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: | ||||
return 1; | return 1; | ||||
case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: | case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: | ||||
return 2; | return 2; | ||||
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: | case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: | ||||
return 4; | return 4; | ||||
Show All 17 Lines | if (TII.isMUBUF(Opc)) { | ||||
return BUFFER_LOAD; | return BUFFER_LOAD; | ||||
case AMDGPU::BUFFER_STORE_DWORD_OFFEN: | case AMDGPU::BUFFER_STORE_DWORD_OFFEN: | ||||
case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact: | case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact: | ||||
case AMDGPU::BUFFER_STORE_DWORD_OFFSET: | case AMDGPU::BUFFER_STORE_DWORD_OFFSET: | ||||
case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact: | case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact: | ||||
return BUFFER_STORE; | return BUFFER_STORE; | ||||
} | } | ||||
} | } | ||||
if (TII.isMIMG(Opc)) { | |||||
// Ignore instructions encoded without vaddr. | |||||
if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr) == -1) | |||||
return UNKNOWN; | |||||
// TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD. | |||||
if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() || TII.isGather4(Opc)) | |||||
return UNKNOWN; | |||||
return MIMG; | |||||
} | |||||
return UNKNOWN; | return UNKNOWN; | ||||
case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: | case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: | ||||
case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: | case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: | ||||
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: | case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: | ||||
return S_BUFFER_LOAD_IMM; | return S_BUFFER_LOAD_IMM; | ||||
case AMDGPU::DS_READ_B32: | case AMDGPU::DS_READ_B32: | ||||
case AMDGPU::DS_READ_B32_gfx9: | case AMDGPU::DS_READ_B32_gfx9: | ||||
case AMDGPU::DS_READ_B64: | case AMDGPU::DS_READ_B64: | ||||
Show All 9 Lines | |||||
/// Determines instruction subclass from opcode. Only instructions | /// Determines instruction subclass from opcode. Only instructions | ||||
/// of the same subclass can be merged together. | /// of the same subclass can be merged together. | ||||
static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) { | static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) { | ||||
switch (Opc) { | switch (Opc) { | ||||
default: | default: | ||||
if (TII.isMUBUF(Opc)) | if (TII.isMUBUF(Opc)) | ||||
return AMDGPU::getMUBUFBaseOpcode(Opc); | return AMDGPU::getMUBUFBaseOpcode(Opc); | ||||
if (TII.isMIMG(Opc)) { | |||||
const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); | |||||
assert(Info); | |||||
return Info->BaseOpcode; | |||||
} | |||||
return -1; | return -1; | ||||
case AMDGPU::DS_READ_B32: | case AMDGPU::DS_READ_B32: | ||||
case AMDGPU::DS_READ_B32_gfx9: | case AMDGPU::DS_READ_B32_gfx9: | ||||
case AMDGPU::DS_READ_B64: | case AMDGPU::DS_READ_B64: | ||||
case AMDGPU::DS_READ_B64_gfx9: | case AMDGPU::DS_READ_B64_gfx9: | ||||
case AMDGPU::DS_WRITE_B32: | case AMDGPU::DS_WRITE_B32: | ||||
case AMDGPU::DS_WRITE_B32_gfx9: | case AMDGPU::DS_WRITE_B32_gfx9: | ||||
case AMDGPU::DS_WRITE_B64: | case AMDGPU::DS_WRITE_B64: | ||||
Show All 20 Lines | if (TII.isMUBUF(Opc)) { | ||||
if (AMDGPU::getMUBUFHasSoffset(Opc)) { | if (AMDGPU::getMUBUFHasSoffset(Opc)) { | ||||
result |= SOFFSET; | result |= SOFFSET; | ||||
} | } | ||||
return result; | return result; | ||||
} | } | ||||
if (TII.isMIMG(Opc)) { | |||||
unsigned result = VADDR | SRSRC; | |||||
const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); | |||||
if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler) | |||||
result |= SSAMP; | |||||
return result; | |||||
} | |||||
switch (Opc) { | switch (Opc) { | ||||
default: | default: | ||||
return 0; | return 0; | ||||
case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: | case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: | ||||
case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: | case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: | ||||
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: | case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: | ||||
return SBASE; | return SBASE; | ||||
case AMDGPU::DS_READ_B32: | case AMDGPU::DS_READ_B32: | ||||
Show All 33 Lines | void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI, | ||||
case S_BUFFER_LOAD_IMM: | case S_BUFFER_LOAD_IMM: | ||||
EltSize = AMDGPU::getSMRDEncodedOffset(STM, 4); | EltSize = AMDGPU::getSMRDEncodedOffset(STM, 4); | ||||
break; | break; | ||||
default: | default: | ||||
EltSize = 4; | EltSize = 4; | ||||
break; | break; | ||||
} | } | ||||
int OffsetIdx = | if (InstClass == MIMG) { | ||||
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset); | DMask0 = TII.getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm(); | ||||
} else { | |||||
int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset); | |||||
Offset0 = I->getOperand(OffsetIdx).getImm(); | Offset0 = I->getOperand(OffsetIdx).getImm(); | ||||
} | |||||
Width0 = getOpcodeWidth(*I, TII); | Width0 = getOpcodeWidth(*I, TII); | ||||
if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) { | if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) { | ||||
Offset0 &= 0xffff; | Offset0 &= 0xffff; | ||||
} else { | } else if (InstClass != MIMG) { | ||||
GLC0 = TII.getNamedOperand(*I, AMDGPU::OpName::glc)->getImm(); | GLC0 = TII.getNamedOperand(*I, AMDGPU::OpName::glc)->getImm(); | ||||
if (InstClass != S_BUFFER_LOAD_IMM) { | if (InstClass != S_BUFFER_LOAD_IMM) { | ||||
SLC0 = TII.getNamedOperand(*I, AMDGPU::OpName::slc)->getImm(); | SLC0 = TII.getNamedOperand(*I, AMDGPU::OpName::slc)->getImm(); | ||||
} | } | ||||
DLC0 = TII.getNamedOperand(*I, AMDGPU::OpName::dlc)->getImm(); | DLC0 = TII.getNamedOperand(*I, AMDGPU::OpName::dlc)->getImm(); | ||||
} | } | ||||
unsigned AddrOpName[5] = {0}; | unsigned AddrOpName[5] = {0}; | ||||
Show All 15 Lines | void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI, | ||||
if (Regs & SOFFSET) { | if (Regs & SOFFSET) { | ||||
AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset; | AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset; | ||||
} | } | ||||
if (Regs & VADDR) { | if (Regs & VADDR) { | ||||
AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr; | AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr; | ||||
} | } | ||||
if (Regs & SSAMP) { | |||||
AddrOpName[NumAddresses++] = AMDGPU::OpName::ssamp; | |||||
} | |||||
for (unsigned i = 0; i < NumAddresses; i++) { | for (unsigned i = 0; i < NumAddresses; i++) { | ||||
AddrIdx[i] = AMDGPU::getNamedOperandIdx(I->getOpcode(), AddrOpName[i]); | AddrIdx[i] = AMDGPU::getNamedOperandIdx(I->getOpcode(), AddrOpName[i]); | ||||
AddrReg[i] = &I->getOperand(AddrIdx[i]); | AddrReg[i] = &I->getOperand(AddrIdx[i]); | ||||
} | } | ||||
InstsToMove.clear(); | InstsToMove.clear(); | ||||
} | } | ||||
void SILoadStoreOptimizer::CombineInfo::setPaired(MachineBasicBlock::iterator MI, | void SILoadStoreOptimizer::CombineInfo::setPaired(MachineBasicBlock::iterator MI, | ||||
const SIInstrInfo &TII) { | const SIInstrInfo &TII) { | ||||
Paired = MI; | Paired = MI; | ||||
assert(InstClass == getInstClass(Paired->getOpcode(), TII)); | assert(InstClass == getInstClass(Paired->getOpcode(), TII)); | ||||
if (InstClass == MIMG) { | |||||
DMask1 = TII.getNamedOperand(*Paired, AMDGPU::OpName::dmask)->getImm(); | |||||
} else { | |||||
int OffsetIdx = | int OffsetIdx = | ||||
AMDGPU::getNamedOperandIdx(I->getOpcode(), AMDGPU::OpName::offset); | AMDGPU::getNamedOperandIdx(I->getOpcode(), AMDGPU::OpName::offset); | ||||
Offset1 = Paired->getOperand(OffsetIdx).getImm(); | Offset1 = Paired->getOperand(OffsetIdx).getImm(); | ||||
} | |||||
Width1 = getOpcodeWidth(*Paired, TII); | Width1 = getOpcodeWidth(*Paired, TII); | ||||
if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) { | if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) { | ||||
Offset1 &= 0xffff; | Offset1 &= 0xffff; | ||||
} else { | } else if (InstClass != MIMG) { | ||||
GLC1 = TII.getNamedOperand(*Paired, AMDGPU::OpName::glc)->getImm(); | GLC1 = TII.getNamedOperand(*Paired, AMDGPU::OpName::glc)->getImm(); | ||||
if (InstClass != S_BUFFER_LOAD_IMM) { | if (InstClass != S_BUFFER_LOAD_IMM) { | ||||
SLC1 = TII.getNamedOperand(*Paired, AMDGPU::OpName::slc)->getImm(); | SLC1 = TII.getNamedOperand(*Paired, AMDGPU::OpName::slc)->getImm(); | ||||
} | } | ||||
DLC1 = TII.getNamedOperand(*Paired, AMDGPU::OpName::dlc)->getImm(); | DLC1 = TII.getNamedOperand(*Paired, AMDGPU::OpName::dlc)->getImm(); | ||||
} | } | ||||
} | } | ||||
▲ Show 20 Lines • Show All 98 Lines • ▼ Show 20 Lines | static MachineMemOperand *combineKnownAdjacentMMOs(MachineFunction &MF, | ||||
// This function adds the offset parameter to the existing offset for A, | // This function adds the offset parameter to the existing offset for A, | ||||
// so we pass 0 here as the offset and then manually set it to the correct | // so we pass 0 here as the offset and then manually set it to the correct | ||||
// value after the call. | // value after the call. | ||||
MachineMemOperand *MMO = MF.getMachineMemOperand(A, 0, Size); | MachineMemOperand *MMO = MF.getMachineMemOperand(A, 0, Size); | ||||
MMO->setOffset(MinOffset); | MMO->setOffset(MinOffset); | ||||
return MMO; | return MMO; | ||||
} | } | ||||
bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI, const SIInstrInfo &TII) { | |||||
assert(CI.InstClass == MIMG); | |||||
// Ignore instructions with tfe/lwe set. | |||||
const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe); | |||||
const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe); | |||||
if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm())) | |||||
return false; | |||||
// Check other optional immediate operands for equality. | |||||
unsigned OperandsToMatch[] = {AMDGPU::OpName::glc, AMDGPU::OpName::slc, | |||||
AMDGPU::OpName::d16, AMDGPU::OpName::unorm, | |||||
AMDGPU::OpName::da, AMDGPU::OpName::r128}; | |||||
for (auto op : OperandsToMatch) { | |||||
int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op); | |||||
if (AMDGPU::getNamedOperandIdx(CI.Paired->getOpcode(), op) != Idx) | |||||
return false; | |||||
if (Idx != -1 && | |||||
CI.I->getOperand(Idx).getImm() != CI.Paired->getOperand(Idx).getImm()) | |||||
return false; | |||||
} | |||||
// Check DMask for overlaps. | |||||
unsigned MaxMask = std::max(CI.DMask0, CI.DMask1); | |||||
unsigned MinMask = std::min(CI.DMask0, CI.DMask1); | |||||
unsigned AllowedBitsForMin = llvm::countTrailingZeros(MaxMask); | |||||
if ((1u << AllowedBitsForMin) <= MinMask) | |||||
return false; | |||||
return true; | |||||
} | |||||
bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) { | bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) { | ||||
assert(CI.InstClass != MIMG); | |||||
// XXX - Would the same offset be OK? Is there any reason this would happen or | // XXX - Would the same offset be OK? Is there any reason this would happen or | ||||
// be useful? | // be useful? | ||||
if (CI.Offset0 == CI.Offset1) | if (CI.Offset0 == CI.Offset1) | ||||
return false; | return false; | ||||
// This won't be valid if the offset isn't aligned. | // This won't be valid if the offset isn't aligned. | ||||
if ((CI.Offset0 % CI.EltSize != 0) || (CI.Offset1 % CI.EltSize != 0)) | if ((CI.Offset0 % CI.EltSize != 0) || (CI.Offset1 % CI.EltSize != 0)) | ||||
return false; | return false; | ||||
▲ Show 20 Lines • Show All 139 Lines • ▼ Show 20 Lines | if (addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove, | ||||
CI.InstsToMove)) | CI.InstsToMove)) | ||||
continue; | continue; | ||||
bool Match = CI.hasSameBaseAddress(*MBBI); | bool Match = CI.hasSameBaseAddress(*MBBI); | ||||
if (Match) { | if (Match) { | ||||
CI.setPaired(MBBI, *TII); | CI.setPaired(MBBI, *TII); | ||||
// Check both offsets fit in the reduced range. | // Check both offsets (or masks for MIMG) can be combined and fit in the | ||||
// reduced range. | |||||
bool canBeCombined = | |||||
CI.InstClass == MIMG | |||||
? dmasksCanBeCombined(CI, *TII) | |||||
: widthsFit(*STM, CI) && offsetsCanBeCombined(CI); | |||||
// We also need to go through the list of instructions that we plan to | // We also need to go through the list of instructions that we plan to | ||||
// move and make sure they are all safe to move down past the merged | // move and make sure they are all safe to move down past the merged | ||||
// instruction. | // instruction. | ||||
if (widthsFit(*STM, CI) && offsetsCanBeCombined(CI)) | if (canBeCombined && canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA)) | ||||
if (canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA)) | |||||
return true; | return true; | ||||
} | } | ||||
// We've found a load/store that we couldn't merge for some reason. | // We've found a load/store that we couldn't merge for some reason. | ||||
// We could potentially keep looking, but we'd need to make sure that | // We could potentially keep looking, but we'd need to make sure that | ||||
// it was safe to move I and also all the instruction in InstsToMove | // it was safe to move I and also all the instruction in InstsToMove | ||||
// down past this instruction. | // down past this instruction. | ||||
// check if we can move I across MBBI and if we can move all I's users | // check if we can move I across MBBI and if we can move all I's users | ||||
if (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) || | if (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) || | ||||
▲ Show 20 Lines • Show All 179 Lines • ▼ Show 20 Lines | SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI) { | ||||
CI.I->eraseFromParent(); | CI.I->eraseFromParent(); | ||||
CI.Paired->eraseFromParent(); | CI.Paired->eraseFromParent(); | ||||
LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n'); | LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n'); | ||||
return Write2; | return Write2; | ||||
} | } | ||||
MachineBasicBlock::iterator | MachineBasicBlock::iterator | ||||
SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI) { | |||||
MachineBasicBlock *MBB = CI.I->getParent(); | |||||
DebugLoc DL = CI.I->getDebugLoc(); | |||||
const unsigned Opcode = getNewOpcode(CI); | |||||
const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI); | |||||
Register DestReg = MRI->createVirtualRegister(SuperRC); | |||||
unsigned MergedDMask = CI.DMask0 | CI.DMask1; | |||||
unsigned DMaskIdx = | |||||
AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask); | |||||
auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg); | |||||
for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) { | |||||
if (I == DMaskIdx) | |||||
MIB.addImm(MergedDMask); | |||||
else | |||||
MIB.add((*CI.I).getOperand(I)); | |||||
} | |||||
// It shouldn't be possible to get this far if the two instructions | |||||
// don't have a single memoperand, because MachineInstr::mayAlias() | |||||
// will return true if this is the case. | |||||
assert(CI.I->hasOneMemOperand() && CI.Paired->hasOneMemOperand()); | |||||
const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); | |||||
const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin(); | |||||
MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); | |||||
std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI); | |||||
const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); | |||||
const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); | |||||
// Copy to the old destination registers. | |||||
const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); | |||||
const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); | |||||
const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata); | |||||
BuildMI(*MBB, CI.Paired, DL, CopyDesc) | |||||
.add(*Dest0) // Copy to same destination including flags and sub reg. | |||||
.addReg(DestReg, 0, SubRegIdx0); | |||||
MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc) | |||||
.add(*Dest1) | |||||
.addReg(DestReg, RegState::Kill, SubRegIdx1); | |||||
moveInstsAfter(Copy1, CI.InstsToMove); | |||||
CI.I->eraseFromParent(); | |||||
CI.Paired->eraseFromParent(); | |||||
return New; | |||||
} | |||||
MachineBasicBlock::iterator | |||||
SILoadStoreOptimizer::mergeSBufferLoadImmPair(CombineInfo &CI) { | SILoadStoreOptimizer::mergeSBufferLoadImmPair(CombineInfo &CI) { | ||||
MachineBasicBlock *MBB = CI.I->getParent(); | MachineBasicBlock *MBB = CI.I->getParent(); | ||||
DebugLoc DL = CI.I->getDebugLoc(); | DebugLoc DL = CI.I->getDebugLoc(); | ||||
const unsigned Opcode = getNewOpcode(CI); | const unsigned Opcode = getNewOpcode(CI); | ||||
const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI); | const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI); | ||||
Register DestReg = MRI->createVirtualRegister(SuperRC); | Register DestReg = MRI->createVirtualRegister(SuperRC); | ||||
▲ Show 20 Lines • Show All 115 Lines • ▼ Show 20 Lines | case S_BUFFER_LOAD_IMM: | ||||
switch (Width) { | switch (Width) { | ||||
default: | default: | ||||
return 0; | return 0; | ||||
case 2: | case 2: | ||||
return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM; | return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM; | ||||
case 4: | case 4: | ||||
return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM; | return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM; | ||||
} | } | ||||
case MIMG: | |||||
assert("No overlaps" && (countPopulation(CI.DMask0 | CI.DMask1) == Width)); | |||||
return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width); | |||||
} | } | ||||
} | } | ||||
std::pair<unsigned, unsigned> | std::pair<unsigned, unsigned> | ||||
SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI) { | SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI) { | ||||
if (CI.Width0 == 0 || CI.Width0 == 0 || CI.Width0 + CI.Width1 > 4) | if (CI.Width0 == 0 || CI.Width0 == 0 || CI.Width0 + CI.Width1 > 4) | ||||
return std::make_pair(0, 0); | return std::make_pair(0, 0); | ||||
bool ReverseOrder = CI.Offset0 > CI.Offset1; | bool ReverseOrder; | ||||
if (CI.InstClass == MIMG) { | |||||
assert((countPopulation(CI.DMask0 | CI.DMask1) == CI.Width0 + CI.Width1) && | |||||
"No overlaps"); | |||||
ReverseOrder = CI.DMask0 > CI.DMask1; | |||||
} else | |||||
ReverseOrder = CI.Offset0 > CI.Offset1; | |||||
static const unsigned Idxs[4][4] = { | static const unsigned Idxs[4][4] = { | ||||
{AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3}, | {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3}, | ||||
{AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, 0}, | {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, 0}, | ||||
{AMDGPU::sub2, AMDGPU::sub2_sub3, 0, 0}, | {AMDGPU::sub2, AMDGPU::sub2_sub3, 0, 0}, | ||||
{AMDGPU::sub3, 0, 0, 0}, | {AMDGPU::sub3, 0, 0, 0}, | ||||
}; | }; | ||||
unsigned Idx0; | unsigned Idx0; | ||||
▲ Show 20 Lines • Show All 552 Lines • ▼ Show 20 Lines | case BUFFER_STORE: | ||||
if (findMatchingInst(CI)) { | if (findMatchingInst(CI)) { | ||||
Modified = true; | Modified = true; | ||||
removeCombinedInst(MergeList, *CI.Paired); | removeCombinedInst(MergeList, *CI.Paired); | ||||
MachineBasicBlock::iterator NewMI = mergeBufferStorePair(CI); | MachineBasicBlock::iterator NewMI = mergeBufferStorePair(CI); | ||||
CI.setMI(NewMI, *TII, *STM); | CI.setMI(NewMI, *TII, *STM); | ||||
OptimizeListAgain |= (CI.Width0 + CI.Width1) < 4; | OptimizeListAgain |= (CI.Width0 + CI.Width1) < 4; | ||||
} | } | ||||
break; | break; | ||||
case MIMG: | |||||
if (findMatchingInst(CI)) { | |||||
Modified = true; | |||||
removeCombinedInst(MergeList, *CI.Paired); | |||||
MachineBasicBlock::iterator NewMI = mergeImagePair(CI); | |||||
CI.setMI(NewMI, *TII, *STM); | |||||
OptimizeListAgain |= (CI.Width0 + CI.Width1) < 4; | |||||
} | |||||
break; | |||||
} | } | ||||
// Clear the InstsToMove after we have finished searching so we don't have | // Clear the InstsToMove after we have finished searching so we don't have | ||||
// stale values left over if we search for this CI again in another pass | // stale values left over if we search for this CI again in another pass | ||||
// over the block. | // over the block. | ||||
CI.InstsToMove.clear(); | CI.InstsToMove.clear(); | ||||
} | } | ||||
return Modified; | return Modified; | ||||
Show All 35 Lines |