namespace {

enum InstClassEnum {

UNKNOWN,

DS_READ,

DS_WRITE,

S_BUFFER_LOAD_IMM,

BUFFER_LOAD,

BUFFER_STORE,

MIMG,

};

101 | 102 | | |||

enum RegisterEnum {

SBASE = 0x1,

SRSRC = 0x2,

SOFFSET = 0x4,

VADDR = 0x8,

ADDR = 0x10,

SSAMP = 0x20,

};

109 | 111 | | |||

class SILoadStoreOptimizer : public MachineFunctionPass {

struct CombineInfo {

MachineBasicBlock::iterator I;

MachineBasicBlock::iterator Paired;

unsigned EltSize;

unsigned Offset0;

unsigned Offset1;

unsigned Width0;

unsigned Width1;

unsigned BaseOff;

unsigned DMask0;

unsigned DMask1;

InstClassEnum InstClass;

bool GLC0;

bool GLC1;

bool SLC0;

bool SLC1;

bool DLC0;

bool DLC1;

bool UseST64;

private:

const GCNSubtarget *STM = nullptr;

const SIInstrInfo *TII = nullptr;

const SIRegisterInfo *TRI = nullptr;

MachineRegisterInfo *MRI = nullptr;

AliasAnalysis *AA = nullptr;

bool OptimizeAgain;

207 | 211 | | |||

static bool dmasksCanBeCombined(const CombineInfo &CI, const SIInstrInfo &TII);

static bool offsetsCanBeCombined(CombineInfo &CI);

static bool widthsFit(const GCNSubtarget &STM, const CombineInfo &CI);

static unsigned getNewOpcode(const CombineInfo &CI);

static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI);

const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI);

213 | 218 | | |||

bool findMatchingInst(CombineInfo &CI);

215 | 220 | | |||

unsigned read2Opcode(unsigned EltSize) const;

unsigned read2ST64Opcode(unsigned EltSize) const;

MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI);

219 | 224 | | |||

unsigned write2Opcode(unsigned EltSize) const;

unsigned write2ST64Opcode(unsigned EltSize) const;

MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI);

MachineBasicBlock::iterator mergeImagePair(CombineInfo &CI);

MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI);

MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI);

MachineBasicBlock::iterator mergeBufferStorePair(CombineInfo &CI);

226 | 232 | | |||

void updateBaseAndOffset(MachineInstr &I, unsigned NewBase,

int32_t NewOffset) const;

unsigned computeBase(MachineInstr &MI, const MemAddress &Addr) const;

MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const;

268 | 274 | | |||

static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {

const unsigned Opc = MI.getOpcode();

271 | 277 | | |||

if (TII.isMUBUF(Opc)) {

// FIXME: Handle d16 correctly

return AMDGPU::getMUBUFElements(Opc);

}

if (TII.isMIMG(MI)) {

uint64_t DMaskImm =

TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm();

return countPopulation(DMaskImm);

}

276 | 287 | | |||

switch (Opc) {

case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:

return 1;

case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:

return 2;

case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:

return 4;

return BUFFER_LOAD;

case AMDGPU::BUFFER_STORE_DWORD_OFFEN:

case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:

case AMDGPU::BUFFER_STORE_DWORD_OFFSET:

case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:

return BUFFER_STORE;

}

}

if (TII.isMIMG(Opc)) {

// Ignore instructions encoded without vaddr.

if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr) == -1)

return UNKNOWN;

// TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD.

if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() || TII.isGather4(Opc))

return UNKNOWN;

return MIMG;

}

return UNKNOWN;

case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:

case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:

case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:

return S_BUFFER_LOAD_IMM;

case AMDGPU::DS_READ_B32:

case AMDGPU::DS_READ_B32_gfx9:

case AMDGPU::DS_READ_B64:

326 | 346 | | |||

/// Determines instruction subclass from opcode. Only instructions

/// of the same subclass can be merged together.

static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {

switch (Opc) {

default:

if (TII.isMUBUF(Opc))

return AMDGPU::getMUBUFBaseOpcode(Opc);

if (TII.isMIMG(Opc)) {

const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);

assert(Info);

return Info->BaseOpcode;

}

return -1;

case AMDGPU::DS_READ_B32:

case AMDGPU::DS_READ_B32_gfx9:

case AMDGPU::DS_READ_B64:

case AMDGPU::DS_READ_B64_gfx9:

case AMDGPU::DS_WRITE_B32:

case AMDGPU::DS_WRITE_B32_gfx9:

case AMDGPU::DS_WRITE_B64:

if (TII.isMUBUF(Opc)) {

362 | 387 | | |||

if (AMDGPU::getMUBUFHasSoffset(Opc)) {

result |= SOFFSET;

}

366 | 391 | | |||

return result;

}

369 | 394 | | |||

if (TII.isMIMG(Opc)) {

unsigned result = VADDR | SRSRC;

const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);

if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler)

result |= SSAMP;

return result;

}

402 | | ||||

switch (Opc) {

default:

return 0;

case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:

case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:

case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:

return SBASE;

case AMDGPU::DS_READ_B32:

void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,

case S_BUFFER_LOAD_IMM:

EltSize = AMDGPU::getSMRDEncodedOffset(STM, 4);

break;

default:

EltSize = 4;

break;

}

418 | 451 | | |||

if (InstClass == MIMG) {

DMask0 = TII.getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm();

} else {

int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset);

Offset0 = I->getOperand(OffsetIdx).getImm();

}

458 | | ||||

Width0 = getOpcodeWidth(*I, TII);

423 | 460 | | |||

if ((Inst

425 | Offset0 &= 0xffff; | 462 | Offset0 &= 0xffff; | ||

426 | } else { | 463 | } else if (InstClass != MIMG) { | ||

427 | GLC0 = TII.getNamedOperand(*I, AMDGPU::OpName::glc)->getImm(); | 464 | GLC0 = TII.getNamedOperand(*I, AMDGPU::OpName::glc)->getImm(); | ||

428 | if (InstClass != S_BUFFER_LOAD_IMM) { | 465 | if (InstClass != S_BUFFER_LOAD_IMM) { | ||

429 | SLC0 = TII.getNamedOperand(*I, AMDGPU::OpName::slc)->getImm(); | 466 | SLC0 = TII.getNamedOperand(*I, AMDGPU::OpName::slc)->getImm(); | ||

430 | } | 467 | } | ||

431 | DLC0 = TII.getNamedOperand(*I, AMDGPU::OpName::dlc)->getImm(); | 468 | DLC0 = TII.getNamedOperand(*I, AMDGPU::OpName::dlc)->getImm(); | ||

432 | } | 469 | } | ||

433 | 470 | | |||

434 | unsigned AddrOpName[5] = {0}; | 471 | unsigned AddrOpName[5] = {0}; | ||

450 | if (Regs & SOFFSET) { | 487 | if (Regs & SOFFSET) { | ||

451 | AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset; | 488 | AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset; | ||

452 | } | 489 | } | ||

453 | 490 | | |||

454 | if (Regs & VADDR) { | 491 | if (Regs & VADDR) { | ||

455 | AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr; | 492 | AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr; | ||

456 | } | 493 | } | ||

457 | 494 | | |||

495 | if (Regs & SSAMP) { | ||||

496 | AddrOpName[NumAddresses++] = AMDGPU::OpName::ssamp; | ||||

497 | } | ||||

498 | | ||||

458 | for (unsigned i = 0; i < NumAddresses; i++) { | 499 | for (unsigned i = 0; i < NumAddresses; i++) { | ||

459 | AddrIdx[i] = AMDGPU::getNamedOperandIdx(I->getOpcode(), AddrOpName[i]); | 500 | AddrIdx[i] = AMDGPU::getNamedOperandIdx(I->getOpcode(), AddrOpName[i]); | ||

460 | AddrReg[i] = &I->getOperand(AddrIdx[i]); | 501 | AddrReg[i] = &I->getOperand(AddrIdx[i]); | ||

461 | } | 502 | } | ||

462 | 503 | | |||

463 | InstsToMove.clear(); | 504 | InstsToMove.clear(); | ||

464 | } | 505 | } | ||

465 | 506 | | |||

466 | void SILoadStoreOptimizer::CombineInfo::setPaired(MachineBasicBlock::iterator MI, | 507 | void SILoadStoreOptimizer::CombineInfo::setPaired(MachineBasicBlock::iterator MI, | ||

467 | const SIInstrInfo &TII) { | 508 | const SIInstrInfo &TII) { | ||

468 | Paired = MI; | 509 | Paired = MI; | ||

469 | assert(InstClass == getInstClass(Paired->getOpcode(), TII)); | 510 | assert(InstClass == getInstClass(Paired->getOpcode(), TII)); | ||

511 | | ||||

512 | if (InstClass == MIMG) { | ||||

513 | DMask1 = TII.getNamedOperand(*Paired, AMDGPU::OpName::dmask)->getImm(); | ||||

514 | } else { | ||||

470 | int OffsetIdx = | 515 | int OffsetIdx = | ||

471 | AMDGPU::getNamedOperandIdx(I->getOpcode(), AMDGPU::OpName::offset); | 516 | AMDGPU::getNamedOperandIdx(I->getOpcode(), AMDGPU::OpName::offset); | ||

472 | Offset1 = Paired->getOperand(OffsetIdx).getImm(); | 517 | Offset1 = Paired->getOperand(OffsetIdx).getImm(); | ||

518 | } | ||||

519 | | ||||

473 | Width1 = getOpcodeWidth(*Paired, TII); | 520 | Width1 = getOpcodeWidth(*Paired, TII); | ||

474 | if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) { | 521 | if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) { | ||

475 | Offset1 &= 0xffff; | 522 | Offset1 &= 0xffff; | ||

476 | } else { | 523 | } else if (InstClass != MIMG) { | ||

477 | GLC1 = TII.getNamedOperand(*Paired, AMDGPU::OpName::glc)->getImm(); | 524 | GLC1 = TII.getNamedOperand(*Paired, AMDGPU::OpName::glc)->getImm(); | ||

478 | if (InstClass != S_BUFFER_LOAD_IMM) { | 525 | if (InstClass != S_BUFFER_LOAD_IMM) { | ||

479 | SLC1 = TII.getNamedOperand(*Paired, AMDGPU::OpName::slc)->getImm(); | 526 | SLC1 = TII.getNamedOperand(*Paired, AMDGPU::OpName::slc)->getImm(); | ||

480 | } | 527 | } | ||

481 | DLC1 = TII.getNamedOperand(*Paired, AMDGPU::OpName::dlc)->getImm(); | 528 | DLC1 = TII.getNamedOperand(*Paired, AMDGPU::OpName::dlc)->getImm(); | ||

482 | } | 529 | } | ||

483 | } | 530 | } | ||

484 | 531 | | |||

583 | // This function adds the offset parameter to the existing offset for A, | 630 | // This function adds the offset parameter to the existing offset for A, | ||

584 | // so we pass 0 here as the offset and then manually set it to the correct | 631 | // so we pass 0 here as the offset and then manually set it to the correct | ||

585 | // value after the call. | 632 | // value after the call. | ||

586 | MachineMemOperand *MMO = MF.getMachineMemOperand(A, 0, Size); | 633 | MachineMemOperand *MMO = MF.getMachineMemOperand(A, 0, Size); | ||

587 | MMO->setOffset(MinOffset); | 634 | MMO->setOffset(MinOffset); | ||

588 | return MMO; | 635 | return MMO; | ||

589 | } | 636 | } | ||

590 | 637 | | |||

638 | bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI, const SIInstrInfo &TII) { | ||||

639 | assert(CI.InstClass == MIMG); | ||||

640 | | ||||

641 | // Ignore instructions with tfe/lwe set. | ||||

642 | const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe); | ||||

643 | const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe); | ||||

644 | | ||||

645 | if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm())) | ||||

646 | return false; | ||||

647 | | ||||

648 | // Check other optional immediate operands for equality. | ||||

649 | unsigned OperandsToMatch[] = {AMDGPU::OpName::glc, AMDGPU::OpName::slc, | ||||

650 | AMDGPU::OpName::d16, AMDGPU::OpName::unorm, | ||||

651 | AMDGPU::OpName::da, AMDGPU::OpName::r128}; | ||||

652 | | ||||

653 | for (auto op : OperandsToMatch) { | ||||

654 | int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op); | ||||

655 | if (AMDGPU::getNamedOperandIdx(CI.Paired->getOpcode(), op) != Idx) | ||||

656 | return false; | ||||

657 | if (Idx != -1 && | ||||

658 | CI.I->getOperand(Idx).getImm() != CI.Paired->getOperand(Idx).getImm()) | ||||

659 | return false; | ||||

660 | } | ||||

661 | | ||||

662 | // Check DMask for overlaps. | ||||

663 | unsigned MaxMask = std::max(CI.DMask0, CI.DMask1); | ||||

664 | unsigned MinMask = std::min(CI.DMask0, CI.DMask1); | ||||

665 | | ||||

666 | unsigned AllowedBitsForMin = llvm::countTrailingZeros(MaxMask); | ||||

667 | if ((1u << AllowedBitsForMin) <= MinMask) | ||||

668 | return false; | ||||

669 | | ||||

670 | return true; | ||||

671 | } | ||||

672 | | ||||

591 | bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) { | 673 | bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) { | ||

674 | assert(CI.InstClass != MIMG); | ||||

675 | | ||||

592 | // XXX - Would the same offset be OK? Is there any reason this would happen or | 676 | // XXX - Would the same offset be OK? Is there any reason this would happen or | ||

593 | // be useful? | 677 | // be useful? | ||

594 | if (CI.Offset0 == CI.Offset1) | 678 | if (CI.Offset0 == CI.Offset1) | ||

595 | return false; | 679 | return false; | ||

596 | 680 | | |||

597 | // This won't be valid if the offset isn't aligned. | 681 | // This won't be valid if the offset isn't aligned. | ||

598 | if ((CI.Offset0 % CI.EltSize != 0) || (CI.Offset1 % CI.EltSize != 0)) | 682 | if ((CI.Offset0 % CI.EltSize != 0) || (CI.Offset1 % CI.EltSize != 0)) | ||

599 | return false; | 683 | return false; | ||

739 | CI.InstsToMove)) | 823 | CI.InstsToMove)) | ||

740 | continue; | 824 | continue; | ||

741 | 825 | | |||

742 | bool Match = CI.hasSameBaseAddress(*MBBI); | 826 | bool Match = CI.hasSameBaseAddress(*MBBI); | ||

743 | 827 | | |||

744 | if (Match) { | 828 | if (Match) { | ||

745 | CI.setPaired(MBBI, *TII); | 829 | CI.setPaired(MBBI, *TII); | ||

746 | 830 | | |||

747 | // Check both offsets fit in the reduced range. | 831 | // Check both offsets (or masks for MIMG) can be combined and fit in the | ||

832 | // reduced range. | ||||

833 | bool canBeCombined = | ||||

834 | CI.InstClass == MIMG | ||||

835 | ? dmasksCanBeCombined(CI, *TII) | ||||

836 | : widthsFit(*STM, CI) && offsetsCanBeCombined(CI); | ||||

837 | | ||||

748 | // We also need to go through the list of instructions that we plan to | 838 | // We also need to go through the list of instructions that we plan to | ||

749 | // move and make sure they are all safe to move down past the merged | 839 | // move and make sure they are all safe to move down past the merged | ||

750 | // instruction. | 840 | // instruction. | ||

751 | if (widthsFit(*STM, CI) && offsetsCanBeCombined(CI)) | 841 | if (canBeCombined && canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA)) | ||

752 | if (canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA)) | | |||

753 | return true; | 842 | return true; | ||

754 | } | 843 | } | ||

755 | 844 | | |||

756 | // We've found a load/store that we couldn't merge for some reason. | 845 | // We've found a load/store that we couldn't merge for some reason. | ||

757 | // We could potentially keep looking, but we'd need to make sure that | 846 | // We could potentially keep looking, but we'd need to make sure that | ||

758 | // it was safe to move I and also all the instruction in InstsToMove | 847 | // it was safe to move I and also all the instruction in InstsToMove | ||

759 | // down past this instruction. | 848 | // down past this instruction. | ||

760 | // check if we can move I across MBBI and if we can move all I's users | 849 | // check if we can move I across MBBI and if we can move all I's users | ||

941 | CI.I->eraseFromParent(); | 1030 | CI.I->eraseFromParent(); | ||

942 | CI.Paired->eraseFromParent(); | 1031 | CI.Paired->eraseFromParent(); | ||

943 | 1032 | | |||

944 | LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n'); | 1033 | LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n'); | ||

945 | return Write2; | 1034 | return Write2; | ||

946 | } | 1035 | } | ||

947 | 1036 | | |||

948 | MachineBasicBlock::iterator | 1037 | MachineBasicBlock::iterator | ||

1038 | SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI) { | ||||

1039 | MachineBasicBlock *MBB = CI.I->getParent(); | ||||

1040 | DebugLoc DL = CI.I->getDebugLoc(); | ||||

1041 | const unsigned Opcode = getNewOpcode(CI); | ||||

1042 | | ||||

1043 | const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI); | ||||

1044 | | ||||

1045 | Register DestReg = MRI->createVirtualRegister(SuperRC); | ||||

1046 | unsigned MergedDMask = CI.DMask0 | CI.DMask1; | ||||

1047 | unsigned DMaskIdx = | ||||

1048 | AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask); | ||||

1049 | | ||||

1050 | auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg); | ||||

1051 | for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) { | ||||

1052 | if (I == DMaskIdx) | ||||

1053 | MIB.addImm(MergedDMask); | ||||

1054 | else | ||||

1055 | MIB.add((*CI.I).getOperand(I)); | ||||

1056 | } | ||||

1057 | | ||||

1058 | // It shouldn't be possible to get this far if the two instructions | ||||

1059 | // don't have a single memoperand, because MachineInstr::mayAlias() | ||||

1060 | // will return true if this is the case. | ||||

1061 | assert(CI.I->hasOneMemOperand() && CI.Paired->hasOneMemOperand()); | ||||

1062 | | ||||

1063 | const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); | ||||

1064 | const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin(); | ||||

1065 | | ||||

1066 | MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); | ||||

1067 | | ||||

1068 | std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI); | ||||

1069 | const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); | ||||

1070 | const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); | ||||

1071 | | ||||

1072 | // Copy to the old destination registers. | ||||

1073 | const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); | ||||

1074 | const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); | ||||

1075 | const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata); | ||||

1076 | | ||||

1077 | BuildMI(*MBB, CI.Paired, DL, CopyDesc) | ||||

1078 | .add(*Dest0) // Copy to same destination including flags and sub reg. | ||||

1079 | .addReg(DestReg, 0, SubRegIdx0); | ||||

1080 | MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc) | ||||

1081 | .add(*Dest1) | ||||

1082 | .addReg(DestReg, RegState::Kill, SubRegIdx1); | ||||

1083 | | ||||

1084 | moveInstsAfter(Copy1, CI.InstsToMove); | ||||

1085 | | ||||

1086 | CI.I->eraseFromParent(); | ||||

1087 | CI.Paired->eraseFromParent(); | ||||

1088 | return New; | ||||

1089 | } | ||||

1090 | | ||||

1091 | MachineBasicBlock::iterator | ||||

949 | SILoadStoreOptimizer::mergeSBufferLoadImmPair(CombineInfo &CI) { | 1092 | SILoadStoreOptimizer::mergeSBufferLoadImmPair(CombineInfo &CI) { | ||

950 | MachineBasicBlock *MBB = CI.I->getParent(); | 1093 | MachineBasicBlock *MBB = CI.I->getParent(); | ||

951 | DebugLoc DL = CI.I->getDebugLoc(); | 1094 | DebugLoc DL = CI.I->getDebugLoc(); | ||

952 | const unsigned Opcode = getNewOpcode(CI); | 1095 | const unsigned Opcode = getNewOpcode(CI); | ||

953 | 1096 | | |||

954 | const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI); | 1097 | const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI); | ||

955 | 1098 | | |||

956 | Register DestReg = MRI->createVirtualRegister(SuperRC); | 1099 | Register DestReg = MRI->createVirtualRegister(SuperRC); | ||

1072 | switch (Width) { | 1215 | switch (Width) { | ||

1073 | default: | 1216 | default: | ||

1074 | return 0; | 1217 | return 0; | ||

1075 | case 2: | 1218 | case 2: | ||

1076 | return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM; | 1219 | return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM; | ||

1077 | case 4: | 1220 | case 4: | ||

1078 | return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM; | 1221 | return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM; | ||

1079 | } | 1222 | } | ||

1223 | case MIMG: | ||||

1224 | assert("No overlaps" && (countPopulation(CI.DMask0 | CI.DMask1) == Width)); | ||||

1225 | return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width); | ||||

1080 | } | 1226 | } | ||

1081 | } | 1227 | } | ||

1082 | 1228 | | |||

1083 | std::pair<unsigned, unsigned> | 1229 | std::pair<unsigned, unsigned> | ||

1084 | SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI) { | 1230 | SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI) { | ||

1085 | 1231 | | |||

1086 | if (CI.Width0 == 0 || CI.Width0 == 0 || CI.Width0 + CI.Width1 > 4) | 1232 | if (CI.Width0 == 0 || CI.Width0 == 0 || CI.Width0 + CI.Width1 > 4) | ||

1087 | return std::make_pair(0, 0); | 1233 | return std::make_pair(0, 0); | ||

1088 | 1234 | | |||

1089 | bool ReverseOrder = CI.Offset0 > CI.Offset1; | 1235 | bool ReverseOrder; | ||

1236 | if (CI.InstClass == MIMG) { | ||||

1237 | assert((countPopulation(CI.DMask0 | CI.DMask1) == CI.Width0 + CI.Width1) && | ||||

1238 | "No overlaps"); | ||||

1239 | ReverseOrder = CI.DMask0 > CI.DMask1; | ||||

1240 | } else | ||||

1241 | ReverseOrder = CI.Offset0 > CI.Offset1; | ||||

1090 | 1242 | | |||

1091 | static const unsigned Idxs[4][4] = { | 1243 | static const unsigned Idxs[4][4] = { | ||

1092 | {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3}, | 1244 | {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3}, | ||

1093 | {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, 0}, | 1245 | {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, 0}, | ||

1094 | {AMDGPU::sub2, AMDGPU::sub2_sub3, 0, 0}, | 1246 | {AMDGPU::sub2, AMDGPU::sub2_sub3, 0, 0}, | ||

1095 | {AMDGPU::sub3, 0, 0, 0}, | 1247 | {AMDGPU::sub3, 0, 0, 0}, | ||

1096 | }; | 1248 | }; | ||

1097 | unsigned Idx0; | 1249 | unsigned Idx0; | ||

1650 | if (findMatchingInst(CI)) { | 1802 | if (findMatchingInst(CI)) { | ||

1651 | Modified = true; | 1803 | Modified = true; | ||

1652 | removeCombinedInst(MergeList, *CI.Paired); | 1804 | removeCombinedInst(MergeList, *CI.Paired); | ||

1653 | MachineBasicBlock::iterator NewMI = mergeBufferStorePair(CI); | 1805 | MachineBasicBlock::iterator NewMI = mergeBufferStorePair(CI); | ||

1654 | CI.setMI(NewMI, *TII, *STM); | 1806 | CI.setMI(NewMI, *TII, *STM); | ||

1655 | OptimizeListAgain |= (CI.Width0 + CI.Width1) < 4; | 1807 | OptimizeListAgain |= (CI.Width0 + CI.Width1) < 4; | ||

1656 | } | 1808 | } | ||

1657 | break; | 1809 | break; | ||

1810 | case MIMG: | ||||

1811 | if (findMatchingInst(CI)) { | ||||

1812 | Modified = true; | ||||

1813 | removeCombinedInst(MergeList, *CI.Paired); | ||||

1814 | MachineBasicBlock::iterator NewMI = mergeImagePair(CI); | ||||

1815 | CI.setMI(NewMI, *TII, *STM); | ||||

1816 | OptimizeListAgain |= (CI.Width0 + CI.Width1) < 4; | ||||

1817 | } | ||||

1818 | break; | ||||

1658 | } | 1819 | } | ||

1659 | // Clear the InstsToMove after we have finished searching so we don't have | 1820 | // Clear the InstsToMove after we have finished searching so we don't have | ||

1660 | // stale values left over if we search for this CI again in another pass | 1821 | // stale values left over if we search for this CI again in another pass | ||

1661 | // over the block. | 1822 | // over the block. | ||

1662 | CI.InstsToMove.clear(); | 1823 | CI.InstsToMove.clear(); | ||

1663 | } | 1824 | } | ||

1664 | 1825 | | |||

1665 | return Modified; | 1826 | return Modified; | ||

