diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp --- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -103,15 +103,19 @@ TBUFFER_STORE, }; -enum RegisterEnum { - SBASE = 0x1, - SRSRC = 0x2, - SOFFSET = 0x4, - VADDR = 0x8, - ADDR = 0x10, - SSAMP = 0x20, +struct AddressRegs { + unsigned char NumVAddrs = 0; + bool SBase = false; + bool SRsrc = false; + bool SOffset = false; + bool VAddr = false; + bool Addr = false; + bool SSamp = false; }; +// GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp. +const unsigned MaxAddressRegs = 12 + 1 + 1; + class SILoadStoreOptimizer : public MachineFunctionPass { struct CombineInfo { MachineBasicBlock::iterator I; @@ -126,8 +130,8 @@ bool SLC; bool DLC; bool UseST64; - int AddrIdx[5]; - const MachineOperand *AddrReg[5]; + int AddrIdx[MaxAddressRegs]; + const MachineOperand *AddrReg[MaxAddressRegs]; unsigned NumAddresses; unsigned Order; @@ -344,7 +348,8 @@ } if (TII.isMIMG(Opc)) { // Ignore instructions encoded without vaddr. - if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr) == -1) + if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr) == -1 && + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) == -1) return UNKNOWN; // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD. if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() || @@ -417,58 +422,54 @@ } } -static unsigned getRegs(unsigned Opc, const SIInstrInfo &TII) { - if (TII.isMUBUF(Opc)) { - unsigned result = 0; +static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) { + AddressRegs Result; - if (AMDGPU::getMUBUFHasVAddr(Opc)) { - result |= VADDR; - } - - if (AMDGPU::getMUBUFHasSrsrc(Opc)) { - result |= SRSRC; - } - - if (AMDGPU::getMUBUFHasSoffset(Opc)) { - result |= SOFFSET; - } - - return result; + if (TII.isMUBUF(Opc)) { + if (AMDGPU::getMUBUFHasVAddr(Opc)) + Result.VAddr = true; + if (AMDGPU::getMUBUFHasSrsrc(Opc)) + Result.SRsrc = true; + if (AMDGPU::getMUBUFHasSoffset(Opc)) + Result.SOffset = true; + + return Result; } if (TII.isMIMG(Opc)) { - unsigned result = VADDR | SRSRC; + int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0); + if (VAddr0Idx >= 0) { + int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc); + Result.NumVAddrs = SRsrcIdx - VAddr0Idx; + } else { + Result.VAddr = true; + } + Result.SRsrc = true; const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler) - result |= SSAMP; + Result.SSamp = true; - return result; + return Result; } if (TII.isMTBUF(Opc)) { - unsigned result = 0; - - if (AMDGPU::getMTBUFHasVAddr(Opc)) { - result |= VADDR; - } - - if (AMDGPU::getMTBUFHasSrsrc(Opc)) { - result |= SRSRC; - } - - if (AMDGPU::getMTBUFHasSoffset(Opc)) { - result |= SOFFSET; - } - - return result; + if (AMDGPU::getMTBUFHasVAddr(Opc)) + Result.VAddr = true; + if (AMDGPU::getMTBUFHasSrsrc(Opc)) + Result.SRsrc = true; + if (AMDGPU::getMTBUFHasSoffset(Opc)) + Result.SOffset = true; + + return Result; } switch (Opc) { default: - return 0; + return Result; case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: - return SBASE; + Result.SBase = true; + return Result; case AMDGPU::DS_READ_B32: case AMDGPU::DS_READ_B64: case AMDGPU::DS_READ_B32_gfx9: @@ -477,7 +478,8 @@ case AMDGPU::DS_WRITE_B64: case AMDGPU::DS_WRITE_B32_gfx9: case AMDGPU::DS_WRITE_B64_gfx9: - return ADDR; + Result.Addr = true; + return Result; } } @@ -534,38 +536,34 @@ DLC = TII.getNamedOperand(*I, AMDGPU::OpName::dlc)->getImm(); } - unsigned AddrOpName[5] = {0}; - NumAddresses = 0; - const unsigned Regs = getRegs(I->getOpcode(), TII); - - if (Regs & ADDR) { - AddrOpName[NumAddresses++] = AMDGPU::OpName::addr; - } - - if (Regs & SBASE) { - AddrOpName[NumAddresses++] = AMDGPU::OpName::sbase; - } - - if (Regs & SRSRC) { - AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc; - } - - if (Regs & SOFFSET) { - AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset; - } + AddressRegs Regs = getRegs(Opc, TII); - if (Regs & VADDR) { - AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr; - } - - if (Regs & SSAMP) { - AddrOpName[NumAddresses++] = AMDGPU::OpName::ssamp; - } - - for (unsigned i = 0; i < NumAddresses; i++) { - AddrIdx[i] = AMDGPU::getNamedOperandIdx(I->getOpcode(), AddrOpName[i]); - AddrReg[i] = &I->getOperand(AddrIdx[i]); - } + NumAddresses = 0; + for (unsigned J = 0; J < Regs.NumVAddrs; J++) + AddrIdx[NumAddresses++] = + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J; + if (Regs.Addr) + AddrIdx[NumAddresses++] = + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr); + if (Regs.SBase) + AddrIdx[NumAddresses++] = + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase); + if (Regs.SRsrc) + AddrIdx[NumAddresses++] = + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc); + if (Regs.SOffset) + AddrIdx[NumAddresses++] = + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset); + if (Regs.VAddr) + AddrIdx[NumAddresses++] = + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr); + if (Regs.SSamp) + AddrIdx[NumAddresses++] = + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::ssamp); + assert(NumAddresses <= MaxAddressRegs); + + for (unsigned J = 0; J < NumAddresses; J++) + AddrReg[J] = &I->getOperand(AddrIdx[J]); } } // end anonymous namespace. @@ -1283,9 +1281,9 @@ auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg); - const unsigned Regs = getRegs(Opcode, *TII); + AddressRegs Regs = getRegs(Opcode, *TII); - if (Regs & VADDR) + if (Regs.VAddr) MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); // It shouldn't be possible to get this far if the two instructions @@ -1346,9 +1344,9 @@ auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg); - const unsigned Regs = getRegs(Opcode, *TII); + AddressRegs Regs = getRegs(Opcode, *TII); - if (Regs & VADDR) + if (Regs.VAddr) MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); unsigned JoinedFormat = @@ -1426,9 +1424,9 @@ auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode)) .addReg(SrcReg, RegState::Kill); - const unsigned Regs = getRegs(Opcode, *TII); + AddressRegs Regs = getRegs(Opcode, *TII); - if (Regs & VADDR) + if (Regs.VAddr) MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); unsigned JoinedFormat = @@ -1589,9 +1587,9 @@ auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode)) .addReg(SrcReg, RegState::Kill); - const unsigned Regs = getRegs(Opcode, *TII); + AddressRegs Regs = getRegs(Opcode, *TII); - if (Regs & VADDR) + if (Regs.VAddr) MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); @@ -1986,6 +1984,8 @@ if (!CI.hasMergeableAddress(*MRI)) continue; + LLVM_DEBUG(dbgs() << "Mergeable: " << MI); + addInstToMergeableList(CI, MergeableInsts); } @@ -2082,6 +2082,8 @@ Modified = true; + LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << " with: " << *Paired.I); + switch (CI.InstClass) { default: llvm_unreachable("unknown InstClass"); diff --git a/llvm/test/CodeGen/AMDGPU/merge-image-load.mir b/llvm/test/CodeGen/AMDGPU/merge-image-load.mir --- a/llvm/test/CodeGen/AMDGPU/merge-image-load.mir +++ b/llvm/test/CodeGen/AMDGPU/merge-image-load.mir @@ -1,4 +1,5 @@ # RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass si-load-store-opt -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass si-load-store-opt -o - %s | FileCheck -check-prefix=GFX9 %s # GFX9-LABEL: name: image_load_merged_v1v3 # GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_V4_V4 %5, %3, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) diff --git a/llvm/test/CodeGen/AMDGPU/merge-image-sample.mir b/llvm/test/CodeGen/AMDGPU/merge-image-sample.mir --- a/llvm/test/CodeGen/AMDGPU/merge-image-sample.mir +++ b/llvm/test/CodeGen/AMDGPU/merge-image-sample.mir @@ -1,4 +1,5 @@ # RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass si-load-store-opt -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass si-load-store-opt -o - %s | FileCheck -check-prefix=GFX9 %s # GFX9-LABEL: name: image_sample_l_merged_v1v3 # GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_L_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)