diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp --- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -79,7 +79,8 @@ MIMG, TBUFFER_LOAD, TBUFFER_STORE, - GLOBAL_LOAD + GLOBAL_LOAD, + GLOBAL_LOAD_SADDR }; struct AddressRegs { @@ -87,6 +88,7 @@ bool SBase = false; bool SRsrc = false; bool SOffset = false; + bool SAddr = false; bool VAddr = false; bool Addr = false; bool SSamp = false; @@ -305,14 +307,18 @@ switch (Opc) { case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: case AMDGPU::GLOBAL_LOAD_DWORD: + case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: return 1; case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: case AMDGPU::GLOBAL_LOAD_DWORDX2: + case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: return 2; case AMDGPU::GLOBAL_LOAD_DWORDX3: + case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: return 3; case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: case AMDGPU::GLOBAL_LOAD_DWORDX4: + case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: return 4; case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: return 8; @@ -402,6 +408,11 @@ case AMDGPU::GLOBAL_LOAD_DWORDX3: case AMDGPU::GLOBAL_LOAD_DWORDX4: return GLOBAL_LOAD; + case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: + case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: + case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: + case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: + return GLOBAL_LOAD_SADDR; } } @@ -440,6 +451,11 @@ case AMDGPU::GLOBAL_LOAD_DWORDX3: case AMDGPU::GLOBAL_LOAD_DWORDX4: return AMDGPU::GLOBAL_LOAD_DWORD; + case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: + case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: + case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: + case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: + return AMDGPU::GLOBAL_LOAD_DWORD_SADDR; } } @@ -502,6 +518,12 @@ case AMDGPU::DS_WRITE_B64_gfx9: Result.Addr = true; return Result; + case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: + case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: + case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: + case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: + Result.SAddr = true; + LLVM_FALLTHROUGH; case AMDGPU::GLOBAL_LOAD_DWORD: case AMDGPU::GLOBAL_LOAD_DWORDX2: case AMDGPU::GLOBAL_LOAD_DWORDX3: @@ -579,6 +601,9 @@ if (Regs.SOffset) AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset); + if (Regs.SAddr) + AddrIdx[NumAddresses++] = + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr); if (Regs.VAddr) AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr); @@ -1402,6 +1427,9 @@ auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); + if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr)) + MIB.add(*SAddr); + const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); @@ -1471,6 +1499,17 @@ case 4: return AMDGPU::GLOBAL_LOAD_DWORDX4; } + case GLOBAL_LOAD_SADDR: + switch (Width) { + default: + return 0; + case 2: + return AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR; + case 3: + return AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR; + case 4: + return AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR; + } case MIMG: assert((countPopulation(CI.DMask | Paired.DMask) == Width) && "No overlaps"); @@ -2115,6 +2154,7 @@ OptimizeListAgain |= CI.Width + Paired.Width < 4; break; case GLOBAL_LOAD: + case GLOBAL_LOAD_SADDR: NewMI = mergeGlobalLoadPair(CI, Paired, Where->I); OptimizeListAgain |= CI.Width + Paired.Width < 4; break; diff --git a/llvm/test/CodeGen/AMDGPU/merge-global-load-store.mir b/llvm/test/CodeGen/AMDGPU/merge-global-load-store.mir --- a/llvm/test/CodeGen/AMDGPU/merge-global-load-store.mir +++ b/llvm/test/CodeGen/AMDGPU/merge-global-load-store.mir @@ -228,3 +228,172 @@ %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) S_NOP 0, implicit %1, implicit %2 ... + +--- +name: merge_global_load_dword_saddr_2 +body: | + bb.0.entry: + + ; GCN-LABEL: name: merge_global_load_dword_saddr_2 + ; GCN: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64_align2 = GLOBAL_LOAD_DWORDX2_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s64) from `i32 addrspace(1)* undef`, align 4, addrspace 1) + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed [[GLOBAL_LOAD_DWORDX2_SADDR]].sub1 + ; GCN-NEXT: S_NOP 0, implicit [[COPY]], implicit [[COPY1]] + %0:sreg_64_xexec = IMPLICIT_DEF + %1:vgpr_32 = IMPLICIT_DEF + %2:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) + %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 4, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) + S_NOP 0, implicit %2, implicit %3 +... + +--- +name: merge_global_load_dword_saddr_3 +body: | + bb.0.entry: + + ; GCN-LABEL: name: merge_global_load_dword_saddr_3 + ; GCN: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[GLOBAL_LOAD_DWORDX3_SADDR:%[0-9]+]]:vreg_96_align2 = GLOBAL_LOAD_DWORDX3_SADDR [[DEF]], [[DEF1]], 0, 1, implicit $exec :: (load (s96) from `i32 addrspace(1)* undef`, align 4, addrspace 1) + ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY [[GLOBAL_LOAD_DWORDX3_SADDR]].sub0_sub1 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed [[GLOBAL_LOAD_DWORDX3_SADDR]].sub2 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY killed [[COPY]].sub1 + ; GCN-NEXT: S_NOP 0, implicit [[COPY2]], implicit [[COPY3]], implicit [[COPY1]] + %0:sreg_64_xexec = IMPLICIT_DEF + %1:vgpr_32 = IMPLICIT_DEF + %2:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 1, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) + %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 4, 1, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) + %4:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 8, 1, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) + S_NOP 0, implicit %2, implicit %3, implicit %4 +... + +--- +name: merge_global_load_dword_saddr_4 +body: | + bb.0.entry: + + ; GCN-LABEL: name: merge_global_load_dword_saddr_4 + ; GCN: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[GLOBAL_LOAD_DWORDX4_SADDR:%[0-9]+]]:vreg_128_align2 = GLOBAL_LOAD_DWORDX4_SADDR [[DEF]], [[DEF1]], 0, 2, implicit $exec :: (load (s128) from `i32 addrspace(1)* undef`, align 4, addrspace 1) + ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_96_align2 = COPY [[GLOBAL_LOAD_DWORDX4_SADDR]].sub0_sub1_sub2 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed [[GLOBAL_LOAD_DWORDX4_SADDR]].sub3 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:vreg_64_align2 = COPY [[COPY]].sub0_sub1 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY killed [[COPY]].sub2 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY2]].sub0 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[COPY2]].sub1 + ; GCN-NEXT: S_NOP 0, implicit [[COPY4]], implicit [[COPY5]], implicit [[COPY3]], implicit [[COPY1]] + %0:sreg_64_xexec = IMPLICIT_DEF + %1:vgpr_32 = IMPLICIT_DEF + %2:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 2, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) + %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 4, 2, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) + %4:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 8, 2, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) + %5:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 12, 2, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) + S_NOP 0, implicit %2, implicit %3, implicit %4, implicit %5 +... + +--- +name: merge_global_load_dword_saddr_6 +body: | + bb.0.entry: + + ; GCN-LABEL: name: merge_global_load_dword_saddr_6 + ; GCN: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[GLOBAL_LOAD_DWORDX4_SADDR:%[0-9]+]]:vreg_128_align2 = GLOBAL_LOAD_DWORDX4_SADDR [[DEF]], [[DEF1]], 4, 3, implicit $exec :: (load (s128) from `i32 addrspace(1)* undef`, align 4, addrspace 1) + ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_96_align2 = COPY [[GLOBAL_LOAD_DWORDX4_SADDR]].sub0_sub1_sub2 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed [[GLOBAL_LOAD_DWORDX4_SADDR]].sub3 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:vreg_64_align2 = COPY [[COPY]].sub0_sub1 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY killed [[COPY]].sub2 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY2]].sub0 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[COPY2]].sub1 + ; GCN-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64_align2 = GLOBAL_LOAD_DWORDX2_SADDR [[DEF]], [[DEF1]], 20, 3, implicit $exec :: (load (s64) from `i32 addrspace(1)* undef`, align 4, addrspace 1) + ; GCN-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0 + ; GCN-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY killed [[GLOBAL_LOAD_DWORDX2_SADDR]].sub1 + ; GCN-NEXT: S_NOP 0, implicit [[COPY4]], implicit [[COPY5]], implicit [[COPY3]], implicit [[COPY1]], implicit [[COPY6]], implicit [[COPY7]] + %0:sreg_64_xexec = IMPLICIT_DEF + %1:vgpr_32 = IMPLICIT_DEF + %2:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 4, 3, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) + %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 8, 3, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) + %4:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 12, 3, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) + %5:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 16, 3, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) + %6:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 20, 3, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) + %7:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 24, 3, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) + S_NOP 0, implicit %2, implicit %3, implicit %4, implicit %5, implicit %6, implicit %7 +... + +--- +name: merge_global_load_dwordx2_saddr +body: | + bb.0.entry: + + ; GCN-LABEL: name: merge_global_load_dwordx2_saddr + ; GCN: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[GLOBAL_LOAD_DWORDX4_SADDR:%[0-9]+]]:vreg_128_align2 = GLOBAL_LOAD_DWORDX4_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s128) from `i64 addrspace(1)* undef`, align 4, addrspace 1) + ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY [[GLOBAL_LOAD_DWORDX4_SADDR]].sub0_sub1 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY killed [[GLOBAL_LOAD_DWORDX4_SADDR]].sub2_sub3 + ; GCN-NEXT: S_NOP 0, implicit [[COPY]], implicit [[COPY1]] + %0:sreg_64_xexec = IMPLICIT_DEF + %1:vgpr_32 = IMPLICIT_DEF + %2:vreg_64_align2 = GLOBAL_LOAD_DWORDX2_SADDR %0, %1, 0, 0, implicit $exec :: (load (s64) from `i64 addrspace(1)* undef`, align 4, addrspace 1) + %3:vreg_64_align2 = GLOBAL_LOAD_DWORDX2_SADDR %0, %1, 8, 0, implicit $exec :: (load (s64) from `i64 addrspace(1)* undef`, align 4, addrspace 1) + S_NOP 0, implicit %2, implicit %3 +... + +--- +name: no_merge_global_load_dword_and_global_load_dword_saddr +body: | + bb.0.entry: + + ; GCN-LABEL: name: no_merge_global_load_dword_and_global_load_dword_saddr + ; GCN: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; GCN-NEXT: [[DEF1:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF + ; GCN-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF1]], 0, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, addrspace 1) + ; GCN-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]].sub0, 4, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, addrspace 1) + ; GCN-NEXT: S_NOP 0, implicit [[GLOBAL_LOAD_DWORD]], implicit [[GLOBAL_LOAD_DWORD_SADDR]] + %0:sreg_64_xexec = IMPLICIT_DEF + %1:vreg_64_align2 = IMPLICIT_DEF + %2:vgpr_32 = GLOBAL_LOAD_DWORD %1, 0, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) + %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1.sub0, 4, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) + S_NOP 0, implicit %2, implicit %3 +... + +--- +name: no_merge_global_load_dword_saddr_different_saddr +body: | + bb.0.entry: + + ; GCN-LABEL: name: no_merge_global_load_dword_saddr_different_saddr + ; GCN: [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]].sub0_sub1, [[DEF1]], 0, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, addrspace 1) + ; GCN-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]].sub2_sub3, [[DEF1]], 4, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, addrspace 1) + ; GCN-NEXT: S_NOP 0, implicit [[GLOBAL_LOAD_DWORD_SADDR]], implicit [[GLOBAL_LOAD_DWORD_SADDR1]] + %0:sgpr_128 = IMPLICIT_DEF + %1:vgpr_32 = IMPLICIT_DEF + %2:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0.sub0_sub1, %1, 0, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) + %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0.sub2_sub3, %1, 4, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) + S_NOP 0, implicit %2, implicit %3 +... + +--- +name: no_merge_global_load_dword_saddr_different_vaddr +body: | + bb.0.entry: + + ; GCN-LABEL: name: no_merge_global_load_dword_saddr_different_vaddr + ; GCN: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; GCN-NEXT: [[DEF1:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF + ; GCN-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]].sub0, 0, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, addrspace 1) + ; GCN-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]].sub1, 4, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, addrspace 1) + ; GCN-NEXT: S_NOP 0, implicit [[GLOBAL_LOAD_DWORD_SADDR]], implicit [[GLOBAL_LOAD_DWORD_SADDR1]] + %0:sreg_64_xexec = IMPLICIT_DEF + %1:vreg_64_align2 = IMPLICIT_DEF + %2:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1.sub0, 0, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) + %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1.sub1, 4, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) + S_NOP 0, implicit %2, implicit %3 +...