diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp --- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -79,6 +79,7 @@ MIMG, TBUFFER_LOAD, TBUFFER_STORE, + GLOBAL_LOAD }; struct AddressRegs { @@ -233,6 +234,9 @@ MachineBasicBlock::iterator mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired, MachineBasicBlock::iterator InsertBefore); + MachineBasicBlock::iterator + mergeGlobalLoadPair(CombineInfo &CI, CombineInfo &Paired, + MachineBasicBlock::iterator InsertBefore); void updateBaseAndOffset(MachineInstr &I, Register NewBase, int32_t NewOffset) const; @@ -300,10 +304,15 @@ switch (Opc) { case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: + case AMDGPU::GLOBAL_LOAD_DWORD: return 1; case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: + case AMDGPU::GLOBAL_LOAD_DWORDX2: return 2; + case AMDGPU::GLOBAL_LOAD_DWORDX3: + return 3; case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: + case AMDGPU::GLOBAL_LOAD_DWORDX4: return 4; case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: return 8; @@ -388,6 +397,11 @@ case AMDGPU::DS_WRITE_B64: case AMDGPU::DS_WRITE_B64_gfx9: return DS_WRITE; + case AMDGPU::GLOBAL_LOAD_DWORD: + case AMDGPU::GLOBAL_LOAD_DWORDX2: + case AMDGPU::GLOBAL_LOAD_DWORDX3: + case AMDGPU::GLOBAL_LOAD_DWORDX4: + return GLOBAL_LOAD; } } @@ -421,6 +435,11 @@ case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: return AMDGPU::S_BUFFER_LOAD_DWORD_IMM; + case AMDGPU::GLOBAL_LOAD_DWORD: + case AMDGPU::GLOBAL_LOAD_DWORDX2: + case AMDGPU::GLOBAL_LOAD_DWORDX3: + case AMDGPU::GLOBAL_LOAD_DWORDX4: + return AMDGPU::GLOBAL_LOAD_DWORD; } } @@ -483,6 +502,12 @@ case AMDGPU::DS_WRITE_B64_gfx9: Result.Addr = true; return Result; + case AMDGPU::GLOBAL_LOAD_DWORD: + case AMDGPU::GLOBAL_LOAD_DWORDX2: + case AMDGPU::GLOBAL_LOAD_DWORDX3: + case AMDGPU::GLOBAL_LOAD_DWORDX4: + Result.VAddr = true; + return Result; } } @@ -1364,6 +1389,49 @@ return New; } +MachineBasicBlock::iterator SILoadStoreOptimizer::mergeGlobalLoadPair( + CombineInfo &CI, CombineInfo &Paired, + MachineBasicBlock::iterator InsertBefore) { + MachineBasicBlock *MBB = CI.I->getParent(); + DebugLoc DL = CI.I->getDebugLoc(); + + const unsigned Opcode = getNewOpcode(CI, Paired); + + const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); + Register DestReg = MRI->createVirtualRegister(SuperRC); + + auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); + + const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); + const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); + + MachineInstr *New = + MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)) + .addImm(std::min(CI.Offset, Paired.Offset)) + .addImm(CI.CPol) + .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); + + std::pair SubRegIdx = getSubRegIdxs(CI, Paired); + const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); + const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); + + // Copy to the old destination registers. + const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); + const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst); + const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst); + + BuildMI(*MBB, InsertBefore, DL, CopyDesc) + .add(*Dest0) // Copy to same destination including flags and sub reg. + .addReg(DestReg, 0, SubRegIdx0); + BuildMI(*MBB, InsertBefore, DL, CopyDesc) + .add(*Dest1) + .addReg(DestReg, RegState::Kill, SubRegIdx1); + + CI.I->eraseFromParent(); + Paired.I->eraseFromParent(); + return New; +} + unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired) { const unsigned Width = CI.Width + Paired.Width; @@ -1392,6 +1460,17 @@ case 8: return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM; } + case GLOBAL_LOAD: + switch (Width) { + default: + return 0; + case 2: + return AMDGPU::GLOBAL_LOAD_DWORDX2; + case 3: + return AMDGPU::GLOBAL_LOAD_DWORDX3; + case 4: + return AMDGPU::GLOBAL_LOAD_DWORDX4; + } case MIMG: assert((countPopulation(CI.DMask | Paired.DMask) == Width) && "No overlaps"); @@ -2035,6 +2114,10 @@ NewMI = mergeTBufferStorePair(CI, Paired, Where->I); OptimizeListAgain |= CI.Width + Paired.Width < 4; break; + case GLOBAL_LOAD: + NewMI = mergeGlobalLoadPair(CI, Paired, Where->I); + OptimizeListAgain |= CI.Width + Paired.Width < 4; + break; } CI.setMI(NewMI, *this); CI.Order = Where->Order; diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-fixed-align.ll b/llvm/test/CodeGen/AMDGPU/memcpy-fixed-align.ll --- a/llvm/test/CodeGen/AMDGPU/memcpy-fixed-align.ll +++ b/llvm/test/CodeGen/AMDGPU/memcpy-fixed-align.ll @@ -7,13 +7,11 @@ ; MUBUF-LABEL: memcpy_fixed_align: ; MUBUF: ; %bb.0: ; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; MUBUF-NEXT: global_load_dword v0, v[1:2], off offset:36 -; MUBUF-NEXT: global_load_dword v11, v[1:2], off offset:32 +; MUBUF-NEXT: global_load_dwordx2 v[11:12], v[1:2], off offset:32 ; MUBUF-NEXT: global_load_dwordx4 v[3:6], v[1:2], off offset:16 ; MUBUF-NEXT: global_load_dwordx4 v[7:10], v[1:2], off -; MUBUF-NEXT: s_waitcnt vmcnt(3) -; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:36 -; MUBUF-NEXT: s_waitcnt vmcnt(3) +; MUBUF-NEXT: s_waitcnt vmcnt(2) +; MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:36 ; MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:32 ; MUBUF-NEXT: s_waitcnt vmcnt(3) ; MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:28 diff --git a/llvm/test/CodeGen/AMDGPU/merge-global-load-store.mir b/llvm/test/CodeGen/AMDGPU/merge-global-load-store.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/merge-global-load-store.mir @@ -0,0 +1,230 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -run-pass=si-load-store-opt -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s + +--- +name: merge_global_load_dword_2 +body: | + bb.0.entry: + + ; GCN-LABEL: name: merge_global_load_dword_2 + ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF + ; GCN-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 [[DEF]], 0, 0, implicit $exec :: (load (s64) from `i32 addrspace(1)* undef`, align 4, addrspace 1) + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX2_]].sub0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed [[GLOBAL_LOAD_DWORDX2_]].sub1 + ; GCN-NEXT: S_NOP 0, implicit [[COPY]], implicit [[COPY1]] + %0:vreg_64_align2 = IMPLICIT_DEF + %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) + %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) + S_NOP 0, implicit %1, implicit %2 +... + +--- +name: merge_global_load_dword_3 +body: | + bb.0.entry: + + ; GCN-LABEL: name: merge_global_load_dword_3 + ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF + ; GCN-NEXT: [[GLOBAL_LOAD_DWORDX3_:%[0-9]+]]:vreg_96_align2 = GLOBAL_LOAD_DWORDX3 [[DEF]], 0, 1, implicit $exec :: (load (s96) from `i32 addrspace(1)* undef`, align 4, addrspace 1) + ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY [[GLOBAL_LOAD_DWORDX3_]].sub0_sub1 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed [[GLOBAL_LOAD_DWORDX3_]].sub2 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY killed [[COPY]].sub1 + ; GCN-NEXT: S_NOP 0, implicit [[COPY2]], implicit [[COPY3]], implicit [[COPY1]] + %0:vreg_64_align2 = IMPLICIT_DEF + %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 1, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) + %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 1, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) + %3:vgpr_32 = GLOBAL_LOAD_DWORD %0, 8, 1, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) + S_NOP 0, implicit %1, implicit %2, implicit %3 +... + +--- +name: merge_global_load_dword_4 +body: | + bb.0.entry: + + ; GCN-LABEL: name: merge_global_load_dword_4 + ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF + ; GCN-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 [[DEF]], 0, 2, implicit $exec :: (load (s128) from `i32 addrspace(1)* undef`, align 4, addrspace 1) + ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_96_align2 = COPY [[GLOBAL_LOAD_DWORDX4_]].sub0_sub1_sub2 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed [[GLOBAL_LOAD_DWORDX4_]].sub3 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:vreg_64_align2 = COPY [[COPY]].sub0_sub1 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY killed [[COPY]].sub2 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY2]].sub0 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[COPY2]].sub1 + ; GCN-NEXT: S_NOP 0, implicit [[COPY4]], implicit [[COPY5]], implicit [[COPY3]], implicit [[COPY1]] + %0:vreg_64_align2 = IMPLICIT_DEF + %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 2, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) + %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 2, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) + %3:vgpr_32 = GLOBAL_LOAD_DWORD %0, 8, 2, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) + %4:vgpr_32 = GLOBAL_LOAD_DWORD %0, 12, 2, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) + S_NOP 0, implicit %1, implicit %2, implicit %3, implicit %4 +... + +--- +name: merge_global_load_dword_5 +body: | + bb.0.entry: + + ; GCN-LABEL: name: merge_global_load_dword_5 + ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF + ; GCN-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 [[DEF]], 0, 3, implicit $exec :: (load (s128) from `i32 addrspace(1)* undef`, align 4, addrspace 1) + ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_96_align2 = COPY [[GLOBAL_LOAD_DWORDX4_]].sub0_sub1_sub2 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed [[GLOBAL_LOAD_DWORDX4_]].sub3 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:vreg_64_align2 = COPY [[COPY]].sub0_sub1 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY killed [[COPY]].sub2 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY2]].sub0 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[COPY2]].sub1 + ; GCN-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF]], 16, 3, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, addrspace 1) + ; GCN-NEXT: S_NOP 0, implicit [[COPY4]], implicit [[COPY5]], implicit [[COPY3]], implicit [[COPY1]], implicit [[GLOBAL_LOAD_DWORD]] + %0:vreg_64_align2 = IMPLICIT_DEF + %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 3, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) + %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 3, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) + %3:vgpr_32 = GLOBAL_LOAD_DWORD %0, 8, 3, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) + %4:vgpr_32 = GLOBAL_LOAD_DWORD %0, 12, 3, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) + %5:vgpr_32 = GLOBAL_LOAD_DWORD %0, 16, 3, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) + S_NOP 0, implicit %1, implicit %2, implicit %3, implicit %4, implicit %5 +... + +--- +name: merge_global_load_dword_6 +body: | + bb.0.entry: + + ; GCN-LABEL: name: merge_global_load_dword_6 + ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF + ; GCN-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 [[DEF]], 0, 0, implicit $exec :: (load (s128) from `i32 addrspace(1)* undef`, align 4, addrspace 1) + ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_96_align2 = COPY [[GLOBAL_LOAD_DWORDX4_]].sub0_sub1_sub2 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed [[GLOBAL_LOAD_DWORDX4_]].sub3 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:vreg_64_align2 = COPY [[COPY]].sub0_sub1 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY killed [[COPY]].sub2 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY2]].sub0 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[COPY2]].sub1 + ; GCN-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 [[DEF]], 16, 0, implicit $exec :: (load (s64) from `i32 addrspace(1)* undef`, align 4, addrspace 1) + ; GCN-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX2_]].sub0 + ; GCN-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY killed [[GLOBAL_LOAD_DWORDX2_]].sub1 + ; GCN-NEXT: S_NOP 0, implicit [[COPY4]], implicit [[COPY5]], implicit [[COPY3]], implicit [[COPY1]], implicit [[COPY6]], implicit [[COPY7]] + %0:vreg_64_align2 = IMPLICIT_DEF + %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) + %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) + %3:vgpr_32 = GLOBAL_LOAD_DWORD %0, 8, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) + %4:vgpr_32 = GLOBAL_LOAD_DWORD %0, 12, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) + %5:vgpr_32 = GLOBAL_LOAD_DWORD %0, 16, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) + %6:vgpr_32 = GLOBAL_LOAD_DWORD %0, 20, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) + S_NOP 0, implicit %1, implicit %2, implicit %3, implicit %4, implicit %5, implicit %6 +... + +--- +name: merge_global_load_dwordx2 +body: | + bb.0.entry: + + ; GCN-LABEL: name: merge_global_load_dwordx2 + ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF + ; GCN-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 [[DEF]], 0, 0, implicit $exec :: (load (s128) from `i64 addrspace(1)* undef`, align 4, addrspace 1) + ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY [[GLOBAL_LOAD_DWORDX4_]].sub0_sub1 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY killed [[GLOBAL_LOAD_DWORDX4_]].sub2_sub3 + ; GCN-NEXT: S_NOP 0, implicit [[COPY]], implicit [[COPY1]] + %0:vreg_64_align2 = IMPLICIT_DEF + %1:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 %0, 0, 0, implicit $exec :: (load (s64) from `i64 addrspace(1)* undef`, align 4, addrspace 1) + %2:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 %0, 8, 0, implicit $exec :: (load (s64) from `i64 addrspace(1)* undef`, align 4, addrspace 1) + S_NOP 0, implicit %1, implicit %2 +... + +--- +name: merge_global_load_dwordx3_with_dwordx1 +body: | + bb.0.entry: + + ; GCN-LABEL: name: merge_global_load_dwordx3_with_dwordx1 + ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF + ; GCN-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 [[DEF]], 12, 0, implicit $exec :: (load (s128) from `i128 addrspace(1)* undef`, align 8, addrspace 1) + ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_96_align2 = COPY [[GLOBAL_LOAD_DWORDX4_]].sub0_sub1_sub2 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed [[GLOBAL_LOAD_DWORDX4_]].sub3 + ; GCN-NEXT: S_NOP 0, implicit [[COPY]], implicit [[COPY1]] + %0:vreg_64_align2 = IMPLICIT_DEF + %1:vreg_96_align2 = GLOBAL_LOAD_DWORDX3 %0, 12, 0, implicit $exec :: (load (s96) from `i128 addrspace(1)* undef`, align 8, addrspace 1) + %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 24, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) + S_NOP 0, implicit %1, implicit %2 +... + +--- +name: merge_global_load_dwordx1_with_dwordx2 +body: | + bb.0.entry: + + ; GCN-LABEL: name: merge_global_load_dwordx1_with_dwordx2 + ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF + ; GCN-NEXT: [[GLOBAL_LOAD_DWORDX3_:%[0-9]+]]:vreg_96_align2 = GLOBAL_LOAD_DWORDX3 [[DEF]], 12, 0, implicit $exec :: (load (s96) from `i32 addrspace(1)* undef`, align 4, addrspace 1) + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX3_]].sub0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY killed [[GLOBAL_LOAD_DWORDX3_]].sub1_sub2 + ; GCN-NEXT: S_NOP 0, implicit [[COPY1]], implicit [[COPY]] + %0:vreg_64_align2 = IMPLICIT_DEF + %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 12, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) + %1:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 %0, 16, 0, implicit $exec :: (load (s64) from `i64 addrspace(1)* undef`, align 8, addrspace 1) + S_NOP 0, implicit %1, implicit %2 +... + +--- +name: no_merge_global_load_dword_agpr_with_vgpr +body: | + bb.0.entry: + + ; GCN-LABEL: name: no_merge_global_load_dword_agpr_with_vgpr + ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF + ; GCN-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF]], 0, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, addrspace 1) + ; GCN-NEXT: [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:agpr_32 = GLOBAL_LOAD_DWORD [[DEF]], 4, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, addrspace 1) + ; GCN-NEXT: S_NOP 0, implicit [[GLOBAL_LOAD_DWORD]], implicit [[GLOBAL_LOAD_DWORD1]] + %0:vreg_64_align2 = IMPLICIT_DEF + %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) + %2:agpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) + S_NOP 0, implicit %1, implicit %2 +... + +--- +name: no_merge_global_load_dword_disjoint +body: | + bb.0.entry: + + ; GCN-LABEL: name: no_merge_global_load_dword_disjoint + ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF + ; GCN-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF]], 0, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, addrspace 1) + ; GCN-NEXT: [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF]], 8, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, addrspace 1) + ; GCN-NEXT: S_NOP 0, implicit [[GLOBAL_LOAD_DWORD]], implicit [[GLOBAL_LOAD_DWORD1]] + %0:vreg_64_align2 = IMPLICIT_DEF + %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) + %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 8, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) + S_NOP 0, implicit %1, implicit %2 +... + +--- +name: no_merge_global_load_dword_overlap +body: | + bb.0.entry: + + ; GCN-LABEL: name: no_merge_global_load_dword_overlap + ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF + ; GCN-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF]], 0, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, addrspace 1) + ; GCN-NEXT: [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF]], 3, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, addrspace 1) + ; GCN-NEXT: S_NOP 0, implicit [[GLOBAL_LOAD_DWORD]], implicit [[GLOBAL_LOAD_DWORD1]] + %0:vreg_64_align2 = IMPLICIT_DEF + %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) + %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 3, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, addrspace 1) + S_NOP 0, implicit %1, implicit %2 +... + +--- +name: no_merge_global_load_dword_different_cpol +body: | + bb.0.entry: + + ; GCN-LABEL: name: no_merge_global_load_dword_different_cpol + ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF + ; GCN-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF]], 0, 1, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, addrspace 1) + ; GCN-NEXT: [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF]], 4, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, addrspace 1) + ; GCN-NEXT: S_NOP 0, implicit [[GLOBAL_LOAD_DWORD]], implicit [[GLOBAL_LOAD_DWORD1]] + %0:vreg_64_align2 = IMPLICIT_DEF + %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 1, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) + %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1) + S_NOP 0, implicit %1, implicit %2 +... diff --git a/llvm/test/CodeGen/AMDGPU/promote-kernel-arguments.ll b/llvm/test/CodeGen/AMDGPU/promote-kernel-arguments.ll --- a/llvm/test/CodeGen/AMDGPU/promote-kernel-arguments.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-kernel-arguments.ll @@ -87,14 +87,7 @@ ; GCN-LABEL: flat_ptr_arg: ; GCN-COUNT-2: global_load_dwordx2 - -; FIXME: First load is in the constant address space and second is in global -; because it is clobbered by store. GPU load store vectorizer cannot -; combine them. Note, this does not happen with -O3 because loads are -; vectorized in pairs earlier and stay in the global address space. - -; GCN: global_load_dword v{{[0-9]+}}, [[PTR:v\[[0-9:]+\]]], off{{$}} -; GCN: global_load_dwordx3 v[{{[0-9:]+}}], [[PTR]], off offset:4 +; GCN: global_load_dwordx4 ; GCN: global_store_dword define amdgpu_kernel void @flat_ptr_arg(float** nocapture readonly noalias %Arg, float** nocapture noalias %Out, i32 %X) { ; CHECK-LABEL: @flat_ptr_arg( @@ -177,8 +170,7 @@ ; GCN-LABEL: global_ptr_arg: ; GCN: global_load_dwordx2 -; GCN: global_load_dword v{{[0-9]+}}, [[PTR:v\[[0-9:]+\]]], off{{$}} -; GCN: global_load_dwordx3 v[{{[0-9:]+}}], [[PTR]], off offset:4 +; GCN: global_load_dwordx4 ; GCN: global_store_dword define amdgpu_kernel void @global_ptr_arg(float* addrspace(1)* nocapture readonly %Arg, i32 %X) { ; CHECK-LABEL: @global_ptr_arg(