diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp --- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -79,12 +79,13 @@ MIMG, TBUFFER_LOAD, TBUFFER_STORE, - GLOBAL_LOAD, GLOBAL_LOAD_SADDR, - GLOBAL_STORE, GLOBAL_STORE_SADDR, FLAT_LOAD, - FLAT_STORE + FLAT_STORE, + GLOBAL_LOAD, // GLOBAL_LOAD/GLOBAL_STORE are never used as the InstClass of + GLOBAL_STORE // any CombineInfo, they are only ever returned by + // getCommonInstClass. }; struct AddressRegs { @@ -275,6 +276,9 @@ static MachineMemOperand *combineKnownAdjacentMMOs(const CombineInfo &CI, const CombineInfo &Paired); + static InstClassEnum getCommonInstClass(const CombineInfo &CI, + const CombineInfo &Paired); + public: static char ID; @@ -438,7 +442,11 @@ case AMDGPU::GLOBAL_LOAD_DWORDX2: case AMDGPU::GLOBAL_LOAD_DWORDX3: case AMDGPU::GLOBAL_LOAD_DWORDX4: - return GLOBAL_LOAD; + case AMDGPU::FLAT_LOAD_DWORD: + case AMDGPU::FLAT_LOAD_DWORDX2: + case AMDGPU::FLAT_LOAD_DWORDX3: + case AMDGPU::FLAT_LOAD_DWORDX4: + return FLAT_LOAD; case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: @@ -448,22 +456,16 @@ case AMDGPU::GLOBAL_STORE_DWORDX2: case AMDGPU::GLOBAL_STORE_DWORDX3: case AMDGPU::GLOBAL_STORE_DWORDX4: - return GLOBAL_STORE; - case AMDGPU::GLOBAL_STORE_DWORD_SADDR: - case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: - case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: - case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: - return GLOBAL_STORE_SADDR; - case AMDGPU::FLAT_LOAD_DWORD: - case AMDGPU::FLAT_LOAD_DWORDX2: - case AMDGPU::FLAT_LOAD_DWORDX3: - case AMDGPU::FLAT_LOAD_DWORDX4: - return FLAT_LOAD; case AMDGPU::FLAT_STORE_DWORD: case AMDGPU::FLAT_STORE_DWORDX2: case AMDGPU::FLAT_STORE_DWORDX3: case AMDGPU::FLAT_STORE_DWORDX4: return FLAT_STORE; + case AMDGPU::GLOBAL_STORE_DWORD_SADDR: + case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: + case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: + case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: + return GLOBAL_STORE_SADDR; } } @@ -501,7 +503,11 @@ case AMDGPU::GLOBAL_LOAD_DWORDX2: case AMDGPU::GLOBAL_LOAD_DWORDX3: case AMDGPU::GLOBAL_LOAD_DWORDX4: - return AMDGPU::GLOBAL_LOAD_DWORD; + case AMDGPU::FLAT_LOAD_DWORD: + case AMDGPU::FLAT_LOAD_DWORDX2: + case AMDGPU::FLAT_LOAD_DWORDX3: + case AMDGPU::FLAT_LOAD_DWORDX4: + return AMDGPU::FLAT_LOAD_DWORD; case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: @@ -511,25 +517,37 @@ case AMDGPU::GLOBAL_STORE_DWORDX2: case AMDGPU::GLOBAL_STORE_DWORDX3: case AMDGPU::GLOBAL_STORE_DWORDX4: - return AMDGPU::GLOBAL_STORE_DWORD; - case AMDGPU::GLOBAL_STORE_DWORD_SADDR: - case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: - case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: - case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: - return AMDGPU::GLOBAL_STORE_DWORD_SADDR; - case AMDGPU::FLAT_LOAD_DWORD: - case AMDGPU::FLAT_LOAD_DWORDX2: - case AMDGPU::FLAT_LOAD_DWORDX3: - case AMDGPU::FLAT_LOAD_DWORDX4: - return AMDGPU::FLAT_LOAD_DWORD; case AMDGPU::FLAT_STORE_DWORD: case AMDGPU::FLAT_STORE_DWORDX2: case AMDGPU::FLAT_STORE_DWORDX3: case AMDGPU::FLAT_STORE_DWORDX4: return AMDGPU::FLAT_STORE_DWORD; + case AMDGPU::GLOBAL_STORE_DWORD_SADDR: + case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: + case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: + case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: + return AMDGPU::GLOBAL_STORE_DWORD_SADDR; } } +// GLOBAL loads and stores are classified as FLAT initially. If both combined +// instructions are FLAT GLOBAL adjust the class to GLOBAL_LOAD or GLOBAL_STORE. +// If either or both instructions are non segment specific FLAT the resulting +// combined operation will be FLAT, potentially promoting one of the GLOBAL +// operations to FLAT. +// For other instructions return the original unmodified class. +InstClassEnum +SILoadStoreOptimizer::getCommonInstClass(const CombineInfo &CI, + const CombineInfo &Paired) { + assert(CI.InstClass == Paired.InstClass); + + if ((CI.InstClass == FLAT_LOAD || CI.InstClass == FLAT_STORE) && + SIInstrInfo::isFLATGlobal(*CI.I) && SIInstrInfo::isFLATGlobal(*Paired.I)) + return (CI.InstClass == FLAT_STORE) ? GLOBAL_STORE : GLOBAL_LOAD; + + return CI.InstClass; +} + static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) { AddressRegs Result; @@ -762,10 +780,15 @@ // A base pointer for the combined operation is the same as the leading // operation's pointer. if (Paired < CI) - MMOa = MMOb; + std::swap(MMOa, MMOb); + + MachinePointerInfo PtrInfo(MMOa->getPointerInfo()); + // If merging FLAT and GLOBAL set address space to FLAT. + if (MMOb->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS) + PtrInfo.AddrSpace = AMDGPUAS::FLAT_ADDRESS; MachineFunction *MF = CI.I->getMF(); - return MF->getMachineMemOperand(MMOa, MMOa->getPointerInfo(), Size); + return MF->getMachineMemOperand(MMOa, PtrInfo, Size); } bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI, @@ -1576,7 +1599,7 @@ const CombineInfo &Paired) { const unsigned Width = CI.Width + Paired.Width; - switch (CI.InstClass) { + switch (getCommonInstClass(CI, Paired)) { default: assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE); // FIXME: Handle d16 correctly diff --git a/llvm/test/CodeGen/AMDGPU/merge-flat-with-global-load-store.mir b/llvm/test/CodeGen/AMDGPU/merge-flat-with-global-load-store.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/merge-flat-with-global-load-store.mir @@ -0,0 +1,312 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -run-pass=si-load-store-opt -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s + +--- +name: merge_flat_global_load_dword_2 +body: | + bb.0.entry: + + ; GCN-LABEL: name: merge_flat_global_load_dword_2 + ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF + ; GCN-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64_align2 = FLAT_LOAD_DWORDX2 [[DEF]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s64) from `float* undef` + 4, align 4) + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[FLAT_LOAD_DWORDX2_]].sub0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed [[FLAT_LOAD_DWORDX2_]].sub1 + ; GCN-NEXT: S_NOP 0, implicit [[COPY]], implicit [[COPY1]] + %0:vreg_64_align2 = IMPLICIT_DEF + %1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `float* undef` + 4, basealign 4) + %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef` + 8, basealign 4, addrspace 1) + S_NOP 0, implicit %1, implicit %2 +... + +--- +name: merge_global_flat_load_dword_2 +body: | + bb.0.entry: + + ; GCN-LABEL: name: merge_global_flat_load_dword_2 + ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF + ; GCN-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64_align2 = FLAT_LOAD_DWORDX2 [[DEF]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s64) from `float addrspace(1)* undef`) + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[FLAT_LOAD_DWORDX2_]].sub0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed [[FLAT_LOAD_DWORDX2_]].sub1 + ; GCN-NEXT: S_NOP 0, implicit [[COPY]], implicit [[COPY1]] + %0:vreg_64_align2 = IMPLICIT_DEF + %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, implicit $exec :: (load (s32) from `float addrspace(1)* undef`, basealign 8, addrspace 1) + %2:vgpr_32 = FLAT_LOAD_DWORD %0, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef` + 4, basealign 8) + S_NOP 0, implicit %1, implicit %2 +... + +--- +name: merge_global_flat_load_dword_3 +body: | + bb.0.entry: + + ; GCN-LABEL: name: merge_global_flat_load_dword_3 + ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF + ; GCN-NEXT: [[FLAT_LOAD_DWORDX3_:%[0-9]+]]:vreg_96_align2 = FLAT_LOAD_DWORDX3 [[DEF]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s96) from `float* undef`, align 16) + ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY [[FLAT_LOAD_DWORDX3_]].sub0_sub1 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed [[FLAT_LOAD_DWORDX3_]].sub2 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY killed [[COPY]].sub0 + ; GCN-NEXT: S_NOP 0, implicit [[COPY2]], implicit [[COPY3]] + %0:vreg_64_align2 = IMPLICIT_DEF + %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef` + 4, basealign 8, addrspace 1) + %2:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `float* undef`, basealign 16) + %3:vgpr_32 = GLOBAL_LOAD_DWORD %0, 8, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef` + 8, basealign 8, addrspace 1) + S_NOP 0, implicit %1, implicit %2 +... + +--- +name: merge_global_flat_load_dword_4 +body: | + bb.0.entry: + + ; GCN-LABEL: name: merge_global_flat_load_dword_4 + ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF + ; GCN-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128_align2 = FLAT_LOAD_DWORDX4 [[DEF]], 4, 0, implicit $exec, implicit $flat_scr :: (load (s128) from `i32 addrspace(1)* undef` + 4, align 4, basealign 8) + ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_96_align2 = COPY [[FLAT_LOAD_DWORDX4_]].sub0_sub1_sub2 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed [[FLAT_LOAD_DWORDX4_]].sub3 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:vreg_64_align2 = COPY [[COPY]].sub0_sub1 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY killed [[COPY]].sub2 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY2]].sub0 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[COPY2]].sub1 + ; GCN-NEXT: S_NOP 0, implicit [[COPY4]], implicit [[COPY5]], implicit [[COPY3]], implicit [[COPY1]] + %0:vreg_64_align2 = IMPLICIT_DEF + %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef` + 4, basealign 8, addrspace 1) + %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 8, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef` + 8, basealign 8, addrspace 1) + %3:vgpr_32 = GLOBAL_LOAD_DWORD %0, 12, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef` + 12, basealign 8, addrspace 1) + %4:vgpr_32 = FLAT_LOAD_DWORD %0, 16, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `float* undef` + 16) + S_NOP 0, implicit %1, implicit %2, implicit %3, implicit %4 +... + +--- +name: merge_flat_global_load_dwordx2 +body: | + bb.0.entry: + + ; GCN-LABEL: name: merge_flat_global_load_dwordx2 + ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF + ; GCN-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128_align2 = FLAT_LOAD_DWORDX4 [[DEF]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s128) from `double* undef`, align 8) + ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY [[FLAT_LOAD_DWORDX4_]].sub0_sub1 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY killed [[FLAT_LOAD_DWORDX4_]].sub2_sub3 + ; GCN-NEXT: S_NOP 0, implicit [[COPY]], implicit [[COPY1]] + %0:vreg_64_align2 = IMPLICIT_DEF + %1:vreg_64_align2 = FLAT_LOAD_DWORDX2 %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s64) from `double* undef`) + %2:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 %0, 8, 0, implicit $exec :: (load (s64) from `i64 addrspace(1)* undef`, addrspace 1) + S_NOP 0, implicit %1, implicit %2 +... + +--- +name: merge_flat_global_load_dwordx3 +body: | + bb.0.entry: + + ; GCN-LABEL: name: merge_flat_global_load_dwordx3 + ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF + ; GCN-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128_align2 = FLAT_LOAD_DWORDX4 [[DEF]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s128) from `float* undef`, align 4) + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[FLAT_LOAD_DWORDX4_]].sub0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_96_align2 = COPY killed [[FLAT_LOAD_DWORDX4_]].sub1_sub2_sub3 + ; GCN-NEXT: S_NOP 0, implicit [[COPY]], implicit [[COPY1]] + %0:vreg_64_align2 = IMPLICIT_DEF + %1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `float* undef`) + %2:vreg_96_align2 = GLOBAL_LOAD_DWORDX3 %0, 4, 0, implicit $exec :: (load (s96) from `<3 x i32> addrspace(1)* undef`, addrspace 1) + S_NOP 0, implicit %1, implicit %2 +... + +--- +name: merge_global_flat_load_dwordx3 +body: | + bb.0.entry: + + ; GCN-LABEL: name: merge_global_flat_load_dwordx3 + ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF + ; GCN-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128_align2 = FLAT_LOAD_DWORDX4 [[DEF]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s128) from `i32 addrspace(1)* undef`, align 4) + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[FLAT_LOAD_DWORDX4_]].sub0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_96_align2 = COPY killed [[FLAT_LOAD_DWORDX4_]].sub1_sub2_sub3 + ; GCN-NEXT: S_NOP 0, implicit [[COPY]], implicit [[COPY1]] + %0:vreg_64_align2 = IMPLICIT_DEF + %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, addrspace 1) + %2:vreg_96_align2 = FLAT_LOAD_DWORDX3 %0, 4, 0, implicit $exec, implicit $flat_scr :: (load (s96) from `<3 x i32>* undef`) + S_NOP 0, implicit %1, implicit %2 +... + +--- +name: no_merge_flat_global_load_dword_saddr +body: | + bb.0.entry: + + ; GCN-LABEL: name: no_merge_flat_global_load_dword_saddr + ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; GCN-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[DEF]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `float* undef`) + ; GCN-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64_align2 = GLOBAL_LOAD_DWORDX2_SADDR [[DEF1]], [[DEF]].sub0, 4, 0, implicit $exec :: (load (s64) from `i32 addrspace(1)* undef` + 4, align 4, addrspace 1) + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed [[GLOBAL_LOAD_DWORDX2_SADDR]].sub1 + ; GCN-NEXT: S_NOP 0, implicit [[FLAT_LOAD_DWORD]], implicit [[COPY]], implicit [[COPY1]] + %0:vreg_64_align2 = IMPLICIT_DEF + %1:sreg_64_xexec = IMPLICIT_DEF + %2:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `float* undef`, basealign 4) + %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %1, %0.sub0, 4, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef` + 4, basealign 4, addrspace 1) + %4:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %1, %0.sub0, 8, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef` + 8, basealign 4, addrspace 1) + S_NOP 0, implicit %2, implicit %3, implicit %4 +... + +--- +name: no_merge_global_saddr_flat_load_dword +body: | + bb.0.entry: + + ; GCN-LABEL: name: no_merge_global_saddr_flat_load_dword + ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; GCN-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF1]], [[DEF]].sub0, 0, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, addrspace 1) + ; GCN-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64_align2 = FLAT_LOAD_DWORDX2 [[DEF]], 4, 0, implicit $exec, implicit $flat_scr :: (load (s64) from `i32* undef` + 4, align 4) + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[FLAT_LOAD_DWORDX2_]].sub0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed [[FLAT_LOAD_DWORDX2_]].sub1 + ; GCN-NEXT: S_NOP 0, implicit [[GLOBAL_LOAD_DWORD_SADDR]], implicit [[COPY]], implicit [[COPY1]] + %0:vreg_64_align2 = IMPLICIT_DEF + %1:sreg_64_xexec = IMPLICIT_DEF + %2:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %1, %0.sub0, 0, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, addrspace 1) + %3:vgpr_32 = FLAT_LOAD_DWORD %0, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef` + 4) + %4:vgpr_32 = FLAT_LOAD_DWORD %0, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef` + 8) + S_NOP 0, implicit %2, implicit %3, implicit %4 +... + +--- +name: merge_flat_global_store_dword_2 +body: | + bb.0.entry: + ; GCN-LABEL: name: merge_flat_global_store_dword_2 + ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE killed [[DEF1]], %subreg.sub0, killed [[DEF2]], %subreg.sub1 + ; GCN-NEXT: FLAT_STORE_DWORDX2 [[DEF]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s64) into `i32* undef`, align 4) + %0:vreg_64_align2 = IMPLICIT_DEF + %1:vgpr_32 = IMPLICIT_DEF + %2:vgpr_32 = IMPLICIT_DEF + FLAT_STORE_DWORD %0, killed %1, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`) + GLOBAL_STORE_DWORD killed %0, killed %2, 4, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, addrspace 1) +... + +--- +name: merge_global_flat_store_dword_2 +body: | + bb.0.entry: + ; GCN-LABEL: name: merge_global_flat_store_dword_2 + ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE killed [[DEF1]], %subreg.sub0, killed [[DEF2]], %subreg.sub1 + ; GCN-NEXT: FLAT_STORE_DWORDX2 [[DEF]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s64) into `i32 addrspace(1)* undef`, align 4) + %0:vreg_64_align2 = IMPLICIT_DEF + %1:vgpr_32 = IMPLICIT_DEF + %2:vgpr_32 = IMPLICIT_DEF + GLOBAL_STORE_DWORD %0, killed %1, 0, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, addrspace 1) + FLAT_STORE_DWORD %0, killed %2, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`) +... + +--- +name: merge_flat_global_store_dwordx2 +body: | + bb.0.entry: + ; GCN-LABEL: name: merge_flat_global_store_dwordx2 + ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF2:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_96_align2 = REG_SEQUENCE [[DEF1]], %subreg.sub0, [[DEF2]], %subreg.sub1_sub2 + ; GCN-NEXT: FLAT_STORE_DWORDX3 [[DEF]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s96) into `i32* undef`, align 4) + %0:vreg_64_align2 = IMPLICIT_DEF + %1:vgpr_32 = IMPLICIT_DEF + %2:vreg_64_align2 = IMPLICIT_DEF + FLAT_STORE_DWORD %0, %1, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`) + GLOBAL_STORE_DWORDX2 %0, %2, 4, 0, implicit $exec :: (store (s64) into `i64 addrspace(1)* undef`, addrspace 1) +... + +--- +name: merge_flat_global_store_dwordx3 +body: | + bb.0.entry: + ; GCN-LABEL: name: merge_flat_global_store_dwordx3 + ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF2:%[0-9]+]]:vreg_96_align2 = IMPLICIT_DEF + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[DEF1]], %subreg.sub0, [[DEF2]], %subreg.sub1_sub2_sub3 + ; GCN-NEXT: FLAT_STORE_DWORDX4 [[DEF]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `i32* undef`, align 4) + %0:vreg_64_align2 = IMPLICIT_DEF + %1:vgpr_32 = IMPLICIT_DEF + %2:vreg_96_align2 = IMPLICIT_DEF + FLAT_STORE_DWORD %0, %1, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`) + GLOBAL_STORE_DWORDX3 %0, %2, 4, 0, implicit $exec :: (store (s96) into `<3 x i32> addrspace(1)* undef`, addrspace 1) +... + +--- +name: merge_global_flat_store_dwordx2 +body: | + bb.0.entry: + ; GCN-LABEL: name: merge_global_flat_store_dwordx2 + ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF2:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_96_align2 = REG_SEQUENCE [[DEF1]], %subreg.sub2, [[DEF2]], %subreg.sub0_sub1 + ; GCN-NEXT: FLAT_STORE_DWORDX3 [[DEF]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s96) into `i64* undef`, align 8) + %0:vreg_64_align2 = IMPLICIT_DEF + %1:vgpr_32 = IMPLICIT_DEF + %2:vreg_64_align2 = IMPLICIT_DEF + GLOBAL_STORE_DWORD %0, %1, 8, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, addrspace 1) + FLAT_STORE_DWORDX2 %0, %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64) into `i64* undef`) +... + +--- +name: merge_global_flat_store_dwordx3 +body: | + bb.0.entry: + ; GCN-LABEL: name: merge_global_flat_store_dwordx3 + ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF2:%[0-9]+]]:vreg_96_align2 = IMPLICIT_DEF + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[DEF1]], %subreg.sub3, [[DEF2]], %subreg.sub0_sub1_sub2 + ; GCN-NEXT: FLAT_STORE_DWORDX4 [[DEF]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `<3 x i32>* undef`) + %0:vreg_64_align2 = IMPLICIT_DEF + %1:vgpr_32 = IMPLICIT_DEF + %2:vreg_96_align2 = IMPLICIT_DEF + GLOBAL_STORE_DWORD %0, %1, 12, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, addrspace 1) + FLAT_STORE_DWORDX3 %0, %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s96) into `<3 x i32>* undef`) +... + +--- +name: no_merge_flat_global_store_dword_saddr +body: | + bb.0.entry: + ; GCN-LABEL: name: no_merge_flat_global_store_dword_saddr + ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; GCN-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: FLAT_STORE_DWORD [[DEF]], [[DEF2]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`) + ; GCN-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF]].sub0, [[DEF3]], [[DEF1]], 4, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, addrspace 1) + %0:vreg_64_align2 = IMPLICIT_DEF + %1:sreg_64_xexec = IMPLICIT_DEF + %2:vgpr_32 = IMPLICIT_DEF + %3:vgpr_32 = IMPLICIT_DEF + FLAT_STORE_DWORD %0, %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`) + GLOBAL_STORE_DWORD_SADDR %0.sub0, %3, %1, 4, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, addrspace 1) +... + +--- +name: no_merge_global_saddr_flat_store_dword +body: | + bb.0.entry: + ; GCN-LABEL: name: no_merge_global_saddr_flat_store_dword + ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; GCN-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF]].sub0, [[DEF2]], [[DEF1]], 0, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, addrspace 1) + ; GCN-NEXT: FLAT_STORE_DWORD [[DEF]], [[DEF3]], 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`) + %0:vreg_64_align2 = IMPLICIT_DEF + %1:sreg_64_xexec = IMPLICIT_DEF + %2:vgpr_32 = IMPLICIT_DEF + %3:vgpr_32 = IMPLICIT_DEF + GLOBAL_STORE_DWORD_SADDR %0.sub0, %2, %1, 0, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, addrspace 1) + FLAT_STORE_DWORD %0, %3, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`) +...