diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp --- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -163,6 +163,11 @@ } void setMI(MachineBasicBlock::iterator MI, const SILoadStoreOptimizer &LSO); + + // Compare by pointer order. + bool operator<(const CombineInfo& Other) const { + return (InstClass == MIMG) ? DMask < Other.DMask : Offset < Other.Offset; + } }; struct BaseRegisters { @@ -260,6 +265,9 @@ MemInfoMap &Visited, SmallPtrSet &AnchorList, std::list> &MergeableInsts) const; + static MachineMemOperand *combineKnownAdjacentMMOs(const CombineInfo &CI, + const CombineInfo &Paired); + public: static char ID; @@ -662,19 +670,23 @@ return true; } -// This function assumes that \p A and \p B have are identical except for -// size and offset, and they reference adjacent memory. -static MachineMemOperand *combineKnownAdjacentMMOs(MachineFunction &MF, - const MachineMemOperand *A, - const MachineMemOperand *B) { - unsigned MinOffset = std::min(A->getOffset(), B->getOffset()); - unsigned Size = A->getSize() + B->getSize(); - // This function adds the offset parameter to the existing offset for A, - // so we pass 0 here as the offset and then manually set it to the correct - // value after the call. - MachineMemOperand *MMO = MF.getMachineMemOperand(A, 0, Size); - MMO->setOffset(MinOffset); - return MMO; +// Given that \p CI and \p Paired are adjacent memory operations produce a new +// MMO for the combined operation with a new access size. +MachineMemOperand * +SILoadStoreOptimizer::combineKnownAdjacentMMOs(const CombineInfo &CI, + const CombineInfo &Paired) { + const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); + const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); + + unsigned Size = MMOa->getSize() + MMOb->getSize(); + + // A base pointer for the combined operation is the same as the leading + // operation's pointer. + if (Paired < CI) + MMOa = MMOb; + + MachineFunction *MF = CI.I->getMF(); + return MF->getMachineMemOperand(MMOa, MMOa->getPointerInfo(), Size); } bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI, @@ -1157,10 +1169,7 @@ // will return true if this is the case. assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); - const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); - const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); - - MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); + MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); unsigned SubRegIdx0, SubRegIdx1; std::tie(SubRegIdx0, SubRegIdx1) = getSubRegIdxs(CI, Paired); @@ -1199,16 +1208,12 @@ // will return true if this is the case. assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); - const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); - const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); - MachineInstr *New = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg) .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase)) .addImm(MergedOffset) // offset .addImm(CI.CPol) // cpol - .addMemOperand( - combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); + .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); std::pair SubRegIdx = getSubRegIdxs(CI, Paired); const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); @@ -1257,9 +1262,6 @@ // will return true if this is the case. assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); - const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); - const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); - MachineInstr *New = MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) @@ -1267,7 +1269,7 @@ .addImm(CI.CPol) // cpol .addImm(0) // tfe .addImm(0) // swz - .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); + .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); std::pair SubRegIdx = getSubRegIdxs(CI, Paired); const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); @@ -1319,9 +1321,6 @@ // will return true if this is the case. assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); - const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); - const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); - MachineInstr *New = MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) @@ -1330,8 +1329,7 @@ .addImm(CI.CPol) // cpol .addImm(0) // tfe .addImm(0) // swz - .addMemOperand( - combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); + .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); std::pair SubRegIdx = getSubRegIdxs(CI, Paired); const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); @@ -1395,9 +1393,6 @@ // will return true if this is the case. assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); - const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); - const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); - MachineInstr *New = MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) @@ -1406,8 +1401,7 @@ .addImm(CI.CPol) // cpol .addImm(0) // tfe .addImm(0) // swz - .addMemOperand( - combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); + .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); CI.I->eraseFromParent(); Paired.I->eraseFromParent(); @@ -1430,14 +1424,11 @@ if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr)) MIB.add(*SAddr); - const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); - const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); - MachineInstr *New = MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)) .addImm(std::min(CI.Offset, Paired.Offset)) .addImm(CI.CPol) - .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); + .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); std::pair SubRegIdx = getSubRegIdxs(CI, Paired); const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); @@ -1520,15 +1511,9 @@ std::pair SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI, const CombineInfo &Paired) { - bool ReverseOrder; - if (CI.InstClass == MIMG) { - assert( - (countPopulation(CI.DMask | Paired.DMask) == CI.Width + Paired.Width) && - "No overlaps"); - ReverseOrder = CI.DMask > Paired.DMask; - } else { - ReverseOrder = CI.Offset > Paired.Offset; - } + assert((CI.InstClass != MIMG || (countPopulation(CI.DMask | Paired.DMask) == + CI.Width + Paired.Width)) && + "No overlaps"); unsigned Idx0; unsigned Idx1; @@ -1544,7 +1529,7 @@ assert(CI.Width >= 1 && CI.Width <= 4); assert(Paired.Width >= 1 && Paired.Width <= 4); - if (ReverseOrder) { + if (Paired < CI) { Idx1 = Idxs[0][Paired.Width - 1]; Idx0 = Idxs[Paired.Width][CI.Width - 1]; } else { @@ -1618,9 +1603,6 @@ // will return true if this is the case. assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); - const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); - const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); - MachineInstr *New = MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) @@ -1628,7 +1610,7 @@ .addImm(CI.CPol) // cpol .addImm(0) // tfe .addImm(0) // swz - .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); + .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); CI.I->eraseFromParent(); Paired.I->eraseFromParent(); diff --git a/llvm/test/CodeGen/AMDGPU/merge-global-load-store.mir b/llvm/test/CodeGen/AMDGPU/merge-global-load-store.mir --- a/llvm/test/CodeGen/AMDGPU/merge-global-load-store.mir +++ b/llvm/test/CodeGen/AMDGPU/merge-global-load-store.mir @@ -405,7 +405,7 @@ ; GCN-LABEL: name: merge_global_load_dword_2_out_of_order ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF - ; GCN-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 [[DEF]], 0, 0, implicit $exec :: (load (s64) from `i32 addrspace(1)* undef`, addrspace 1) + ; GCN-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 [[DEF]], 0, 0, implicit $exec :: (load (s64) from `float addrspace(1)* undef`, align 4, addrspace 1) ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX2_]].sub1 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed [[GLOBAL_LOAD_DWORDX2_]].sub0 ; GCN-NEXT: S_NOP 0, implicit [[COPY]], implicit [[COPY1]] @@ -422,7 +422,7 @@ ; GCN-LABEL: name: merge_global_load_dword_3_out_of_order ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF - ; GCN-NEXT: [[GLOBAL_LOAD_DWORDX3_:%[0-9]+]]:vreg_96_align2 = GLOBAL_LOAD_DWORDX3 [[DEF]], 0, 0, implicit $exec :: (load (s96) from `i32 addrspace(1)* undef`, align 4, addrspace 1) + ; GCN-NEXT: [[GLOBAL_LOAD_DWORDX3_:%[0-9]+]]:vreg_96_align2 = GLOBAL_LOAD_DWORDX3 [[DEF]], 0, 0, implicit $exec :: (load (s96) from `float addrspace(1)* undef`, align 16, addrspace 1) ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY [[GLOBAL_LOAD_DWORDX3_]].sub0_sub1 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed [[GLOBAL_LOAD_DWORDX3_]].sub2 ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 diff --git a/llvm/test/CodeGen/AMDGPU/merge-image-load-gfx10.mir b/llvm/test/CodeGen/AMDGPU/merge-image-load-gfx10.mir --- a/llvm/test/CodeGen/AMDGPU/merge-image-load-gfx10.mir +++ b/llvm/test/CodeGen/AMDGPU/merge-image-load-gfx10.mir @@ -19,7 +19,7 @@ ... --- # GFX10-LABEL: name: image_load_merged_v1v3_reversed -# GFX10: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_V4_V2_gfx10 %5, %3, 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4) +# GFX10: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_V4_V2_gfx10 %5, %3, 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (dereferenceable load (s128), addrspace 4) # GFX10: %{{[0-9]+}}:vgpr_32 = COPY %8.sub3 # GFX10: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub0_sub1_sub2 @@ -95,7 +95,7 @@ --- # GFX10-LABEL: name: image_load_merged_v3v1_reversed -# GFX10: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_V4_V2_gfx10 %5, %3, 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (dereferenceable load (s128), addrspace 4) +# GFX10: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_V4_V2_gfx10 %5, %3, 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4) # GFX10: %{{[0-9]+}}:vreg_96 = COPY %8.sub1_sub2_sub3 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %8.sub0 @@ -114,7 +114,7 @@ --- # GFX10-LABEL: name: image_load_divided_merged -# GFX10: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_V4_V2_gfx10 %5, %3, 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4) +# GFX10: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_V4_V2_gfx10 %5, %3, 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (dereferenceable load (s128), addrspace 4) name: image_load_divided_merged body: | diff --git a/llvm/test/CodeGen/AMDGPU/merge-image-load.mir b/llvm/test/CodeGen/AMDGPU/merge-image-load.mir --- a/llvm/test/CodeGen/AMDGPU/merge-image-load.mir +++ b/llvm/test/CodeGen/AMDGPU/merge-image-load.mir @@ -20,7 +20,7 @@ --- # GFX9-LABEL: name: image_load_merged_v1v3_reversed -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_V4_V4 %5, %3, 15, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_V4_V4 %5, %3, 15, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load (s128), addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub3 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub0_sub1_sub2 @@ -96,7 +96,7 @@ --- # GFX9-LABEL: name: image_load_merged_v3v1_reversed -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_V4_V4 %5, %3, 15, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load (s128), addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_V4_V4 %5, %3, 15, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vreg_96 = COPY %8.sub1_sub2_sub3 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %8.sub0 @@ -115,7 +115,7 @@ --- # GFX9-LABEL: name: image_load_divided_merged -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_V4_V4 %5, %3, 15, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_V4_V4 %5, %3, 15, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load (s128), addrspace 4) name: image_load_divided_merged body: | diff --git a/llvm/test/CodeGen/AMDGPU/merge-image-sample-gfx10.mir b/llvm/test/CodeGen/AMDGPU/merge-image-sample-gfx10.mir --- a/llvm/test/CodeGen/AMDGPU/merge-image-sample-gfx10.mir +++ b/llvm/test/CodeGen/AMDGPU/merge-image-sample-gfx10.mir @@ -19,7 +19,7 @@ ... --- # GFX10-LABEL: name: image_sample_l_merged_v1v3_reversed -# GFX10: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_L_V4_V3_nsa_gfx10 %5, %5, %5, %3, %2, 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4) +# GFX10: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_L_V4_V3_nsa_gfx10 %5, %5, %5, %3, %2, 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (dereferenceable load (s128), addrspace 4) # GFX10: %{{[0-9]+}}:vgpr_32 = COPY %8.sub3 # GFX10: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub0_sub1_sub2 @@ -95,7 +95,7 @@ --- # GFX10-LABEL: name: image_sample_l_merged_v3v1_reversed -# GFX10: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_L_V4_V3_nsa_gfx10 %5, %5, %5, %3, %2, 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (dereferenceable load (s128), addrspace 4) +# GFX10: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_L_V4_V3_nsa_gfx10 %5, %5, %5, %3, %2, 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4) # GFX10: %{{[0-9]+}}:vreg_96 = COPY %8.sub1_sub2_sub3 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %8.sub0 @@ -114,7 +114,7 @@ --- # GFX10-LABEL: name: image_sample_l_divided_merged -# GFX10: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_L_V4_V3_nsa_gfx10 %5, %5, %5, %3, %2, 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4) +# GFX10: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_L_V4_V3_nsa_gfx10 %5, %5, %5, %3, %2, 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (dereferenceable load (s128), addrspace 4) name: image_sample_l_divided_merged body: | diff --git a/llvm/test/CodeGen/AMDGPU/merge-image-sample.mir b/llvm/test/CodeGen/AMDGPU/merge-image-sample.mir --- a/llvm/test/CodeGen/AMDGPU/merge-image-sample.mir +++ b/llvm/test/CodeGen/AMDGPU/merge-image-sample.mir @@ -20,7 +20,7 @@ --- # GFX9-LABEL: name: image_sample_l_merged_v1v3_reversed -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_L_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_L_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load (s128), addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub3 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub0_sub1_sub2 @@ -96,7 +96,7 @@ --- # GFX9-LABEL: name: image_sample_l_merged_v3v1_reversed -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_L_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load (s128), addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_L_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vreg_96 = COPY %8.sub1_sub2_sub3 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %8.sub0 @@ -115,7 +115,7 @@ --- # GFX9-LABEL: name: image_sample_l_divided_merged -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_L_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_L_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load (s128), addrspace 4) name: image_sample_l_divided_merged body: |