Index: llvm/trunk/include/llvm/Target/TargetInstrInfo.h =================================================================== --- llvm/trunk/include/llvm/Target/TargetInstrInfo.h +++ llvm/trunk/include/llvm/Target/TargetInstrInfo.h @@ -1097,8 +1097,8 @@ /// or /// DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); /// to TargetPassConfig::createMachineScheduler() to have an effect. - virtual bool shouldClusterMemOps(MachineInstr &FirstLdSt, - MachineInstr &SecondLdSt, + virtual bool shouldClusterMemOps(MachineInstr &FirstLdSt, unsigned BaseReg1, + MachineInstr &SecondLdSt, unsigned BaseReg2, unsigned NumLoads) const { llvm_unreachable("target did not implement shouldClusterMemOps()"); } Index: llvm/trunk/lib/CodeGen/MachineScheduler.cpp =================================================================== --- llvm/trunk/lib/CodeGen/MachineScheduler.cpp +++ llvm/trunk/lib/CodeGen/MachineScheduler.cpp @@ -1561,14 +1561,10 @@ std::sort(MemOpRecords.begin(), MemOpRecords.end()); unsigned ClusterLength = 1; for (unsigned Idx = 0, End = MemOpRecords.size(); Idx < (End - 1); ++Idx) { - if (MemOpRecords[Idx].BaseReg != MemOpRecords[Idx+1].BaseReg) { - ClusterLength = 1; - continue; - } - SUnit *SUa = MemOpRecords[Idx].SU; SUnit *SUb = MemOpRecords[Idx+1].SU; - if (TII->shouldClusterMemOps(*SUa->getInstr(), *SUb->getInstr(), + if (TII->shouldClusterMemOps(*SUa->getInstr(), MemOpRecords[Idx].BaseReg, + *SUb->getInstr(), MemOpRecords[Idx+1].BaseReg, ClusterLength) && DAG->addEdge(SUb, SDep(SUa, SDep::Cluster))) { DEBUG(dbgs() << "Cluster ld/st SU(" << SUa->NodeNum << ") - SU(" Index: llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.h =================================================================== --- llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.h +++ llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.h @@ -242,7 +242,8 @@ bool getMemOpInfo(unsigned Opcode, unsigned &Scale, unsigned &Width, int64_t &MinOffset, int64_t &MaxOffset) const; - bool shouldClusterMemOps(MachineInstr &FirstLdSt, MachineInstr &SecondLdSt, + bool shouldClusterMemOps(MachineInstr &FirstLdSt, unsigned BaseReg1, + MachineInstr &SecondLdSt, unsigned BaseReg2, unsigned NumLoads) const override; void copyPhysRegTuple(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Index: llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.cpp =================================================================== --- llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.cpp +++ llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -2104,8 +2104,13 @@ /// /// Only called for LdSt for which getMemOpBaseRegImmOfs returns true. bool AArch64InstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt, + unsigned BaseReg1, MachineInstr &SecondLdSt, + unsigned BaseReg2, unsigned NumLoads) const { + if (BaseReg1 != BaseReg2) + return false; + // Only cluster up to a single pair. if (NumLoads > 1) return false; Index: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h +++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h @@ -151,7 +151,8 @@ int64_t &Offset, const TargetRegisterInfo *TRI) const final; - bool shouldClusterMemOps(MachineInstr &FirstLdSt, MachineInstr &SecondLdSt, + bool shouldClusterMemOps(MachineInstr &FirstLdSt, unsigned BaseReg1, + MachineInstr &SecondLdSt, unsigned BaseReg2, unsigned NumLoads) const final; void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Index: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -27,6 +27,7 @@ #include "llvm/ADT/iterator_range.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/MemoryLocation.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" @@ -356,15 +357,52 @@ return false; } +static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, unsigned BaseReg1, + const MachineInstr &MI2, unsigned BaseReg2) { + if (BaseReg1 == BaseReg2) + return true; + + if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand()) + return false; + + auto MO1 = *MI1.memoperands_begin(); + auto MO2 = *MI2.memoperands_begin(); + if (MO1->getAddrSpace() != MO2->getAddrSpace()) + return false; + + auto Base1 = MO1->getValue(); + auto Base2 = MO2->getValue(); + if (!Base1 || !Base2) + return false; + const MachineFunction &MF = *MI1.getParent()->getParent(); + const DataLayout &DL = MF.getFunction()->getParent()->getDataLayout(); + Base1 = GetUnderlyingObject(Base1, DL); + Base2 = GetUnderlyingObject(Base1, DL); + + if (isa(Base1) || isa(Base2)) + return false; + + return Base1 == Base2; +} + bool SIInstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt, + unsigned BaseReg1, MachineInstr &SecondLdSt, + unsigned BaseReg2, unsigned NumLoads) const { + if (!memOpsHaveSameBasePtr(FirstLdSt, BaseReg1, SecondLdSt, BaseReg2)) + return false; + const MachineOperand *FirstDst = nullptr; const MachineOperand *SecondDst = nullptr; if ((isMUBUF(FirstLdSt) && isMUBUF(SecondLdSt)) || (isMTBUF(FirstLdSt) && isMTBUF(SecondLdSt)) || (isFLAT(FirstLdSt) && isFLAT(SecondLdSt))) { + const unsigned MaxGlobalLoadCluster = 6; + if (NumLoads > MaxGlobalLoadCluster) + return false; + FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdata); if (!FirstDst) FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst); Index: llvm/trunk/test/CodeGen/AMDGPU/add.i16.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/add.i16.ll +++ llvm/trunk/test/CodeGen/AMDGPU/add.i16.ll @@ -105,7 +105,7 @@ ; GCN-LABEL: {{^}}v_test_add_i16_sext_to_i32: ; VI: flat_load_ushort [[A:v[0-9]+]] ; VI: flat_load_ushort [[B:v[0-9]+]] -; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]] +; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]] ; VI-NEXT: v_bfe_i32 [[SEXT:v[0-9]+]], [[ADD]], 0, 16 ; VI-NEXT: buffer_store_dword [[SEXT]] define amdgpu_kernel void @v_test_add_i16_sext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { @@ -125,7 +125,7 @@ ; GCN-LABEL: {{^}}v_test_add_i16_sext_to_i64: ; VI: flat_load_ushort [[A:v[0-9]+]] ; VI: flat_load_ushort [[B:v[0-9]+]] -; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]] +; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]] ; VI-NEXT: v_bfe_i32 v[[LO:[0-9]+]], [[ADD]], 0, 16 ; VI-NEXT: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]] ; VI-NEXT: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} Index: llvm/trunk/test/CodeGen/AMDGPU/ctpop.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/ctpop.ll +++ llvm/trunk/test/CodeGen/AMDGPU/ctpop.ll @@ -42,8 +42,10 @@ } ; FUNC-LABEL: {{^}}v_ctpop_add_chain_i32: -; GCN: {{buffer|flat}}_load_dword [[VAL0:v[0-9]+]], -; GCN: {{buffer|flat}}_load_dword [[VAL1:v[0-9]+]], +; SI: buffer_load_dword [[VAL0:v[0-9]+]], +; SI: buffer_load_dword [[VAL1:v[0-9]+]], +; VI: flat_load_dword [[VAL1:v[0-9]+]], +; VI: flat_load_dword [[VAL0:v[0-9]+]], ; GCN: v_bcnt_u32_b32{{(_e64)*}} [[MIDRESULT:v[0-9]+]], [[VAL1]], 0 ; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]] ; VI: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]] @@ -280,8 +282,8 @@ ; SI: buffer_load_dword [[VAR:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 ; SI: buffer_load_dword [[VAL:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 ; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAR]], [[VAL]] -; VI: flat_load_dword [[VAL:v[0-9]+]], v[{{[0-9]+:[0-9]+}}] ; VI: flat_load_dword [[VAR:v[0-9]+]], v[{{[0-9]+:[0-9]+}}] +; VI: flat_load_dword [[VAL:v[0-9]+]], v[{{[0-9]+:[0-9]+}}] ; VI: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]] ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm Index: llvm/trunk/test/CodeGen/AMDGPU/fadd.f16.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/fadd.f16.ll +++ llvm/trunk/test/CodeGen/AMDGPU/fadd.f16.ll @@ -60,8 +60,10 @@ } ; GCN-LABEL: {{^}}fadd_v2f16: -; GCN: {{buffer|flat}}_load_dword v[[A_V2_F16:[0-9]+]] -; GCN: {{buffer|flat}}_load_dword v[[B_V2_F16:[0-9]+]] +; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]] +; SI: buffer_load_dword v[[B_V2_F16:[0-9]+]] +; VI: flat_load_dword v[[B_V2_F16:[0-9]+]] +; VI: flat_load_dword v[[A_V2_F16:[0-9]+]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] ; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] Index: llvm/trunk/test/CodeGen/AMDGPU/flat-load-clustering.mir =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/flat-load-clustering.mir +++ llvm/trunk/test/CodeGen/AMDGPU/flat-load-clustering.mir @@ -0,0 +1,77 @@ +# RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -run-pass machine-scheduler -o - %s | FileCheck -check-prefix=GCN %s + +# CGN-LABEL: name: flat_load_clustering +# GCN: FLAT_LOAD_DWORD +# GCN-NEXT: FLAT_LOAD_DWORD +--- | + define amdgpu_kernel void @flat_load_clustering(i32 addrspace(1)* nocapture %arg, i32 addrspace(2)* nocapture readonly %arg1) { + bb: + %tid = tail call i32 @llvm.amdgcn.workitem.id.x() + %idxprom = sext i32 %tid to i64 + %gep1 = getelementptr inbounds i32, i32 addrspace(2)* %arg1, i64 %idxprom + %load1 = load i32, i32 addrspace(2)* %gep1, align 4 + %gep2 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %idxprom + %gep34 = getelementptr inbounds i32, i32 addrspace(2)* %gep1, i64 4 + %load2 = load i32, i32 addrspace(2)* %gep34, align 4 + %gep4 = getelementptr inbounds i32, i32 addrspace(1)* %gep2, i64 4 + store i32 %load1, i32 addrspace(1)* %gep2, align 4 + store i32 %load2, i32 addrspace(1)* %gep4, align 4 + ret void + } + + declare i32 @llvm.amdgcn.workitem.id.x() + +... +--- +name: flat_load_clustering +alignment: 0 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: vgpr_32 } + - { id: 1, class: sgpr_64 } + - { id: 2, class: vgpr_32 } + - { id: 3, class: sreg_64_xexec } + - { id: 4, class: sreg_64_xexec } + - { id: 5, class: vgpr_32 } + - { id: 6, class: vgpr_32 } + - { id: 7, class: vgpr_32 } + - { id: 8, class: vgpr_32 } + - { id: 9, class: vreg_64 } + - { id: 10, class: vreg_64 } + - { id: 11, class: vgpr_32 } + - { id: 12, class: vreg_64 } + - { id: 13, class: vreg_64 } +liveins: + - { reg: '%vgpr0', virtual-reg: '%0' } + - { reg: '%sgpr4_sgpr5', virtual-reg: '%1' } +body: | + bb.0.bb: + liveins: %vgpr0, %sgpr4_sgpr5 + + %1 = COPY %sgpr4_sgpr5 + %0 = COPY %vgpr0 + %3 = S_LOAD_DWORDX2_IMM %1, 0, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %4 = S_LOAD_DWORDX2_IMM %1, 8, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %7 = V_LSHLREV_B32_e32 2, %0, implicit %exec + %2 = V_MOV_B32_e32 0, implicit %exec + undef %12.sub0 = V_ADD_I32_e32 %4.sub0, %7, implicit-def %vcc, implicit %exec + %11 = COPY %4.sub1 + %12.sub1 = V_ADDC_U32_e32 %11, %2, implicit-def dead %vcc, implicit killed %vcc, implicit %exec + %5 = FLAT_LOAD_DWORD %12, 0, 0, 0, implicit %exec, implicit %flat_scr :: (load 4 from %ir.gep1) + undef %9.sub0 = V_ADD_I32_e32 %3.sub0, %7, implicit-def %vcc, implicit %exec + %8 = COPY %3.sub1 + %9.sub1 = V_ADDC_U32_e32 %8, %2, implicit-def dead %vcc, implicit killed %vcc, implicit %exec + undef %13.sub0 = V_ADD_I32_e32 16, %12.sub0, implicit-def %vcc, implicit %exec + %13.sub1 = V_ADDC_U32_e32 %12.sub1, %2, implicit-def dead %vcc, implicit killed %vcc, implicit %exec + %6 = FLAT_LOAD_DWORD %13, 0, 0, 0, implicit %exec, implicit %flat_scr :: (load 4 from %ir.gep34) + undef %10.sub0 = V_ADD_I32_e32 16, %9.sub0, implicit-def %vcc, implicit %exec + %10.sub1 = V_ADDC_U32_e32 %9.sub1, %2, implicit-def dead %vcc, implicit killed %vcc, implicit %exec + FLAT_STORE_DWORD %9, %5, 0, 0, 0, implicit %exec, implicit %flat_scr :: (store 4 into %ir.gep2) + FLAT_STORE_DWORD %10, %6, 0, 0, 0, implicit %exec, implicit %flat_scr :: (store 4 into %ir.gep4) + S_ENDPGM + +... Index: llvm/trunk/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll +++ llvm/trunk/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll @@ -222,11 +222,11 @@ ; CI: buffer_store_dword ; CI: s_endpgm -; GFX9: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:400 -; GFX9: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:408 -; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, off offset:12 -; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, off offset:400 -; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, off offset:408 +; GFX9-DAG: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:400 +; GFX9-DAG: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:408 +; GFX9-DAG: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, off offset:12 +; GFX9-DAG: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, off offset:400 +; GFX9-DAG: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, off offset:408 ; GFX9: global_store_dword ; GFX9: s_endpgm define amdgpu_kernel void @reorder_global_offsets(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* noalias nocapture readnone %gptr, i32 addrspace(1)* noalias nocapture %ptr0) #0 { Index: llvm/trunk/test/CodeGen/AMDGPU/sub.i16.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/sub.i16.ll +++ llvm/trunk/test/CodeGen/AMDGPU/sub.i16.ll @@ -107,7 +107,7 @@ ; GCN-LABEL: {{^}}v_test_sub_i16_sext_to_i32: ; VI: flat_load_ushort [[A:v[0-9]+]] ; VI: flat_load_ushort [[B:v[0-9]+]] -; VI: v_sub_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]] +; VI: v_sub_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]] ; VI-NEXT: v_bfe_i32 [[SEXT:v[0-9]+]], [[ADD]], 0, 16 ; VI-NEXT: buffer_store_dword [[SEXT]] define amdgpu_kernel void @v_test_sub_i16_sext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { @@ -127,7 +127,7 @@ ; GCN-LABEL: {{^}}v_test_sub_i16_sext_to_i64: ; VI: flat_load_ushort [[A:v[0-9]+]] ; VI: flat_load_ushort [[B:v[0-9]+]] -; VI: v_sub_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]] +; VI: v_sub_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]] ; VI-NEXT: v_bfe_i32 v[[LO:[0-9]+]], [[ADD]], 0, 16 ; VI-NEXT: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]] ; VI-NEXT: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}