Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -138,6 +138,7 @@ ScheduleDAGMILive *DAG = new ScheduleDAGMILive(C, llvm::make_unique(C)); + DAG->addMutation(AMDGPU::createPhysRegCSEDAGMutation()); DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); return DAG; Index: lib/Target/AMDGPU/GCNSchedStrategy.h =================================================================== --- lib/Target/AMDGPU/GCNSchedStrategy.h +++ lib/Target/AMDGPU/GCNSchedStrategy.h @@ -49,6 +49,11 @@ SUnit *pickNode(bool &IsTopNode) override; }; +namespace AMDGPU { + +std::unique_ptr createPhysRegCSEDAGMutation(); + +} } // End namespace llvm #endif // GCNSCHEDSTRATEGY_H Index: lib/Target/AMDGPU/GCNSchedStrategy.cpp =================================================================== --- lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -311,3 +311,100 @@ DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") " << *SU->getInstr()); return SU; } + +namespace { + +static bool isCSECandidate(const MachineInstr &MI) { + // Only CSE instruction that define physical registers + for (const MachineOperand &MO : MI.defs()) { + if (!TargetRegisterInfo::isPhysicalRegister(MO.getReg())) + return false; + } + + // Don't CSE memory instructions + if (MI.hasUnmodeledSideEffects() || MI.mayLoadOrStore()) + return false; + + // Only CSE instruction with immediate uses. + for (const MachineOperand &MO : MI.uses()) { + if (!MO.isImm()) + return false; + } + + return true; +} + +static void collectIdenticalSuccs(SUnit *SU, SmallVectorImpl &Worklist) { + for (SUnit::const_succ_iterator + SI = SU->Succs.begin(), SE = SU->Succs.end(); SI != SE; ++SI) { + SUnit *Succ = SI->getSUnit(); + if (SU->getInstr()->isIdenticalTo(*Succ->getInstr())) { + Worklist.push_back(Succ); + collectIdenticalSuccs(Succ, Worklist); + } + } +} + +class PhysRegCSE : public ScheduleDAGMutation { +public: + PhysRegCSE() { } + + void apply(ScheduleDAGInstrs *DAGInstrs) override; +}; + +void PhysRegCSE::apply(ScheduleDAGInstrs *DAGInstrs) { + + ScheduleDAGMI *DAG = static_cast(DAGInstrs); + + std::vector::iterator I, E; + for (I = DAG->SUnits.begin(), E = DAG->SUnits.end(); I != E; ++I) { + SUnit *SU = &*I; + MachineInstr *MI = SU->getInstr(); + + if (!isCSECandidate(*MI)) + continue; + + // Scan successors for an identical instructions + SmallVector Worklist; + collectIdenticalSuccs(SU, Worklist); + if (Worklist.empty()) + continue; + + for (SUnit *IdenticalSU : Worklist) { + // Copy successors over to SU + SUnit::succ_iterator SI; + while ((SI = IdenticalSU->Succs.begin()) != IdenticalSU->Succs.end()) { + SI->getSUnit()->addPred(SDep(SU, SI->getKind(), SI->getReg())); + for (SUnit::pred_iterator PI = SI->getSUnit()->Preds.begin(), + PE = SI->getSUnit()->Preds.end(); PI != PE; ++PI) { + + if (PI->getSUnit() == IdenticalSU) { + SI->getSUnit()->removePred(*PI); + break; + } + } + } + + // Remove all predecessors + SUnit::pred_iterator PI; + while ((PI = IdenticalSU->Preds.begin()) != IdenticalSU->Preds.end()) + IdenticalSU->removePred(*PI); + + // Try to schedule the two SCEs together so MachineCSE can more easily + // remove them. + IdenticalSU->addPred(SDep(SU, SDep::Cluster)); + } + } +} + +} // End anonymous namespace + +namespace llvm { +namespace AMDGPU { + +std::unique_ptr createPhysRegCSEDAGMutation() { + return make_unique(); +} + +} +} Index: test/CodeGen/AMDGPU/init-m0-sched-deps.mir =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/init-m0-sched-deps.mir @@ -0,0 +1,201 @@ +# RUN: llc -march=amdgcn -run-pass machine-scheduler -o - %s | FileCheck %s + +--- | + define void @reinit_m0_neg1(i32 addrspace(3)* %lds0, i32 addrspace(3)* %lds1) { + ret void + } + + define void @reinit_m0_multi_value(i32 addrspace(3)* %lds0, i32 addrspace(3)* %lds1) { + ret void + } + + define void @reinit_m0_neg1_multi_use_same_def(i32 addrspace(3)* %lds0, i32 addrspace(3)* %lds1, i32 addrspace(3)* %lds2) { + ret void + } + + define void @reinit_m0_add_impdef_scc(i32 addrspace(3)* %lds0, i32 addrspace(3)* %lds1) { + ret void + } + + define void @reinit_m0_brev(i32 addrspace(3)* %lds0, i32 addrspace(3)* %lds1) { + ret void + } + + define void @reinit_m0_diff_opcode(i32 addrspace(3)* %lds0, i32 addrspace(3)* %lds1) { + ret void + } + + define void @reinit_m0_copy(i32 addrspace(3)* %lds0, i32 addrspace(3)* %lds1) { + ret void + } + + define void @reinit_m0_other_use(i32 addrspace(3)* %lds0, i32 addrspace(3)* %lds1) { + ret void + } + +... +--- + +# CHECK-LABEL: name: reinit_m0_neg1 +# CHECK: DS_READ_B32 +# CHECK-NEXT: DS_READ_B32 +name: reinit_m0_neg1 +registers: + - { id: 0, class: vgpr_32 } + - { id: 1, class: vgpr_32 } + - { id: 2, class: vgpr_32 } +body: | + bb.0: + %m0 = S_MOV_B32 -1 + %0 = V_MOV_B32_e32 0, implicit %exec + %1 = DS_READ_B32 %0, 0, 0, implicit %m0, implicit %exec :: (load 4 from %ir.lds0) + %m0 = S_MOV_B32 -1 + %2 = DS_READ_B32 %0, 4, 0, implicit %m0, implicit %exec :: (load 4 from %ir.lds1) +... +--- + +# CHECK-LABEL: name: reinit_m0_multi_value +# CHECK: %m0 = S_MOV_B32 400 +# CHECK: DS_READ_B32 +# CHECK: %m0 = S_MOV_B32 800 +# CHECK: DS_READ_B32 +name: reinit_m0_multi_value +registers: + - { id: 0, class: vgpr_32 } + - { id: 1, class: vgpr_32 } + - { id: 2, class: vgpr_32 } +body: | + bb.0: + %m0 = S_MOV_B32 400 + %0 = V_MOV_B32_e32 0, implicit %exec + %1 = DS_READ_B32 %0, 0, 0, implicit %m0, implicit %exec :: (load 4 from %ir.lds0) + %m0 = S_MOV_B32 800 + %2 = DS_READ_B32 %0, 4, 0, implicit %m0, implicit %exec :: (load 4 from %ir.lds1) +... +--- + +# CHECK-LABEL: name: reinit_m0_neg1_multi_use_same_def +# CHECK: %m0 = S_MOV_B32 -1 +# CHECK: DS_READ_B32 +# CHECK-NEXT: DS_READ_B32 +# CHECK-NEXT: DS_READ_B32 + +name: reinit_m0_neg1_multi_use_same_def +registers: + - { id: 0, class: vgpr_32 } + - { id: 1, class: vgpr_32 } + - { id: 2, class: vgpr_32 } + - { id: 3, class: vgpr_32 } +body: | + bb.0: + %m0 = S_MOV_B32 -1 + %0 = V_MOV_B32_e32 0, implicit %exec + %1 = DS_READ_B32 %0, 0, 0, implicit %m0, implicit %exec :: (load 4 from %ir.lds0) + %2 = DS_READ_B32 %0, 4, 0, implicit %m0, implicit %exec :: (load 4 from %ir.lds1) + %m0 = S_MOV_B32 -1 + %3 = DS_READ_B32 %0, 4, 0, implicit %m0, implicit %exec :: (load 4 from %ir.lds2) +... +--- + +# CHECK-LABEL: name: reinit_m0_add_impdef_scc +# CHECK: %m0 = S_ADD_U32 0, 4, implicit-def %scc +# CHECK: DS_READ_B32 +# CHECK: %m0 = S_ADD_U32 0, 4, implicit-def %scc +# CHECK: DS_READ_B32 +name: reinit_m0_add_impdef_scc +registers: + - { id: 0, class: vgpr_32 } + - { id: 1, class: vgpr_32 } + - { id: 2, class: vgpr_32 } +body: | + bb.0: + %m0 = S_ADD_U32 0, 4, implicit-def %scc + %0 = V_MOV_B32_e32 0, implicit %exec + %1 = DS_READ_B32 %0, 0, 0, implicit %m0, implicit %exec :: (load 4 from %ir.lds0) + %m0 = S_ADD_U32 0, 4, implicit-def %scc + %2 = DS_READ_B32 %0, 4, 0, implicit %m0, implicit %exec :: (load 4 from %ir.lds1) +... +--- + +# CHECK-LABEL: name: reinit_m0_brev +# CHECK: %m0 = S_BREV_B32 2 +# CHECK: DS_READ_B32 +# CHECK-NEXT: DS_READ_B32 +name: reinit_m0_brev +registers: + - { id: 0, class: vgpr_32 } + - { id: 1, class: vgpr_32 } + - { id: 2, class: vgpr_32 } +body: | + bb.0: + %m0 = S_BREV_B32 2 + %0 = V_MOV_B32_e32 0, implicit %exec + %1 = DS_READ_B32 %0, 0, 0, implicit %m0, implicit %exec :: (load 4 from %ir.lds0) + %m0 = S_BREV_B32 2 + %2 = DS_READ_B32 %0, 4, 0, implicit %m0, implicit %exec :: (load 4 from %ir.lds1) +... +--- + +# CHECK-LABEL: name: reinit_m0_diff_opcode +# CHECK: %m0 = S_MOV_B32 400 +# CHECK: DS_READ_B32 +# CHECK: %m0 = S_BREV_B32 400 +# CHECK: DS_READ_B32 +name: reinit_m0_diff_opcode +registers: + - { id: 0, class: vgpr_32 } + - { id: 1, class: vgpr_32 } + - { id: 2, class: vgpr_32 } +body: | + bb.0: + %m0 = S_MOV_B32 400 + %0 = V_MOV_B32_e32 0, implicit %exec + %1 = DS_READ_B32 %0, 0, 0, implicit %m0, implicit %exec :: (load 4 from %ir.lds0) + %m0 = S_BREV_B32 400 + %2 = DS_READ_B32 %0, 4, 0, implicit %m0, implicit %exec :: (load 4 from %ir.lds1) +... +--- + +# CHECK-LABEL: name: reinit_m0_copy +# CHECK: %3 = S_MOV_B32 400 +# CHECK: %m0 = COPY %3 +# CHECK: DS_READ_B32 +# CHECK: %m0 = COPY %3 +# CHECK: DS_READ_B32 +name: reinit_m0_copy +registers: + - { id: 0, class: vgpr_32 } + - { id: 1, class: vgpr_32 } + - { id: 2, class: vgpr_32 } + - { id: 3, class: sreg_32_xm0 } +body: | + bb.0: + %3 = S_MOV_B32 400 + %m0 = COPY %3 + %0 = V_MOV_B32_e32 0, implicit %exec + %1 = DS_READ_B32 %0, 0, 0, implicit %m0, implicit %exec :: (load 4 from %ir.lds0) + %m0 = COPY %3 + %2 = DS_READ_B32 %0, 4, 0, implicit %m0, implicit %exec :: (load 4 from %ir.lds1) +... +--- +#FIXME: Why isn't this combined? + +# CHECK-LABEL: name: reinit_m0_other_use +# CHECK: %m0 = S_MOV_B32 8 +# CHECK: S_ADD_U32 %m0, 4 +# CHECK: %m0 = S_MOV_B32 8 +# CHECK: S_ADD_U32 %m0, 8 +name: reinit_m0_other_use +registers: + - { id: 0, class: vgpr_32 } + - { id: 1, class: sreg_32_xm0 } + - { id: 2, class: sreg_32_xm0 } + - { id: 3, class: sreg_32_xm0 } +body: | + bb.0: + %m0 = S_MOV_B32 8 + %0 = V_MOV_B32_e32 0, implicit %exec + %1 = S_ADD_U32 %m0, 4, implicit-def %scc + %m0 = S_MOV_B32 8 + %2 = S_ADD_U32 %m0, 8, implicit-def %scc +...