Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -137,6 +137,7 @@ ScheduleDAGMILive *DAG = new ScheduleDAGMILive(C, llvm::make_unique(C)); + DAG->addMutation(AMDGPU::createPhysRegCSEDAGMutation()); DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); return DAG; Index: lib/Target/AMDGPU/GCNSchedStrategy.h =================================================================== --- lib/Target/AMDGPU/GCNSchedStrategy.h +++ lib/Target/AMDGPU/GCNSchedStrategy.h @@ -49,6 +49,11 @@ SUnit *pickNode(bool &IsTopNode) override; }; +namespace AMDGPU { + +std::unique_ptr createPhysRegCSEDAGMutation(); + +} } // End namespace llvm #endif // GCNSCHEDSTRATEGY_H Index: lib/Target/AMDGPU/GCNSchedStrategy.cpp =================================================================== --- lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -310,3 +310,97 @@ DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") " << *SU->getInstr()); return SU; } + +namespace { + +static bool isCSECandidate(const MachineInstr &MI) { + + // Only CSE instruction with immediate uses. + for (const MachineOperand &MO : MI.uses()) { + if (!MO.isImm()) + return false; + } + + // Only CSE instruction that define physical registers + for (const MachineOperand &MO : MI.defs()) { + if (!TargetRegisterInfo::isPhysicalRegister(MO.getReg())) + return false; + } + + // Don't CSE memory instructions + if (MI.hasUnmodeledSideEffects() || MI.mayLoadOrStore()) + return false; + + return true; +} + +static void collectIdenticalSuccs(SUnit *SU, SmallVectorImpl &Worklist) { + for (SUnit::const_succ_iterator + SI = SU->Succs.begin(), SE = SU->Succs.end(); SI != SE; ++SI) { + SUnit *Succ = SI->getSUnit(); + if (SU->getInstr()->isIdenticalTo(*Succ->getInstr())) { + Worklist.push_back(Succ); + } + } +} + +class PhysRegCSE : public ScheduleDAGMutation { +public: + PhysRegCSE() { } + + void apply(ScheduleDAGInstrs *DAGInstrs) override; +}; + +void PhysRegCSE::apply(ScheduleDAGInstrs *DAGInstrs) { + + ScheduleDAGMI *DAG = static_cast(DAGInstrs); + + std::vector::iterator I, Next; + for (I = DAG->SUnits.begin(); I != DAG->SUnits.end(); I = Next) { + Next = std::next(I); + SUnit *SU = &*I; + MachineInstr *MI = SU->getInstr(); + + if (!isCSECandidate(*MI)) + continue; + + // Scan successors for an identical instructions + SmallVector Worklist; + collectIdenticalSuccs(SU, Worklist); + if (Worklist.empty()) + continue; + + for (SUnit *IdenticalSU : Worklist) { + // Copy successors over to SU + for (SUnit::succ_iterator + SI = IdenticalSU->Succs.begin(), SE = IdenticalSU->Succs.end(); + SI != SE; ++SI) { + SI->getSUnit()->addPred(SDep(SU, SI->getKind(), SI->getReg())); + SI->getSUnit()->removePred(SDep(IdenticalSU, SI->getKind(), SI->getReg())); + } + + // Remove all predecessors + for (SUnit::pred_iterator + PI = IdenticalSU->Preds.begin(), PE = IdenticalSU->Preds.end(); + PI != PE; ++PI) { + IdenticalSU->removePred(*PI); + } + + // Try to schedule the two SCEs together so MachineCSE can more easily + // remove them. + IdenticalSU->addPred(SDep(SU, SDep::Cluster)); + } + } +} + +} // End anonymous namespace + +namespace llvm { +namespace AMDGPU { + +std::unique_ptr createPhysRegCSEDAGMutation() { + return make_unique(); +} + +} +} Index: test/CodeGen/AMDGPU/init-m0-sched-deps.mir =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/init-m0-sched-deps.mir @@ -0,0 +1,26 @@ +# RUN: llc -march=amdgcn -run-pass machine-scheduler -o - %s | FileCheck %s + +--- | + define void @main(i32 addrspace(3)* %lds0, i32 addrspace(3)* %lds1) { + ret void + } +... + +--- + +# CHECK-LABEL: name: main +# CHECK: DS_READ_B32 +# CHECK-NEXT: DS_READ_B32 +name: main +registers: + - { id: 0, class: vgpr_32 } + - { id: 1, class: vgpr_32 } + - { id: 2, class: vgpr_32 } +body: | + bb.0: + %m0 = S_MOV_B32 -1 + %0 = V_MOV_B32_e32 0, implicit %exec + %1 = DS_READ_B32 %0, 0, 0, implicit %m0, implicit %exec :: (load 4 from %ir.lds0) + %m0 = S_MOV_B32 -1 + %2 = DS_READ_B32 %0, 4, 0, implicit %m0, implicit %exec :: (load 4 from %ir.lds1) +...