Index: include/llvm/CodeGen/TargetPassConfig.h =================================================================== --- include/llvm/CodeGen/TargetPassConfig.h +++ include/llvm/CodeGen/TargetPassConfig.h @@ -22,6 +22,7 @@ class PassConfigImpl; class ScheduleDAGInstrs; +class ScheduleDAGMutation; class TargetMachine; struct MachineSchedContext; @@ -251,6 +252,15 @@ return nullptr; } + /// When EnableMacroFusion is true, create target defined MacroFusion + /// instance to be run within default pre-scheduler. + /// + /// Return NULL to select the default MacroFusion. + virtual ScheduleDAGMutation * + createMacroFusion(ScheduleDAGInstrs *DAG) const { + return nullptr; + } + /// printAndVerify - Add a pass to dump then verify the machine function, if /// those steps are enabled. /// Index: lib/CodeGen/MachineScheduler.cpp =================================================================== --- lib/CodeGen/MachineScheduler.cpp +++ lib/CodeGen/MachineScheduler.cpp @@ -3119,8 +3119,12 @@ if (DAG->TII->enableClusterStores()) DAG->addMutation(make_unique(DAG->TII, DAG->TRI)); } - if (EnableMacroFusion) - DAG->addMutation(make_unique(*DAG->TII, *DAG->TRI)); + if (EnableMacroFusion) { + if (ScheduleDAGMutation *Fusion = C->PassConfig->createMacroFusion(DAG)) + DAG->addMutation(std::unique_ptr(Fusion)); + else + DAG->addMutation(make_unique(*DAG->TII, *DAG->TRI)); + } return DAG; } Index: lib/CodeGen/PostRASchedulerList.cpp =================================================================== --- lib/CodeGen/PostRASchedulerList.cpp +++ lib/CodeGen/PostRASchedulerList.cpp @@ -122,6 +122,9 @@ /// added to the AvailableQueue. std::vector PendingQueue; + /// Record the next node in a scheduled cluster. + SUnit *NextClusterSucc; + /// HazardRec - The hazard recognizer to use. ScheduleHazardRecognizer *HazardRec; @@ -208,8 +211,8 @@ const RegisterClassInfo &RCI, TargetSubtargetInfo::AntiDepBreakMode AntiDepMode, SmallVectorImpl &CriticalPathRCs) - : ScheduleDAGInstrs(MF, &MLI), AA(AA), EndIndex(0) { - + : ScheduleDAGInstrs(MF, &MLI), NextClusterSucc(nullptr), AA(AA), EndIndex(0) +{ const InstrItineraryData *InstrItins = MF.getSubtarget().getInstrItineraryData(); HazardRec = @@ -461,6 +464,11 @@ if (SuccEdge->isWeak()) { --SuccSU->WeakPredsLeft; + // Cluster instructions get higher scheduling priority. + // If SuccSU is not blocked by any other predecessors, let scheduler pick + // SuccSU as next scheduling instruction. + if (SuccEdge->isCluster() && SuccSU->NumPredsLeft == 0) + NextClusterSucc = SuccSU; return; } #ifndef NDEBUG @@ -550,54 +558,77 @@ // stall or emit a noop, depending on the target. bool CycleHasInsts = false; + assert(!NextClusterSucc && "Incorrect scheduling state."); + // While Available queue is not empty, grab the node with the highest // priority. If it is not ready put it back. Schedule the node. std::vector NotReady; Sequence.reserve(SUnits.size()); while (!AvailableQueue.empty() || !PendingQueue.empty()) { - // Check to see if any of the pending instructions are ready to issue. If - // so, add them to the available queue. - unsigned MinDepth = ~0u; - for (unsigned i = 0, e = PendingQueue.size(); i != e; ++i) { - if (PendingQueue[i]->getDepth() <= CurCycle) { - AvailableQueue.push(PendingQueue[i]); - PendingQueue[i]->isAvailable = true; - PendingQueue[i] = PendingQueue.back(); - PendingQueue.pop_back(); - --i; --e; - } else if (PendingQueue[i]->getDepth() < MinDepth) - MinDepth = PendingQueue[i]->getDepth(); - } - - DEBUG(dbgs() << "\n*** Examining Available\n"; AvailableQueue.dump(this)); - SUnit *FoundSUnit = nullptr, *NotPreferredSUnit = nullptr; bool HasNoopHazards = false; - while (!AvailableQueue.empty()) { - SUnit *CurSUnit = AvailableQueue.pop(); - - ScheduleHazardRecognizer::HazardType HT = - HazardRec->getHazardType(CurSUnit, 0/*no stalls*/); - if (HT == ScheduleHazardRecognizer::NoHazard) { - if (HazardRec->ShouldPreferAnother(CurSUnit)) { - if (!NotPreferredSUnit) { - // If this is the first non-preferred node for this cycle, then - // record it and continue searching for a preferred node. If this - // is not the first non-preferred node, then treat it as though - // there had been a hazard. - NotPreferredSUnit = CurSUnit; - continue; + + // Clustered instructions get higher scheduling priority, because processor + // can work efficiently if these instructions can be scheduled nearby, so + // if we have clustered instructions, scheduling them first. + if (NextClusterSucc) { + // Wait until we have correct cycle count and no more hazard. + if (NextClusterSucc->getDepth() <= CurCycle && + HazardRec->getHazardType(NextClusterSucc, 0/*no stalls*/) == + ScheduleHazardRecognizer::NoHazard) { + for (unsigned i = 0, e = PendingQueue.size(); i != e; ++i) + if (PendingQueue[i] == NextClusterSucc) { + PendingQueue.erase(PendingQueue.begin() + i); + break; } - } else { - FoundSUnit = CurSUnit; - break; - } + + FoundSUnit = NextClusterSucc; + NextClusterSucc->isAvailable = true; + NextClusterSucc = nullptr; + } + } else { + // Check to see if any of the pending instructions are ready to issue. If + // so, add them to the available queue. + unsigned MinDepth = ~0u; + for (unsigned i = 0, e = PendingQueue.size(); i != e; ++i) { + if (PendingQueue[i]->getDepth() <= CurCycle) { + AvailableQueue.push(PendingQueue[i]); + PendingQueue[i]->isAvailable = true; + PendingQueue[i] = PendingQueue.back(); + PendingQueue.pop_back(); + --i; --e; + } else if (PendingQueue[i]->getDepth() < MinDepth) + MinDepth = PendingQueue[i]->getDepth(); } - // Remember if this is a noop hazard. - HasNoopHazards |= HT == ScheduleHazardRecognizer::NoopHazard; + DEBUG(dbgs() << "\n*** Examining Available\n"; AvailableQueue.dump(this)); + + while (!AvailableQueue.empty()) { + SUnit *CurSUnit = AvailableQueue.pop(); + + ScheduleHazardRecognizer::HazardType HT = + HazardRec->getHazardType(CurSUnit, 0/*no stalls*/); + if (HT == ScheduleHazardRecognizer::NoHazard) { + if (HazardRec->ShouldPreferAnother(CurSUnit)) { + if (!NotPreferredSUnit) { + // If this is the first non-preferred node for this cycle, then + // record it and continue searching for a preferred node. If this + // is not the first non-preferred node, then treat it as though + // there had been a hazard. + NotPreferredSUnit = CurSUnit; + continue; + } + } else { + FoundSUnit = CurSUnit; + break; + } + } + + // Remember if this is a noop hazard. + HasNoopHazards |= HT == ScheduleHazardRecognizer::NoopHazard; - NotReady.push_back(CurSUnit); + NotReady.push_back(CurSUnit); + } } // If we have a non-preferred node, push it back onto the available list. Index: lib/Target/PowerPC/CMakeLists.txt =================================================================== --- lib/Target/PowerPC/CMakeLists.txt +++ lib/Target/PowerPC/CMakeLists.txt @@ -28,6 +28,7 @@ PPCLoopPreIncPrep.cpp PPCMCInstLower.cpp PPCMachineFunctionInfo.cpp + PPCMacroFusion.cpp PPCMIPeephole.cpp PPCRegisterInfo.cpp PPCQPXLoadSplat.cpp Index: lib/Target/PowerPC/PPC.td =================================================================== --- lib/Target/PowerPC/PPC.td +++ lib/Target/PowerPC/PPC.td @@ -201,7 +201,7 @@ list Power8FeatureList = !listconcat(Power7FeatureList, Power8SpecificFeatures); list Power9SpecificFeatures = - [FeatureP9Altivec, FeatureP9Vector, FeatureISA3_0]; + [DirectivePwr9, FeatureP9Altivec, FeatureP9Vector, FeatureISA3_0]; list Power9FeatureList = !listconcat(Power8FeatureList, Power9SpecificFeatures); } Index: lib/Target/PowerPC/PPCMacroFusion.h =================================================================== --- /dev/null +++ lib/Target/PowerPC/PPCMacroFusion.h @@ -0,0 +1,31 @@ +//===-- PPCMacroFusion.h - PPC Instruction Fusion Logic --------*- C++ -*--===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_POWERPC_PPCMACROFUSION_H +#define LLVM_LIB_TARGET_POWERPC_PPCMACROFUSION_H + +#include "llvm/CodeGen/ScheduleDAGMutation.h" + +namespace llvm { +class TargetInstrInfo; +class TargetRegisterInfo; + +class PPCMacroFusion : public ScheduleDAGMutation { + const TargetInstrInfo &TII; + const TargetRegisterInfo &TRI; +public: + PPCMacroFusion(const TargetInstrInfo &TII, const TargetRegisterInfo &TRI) + : TII(TII), TRI(TRI) {} + + void apply(ScheduleDAGInstrs *DAGInstrs) override; + void applyPower9FusionPattern(ScheduleDAGInstrs *DAGInstrs); +}; +} + +#endif Index: lib/Target/PowerPC/PPCMacroFusion.cpp =================================================================== --- /dev/null +++ lib/Target/PowerPC/PPCMacroFusion.cpp @@ -0,0 +1,91 @@ +//===-- PPCMacroFusion.cpp - PPC Instruction Fusion Logic -----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "PPCMacroFusion.h" +#include "PPCRegisterInfo.h" +#include "PPCSubtarget.h" +#include "llvm/CodeGen/MachineScheduler.h" +using namespace llvm; + +/// Returns true if \p MI reads a register written by \p Other. +static bool hasDataDep(const TargetRegisterInfo &TRI, const MachineInstr *MI, + const MachineInstr *Other) { + for (const MachineOperand &MO : MI->uses()) { + if (!MO.isReg() || !MO.readsReg()) + continue; + + unsigned Reg = MO.getReg(); + if (Other->modifiesRegister(Reg, &TRI)) + return true; + } + return false; +} + +static void +createClusterEdge(ScheduleDAGInstrs *DAGInstrs, SUnit &First, SUnit &Second, + bool HasDependency) { + // Set latency of the edge to zero so (pre) scheduler is able to pick the + // Second instruction in the same cycle. + if (HasDependency) { + for (SDep &I: Second.Preds) + if (I.getSUnit() == &First) { + I.setLatency(0); + break; + } + + for (SDep &I: First.Succs) + if (I.getSUnit() == &Second) { + I.setLatency(0); + break; + } + } + + // FIXME: When adding new edge, should we check whether this new edge create + // a cycle? + const SDep PredDep(&First, SDep::Cluster); + Second.addPred(PredDep, /*Required=*/!PredDep.isArtificial()); +} + +void PPCMacroFusion::applyPower9FusionPattern(ScheduleDAGInstrs *DAGInstrs) { + std::vector &SU = DAGInstrs->SUnits; + unsigned Idx = 0, LastIdx = SU.size() - 1; + + assert(SU.size() > 1 && "Meanless fusion pattern match."); + + // Power9 Fusion Patterns: + while (Idx++ < LastIdx) { + MachineInstr *MI = SU[Idx].getInstr(); + MI->dump(); + + switch (MI->getOpcode()) { + case PPC::ORIS8 : + if (SU[Idx + 1].getInstr()->getOpcode() == PPC::ORI8 && + hasDataDep(TRI, SU[Idx + 1].getInstr(), SU[Idx].getInstr())) { + createClusterEdge(DAGInstrs, SU[Idx], SU[Idx + 1], true); + ++Idx; + continue; + } + } + } +} + +void PPCMacroFusion::apply(ScheduleDAGInstrs *DAGInstrs) { + std::vector &SU = DAGInstrs->SUnits; + + if (SU.size() < 2) return; + + const PPCSubtarget &Subtarget = DAGInstrs->MF.getSubtarget(); + + switch (Subtarget.getDarwinDirective()) { + default: break; + + case PPC::DIR_PWR9: + return applyPower9FusionPattern(DAGInstrs); + } +} Index: lib/Target/PowerPC/PPCSubtarget.h =================================================================== --- lib/Target/PowerPC/PPCSubtarget.h +++ lib/Target/PowerPC/PPCSubtarget.h @@ -187,6 +187,9 @@ } const PPCTargetMachine &getTargetMachine() const { return TM; } + void getPostRAMutations(std::vector> + &Mutations) const override; + /// initializeSubtargetDependencies - Initializes using a CPU and feature string /// so that we can use initializer lists for subtarget initialization. PPCSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS); Index: lib/Target/PowerPC/PPCSubtarget.cpp =================================================================== --- lib/Target/PowerPC/PPCSubtarget.cpp +++ lib/Target/PowerPC/PPCSubtarget.cpp @@ -13,6 +13,7 @@ #include "PPCSubtarget.h" #include "PPC.h" +#include "PPCMacroFusion.h" #include "PPCRegisterInfo.h" #include "PPCTargetMachine.h" #include "llvm/CodeGen/MachineFunction.h" @@ -247,5 +248,12 @@ return flags | PPCII::MO_NLP_FLAG; } +void PPCSubtarget::getPostRAMutations( + std::vector> &Mutations) const +{ + Mutations.push_back(make_unique(*getInstrInfo(), + *getRegisterInfo())); +} + bool PPCSubtarget::isELFv2ABI() const { return TM.isELFv2ABI(); } bool PPCSubtarget::isPPC64() const { return TM.isPPC64(); } Index: lib/Target/PowerPC/PPCTargetMachine.cpp =================================================================== --- lib/Target/PowerPC/PPCTargetMachine.cpp +++ lib/Target/PowerPC/PPCTargetMachine.cpp @@ -13,9 +13,11 @@ #include "PPCTargetMachine.h" #include "PPC.h" +#include "PPCMacroFusion.h" #include "PPCTargetObjectFile.h" #include "PPCTargetTransformInfo.h" #include "llvm/CodeGen/LiveVariables.h" +#include "llvm/CodeGen/MachineScheduler.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/Function.h" @@ -312,7 +314,9 @@ void addPreRegAlloc() override; void addPreSched2() override; void addPreEmitPass() override; + ScheduleDAGMutation *createMacroFusion(ScheduleDAGInstrs *DAG) const override; }; + } // namespace TargetPassConfig *PPCTargetMachine::createPassConfig(PassManagerBase &PM) { @@ -435,6 +439,11 @@ addPass(createPPCBranchSelectionPass(), false); } +ScheduleDAGMutation * +PPCPassConfig::createMacroFusion(ScheduleDAGInstrs *DAG) const { + return new PPCMacroFusion(*DAG->TII, *DAG->TRI); +} + TargetIRAnalysis PPCTargetMachine::getTargetIRAnalysis() { return TargetIRAnalysis([this](const Function &F) { return TargetTransformInfo(PPCTTIImpl(this, F)); Index: test/CodeGen/PowerPC/fusing-constant.ll =================================================================== --- /dev/null +++ test/CodeGen/PowerPC/fusing-constant.ll @@ -0,0 +1,29 @@ +; RUN: llc < %s -march=ppc64 -mcpu=pwr9 -verify-machineinstrs | \ +; RUN: FileCheck %s -check-prefix=CHECK-P9 +; RUN: llc < %s -march=ppc64 -mcpu=pwr8 -verify-machineinstrs | \ +; RUN: FileCheck %s -check-prefix=CHECK-P8 +define i64 @fusingOrisOri(i64 %x, i64 %y, i64 %z) { +entry: + %add = add nsw i64 %x, 4295163905 + %add1 = add nsw i64 %y, 8590196738 + %add2 = add nsw i64 %z, 12885229571 + %and = and i64 %add1, %add + %and3 = and i64 %and, %add2 + ret i64 %and3 + +; CHECK-P9-LABEL: @fusingOrisOri +; CHECK-P9: oris +; CHECK-P9: ori +; CHECK-P9: oris +; CHECK-P9: ori +; CHECK-P9: oris +; CHECK-P9: ori + +; CHECK-P8-LABEL: @fusingOrisOri +; CHECK-P8: oris +; CHECK-P8: oris +; CHECK-P8: oris +; CHECK-P8: ori +; CHECK-P8: ori +; CHECK-P8: ori +}