Index: include/llvm/CodeGen/TargetPassConfig.h =================================================================== --- include/llvm/CodeGen/TargetPassConfig.h +++ include/llvm/CodeGen/TargetPassConfig.h @@ -22,6 +22,7 @@ class PassConfigImpl; class ScheduleDAGInstrs; +class ScheduleDAGMutation; class TargetMachine; struct MachineSchedContext; @@ -251,6 +252,15 @@ return nullptr; } + /// When EnableMacroFusion is true, create target defined MacroFusion + /// instance to be run within default pre-scheduler. + /// + /// Return NULL to select the default MacroFusion. + virtual ScheduleDAGMutation * + createMacroFusion(ScheduleDAGInstrs *DAG) const { + return nullptr; + } + /// printAndVerify - Add a pass to dump then verify the machine function, if /// those steps are enabled. /// Index: lib/CodeGen/MachineScheduler.cpp =================================================================== --- lib/CodeGen/MachineScheduler.cpp +++ lib/CodeGen/MachineScheduler.cpp @@ -3119,8 +3119,12 @@ if (DAG->TII->enableClusterStores()) DAG->addMutation(make_unique(DAG->TII, DAG->TRI)); } - if (EnableMacroFusion) - DAG->addMutation(make_unique(*DAG->TII, *DAG->TRI)); + if (EnableMacroFusion) { + if (ScheduleDAGMutation *Fusion = C->PassConfig->createMacroFusion(DAG)) + DAG->addMutation(std::unique_ptr(Fusion)); + else + DAG->addMutation(make_unique(*DAG->TII, *DAG->TRI)); + } return DAG; } Index: lib/CodeGen/PostRASchedulerList.cpp =================================================================== --- lib/CodeGen/PostRASchedulerList.cpp +++ lib/CodeGen/PostRASchedulerList.cpp @@ -122,6 +122,9 @@ /// added to the AvailableQueue. std::vector PendingQueue; + /// Record the next node in a scheduled cluster. + SUnit *NextClusterSucc; + /// HazardRec - The hazard recognizer to use. ScheduleHazardRecognizer *HazardRec; @@ -208,8 +211,8 @@ const RegisterClassInfo &RCI, TargetSubtargetInfo::AntiDepBreakMode AntiDepMode, SmallVectorImpl &CriticalPathRCs) - : ScheduleDAGInstrs(MF, &MLI), AA(AA), EndIndex(0) { - + : ScheduleDAGInstrs(MF, &MLI), NextClusterSucc(nullptr), AA(AA), EndIndex(0) +{ const InstrItineraryData *InstrItins = MF.getSubtarget().getInstrItineraryData(); HazardRec = @@ -461,6 +464,11 @@ if (SuccEdge->isWeak()) { --SuccSU->WeakPredsLeft; + // Cluster instructions get higher scheduling priority. + // If SuccSU is not blocked by any other predecessors, let scheduler pick + // SuccSU as next scheduling instruction. + if (SuccEdge->isCluster() && SuccSU->NumPredsLeft == 0) + NextClusterSucc = SuccSU; return; } #ifndef NDEBUG @@ -550,54 +558,77 @@ // stall or emit a noop, depending on the target. bool CycleHasInsts = false; + assert(!NextClusterSucc && "Incorrect scheduling state."); + // While Available queue is not empty, grab the node with the highest // priority. If it is not ready put it back. Schedule the node. std::vector NotReady; Sequence.reserve(SUnits.size()); while (!AvailableQueue.empty() || !PendingQueue.empty()) { - // Check to see if any of the pending instructions are ready to issue. If - // so, add them to the available queue. - unsigned MinDepth = ~0u; - for (unsigned i = 0, e = PendingQueue.size(); i != e; ++i) { - if (PendingQueue[i]->getDepth() <= CurCycle) { - AvailableQueue.push(PendingQueue[i]); - PendingQueue[i]->isAvailable = true; - PendingQueue[i] = PendingQueue.back(); - PendingQueue.pop_back(); - --i; --e; - } else if (PendingQueue[i]->getDepth() < MinDepth) - MinDepth = PendingQueue[i]->getDepth(); - } - - DEBUG(dbgs() << "\n*** Examining Available\n"; AvailableQueue.dump(this)); - SUnit *FoundSUnit = nullptr, *NotPreferredSUnit = nullptr; bool HasNoopHazards = false; - while (!AvailableQueue.empty()) { - SUnit *CurSUnit = AvailableQueue.pop(); - - ScheduleHazardRecognizer::HazardType HT = - HazardRec->getHazardType(CurSUnit, 0/*no stalls*/); - if (HT == ScheduleHazardRecognizer::NoHazard) { - if (HazardRec->ShouldPreferAnother(CurSUnit)) { - if (!NotPreferredSUnit) { - // If this is the first non-preferred node for this cycle, then - // record it and continue searching for a preferred node. If this - // is not the first non-preferred node, then treat it as though - // there had been a hazard. - NotPreferredSUnit = CurSUnit; - continue; + + // Clustered instructions get higher scheduling priority, because processor + // can work efficiently if these instructions can be scheduled nearby, so + // if we have clustered instructions, scheduling them first. + if (NextClusterSucc) { + // Wait until we have correct cycle count and no more hazard. + if (NextClusterSucc->getDepth() <= CurCycle && + HazardRec->getHazardType(NextClusterSucc, 0/*no stalls*/) == + ScheduleHazardRecognizer::NoHazard) { + for (unsigned i = 0, e = PendingQueue.size(); i != e; ++i) + if (PendingQueue[i] == NextClusterSucc) { + PendingQueue.erase(PendingQueue.begin() + i); + break; } - } else { - FoundSUnit = CurSUnit; - break; - } + + FoundSUnit = NextClusterSucc; + NextClusterSucc->isAvailable = true; + NextClusterSucc = nullptr; + } + } else { + // Check to see if any of the pending instructions are ready to issue. If + // so, add them to the available queue. + unsigned MinDepth = ~0u; + for (unsigned i = 0, e = PendingQueue.size(); i != e; ++i) { + if (PendingQueue[i]->getDepth() <= CurCycle) { + AvailableQueue.push(PendingQueue[i]); + PendingQueue[i]->isAvailable = true; + PendingQueue[i] = PendingQueue.back(); + PendingQueue.pop_back(); + --i; --e; + } else if (PendingQueue[i]->getDepth() < MinDepth) + MinDepth = PendingQueue[i]->getDepth(); } - // Remember if this is a noop hazard. - HasNoopHazards |= HT == ScheduleHazardRecognizer::NoopHazard; + DEBUG(dbgs() << "\n*** Examining Available\n"; AvailableQueue.dump(this)); + + while (!AvailableQueue.empty()) { + SUnit *CurSUnit = AvailableQueue.pop(); + + ScheduleHazardRecognizer::HazardType HT = + HazardRec->getHazardType(CurSUnit, 0/*no stalls*/); + if (HT == ScheduleHazardRecognizer::NoHazard) { + if (HazardRec->ShouldPreferAnother(CurSUnit)) { + if (!NotPreferredSUnit) { + // If this is the first non-preferred node for this cycle, then + // record it and continue searching for a preferred node. If this + // is not the first non-preferred node, then treat it as though + // there had been a hazard. + NotPreferredSUnit = CurSUnit; + continue; + } + } else { + FoundSUnit = CurSUnit; + break; + } + } + + // Remember if this is a noop hazard. + HasNoopHazards |= HT == ScheduleHazardRecognizer::NoopHazard; - NotReady.push_back(CurSUnit); + NotReady.push_back(CurSUnit); + } } // If we have a non-preferred node, push it back onto the available list. Index: lib/Target/PowerPC/CMakeLists.txt =================================================================== --- lib/Target/PowerPC/CMakeLists.txt +++ lib/Target/PowerPC/CMakeLists.txt @@ -28,6 +28,7 @@ PPCLoopPreIncPrep.cpp PPCMCInstLower.cpp PPCMachineFunctionInfo.cpp + PPCMacroFusion.cpp PPCMIPeephole.cpp PPCRegisterInfo.cpp PPCQPXLoadSplat.cpp Index: lib/Target/PowerPC/PPC.td =================================================================== --- lib/Target/PowerPC/PPC.td +++ lib/Target/PowerPC/PPC.td @@ -201,7 +201,7 @@ list Power8FeatureList = !listconcat(Power7FeatureList, Power8SpecificFeatures); list Power9SpecificFeatures = - [FeatureP9Altivec, FeatureP9Vector, FeatureISA3_0]; + [DirectivePwr9, FeatureP9Altivec, FeatureP9Vector, FeatureISA3_0]; list Power9FeatureList = !listconcat(Power8FeatureList, Power9SpecificFeatures); } Index: lib/Target/PowerPC/PPCMacroFusion.h =================================================================== --- /dev/null +++ lib/Target/PowerPC/PPCMacroFusion.h @@ -0,0 +1,31 @@ +//===-- PPCMacroFusion.h - PPC Instruction Fusion Logic --------*- C++ -*--===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_POWERPC_PPCMACROFUSION_H +#define LLVM_LIB_TARGET_POWERPC_PPCMACROFUSION_H + +#include "llvm/CodeGen/ScheduleDAGMutation.h" + +namespace llvm { +class TargetInstrInfo; +class TargetRegisterInfo; + +// PPCMacroFusion - PPC specific macro-fusion implementation. +class PPCMacroFusion : public ScheduleDAGMutation { + const TargetInstrInfo &TII; + const TargetRegisterInfo &TRI; +public: + PPCMacroFusion(const TargetInstrInfo &TII, const TargetRegisterInfo &TRI) + : TII(TII), TRI(TRI) {} + + void apply(ScheduleDAGInstrs *DAGInstrs) override; +}; +} + +#endif Index: lib/Target/PowerPC/PPCMacroFusion.cpp =================================================================== --- /dev/null +++ lib/Target/PowerPC/PPCMacroFusion.cpp @@ -0,0 +1,203 @@ +//===-- PPCMacroFusion.cpp - PPC Instruction Fusion Logic -----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// How to add a new fusion pairs to existing table? +// 1. Insert a new pair to P*FusedInstrsTable, please put same Op1 pair together +// 2. If matched pair require special check, we can add a callback to table for +// that pair +// +// How to add a new table? +// 1. Define a new P*FusedInstrsTable +// 2. Instantiating a PPCFusionTable by feeding newly defined table +// +//===----------------------------------------------------------------------===// + +#include "PPCMacroFusion.h" +#include "PPCRegisterInfo.h" +#include "PPCSubtarget.h" +#include "llvm/CodeGen/MachineScheduler.h" +using namespace llvm; + +// CheckCallback - Some fused-pairs require more specific checking, we can +// implement and add new Callback in FusedInstrsTable for these fused-pairs. +typedef bool (*CheckCallback)(const MachineInstr &First, + const MachineInstr &Second); + +// FusionInfo - Describe each fused-pair: Opcode1, Opcode2, Should they have +// data dependency, Need more further checking? +struct FusionInfo { + unsigned Op1; + unsigned Op2; + bool NeedCheckDataDep; + CheckCallback FurtherCheck; +}; + +// Power9's Fused Instruction Table: +const static FusionInfo P9FusedInstrsTable[] = { + {PPC::ORIS8, PPC::ORI8, true, nullptr}, + {PPC::ORIS, PPC::ORI, true, nullptr} +}; + +// Returns true if Second instr reads a register written by First instr. +static bool hasDataDep(const TargetRegisterInfo &TRI, + const MachineInstr &First, const MachineInstr &Second) { + for (const MachineOperand &MO : Second.uses()) { + if (!MO.isReg() || !MO.readsReg()) + continue; + + unsigned Reg = MO.getReg(); + if (First.modifiesRegister(Reg, &TRI)) + return true; + } + return false; +} + +// Add a weak cluster edge First->Second, so MI scheduler can schedule +// First and Second adjacently. +static void addClusterEdge(SUnit &First, SUnit &Second) { + // If there has been a First->Second edge, then set the edge's latency to + // zero, so (pre) scheduler is able to pick the Second instruction in the + // same cycle. + for (SDep &I: Second.Preds) + if (I.getSUnit() == &First) { + I.setLatency(0); + break; + } + + for (SDep &I: First.Succs) + if (I.getSUnit() == &Second) { + I.setLatency(0); + break; + } + + // FIXME: When adding new edge, should we check whether this new edge create + // a cycle? + const SDep PredDep(&First, SDep::Cluster); + Second.addPred(PredDep, /*Required=*/!PredDep.isArtificial()); +} + +// PPCFusionTable - Manage FusedInstrsTable, and is responsible for judging +// if an instruction pair is fusable. +class PPCFusionTable { +public: + // TableIndex - Index into FusedInstrsTable. See TableIndexMap comment. + struct TableIndex { + unsigned short StartRow; + unsigned short Size; + + TableIndex(): StartRow(0), Size(0) {} + TableIndex(unsigned short StartRow, unsigned short Size) + : StartRow(StartRow), Size(Size) {} + }; + + PPCFusionTable(const FusionInfo *FIArray, unsigned TableSize) + : FusedInstrsTable(FIArray), TableSize(TableSize) { + initTableIndex(FIArray, TableSize); + } + + void initTableIndex(const FusionInfo *FIArray, unsigned TableSize); + + // areInstrsFusable - Test if First and Second is a legal fused pattern + bool areInstrsFusable(const TargetRegisterInfo &TRI, + const MachineInstr &First, + const MachineInstr &Second) const; +private: + // TableIndexMap - Map Op1 to its TableIndex, then we can use TableIndex to + // find out all of Op1's pairs in the FusedInstrsTable. + SmallDenseMap TableIndexMap; + + // External defined fused instruction pairs. + const FusionInfo *FusedInstrsTable; + const unsigned TableSize; +}; + +void PPCFusionTable::initTableIndex(const FusionInfo *FIArray, + unsigned TableSize) { + TableIndex Index(0, 1 /* Size */); + unsigned CurOp1 = FIArray[0].Op1; + + for (unsigned i = 1; i < TableSize; ++i) { + if (CurOp1 == FIArray[i].Op1) + ++Index.Size; + else { + assert(TableIndexMap.find(CurOp1) == TableIndexMap.end() && + "The opcode has been added, opcode order in the table is wrong."); + + TableIndexMap[CurOp1] = Index; + + Index.StartRow += Index.Size; + Index.Size = 1; + CurOp1 = FIArray[i].Op1; + } + } + + // Add the last record. + assert(TableIndexMap.find(CurOp1) == TableIndexMap.end() && + "The opcode has been added, opcode order in the table is wrong."); + TableIndexMap[CurOp1] = Index; +} + +bool PPCFusionTable::areInstrsFusable(const TargetRegisterInfo &TRI, + const MachineInstr &First, + const MachineInstr &Second) const { + auto Iter = TableIndexMap.find(First.getOpcode()); + if (Iter == TableIndexMap.end()) + return false; + + const TableIndex &Idx = Iter->second; + for (unsigned i = Idx.StartRow; i < Idx.Size; ++i) + if (FusedInstrsTable[i].Op2 == Second.getOpcode()) { + if (FusedInstrsTable[i].NeedCheckDataDep && + !hasDataDep(TRI, First, Second)) + return false; + + if (FusedInstrsTable[i].FurtherCheck && + !FusedInstrsTable[i].FurtherCheck(First, Second)) + return false; + + return true; + } + + return false; +} + +void PPCMacroFusion::apply(ScheduleDAGInstrs *DAGInstrs) { + std::vector &SU = DAGInstrs->SUnits; + if (SU.size() < 2) + return; + + const PPCSubtarget &Subtarget = DAGInstrs->MF.getSubtarget(); + const PPCFusionTable *FT = nullptr; + + switch (Subtarget.getDarwinDirective()) { + default: return; + + case PPC::DIR_PWR9: { + const static PPCFusionTable + P9FusionTable(P9FusedInstrsTable, array_lengthof(P9FusedInstrsTable)); + + FT = &P9FusionTable; + break; + } + } + + // Iterate over current scheduling region sequentially. + unsigned Idx = 0, LastIdx = SU.size() - 1; + + while (Idx < LastIdx) { + // FIXME: Do we have any pattern that need to scan more SUs + // (From Idx+1 to LastIdx)? + if (FT->areInstrsFusable(TRI, + *SU[Idx].getInstr(), *SU[Idx + 1].getInstr())) { + addClusterEdge(SU[Idx], SU[Idx + 1]); + ++Idx; + } + ++Idx; + } +} Index: lib/Target/PowerPC/PPCSubtarget.h =================================================================== --- lib/Target/PowerPC/PPCSubtarget.h +++ lib/Target/PowerPC/PPCSubtarget.h @@ -187,6 +187,9 @@ } const PPCTargetMachine &getTargetMachine() const { return TM; } + void getPostRAMutations(std::vector> + &Mutations) const override; + /// initializeSubtargetDependencies - Initializes using a CPU and feature string /// so that we can use initializer lists for subtarget initialization. PPCSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS); Index: lib/Target/PowerPC/PPCSubtarget.cpp =================================================================== --- lib/Target/PowerPC/PPCSubtarget.cpp +++ lib/Target/PowerPC/PPCSubtarget.cpp @@ -13,6 +13,7 @@ #include "PPCSubtarget.h" #include "PPC.h" +#include "PPCMacroFusion.h" #include "PPCRegisterInfo.h" #include "PPCTargetMachine.h" #include "llvm/CodeGen/MachineFunction.h" @@ -247,5 +248,12 @@ return flags | PPCII::MO_NLP_FLAG; } +void PPCSubtarget::getPostRAMutations( + std::vector> &Mutations) const +{ + Mutations.push_back(make_unique(*getInstrInfo(), + *getRegisterInfo())); +} + bool PPCSubtarget::isELFv2ABI() const { return TM.isELFv2ABI(); } bool PPCSubtarget::isPPC64() const { return TM.isPPC64(); } Index: lib/Target/PowerPC/PPCTargetMachine.cpp =================================================================== --- lib/Target/PowerPC/PPCTargetMachine.cpp +++ lib/Target/PowerPC/PPCTargetMachine.cpp @@ -13,9 +13,11 @@ #include "PPCTargetMachine.h" #include "PPC.h" +#include "PPCMacroFusion.h" #include "PPCTargetObjectFile.h" #include "PPCTargetTransformInfo.h" #include "llvm/CodeGen/LiveVariables.h" +#include "llvm/CodeGen/MachineScheduler.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/Function.h" @@ -312,7 +314,9 @@ void addPreRegAlloc() override; void addPreSched2() override; void addPreEmitPass() override; + ScheduleDAGMutation *createMacroFusion(ScheduleDAGInstrs *DAG) const override; }; + } // namespace TargetPassConfig *PPCTargetMachine::createPassConfig(PassManagerBase &PM) { @@ -435,6 +439,11 @@ addPass(createPPCBranchSelectionPass(), false); } +ScheduleDAGMutation * +PPCPassConfig::createMacroFusion(ScheduleDAGInstrs *DAG) const { + return new PPCMacroFusion(*DAG->TII, *DAG->TRI); +} + TargetIRAnalysis PPCTargetMachine::getTargetIRAnalysis() { return TargetIRAnalysis([this](const Function &F) { return TargetTransformInfo(PPCTTIImpl(this, F)); Index: test/CodeGen/PowerPC/fusing-constant.ll =================================================================== --- /dev/null +++ test/CodeGen/PowerPC/fusing-constant.ll @@ -0,0 +1,29 @@ +; RUN: llc < %s -march=ppc64 -mcpu=pwr9 -verify-machineinstrs | \ +; RUN: FileCheck %s -check-prefix=CHECK-P9 +; RUN: llc < %s -march=ppc64 -mcpu=pwr8 -verify-machineinstrs | \ +; RUN: FileCheck %s -check-prefix=CHECK-P8 +define i64 @fusingOrisOri(i64 %x, i64 %y, i64 %z) { +entry: + %add = add nsw i64 %x, 4295163905 + %add1 = add nsw i64 %y, 8590196738 + %add2 = add nsw i64 %z, 12885229571 + %and = and i64 %add1, %add + %and3 = and i64 %and, %add2 + ret i64 %and3 + +; CHECK-P9-LABEL: @fusingOrisOri +; CHECK-P9: oris +; CHECK-NEXT-P9: ori +; CHECK-P9: oris +; CHECK-NEXT-P9: ori +; CHECK-P9: oris +; CHECK-NEXT-P9: ori + +; CHECK-P8-LABEL: @fusingOrisOri +; CHECK-P8: oris +; CHECK-P8: oris +; CHECK-P8: oris +; CHECK-P8: ori +; CHECK-P8: ori +; CHECK-P8: ori +}