Index: llvm/trunk/lib/CodeGen/MachinePipeliner.cpp =================================================================== --- llvm/trunk/lib/CodeGen/MachinePipeliner.cpp +++ llvm/trunk/lib/CodeGen/MachinePipeliner.cpp @@ -102,6 +102,7 @@ #include "llvm/MC/MCInstrItineraries.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/Pass.h" +#include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" @@ -171,6 +172,12 @@ cl::ReallyHidden, cl::init(false), cl::ZeroOrMore, cl::desc("Ignore RecMII")); +// A command line option to enable the CopyToPhi DAG mutation. +static cl::opt + SwpEnableCopyToPhi("pipeliner-enable-copytophi", cl::ReallyHidden, + cl::init(true), cl::ZeroOrMore, + cl::desc("Enable CopyToPhi DAG Mutation")); + namespace { class NodeSet; @@ -307,12 +314,18 @@ void unblock(int U); }; + struct CopyToPhiMutation : public ScheduleDAGMutation { + void apply(ScheduleDAGInstrs *DAG) override; + }; + public: SwingSchedulerDAG(MachinePipeliner &P, MachineLoop &L, LiveIntervals &lis, const RegisterClassInfo &rci) : ScheduleDAGInstrs(*P.MF, P.MLI, false), Pass(P), Loop(L), LIS(lis), RegClassInfo(rci), Topo(SUnits, &ExitSU) { P.MF->getSubtarget().getSMSMutations(Mutations); + if (SwpEnableCopyToPhi) + Mutations.push_back(llvm::make_unique()); } void schedule() override; @@ -391,6 +404,8 @@ Mutations.push_back(std::move(Mutation)); } + static bool classof(const ScheduleDAGInstrs *DAG) { return true; } + private: void addLoopCarriedDependences(AliasAnalysis *AA); void updatePhiDependences(); @@ -893,8 +908,8 @@ addLoopCarriedDependences(AA); updatePhiDependences(); Topo.InitDAGTopologicalSorting(); - postprocessDAG(); changeDependences(); + postprocessDAG(); LLVM_DEBUG(dump()); NodeSetType NodeSets; @@ -1624,6 +1639,85 @@ swapAntiDependences(SUnits); } +// Create artificial dependencies between the source of COPY/REG_SEQUENCE that +// is loop-carried to the USE in next iteration. This will help pipeliner avoid +// additional copies that are needed across iterations. An artificial dependence +// edge is added from USE to SOURCE of COPY/REG_SEQUENCE. + +// PHI-------Anti-Dep-----> COPY/REG_SEQUENCE (loop-carried) +// SRCOfCopY------True-Dep---> COPY/REG_SEQUENCE +// PHI-------True-Dep------> USEOfPhi + +// The mutation creates +// USEOfPHI -------Artificial-Dep---> SRCOfCopy + +// This overall will ensure, the USEOfPHI is scheduled before SRCOfCopy +// (since USE is a predecessor), implies, the COPY/ REG_SEQUENCE is scheduled +// late to avoid additional copies across iterations. The possible scheduling +// order would be +// USEOfPHI --- SRCOfCopy--- COPY/REG_SEQUENCE. + +void SwingSchedulerDAG::CopyToPhiMutation::apply(ScheduleDAGInstrs *DAG) { + for (SUnit &SU : DAG->SUnits) { + // Find the COPY/REG_SEQUENCE instruction. + if (!SU.getInstr()->isCopy() && !SU.getInstr()->isRegSequence()) + continue; + + // Record the loop carried PHIs. + SmallVector PHISUs; + // Record the SrcSUs that feed the COPY/REG_SEQUENCE instructions. + SmallVector SrcSUs; + + for (auto &Dep : SU.Preds) { + SUnit *TmpSU = Dep.getSUnit(); + MachineInstr *TmpMI = TmpSU->getInstr(); + SDep::Kind DepKind = Dep.getKind(); + // Save the loop carried PHI. + if (DepKind == SDep::Anti && TmpMI->isPHI()) + PHISUs.push_back(TmpSU); + // Save the source of COPY/REG_SEQUENCE. + // If the source has no pre-decessors, we will end up creating cycles. + else if (DepKind == SDep::Data && !TmpMI->isPHI() && TmpSU->NumPreds > 0) + SrcSUs.push_back(TmpSU); + } + + if (PHISUs.size() == 0 || SrcSUs.size() == 0) + continue; + + // Find the USEs of PHI. If the use is a PHI or REG_SEQUENCE, push back this + // SUnit to the container. + SmallVector UseSUs; + for (auto I = PHISUs.begin(); I != PHISUs.end(); ++I) { + for (auto &Dep : (*I)->Succs) { + if (Dep.getKind() != SDep::Data) + continue; + + SUnit *TmpSU = Dep.getSUnit(); + MachineInstr *TmpMI = TmpSU->getInstr(); + if (TmpMI->isPHI() || TmpMI->isRegSequence()) { + PHISUs.push_back(TmpSU); + continue; + } + UseSUs.push_back(TmpSU); + } + } + + if (UseSUs.size() == 0) + continue; + + SwingSchedulerDAG *SDAG = cast(DAG); + // Add the artificial dependencies if it does not form a cycle. + for (auto I : UseSUs) { + for (auto Src : SrcSUs) { + if (!SDAG->Topo.IsReachable(I, Src) && Src != I) { + Src->addPred(SDep(I, SDep::Artificial)); + SDAG->Topo.AddPred(Src, I); + } + } + } + } +} + /// Return true for DAG nodes that we ignore when computing the cost functions. /// We ignore the back-edge recurrence in order to avoid unbounded recursion /// in the calculation of the ASAP, ALAP, etc functions. Index: llvm/trunk/test/CodeGen/Hexagon/swp-copytophi-dag.ll =================================================================== --- llvm/trunk/test/CodeGen/Hexagon/swp-copytophi-dag.ll +++ llvm/trunk/test/CodeGen/Hexagon/swp-copytophi-dag.ll @@ -0,0 +1,72 @@ +; RUN: llc -march=hexagon -enable-pipeliner=true -debug-only=pipeliner < %s \ +; RUN: 2>&1 | FileCheck %s + +; Test that the artificial dependence is created as a result of +; CopyToPhi DAG mutation. +; CHECK: Ord Latency=0 Artificial +target triple = "hexagon" + +; Function Attrs: nounwind +define void @foo(i64* nocapture readonly %r64, i16 zeroext %n, i16 zeroext %s, i64* nocapture %p64) #0 { +entry: + %conv = zext i16 %n to i32 + %cmp = icmp eq i16 %n, 0 + br i1 %cmp, label %for.end, label %for.body.preheader + +for.body.preheader: ; preds = %entry + %tmp = load i64, i64* %r64, align 8 + %v.sroa.0.0.extract.trunc = trunc i64 %tmp to i16 + %v.sroa.4.0.extract.shift = lshr i64 %tmp, 16 + %v.sroa.4.0.extract.trunc = trunc i64 %v.sroa.4.0.extract.shift to i16 + %v.sroa.5.0.extract.shift = lshr i64 %tmp, 32 + %v.sroa.5.0.extract.trunc = trunc i64 %v.sroa.5.0.extract.shift to i16 + %v.sroa.6.0.extract.shift = lshr i64 %tmp, 48 + %v.sroa.6.0.extract.trunc = trunc i64 %v.sroa.6.0.extract.shift to i16 + %tmp1 = bitcast i64* %p64 to i16* + %conv2 = zext i16 %s to i32 + %add.ptr = getelementptr inbounds i16, i16* %tmp1, i32 %conv2 + %add.ptr.sum = add nuw nsw i32 %conv2, 1 + %add.ptr3 = getelementptr inbounds i16, i16* %tmp1, i32 %add.ptr.sum + %add.ptr.sum50 = add nuw nsw i32 %conv2, 2 + %add.ptr4 = getelementptr inbounds i16, i16* %tmp1, i32 %add.ptr.sum50 + %add.ptr.sum51 = add nuw nsw i32 %conv2, 3 + %add.ptr5 = getelementptr inbounds i16, i16* %tmp1, i32 %add.ptr.sum51 + br label %for.body + +for.body: ; preds = %for.body, %for.body.preheader + %add.ptr11.phi = phi i16* [ %add.ptr11.inc, %for.body ], [ %add.ptr, %for.body.preheader ] + %add.ptr16.phi = phi i16* [ %add.ptr16.inc, %for.body ], [ %add.ptr3, %for.body.preheader ] + %add.ptr21.phi = phi i16* [ %add.ptr21.inc, %for.body ], [ %add.ptr4, %for.body.preheader ] + %add.ptr26.phi = phi i16* [ %add.ptr26.inc, %for.body ], [ %add.ptr5, %for.body.preheader ] + %i.058.pmt = phi i32 [ %inc.pmt, %for.body ], [ 0, %for.body.preheader ] + %v.sroa.0.157 = phi i16 [ %v.sroa.0.0.extract.trunc34, %for.body ], [ %v.sroa.0.0.extract.trunc, %for.body.preheader ] + %v.sroa.4.156 = phi i16 [ %v.sroa.4.0.extract.trunc36, %for.body ], [ %v.sroa.4.0.extract.trunc, %for.body.preheader ] + %v.sroa.5.155 = phi i16 [ %v.sroa.5.0.extract.trunc38, %for.body ], [ %v.sroa.5.0.extract.trunc, %for.body.preheader ] + %v.sroa.6.154 = phi i16 [ %v.sroa.6.0.extract.trunc40, %for.body ], [ %v.sroa.6.0.extract.trunc, %for.body.preheader ] + %q64.153.pn = phi i64* [ %q64.153, %for.body ], [ %r64, %for.body.preheader ] + %q64.153 = getelementptr inbounds i64, i64* %q64.153.pn, i32 1 + store i16 %v.sroa.0.157, i16* %add.ptr11.phi, align 2 + store i16 %v.sroa.4.156, i16* %add.ptr16.phi, align 2 + store i16 %v.sroa.5.155, i16* %add.ptr21.phi, align 2 + store i16 %v.sroa.6.154, i16* %add.ptr26.phi, align 2 + %tmp2 = load i64, i64* %q64.153, align 8 + %v.sroa.0.0.extract.trunc34 = trunc i64 %tmp2 to i16 + %v.sroa.4.0.extract.shift35 = lshr i64 %tmp2, 16 + %v.sroa.4.0.extract.trunc36 = trunc i64 %v.sroa.4.0.extract.shift35 to i16 + %v.sroa.5.0.extract.shift37 = lshr i64 %tmp2, 32 + %v.sroa.5.0.extract.trunc38 = trunc i64 %v.sroa.5.0.extract.shift37 to i16 + %v.sroa.6.0.extract.shift39 = lshr i64 %tmp2, 48 + %v.sroa.6.0.extract.trunc40 = trunc i64 %v.sroa.6.0.extract.shift39 to i16 + %inc.pmt = add i32 %i.058.pmt, 1 + %cmp8 = icmp slt i32 %inc.pmt, %conv + %add.ptr11.inc = getelementptr i16, i16* %add.ptr11.phi, i32 4 + %add.ptr16.inc = getelementptr i16, i16* %add.ptr16.phi, i32 4 + %add.ptr21.inc = getelementptr i16, i16* %add.ptr21.phi, i32 4 + %add.ptr26.inc = getelementptr i16, i16* %add.ptr26.phi, i32 4 + br i1 %cmp8, label %for.body, label %for.end + +for.end: ; preds = %for.body, %entry + ret void +} + +attributes #0 = { nounwind "target-cpu"="hexagonv65" }