Index: lib/CodeGen/MachinePipeliner.cpp =================================================================== --- lib/CodeGen/MachinePipeliner.cpp +++ lib/CodeGen/MachinePipeliner.cpp @@ -125,6 +125,7 @@ STATISTIC(NumTrytoPipeline, "Number of loops that we attempt to pipeline"); STATISTIC(NumPipelined, "Number of loops software pipelined"); +STATISTIC(NumNodeOrderIssues, "Number of node order issues found"); /// A command line option to turn software pipelining on or off. static cl::opt EnableSWP("enable-pipeliner", cl::Hidden, cl::init(true), @@ -241,6 +242,8 @@ struct NodeInfo { int ASAP = 0; int ALAP = 0; + int ZeroLatencyDepth = 0; + int ZeroLatencyHeight = 0; NodeInfo() = default; }; @@ -320,9 +323,21 @@ /// The depth, in the dependence graph, for a node. int getDepth(SUnit *Node) { return Node->getDepth(); } + /// The maximum unweighted length of a path from an arbitrary node to the + /// given node in which each edge has latency 0 + int getZeroLatencyDepth(SUnit *Node) { + return ScheduleInfo[Node->NodeNum].ZeroLatencyDepth; + } + /// The height, in the dependence graph, for a node. int getHeight(SUnit *Node) { return Node->getHeight(); } + /// The maximum unweighted length of a path from the given node to an + /// arbitrary node in which each edge has latency 0 + int getZeroLatencyHeight(SUnit *Node) { + return ScheduleInfo[Node->NodeNum].ZeroLatencyHeight; + } + /// Return true if the dependence is a back-edge in the data dependence graph. /// Since the DAG doesn't contain cycles, we represent a cycle in the graph /// using an anti dependence from a Phi to an instruction. @@ -404,6 +419,7 @@ void addConnectedNodes(SUnit *SU, NodeSet &NewSet, SetVector &NodesAdded); void computeNodeOrder(NodeSetType &NodeSets); + bool isValidNodeOrder(const NodeSetType &Circuits) const; bool schedulePipeline(SMSchedule &Schedule); void generatePipelinedLoop(SMSchedule &Schedule); void generateProlog(SMSchedule &Schedule, unsigned LastStage, @@ -863,6 +879,7 @@ NodeSetType NodeSets; findCircuits(NodeSets); + NodeSetType Circuits = NodeSets; // Calculate the MII. unsigned ResMII = calculateResMII(); @@ -916,6 +933,18 @@ computeNodeOrder(NodeSets); + // check for node order issues + bool ValidNodeOrder = isValidNodeOrder(Circuits); + + // Note that we do not use an assert to check for invalid node orders. + // The reason is that although an invalid node oder may prevent + // the pipeliner from finding a pipelined schedule for arbitrary II, + // it does not lead to the generation of incorrect code. + DEBUG({ + if (!ValidNodeOrder) + dbgs() << "Invalid node order found!\n"; + }); + SMSchedule Schedule(Pass.MF); Scheduled = schedulePipeline(Schedule); @@ -1568,11 +1597,12 @@ }); int maxASAP = 0; - // Compute ASAP. + // Compute ASAP and ZeroLatencyDepth. for (ScheduleDAGTopologicalSort::const_iterator I = Topo.begin(), E = Topo.end(); I != E; ++I) { int asap = 0; + int zeroLatencyDepth = 0; SUnit *SU = &SUnits[*I]; for (SUnit::const_pred_iterator IP = SU->Preds.begin(), EP = SU->Preds.end(); @@ -1582,16 +1612,21 @@ SUnit *pred = IP->getSUnit(); asap = std::max(asap, (int)(getASAP(pred) + getLatency(SU, *IP) - getDistance(pred, SU, *IP) * MII)); + if (getLatency(SU, *IP) == 0) + zeroLatencyDepth = + std::max(zeroLatencyDepth, getZeroLatencyDepth(pred) + 1); } maxASAP = std::max(maxASAP, asap); ScheduleInfo[*I].ASAP = asap; + ScheduleInfo[*I].ZeroLatencyDepth = zeroLatencyDepth; } - // Compute ALAP and MOV. + // Compute ALAP, ZeroLatencyHeight, and MOV. for (ScheduleDAGTopologicalSort::const_reverse_iterator I = Topo.rbegin(), E = Topo.rend(); I != E; ++I) { int alap = maxASAP; + int zeroLatencyHeight = 0; SUnit *SU = &SUnits[*I]; for (SUnit::const_succ_iterator IS = SU->Succs.begin(), ES = SU->Succs.end(); @@ -1601,9 +1636,13 @@ SUnit *succ = IS->getSUnit(); alap = std::min(alap, (int)(getALAP(succ) - getLatency(SU, *IS) + getDistance(SU, succ, *IS) * MII)); + if (getLatency(SU, *IS) == 0) + zeroLatencyHeight = + std::max(zeroLatencyHeight, getZeroLatencyHeight(succ) + 1); } ScheduleInfo[*I].ALAP = alap; + ScheduleInfo[*I].ZeroLatencyHeight = zeroLatencyHeight; } // After computing the node functions, compute the summary for each node set. @@ -1618,6 +1657,8 @@ dbgs() << "\t MOV = " << getMOV(&SUnits[i]) << "\n"; dbgs() << "\t D = " << getDepth(&SUnits[i]) << "\n"; dbgs() << "\t H = " << getHeight(&SUnits[i]) << "\n"; + dbgs() << "\t ZLD = " << getZeroLatencyDepth(&SUnits[i]) << "\n"; + dbgs() << "\t ZLH = " << getZeroLatencyHeight(&SUnits[i]) << "\n"; } }); } @@ -1986,14 +2027,6 @@ } } -/// Return true if Inst1 defines a value that is used in Inst2. -static bool hasDataDependence(SUnit *Inst1, SUnit *Inst2) { - for (auto &SI : Inst1->Succs) - if (SI.getSUnit() == Inst2 && SI.getKind() == SDep::Data) - return true; - return false; -} - /// Compute an ordered list of the dependence graph nodes, which /// indicates the order that the nodes will be scheduled. This is a /// two-level algorithm. First, a partial order is created, which @@ -2040,18 +2073,20 @@ while (!R.empty()) { if (Order == TopDown) { // Choose the node with the maximum height. If more than one, choose - // the node with the lowest MOV. If still more than one, check if there - // is a dependence between the instructions. + // the node with the maximum ZeroLatencyHeight. If still more than one, + // choose the node with the lowest MOV. while (!R.empty()) { SUnit *maxHeight = nullptr; for (SUnit *I : R) { if (maxHeight == nullptr || getHeight(I) > getHeight(maxHeight)) maxHeight = I; else if (getHeight(I) == getHeight(maxHeight) && - getMOV(I) < getMOV(maxHeight) && - !hasDataDependence(maxHeight, I)) + getZeroLatencyHeight(I) > getZeroLatencyHeight(maxHeight)) maxHeight = I; - else if (hasDataDependence(I, maxHeight)) + else if (getHeight(I) == getHeight(maxHeight) && + getZeroLatencyHeight(I) == + getZeroLatencyHeight(maxHeight) && + getMOV(I) < getMOV(maxHeight)) maxHeight = I; } NodeOrder.insert(maxHeight); @@ -2084,18 +2119,19 @@ R.insert(N.begin(), N.end()); } else { // Choose the node with the maximum depth. If more than one, choose - // the node with the lowest MOV. If there is still more than one, check - // for a dependence between the instructions. + // the node with the maximum ZeroLatencyDepth. If still more than one, + // choose the node with the lowest MOV. while (!R.empty()) { SUnit *maxDepth = nullptr; for (SUnit *I : R) { if (maxDepth == nullptr || getDepth(I) > getDepth(maxDepth)) maxDepth = I; else if (getDepth(I) == getDepth(maxDepth) && - getMOV(I) < getMOV(maxDepth) && - !hasDataDependence(I, maxDepth)) + getZeroLatencyDepth(I) > getZeroLatencyHeight(maxDepth)) maxDepth = I; - else if (hasDataDependence(maxDepth, I)) + else if (getDepth(I) == getDepth(maxDepth) && + getZeroLatencyDepth(I) == getZeroLatencyDepth(maxDepth) && + getMOV(I) < getMOV(maxDepth)) maxDepth = I; } NodeOrder.insert(maxDepth); @@ -3862,6 +3898,88 @@ return true; } +/// A property of the node order in swing-modulo-scheduling is +/// that for nodes outside circuits the following holds: +/// none of them is scheduled after both a successor and a +/// predecessor. +/// The method below checks whether the property is met. +bool SwingSchedulerDAG::isValidNodeOrder(const NodeSetType &Circuits) const { + + // a sorted vector that maps each SUnit to its index in the NodeOrder + typedef std::pair UnitIndex; + std::vector Indices(NodeOrder.size(), std::make_pair(nullptr, 0)); + + for (unsigned i = 0, s = NodeOrder.size(); i < s; ++i) + Indices.push_back(std::make_pair(NodeOrder[i], i)); + + auto CompareKey = [](UnitIndex i1, UnitIndex i2) { + return std::get<0>(i1) < std::get<0>(i2); + }; + + // sort, so that we can perform a binary search + std::sort(Indices.begin(), Indices.end(), CompareKey); + + bool Valid = true; + // for each SUnit in the NodeOrder, check whether + // it appears after both a successor and a predecessor + // of the SUnit. If this is the case, and the SUnit + // is not part of circuit, then the NodeOrder is not + // valid. + for (unsigned i = 0, s = NodeOrder.size(); i < s; ++i) { + SUnit *SU = NodeOrder[i]; + unsigned Index = i; + + bool PredBefore = false; + bool SuccBefore = false; + + SUnit *Succ; + SUnit *Pred; + + for (SDep &PredEdge : SU->Preds) { + SUnit *PredSU = PredEdge.getSUnit(); + unsigned PredIndex = + std::get<1>(*std::lower_bound(Indices.begin(), Indices.end(), + std::make_pair(PredSU, 0), CompareKey)); + if (!PredSU->getInstr()->isPHI() && PredIndex < Index) { + PredBefore = true; + Pred = PredSU; + break; + } + } + + for (SDep &SuccEdge : SU->Succs) { + SUnit *SuccSU = SuccEdge.getSUnit(); + unsigned SuccIndex = + std::get<1>(*std::lower_bound(Indices.begin(), Indices.end(), + std::make_pair(SuccSU, 0), CompareKey)); + if (!SuccSU->getInstr()->isPHI() && SuccIndex < Index) { + SuccBefore = true; + Succ = SuccSU; + break; + } + } + + if (PredBefore && SuccBefore && !SU->getInstr()->isPHI()) { + // instructions in circuits are allowed to be scheduled + // after both a successor and predecessor. + bool InCircuit = std::any_of( + Circuits.begin(), Circuits.end(), + [SU](const NodeSet &Circuit) { return Circuit.count(SU); }); + if (InCircuit) + DEBUG(dbgs() << "In a circuit, predecessor ";); + else { + Valid = false; + NumNodeOrderIssues++; + DEBUG(dbgs() << "Predecessor ";); + } + DEBUG(dbgs() << Pred->NodeNum << " and successor " << Succ->NodeNum + << " are scheduled before node " << SU->NodeNum << "\n";); + } + } + + return Valid; +} + /// Attempt to fix the degenerate cases when the instruction serialization /// causes the register lifetimes to overlap. For example, /// p' = store_pi(p, b) Index: test/CodeGen/Hexagon/SUnit-boundary-prob.ll =================================================================== --- test/CodeGen/Hexagon/SUnit-boundary-prob.ll +++ test/CodeGen/Hexagon/SUnit-boundary-prob.ll @@ -1,8 +1,12 @@ -; RUN: llc -march=hexagon -O2 -mcpu=hexagonv60 < %s | FileCheck %s +; REQUIRES: asserts +; RUN: llc -march=hexagon -O2 -mcpu=hexagonv60 --stats -o - 2>&1 < %s | FileCheck %s ; This was aborting while processing SUnits. ; CHECK: vmem +; CHECK-NOT: Number of node order issues found +; CHECK: Number of loops software pipelined +; CHECK-NOT: Number of node order issues found source_filename = "bugpoint-output-bdb0052.bc" target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048" target triple = "hexagon-unknown--elf" Index: test/CodeGen/Hexagon/frame-offset-overflow.ll =================================================================== --- test/CodeGen/Hexagon/frame-offset-overflow.ll +++ test/CodeGen/Hexagon/frame-offset-overflow.ll @@ -1,9 +1,14 @@ -; RUN: llc -march=hexagon < %s | FileCheck %s +; REQUIRES: asserts +; RUN: llc -march=hexagon --stats -o - 2>&1 < %s | FileCheck %s -; In reality, check that the compilation succeeded and that some code was -; generated. +; Check that the compilation succeeded and that some code was generated. ; CHECK: vadd +; Check that the loop is pipelined and that a valid node order is used. +; CHECK-NOT: Number of node order issues found +; CHECK: Number of loops software pipelined +; CHECK-NOT: Number of node order issues found + target triple = "hexagon" define void @fred(i16* noalias nocapture readonly %p0, i32 %p1, i32 %p2, i16* noalias nocapture %p3, i32 %p4) local_unnamed_addr #1 { Index: test/CodeGen/Hexagon/swp-vmult.ll =================================================================== --- test/CodeGen/Hexagon/swp-vmult.ll +++ test/CodeGen/Hexagon/swp-vmult.ll @@ -4,8 +4,8 @@ ; Multiply and accumulate ; CHECK: mpyi([[REG0:r([0-9]+)]],[[REG1:r([0-9]+)]]) ; CHECK-NEXT: add(r{{[0-9]+}},#4) -; CHECK-NEXT: [[REG0]] = memw(r{{[0-9]+}}+r{{[0-9]+}}<<#0) -; CHECK-NEXT: [[REG1]] = memw(r{{[0-9]+}}+r{{[0-9]+}}<<#0) +; CHECK-DAG: [[REG1]] = memw(r{{[0-9]+}}+r{{[0-9]+}}<<#0) +; CHECK-DAG: [[REG0]] = memw(r{{[0-9]+}}+r{{[0-9]+}}<<#0) ; CHECK-NEXT: endloop0 define i32 @foo(i32* %a, i32* %b, i32 %n) { Index: test/CodeGen/Hexagon/vect/vect-shuffle.ll =================================================================== --- test/CodeGen/Hexagon/vect/vect-shuffle.ll +++ test/CodeGen/Hexagon/vect/vect-shuffle.ll @@ -1,8 +1,12 @@ -; RUN: llc -march=hexagon -mcpu=hexagonv5 -disable-hsdr < %s | FileCheck %s +; REQUIRES: asserts +; RUN: llc -march=hexagon -mcpu=hexagonv5 -disable-hsdr --stats -o - 2>&1 < %s | FileCheck %s ; Check that store is post-incremented. ; CHECK-NOT: extractu(r{{[0-9]+}},#32, ; CHECK-NOT: insert +; CHECK-NOT: Number of node order issues found +; CHECK: Number of loops software pipelined +; CHECK-NOT: Number of node order issues found target datalayout = "e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-f64:64:64-f32:32:32-v64:64:64-v32:32:32-a0:0-n16:32" target triple = "hexagon"