Index: docs/CommandGuide/llvm-exegesis.rst =================================================================== --- docs/CommandGuide/llvm-exegesis.rst +++ docs/CommandGuide/llvm-exegesis.rst @@ -67,7 +67,7 @@ .. code-block:: bash #!/bin/bash - readonly INSTRUCTIONS=$(grep INSTRUCTION_LIST_END build/lib/Target/X86/X86GenInstrInfo.inc | cut -f2 -d=) + readonly INSTRUCTIONS=$(($(grep INSTRUCTION_LIST_END build/lib/Target/X86/X86GenInstrInfo.inc | cut -f2 -d=) - 1)) for INSTRUCTION in $(seq 1 ${INSTRUCTIONS}); do ./build/bin/llvm-exegesis -mode=latency -opcode-index=${INSTRUCTION} | sed -n '/---/,$p' Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -2283,8 +2283,11 @@ return DAG.getNode(ISD::ADDCARRY, DL, N->getVTList(), N1, N0, CarryIn); // fold (addcarry x, y, false) -> (uaddo x, y) - if (isNullConstant(CarryIn)) - return DAG.getNode(ISD::UADDO, DL, N->getVTList(), N0, N1); + if (isNullConstant(CarryIn)) { + if (!LegalOperations || + TLI.isOperationLegalOrCustom(ISD::UADDO, N->getValueType(0))) + return DAG.getNode(ISD::UADDO, DL, N->getVTList(), N0, N1); + } // fold (addcarry 0, 0, X) -> (and (ext/trunc X), 1) and no carry. if (isNullConstant(N0) && isNullConstant(N1)) { @@ -2592,8 +2595,11 @@ SDValue CarryIn = N->getOperand(2); // fold (subcarry x, y, false) -> (usubo x, y) - if (isNullConstant(CarryIn)) - return DAG.getNode(ISD::USUBO, SDLoc(N), N->getVTList(), N0, N1); + if (isNullConstant(CarryIn)) { + if (!LegalOperations || + TLI.isOperationLegalOrCustom(ISD::USUBO, N->getValueType(0))) + return DAG.getNode(ISD::USUBO, SDLoc(N), N->getVTList(), N0, N1); + } return SDValue(); } Index: lib/CodeGen/SelectionDAG/LegalizeDAG.cpp =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -3499,15 +3499,25 @@ case ISD::USUBO: { SDValue LHS = Node->getOperand(0); SDValue RHS = Node->getOperand(1); - SDValue Sum = DAG.getNode(Node->getOpcode() == ISD::UADDO ? - ISD::ADD : ISD::SUB, dl, LHS.getValueType(), - LHS, RHS); + bool IsAdd = Node->getOpcode() == ISD::UADDO; + // If ADD/SUBCARRY is legal, use that instead. + unsigned OpcCarry = IsAdd ? ISD::ADDCARRY : ISD::SUBCARRY; + if (TLI.isOperationLegalOrCustom(OpcCarry, Node->getValueType(0))) { + SDValue CarryIn = DAG.getConstant(0, dl, Node->getValueType(1)); + SDValue NodeCarry = DAG.getNode(OpcCarry, dl, Node->getVTList(), + { LHS, RHS, CarryIn }); + Results.push_back(SDValue(NodeCarry.getNode(), 0)); + Results.push_back(SDValue(NodeCarry.getNode(), 1)); + break; + } + + SDValue Sum = DAG.getNode(IsAdd ? ISD::ADD : ISD::SUB, dl, + LHS.getValueType(), LHS, RHS); Results.push_back(Sum); EVT ResultType = Node->getValueType(1); EVT SetCCType = getSetCCResultType(Node->getValueType(0)); - ISD::CondCode CC - = Node->getOpcode() == ISD::UADDO ? ISD::SETULT : ISD::SETUGT; + ISD::CondCode CC = IsAdd ? ISD::SETULT : ISD::SETUGT; SDValue SetCC = DAG.getSetCC(dl, SetCCType, Sum, LHS, CC); Results.push_back(DAG.getBoolExtOrTrunc(SetCC, dl, ResultType, ResultType)); Index: lib/Target/Hexagon/HexagonISelDAGToDAG.h =================================================================== --- lib/Target/Hexagon/HexagonISelDAGToDAG.h +++ lib/Target/Hexagon/HexagonISelDAGToDAG.h @@ -105,6 +105,7 @@ void SelectV65Gather(SDNode *N); void SelectV65GatherPred(SDNode *N); void SelectHVXDualOutput(SDNode *N); + void SelectAddSubCarry(SDNode *N); void SelectVAlign(SDNode *N); void SelectVAlignAddr(SDNode *N); void SelectTypecast(SDNode *N); Index: lib/Target/Hexagon/HexagonISelDAGToDAG.cpp =================================================================== --- lib/Target/Hexagon/HexagonISelDAGToDAG.cpp +++ lib/Target/Hexagon/HexagonISelDAGToDAG.cpp @@ -762,6 +762,15 @@ ReplaceNode(N, R); } +void HexagonDAGToDAGISel::SelectAddSubCarry(SDNode *N) { + unsigned OpcCarry = N->getOpcode() == HexagonISD::ADDC ? Hexagon::A4_addp_c + : Hexagon::A4_subp_c; + SDNode *C = CurDAG->getMachineNode(OpcCarry, SDLoc(N), N->getVTList(), + { N->getOperand(0), N->getOperand(1), + N->getOperand(2) }); + ReplaceNode(N, C); +} + void HexagonDAGToDAGISel::SelectVAlign(SDNode *N) { MVT ResTy = N->getValueType(0).getSimpleVT(); if (HST->isHVXVectorType(ResTy, true)) @@ -875,6 +884,9 @@ case ISD::STORE: return SelectStore(N); case ISD::INTRINSIC_W_CHAIN: return SelectIntrinsicWChain(N); case ISD::INTRINSIC_WO_CHAIN: return SelectIntrinsicWOChain(N); + + case HexagonISD::ADDC: + case HexagonISD::SUBC: return SelectAddSubCarry(N); case HexagonISD::VALIGN: return SelectVAlign(N); case HexagonISD::VALIGNADDR: return SelectVAlignAddr(N); case HexagonISD::TYPECAST: return SelectTypecast(N); Index: lib/Target/Hexagon/HexagonISelLowering.h =================================================================== --- lib/Target/Hexagon/HexagonISelLowering.h +++ lib/Target/Hexagon/HexagonISelLowering.h @@ -36,6 +36,8 @@ CONST32 = OP_BEGIN, CONST32_GP, // For marking data present in GP. + ADDC, // Add with carry: (X, Y, Cin) -> (X+Y, Cout). + SUBC, // Sub with carry: (X, Y, Cin) -> (X+~Y+Cin, Cout). ALLOCA, AT_GOT, // Index in GOT. @@ -162,6 +164,7 @@ SDValue LowerSIGN_EXTEND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerZERO_EXTEND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerUnalignedLoad(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerAddSubCarry(SDValue Op, SelectionDAG &DAG) const; SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINLINEASM(SDValue Op, SelectionDAG &DAG) const; Index: lib/Target/Hexagon/HexagonISelLowering.cpp =================================================================== --- lib/Target/Hexagon/HexagonISelLowering.cpp +++ lib/Target/Hexagon/HexagonISelLowering.cpp @@ -1327,13 +1327,18 @@ setMinimumJumpTableEntries(std::numeric_limits::max()); setOperationAction(ISD::BR_JT, MVT::Other, Expand); - // Only add and sub that detect overflow are the saturating ones. + // Hexagon has A4_addp_c and A4_subp_c that take and generate a carry bit, + // but they only operate on i64. for (MVT VT : MVT::integer_valuetypes()) { - setOperationAction(ISD::UADDO, VT, Expand); - setOperationAction(ISD::SADDO, VT, Expand); - setOperationAction(ISD::USUBO, VT, Expand); - setOperationAction(ISD::SSUBO, VT, Expand); + setOperationAction(ISD::UADDO, VT, Expand); + setOperationAction(ISD::USUBO, VT, Expand); + setOperationAction(ISD::SADDO, VT, Expand); + setOperationAction(ISD::SSUBO, VT, Expand); + setOperationAction(ISD::ADDCARRY, VT, Expand); + setOperationAction(ISD::SUBCARRY, VT, Expand); } + setOperationAction(ISD::ADDCARRY, MVT::i64, Custom); + setOperationAction(ISD::SUBCARRY, MVT::i64, Custom); setOperationAction(ISD::CTLZ, MVT::i8, Promote); setOperationAction(ISD::CTLZ, MVT::i16, Promote); @@ -1681,6 +1686,8 @@ const char* HexagonTargetLowering::getTargetNodeName(unsigned Opcode) const { switch ((HexagonISD::NodeType)Opcode) { + case HexagonISD::ADDC: return "HexagonISD::ADDC"; + case HexagonISD::SUBC: return "HexagonISD::SUBC"; case HexagonISD::ALLOCA: return "HexagonISD::ALLOCA"; case HexagonISD::AT_GOT: return "HexagonISD::AT_GOT"; case HexagonISD::AT_PCREL: return "HexagonISD::AT_PCREL"; @@ -2705,6 +2712,24 @@ return M; } +SDValue +HexagonTargetLowering::LowerAddSubCarry(SDValue Op, SelectionDAG &DAG) const { + const SDLoc &dl(Op); + unsigned Opc = Op.getOpcode(); + SDValue X = Op.getOperand(0), Y = Op.getOperand(1), C = Op.getOperand(2); + + if (Opc == ISD::ADDCARRY) + return DAG.getNode(HexagonISD::ADDC, dl, Op.getNode()->getVTList(), + { X, Y, C }); + + EVT CarryTy = C.getValueType(); + SDValue SubC = DAG.getNode(HexagonISD::SUBC, dl, Op.getNode()->getVTList(), + { X, Y, DAG.getLogicalNOT(dl, C, CarryTy) }); + SDValue Out[] = { SubC.getValue(0), + DAG.getLogicalNOT(dl, SubC.getValue(1), CarryTy) }; + return DAG.getMergeValues(Out, dl); +} + SDValue HexagonTargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const { SDValue Chain = Op.getOperand(0); @@ -2763,6 +2788,8 @@ case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); case ISD::BITCAST: return LowerBITCAST(Op, DAG); case ISD::LOAD: return LowerUnalignedLoad(Op, DAG); + case ISD::ADDCARRY: + case ISD::SUBCARRY: return LowerAddSubCarry(Op, DAG); case ISD::SRA: case ISD::SHL: case ISD::SRL: return LowerVECTOR_SHIFT(Op, DAG); Index: lib/Target/PowerPC/PPCISelDAGToDAG.cpp =================================================================== --- lib/Target/PowerPC/PPCISelDAGToDAG.cpp +++ lib/Target/PowerPC/PPCISelDAGToDAG.cpp @@ -2253,7 +2253,7 @@ // set of bit groups, and then mask in the zeros at the end. With early // masking, we only insert the non-zero parts of the result at every step. - unsigned InstCnt, InstCntLateMask; + unsigned InstCnt = 0, InstCntLateMask = 0; LLVM_DEBUG(dbgs() << "\tEarly masking:\n"); SDNode *RN = Select(N, false, &InstCnt); LLVM_DEBUG(dbgs() << "\t\tisel would use " << InstCnt << " instructions\n"); Index: test/CodeGen/Hexagon/adde.ll =================================================================== --- test/CodeGen/Hexagon/adde.ll +++ /dev/null @@ -1,27 +0,0 @@ -; RUN: llc -march=hexagon -hexagon-expand-condsets=0 < %s | FileCheck %s - -; CHECK-DAG: r{{[0-9]+:[0-9]+}} = add(r{{[0-9]+:[0-9]+}},r{{[0-9]+:[0-9]+}}) -; CHECK-DAG: r{{[0-9]+:[0-9]+}} = add(r{{[0-9]+:[0-9]+}},r{{[0-9]+:[0-9]+}}) -; CHECK-DAG: p{{[0-9]+}} = cmp.gtu(r{{[0-9]+:[0-9]+}},r{{[0-9]+:[0-9]+}}) -; CHECK-DAG: p{{[0-9]+}} = cmp.gtu(r{{[0-9]+:[0-9]+}},r{{[0-9]+:[0-9]+}}) -; CHECK-DAG: r{{[0-9]+}} = mux(p{{[0-9]+}},r{{[0-9]+}},r{{[0-9]+}}) -; CHECK-DAG: r{{[0-9]+}} = mux(p{{[0-9]+}},r{{[0-9]+}},r{{[0-9]+}}) - -define void @check_adde_addc(i64 %a0, i64 %a1, i64 %a2, i64 %a3, i64* %a4, i64* %a5) { -b6: - %v7 = zext i64 %a0 to i128 - %v8 = zext i64 %a1 to i128 - %v9 = shl i128 %v8, 64 - %v10 = or i128 %v7, %v9 - %v11 = zext i64 %a2 to i128 - %v12 = zext i64 %a3 to i128 - %v13 = shl i128 %v12, 64 - %v14 = or i128 %v11, %v13 - %v15 = add i128 %v10, %v14 - %v16 = lshr i128 %v15, 64 - %v17 = trunc i128 %v15 to i64 - %v18 = trunc i128 %v16 to i64 - store i64 %v17, i64* %a4 - store i64 %v18, i64* %a5 - ret void -} Index: test/CodeGen/Hexagon/addsubcarry.ll =================================================================== --- /dev/null +++ test/CodeGen/Hexagon/addsubcarry.ll @@ -0,0 +1,25 @@ +; RUN: llc -march=hexagon < %s | FileCheck %s + +@g = global i128 zeroinitializer, align 8 + +; CHECK-LABEL: addc: +; CHECK: p[[P0:[0-3]]] = and(p[[P1:[0-9]]],!p[[P1]]) +; CHECK: add({{.*}},{{.*}},p[[P0]]):carry +; CHECK: add({{.*}},{{.*}},p[[P0]]):carry +define void @addc(i128 %a0, i128 %a1) #0 { + %v0 = add i128 %a0, %a1 + store i128 %v0, i128* @g, align 8 + ret void +} + +; CHECK-LABEL: subc: +; CHECK: p[[P0:[0-3]]] = or(p[[P1:[0-9]]],!p[[P1]]) +; CHECK: sub({{.*}},{{.*}},p[[P0]]):carry +; CHECK: sub({{.*}},{{.*}},p[[P0]]):carry +define void @subc(i128 %a0, i128 %a1) #0 { + %v0 = sub i128 %a0, %a1 + store i128 %v0, i128* @g, align 8 + ret void +} + + Index: test/CodeGen/Hexagon/sube.ll =================================================================== --- test/CodeGen/Hexagon/sube.ll +++ /dev/null @@ -1,26 +0,0 @@ -; RUN: llc -march=hexagon -hexagon-expand-condsets=0 < %s | FileCheck %s - -; CHECK-DAG: r{{[0-9]+:[0-9]+}} = sub(r{{[0-9]+:[0-9]+}},r{{[0-9]+:[0-9]+}}) -; CHECK-DAG: r{{[0-9]+:[0-9]+}} = sub(r{{[0-9]+:[0-9]+}},r{{[0-9]+:[0-9]+}}) -; CHECK-DAG: p{{[0-9]+}} = cmp.gtu(r{{[0-9]+:[0-9]+}},r{{[0-9]+:[0-9]+}}) -; CHECK-DAG: r{{[0-9]+}} = mux(p{{[0-9]+}},r{{[0-9]+}},r{{[0-9]+}}) -; CHECK-DAG: r{{[0-9]+}} = mux(p{{[0-9]+}},r{{[0-9]+}},r{{[0-9]+}}) - -define void @check_sube_subc(i64 %a0, i64 %a1, i64 %a2, i64 %a3, i64* %a4, i64* %a5) { -b6: - %v7 = zext i64 %a0 to i128 - %v8 = zext i64 %a1 to i128 - %v9 = shl i128 %v8, 64 - %v10 = or i128 %v7, %v9 - %v11 = zext i64 %a2 to i128 - %v12 = zext i64 %a3 to i128 - %v13 = shl i128 %v12, 64 - %v14 = or i128 %v11, %v13 - %v15 = sub i128 %v10, %v14 - %v16 = lshr i128 %v15, 64 - %v17 = trunc i128 %v15 to i64 - %v18 = trunc i128 %v16 to i64 - store i64 %v17, i64* %a4 - store i64 %v18, i64* %a5 - ret void -} Index: test/Transforms/InstCombine/sdiv-guard.ll =================================================================== --- /dev/null +++ test/Transforms/InstCombine/sdiv-guard.ll @@ -0,0 +1,21 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -instcombine -S | FileCheck %s + +declare void @llvm.experimental.guard(i1, ...) + +; FIXME: If %flag is false then %s == 0 and guard should be triggered. + +define i32 @a(i1 %flag, i32 %X) nounwind readnone { +; CHECK-LABEL: @a( +; CHECK-NEXT: [[CMP:%.*]] = icmp ne i32 [[X:%.*]], 0 +; CHECK-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 [[CMP]]) #1 [ "deopt"() ] +; CHECK-NEXT: [[R:%.*]] = sdiv i32 100, [[X]] +; CHECK-NEXT: ret i32 [[R]] +; + %s = select i1 %flag, i32 %X, i32 0 + %cmp = icmp ne i32 %s, 0 + call void(i1, ...) @llvm.experimental.guard( i1 %cmp )[ "deopt"() ] + %r = sdiv i32 100, %s + ret i32 %r +} + Index: tools/llvm-exegesis/lib/Analysis.h =================================================================== --- tools/llvm-exegesis/lib/Analysis.h +++ tools/llvm-exegesis/lib/Analysis.h @@ -21,6 +21,7 @@ #include "llvm/Support/Error.h" #include "llvm/Support/TargetRegistry.h" #include "llvm/Support/raw_ostream.h" +#include #include #include @@ -40,11 +41,62 @@ template llvm::Error run(llvm::raw_ostream &OS) const; private: + using ClusterId = InstructionBenchmarkClustering::ClusterId; + // An llvm::MCSchedClassDesc augmented with some additional data. + struct SchedClass { + SchedClass(const llvm::MCSchedClassDesc &SD, + const llvm::MCSubtargetInfo &STI); + + const llvm::MCSchedClassDesc &SCDesc; + const llvm::SmallVector + NonRedundantWriteProcRes; + const std::vector> IdealizedProcResPressure; + }; + + // Represents the intersection of a sched class and a cluster. + class SchedClassCluster { + public: + const InstructionBenchmarkClustering::ClusterId &id() const { + return ClusterId; + } + const std::vector &getPointIds() const { return PointIds; } + const std::vector &getRepresentative() const { + return Representative; + } + + // Returns true if the cluster representative measurements match that of SC. + bool + measurementsMatch(const llvm::MCSubtargetInfo &STI, const SchedClass &SC, + const InstructionBenchmarkClustering &Clustering) const; + + void addPoint(size_t PointId, + const InstructionBenchmarkClustering &Clustering) { + PointIds.push_back(PointId); + const auto &Point = Clustering.getPoints()[PointId]; + if (ClusterId.isUndef()) { + ClusterId = Clustering.getClusterIdForPoint(PointId); + Representative.resize(Point.Measurements.size()); + } + for (size_t I = 0, E = Point.Measurements.size(); I < E; ++I) { + Representative[I].push(Point.Measurements[I]); + } + assert(ClusterId == Clustering.getClusterIdForPoint(PointId)); + } + + private: + InstructionBenchmarkClustering::ClusterId ClusterId; + std::vector PointIds; + // Measurement stats for the points in the SchedClassCluster. + std::vector Representative; + }; + void printInstructionRowCsv(size_t PointId, llvm::raw_ostream &OS) const; - void printSchedClassClustersHtml(std::vector PointIds, - llvm::raw_ostream &OS) const; - void printSchedClassDescHtml(const llvm::MCSchedClassDesc &SCDesc, + void + printSchedClassClustersHtml(const std::vector &Clusters, + const SchedClass &SC, + llvm::raw_ostream &OS) const; + void printSchedClassDescHtml(const SchedClass &SC, llvm::raw_ostream &OS) const; // Builds a map of Sched Class -> indices of points that belong to the sched @@ -52,12 +104,22 @@ std::unordered_map> makePointsPerSchedClass() const; + // Returns true of the data for this cluster matches the checked-in llvm data + // for this class. + const InstructionBenchmarkClustering &Clustering_; std::unique_ptr SubtargetInfo_; std::unique_ptr InstrInfo_; std::unordered_map MnemonicToOpcode_; }; +// Computes the idealized ProcRes Unit pressure. This is the expected +// distribution if the CPU scheduler can distribute the load as evenly as +// possible. +std::vector> computeIdealizedProcResPressure( + const llvm::MCSchedModel &SM, + llvm::SmallVector WPRS); + } // namespace exegesis #endif // LLVM_TOOLS_LLVM_EXEGESIS_CLUSTERING_H Index: tools/llvm-exegesis/lib/Analysis.cpp =================================================================== --- tools/llvm-exegesis/lib/Analysis.cpp +++ tools/llvm-exegesis/lib/Analysis.cpp @@ -9,6 +9,7 @@ #include "Analysis.h" #include "BenchmarkResult.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/Support/FormatVariadic.h" #include #include @@ -167,20 +168,15 @@ return PointsPerSchedClass; } -void Analysis::printSchedClassClustersHtml(std::vector PointIds, - llvm::raw_ostream &OS) const { - assert(!PointIds.empty()); - // Sort the points by cluster id so that we can display them grouped by - // cluster. - std::sort(PointIds.begin(), PointIds.end(), - [this](const size_t A, const size_t B) { - return Clustering_.getClusterIdForPoint(A) < - Clustering_.getClusterIdForPoint(B); - }); +void Analysis::printSchedClassClustersHtml( + const std::vector &Clusters, const SchedClass &SC, + llvm::raw_ostream &OS) const { const auto &Points = Clustering_.getPoints(); OS << ""; OS << ""; - for (const auto &Measurement : Points[PointIds[0]].Measurements) { + assert(!Clusters.empty()); + for (const auto &Measurement : + Points[Clusters[0].getPointIds()[0]].Measurements) { OS << ""; } OS << ""; - for (size_t I = 0, E = PointIds.size(); I < E;) { - const auto &CurrentClusterId = - Clustering_.getClusterIdForPoint(PointIds[I]); - OS << ""; - for (const auto &Stats : MeasurementStats) { + for (const auto &Stats : Cluster.getRepresentative()) { OS << "
ClusterIdOpcode/Config"; if (Measurement.DebugString.empty()) writeEscaped(OS, Measurement.Key); @@ -189,29 +185,24 @@ OS << "
"; - writeClusterId(OS, CurrentClusterId); + for (const SchedClassCluster &Cluster : Clusters) { + OS << "
"; + writeClusterId(OS, Cluster.id()); OS << "
    "; - std::vector MeasurementStats( - Points[PointIds[I]].Measurements.size()); - for (; I < E && - Clustering_.getClusterIdForPoint(PointIds[I]) == CurrentClusterId; - ++I) { - const auto &Point = Points[PointIds[I]]; + for (const size_t PointId : Cluster.getPointIds()) { + const auto &Point = Points[PointId]; OS << "
  • "; writeEscaped(OS, Point.Key.OpcodeName); OS << " "; writeEscaped(OS, Point.Key.Config); OS << "
  • "; - for (size_t J = 0, F = Point.Measurements.size(); J < F; ++J) { - MeasurementStats[J].push(Point.Measurements[J]); - } } OS << "
"; writeMeasurementValue(OS, Stats.avg()); OS << "
["; @@ -299,22 +290,87 @@ return Result; } -void Analysis::printSchedClassDescHtml(const llvm::MCSchedClassDesc &SCDesc, +Analysis::SchedClass::SchedClass(const llvm::MCSchedClassDesc &SD, + const llvm::MCSubtargetInfo &STI) + : SCDesc(SD), + NonRedundantWriteProcRes(getNonRedundantWriteProcRes(SD, STI)), + IdealizedProcResPressure(computeIdealizedProcResPressure( + STI.getSchedModel(), NonRedundantWriteProcRes)) {} + +bool Analysis::SchedClassCluster::measurementsMatch( + const llvm::MCSubtargetInfo &STI, const SchedClass &SC, + const InstructionBenchmarkClustering &Clustering) const { + const size_t NumMeasurements = Representative.size(); + std::vector ClusterCenterPoint(NumMeasurements); + std::vector SchedClassPoint(NumMeasurements); + // Latency case. + assert(!Clustering.getPoints().empty()); + const std::string &Mode = Clustering.getPoints()[0].Key.Mode; + if (Mode == "latency") { // FIXME: use an enum. + if (NumMeasurements != 1) { + llvm::errs() + << "invalid number of measurements in latency mode: expected 1, got " + << NumMeasurements << "\n"; + return false; + } + // Find the latency. + SchedClassPoint[0].Value = 0.0; + for (unsigned I = 0; I < SC.SCDesc.NumWriteLatencyEntries; ++I) { + const llvm::MCWriteLatencyEntry *const WLE = + STI.getWriteLatencyEntry(&SC.SCDesc, I); + SchedClassPoint[0].Value = + std::max(SchedClassPoint[0].Value, WLE->Cycles); + } + ClusterCenterPoint[0].Value = Representative[0].avg(); + } else if (Mode == "uops") { + for (int I = 0, E = Representative.size(); I < E; ++I) { + // Find the pressure on ProcResIdx `Key`. + uint16_t ProcResIdx = 0; + if (!llvm::to_integer(Representative[I].key(), ProcResIdx, 10)) { + llvm::errs() << "expected ProcResIdx key, got " + << Representative[I].key() << "\n"; + return false; + } + const auto ProcResPressureIt = + std::find_if(SC.IdealizedProcResPressure.begin(), + SC.IdealizedProcResPressure.end(), + [ProcResIdx](const std::pair &WPR) { + return WPR.first == ProcResIdx; + }); + SchedClassPoint[I].Value = + ProcResPressureIt == SC.IdealizedProcResPressure.end() + ? 0.0 + : ProcResPressureIt->second; + ClusterCenterPoint[I].Value = Representative[I].avg(); + } + } else { + llvm::errs() << "unimplemented measurement matching for mode ''" << Mode + << "''\n"; + return false; + } + return Clustering.isNeighbour(ClusterCenterPoint, SchedClassPoint); +} + +void Analysis::printSchedClassDescHtml(const SchedClass &SC, llvm::raw_ostream &OS) const { OS << ""; OS << ""; - if (SCDesc.isValid()) { + "th>"; + if (SC.SCDesc.isValid()) { + const auto &SM = SubtargetInfo_->getSchedModel(); OS << ""; - OS << ""; - OS << ""; + OS << ""; + OS << ""; // Latencies. OS << ""; // WriteProcRes. OS << ""; + // Idealized port pressure. + OS << ""; OS << ""; @@ -386,12 +452,21 @@ span.mono { font-family: monospace; } -span.minmax { - color: #888; -} td.measurement { text-align: center; } +tr.good-cluster td.measurement { + color: #292 +} +tr.bad-cluster td.measurement { + color: #922 +} +tr.good-cluster td.measurement span.minmax { + color: #888; +} +tr.bad-cluster td.measurement span.minmax { + color: #888; +} )"; @@ -399,46 +474,65 @@ template <> llvm::Error Analysis::run( llvm::raw_ostream &OS) const { + const auto &FirstPoint = Clustering_.getPoints()[0]; // Print the header. OS << "" << kHtmlHead << ""; OS << "

llvm-exegesis Analysis Results

"; OS << "

Triple: "; - writeEscaped(OS, Clustering_.getPoints()[0].LLVMTriple); + writeEscaped(OS, FirstPoint.LLVMTriple); OS << "

Cpu: "; - writeEscaped(OS, Clustering_.getPoints()[0].CpuName); + writeEscaped(OS, FirstPoint.CpuName); OS << "

"; - // All the points in a scheduling class should be in the same cluster. - // Print any scheduling class for which this is not the case. for (const auto &SchedClassAndPoints : makePointsPerSchedClass()) { - std::unordered_set ClustersForSchedClass; - for (const size_t PointId : SchedClassAndPoints.second) { + const auto SchedClassId = SchedClassAndPoints.first; + const std::vector &SchedClassPoints = SchedClassAndPoints.second; + const auto &SchedModel = SubtargetInfo_->getSchedModel(); + const llvm::MCSchedClassDesc *const SCDesc = + SchedModel.getSchedClassDesc(SchedClassId); + if (!SCDesc) + continue; + const SchedClass SC(*SCDesc, *SubtargetInfo_); + + // Bucket sched class points into sched class clusters. + std::vector SchedClassClusters; + for (const size_t PointId : SchedClassPoints) { const auto &ClusterId = Clustering_.getClusterIdForPoint(PointId); if (!ClusterId.isValid()) - continue; // Ignore noise and errors. - ClustersForSchedClass.insert(ClusterId.getId()); + continue; // Ignore noise and errors. FIXME: take noise into account ? + auto SchedClassClusterIt = + std::find_if(SchedClassClusters.begin(), SchedClassClusters.end(), + [ClusterId](const SchedClassCluster &C) { + return C.id() == ClusterId; + }); + if (SchedClassClusterIt == SchedClassClusters.end()) { + SchedClassClusters.emplace_back(); + SchedClassClusterIt = std::prev(SchedClassClusters.end()); + } + SchedClassClusterIt->addPoint(PointId, Clustering_); } - if (ClustersForSchedClass.size() <= 1) + + // Print any scheduling class that has at least one cluster that does not + // match the checked-in data. + if (std::all_of(SchedClassClusters.begin(), SchedClassClusters.end(), + [this, &SC](const SchedClassCluster &C) { + return C.measurementsMatch(*SubtargetInfo_, SC, + Clustering_); + })) continue; // Nothing weird. - const auto &SchedModel = SubtargetInfo_->getSchedModel(); - const llvm::MCSchedClassDesc *const SCDesc = - SchedModel.getSchedClassDesc(SchedClassAndPoints.first); - if (!SCDesc) - continue; OS << "

Sched Class "; #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) writeEscaped(OS, SCDesc->Name); #else - OS << SchedClassAndPoints.first; + OS << SchedClassId; #endif - OS << " contains instructions with distinct performance " - "characteristics, falling into " - << ClustersForSchedClass.size() << " clusters:

"; - printSchedClassClustersHtml(SchedClassAndPoints.second, OS); - OS << "

llvm data:

"; - printSchedClassDescHtml(*SCDesc, OS); + OS << " contains instructions whose performance characteristics do" + " not match that of LLVM:

"; + printSchedClassClustersHtml(SchedClassClusters, SC, OS); + OS << "

llvm SchedModel data:

"; + printSchedClassDescHtml(SC, OS); OS << "
"; } @@ -446,4 +540,118 @@ return llvm::Error::success(); } +// Distributes a pressure budget as evenly as possible on the provided subunits +// given the already existing port pressure distribution. +// +// The algorithm is as follows: while there is remaining pressure to +// distribute, find the subunits with minimal pressure, and distribute +// remaining pressure equally up to the pressure of the unit with +// second-to-minimal pressure. +// For example, let's assume we want to distribute 2*P1256 +// (Subunits = [P1,P2,P5,P6]), and the starting DensePressure is: +// DensePressure = P0 P1 P2 P3 P4 P5 P6 P7 +// 0.1 0.3 0.2 0.0 0.0 0.5 0.5 0.5 +// RemainingPressure = 2.0 +// We sort the subunits by pressure: +// Subunits = [(P2,p=0.2), (P1,p=0.3), (P5,p=0.5), (P6, p=0.5)] +// We'll first start by the subunits with minimal pressure, which are at +// the beginning of the sorted array. In this example there is one (P2). +// The subunit with second-to-minimal pressure is the next one in the +// array (P1). So we distribute 0.1 pressure to P2, and remove 0.1 cycles +// from the budget. +// Subunits = [(P2,p=0.3), (P1,p=0.3), (P5,p=0.5), (P5,p=0.5)] +// RemainingPressure = 1.9 +// We repeat this process: distribute 0.2 pressure on each of the minimal +// P2 and P1, decrease budget by 2*0.2: +// Subunits = [(P2,p=0.5), (P1,p=0.5), (P5,p=0.5), (P5,p=0.5)] +// RemainingPressure = 1.5 +// There are no second-to-minimal subunits so we just share the remaining +// budget (1.5 cycles) equally: +// Subunits = [(P2,p=0.875), (P1,p=0.875), (P5,p=0.875), (P5,p=0.875)] +// RemainingPressure = 0.0 +// We stop as there is no remaining budget to distribute. +void distributePressure(float RemainingPressure, + llvm::SmallVector Subunits, + llvm::SmallVector &DensePressure) { + // Find the number of subunits with minimal pressure (they are at the + // front). + llvm::sort(Subunits.begin(), Subunits.end(), + [&DensePressure](const uint16_t A, const uint16_t B) { + return DensePressure[A] < DensePressure[B]; + }); + const auto getPressureForSubunit = [&DensePressure, + &Subunits](size_t I) -> float & { + return DensePressure[Subunits[I]]; + }; + size_t NumMinimalSU = 1; + while (NumMinimalSU < Subunits.size() && + getPressureForSubunit(NumMinimalSU) == getPressureForSubunit(0)) { + ++NumMinimalSU; + } + while (RemainingPressure > 0.0f) { + if (NumMinimalSU == Subunits.size()) { + // All units are minimal, just distribute evenly and be done. + for (size_t I = 0; I < NumMinimalSU; ++I) { + getPressureForSubunit(I) += RemainingPressure / NumMinimalSU; + } + return; + } + // Distribute the remaining pressure equally. + const float MinimalPressure = getPressureForSubunit(NumMinimalSU - 1); + const float SecondToMinimalPressure = getPressureForSubunit(NumMinimalSU); + assert(MinimalPressure < SecondToMinimalPressure); + const float Increment = SecondToMinimalPressure - MinimalPressure; + if (RemainingPressure <= NumMinimalSU * Increment) { + // There is not enough remaining pressure. + for (size_t I = 0; I < NumMinimalSU; ++I) { + getPressureForSubunit(I) += RemainingPressure / NumMinimalSU; + } + return; + } + // Bump all minimal pressure subunits to `SecondToMinimalPressure`. + for (size_t I = 0; I < NumMinimalSU; ++I) { + getPressureForSubunit(I) = SecondToMinimalPressure; + RemainingPressure -= SecondToMinimalPressure; + } + while (NumMinimalSU < Subunits.size() && + getPressureForSubunit(NumMinimalSU) == SecondToMinimalPressure) { + ++NumMinimalSU; + } + } +} + +std::vector> computeIdealizedProcResPressure( + const llvm::MCSchedModel &SM, + llvm::SmallVector WPRS) { + // DensePressure[I] is the port pressure for Proc Resource I. + llvm::SmallVector DensePressure(SM.getNumProcResourceKinds()); + llvm::sort(WPRS.begin(), WPRS.end(), + [](const llvm::MCWriteProcResEntry &A, + const llvm::MCWriteProcResEntry &B) { + return A.ProcResourceIdx < B.ProcResourceIdx; + }); + for (const llvm::MCWriteProcResEntry &WPR : WPRS) { + // Get units for the entry. + const llvm::MCProcResourceDesc *const ProcResDesc = + SM.getProcResource(WPR.ProcResourceIdx); + if (ProcResDesc->SubUnitsIdxBegin == nullptr) { + // This is a ProcResUnit. + DensePressure[WPR.ProcResourceIdx] += WPR.Cycles; + } else { + // This is a ProcResGroup. + llvm::SmallVector Subunits(ProcResDesc->SubUnitsIdxBegin, + ProcResDesc->SubUnitsIdxBegin + + ProcResDesc->NumUnits); + distributePressure(WPR.Cycles, Subunits, DensePressure); + } + } + // Turn dense pressure into sparse pressure by removing zero entries. + std::vector> Pressure; + for (unsigned I = 0, E = SM.getNumProcResourceKinds(); I < E; ++I) { + if (DensePressure[I] > 0.0f) + Pressure.emplace_back(I, DensePressure[I]); + } + return Pressure; +} + } // namespace exegesis Index: tools/llvm-exegesis/lib/BenchmarkResult.h =================================================================== --- tools/llvm-exegesis/lib/BenchmarkResult.h +++ tools/llvm-exegesis/lib/BenchmarkResult.h @@ -77,6 +77,8 @@ double min() const { return MinValue; } double max() const { return MaxValue; } + const std::string &key() const { return Key; } + private: std::string Key; double SumValues = 0.0; Index: tools/llvm-exegesis/lib/Clustering.h =================================================================== --- tools/llvm-exegesis/lib/Clustering.h +++ tools/llvm-exegesis/lib/Clustering.h @@ -33,12 +33,10 @@ public: static ClusterId noise() { return ClusterId(kNoise); } static ClusterId error() { return ClusterId(kError); } - static ClusterId makeValid(size_t Id) { - return ClusterId(Id); - } + static ClusterId makeValid(size_t Id) { return ClusterId(Id); } ClusterId() : Id_(kUndef) {} bool operator==(const ClusterId &O) const { return Id_ == O.Id_; } - bool operator<(const ClusterId &O) const {return Id_ < O.Id_; } + bool operator<(const ClusterId &O) const { return Id_ < O.Id_; } bool isValid() const { return Id_ <= kMaxValid; } bool isUndef() const { return Id_ == kUndef; } @@ -53,7 +51,8 @@ private: explicit ClusterId(size_t Id) : Id_(Id) {} - static constexpr const size_t kMaxValid = std::numeric_limits::max() - 4; + static constexpr const size_t kMaxValid = + std::numeric_limits::max() - 4; static constexpr const size_t kNoise = kMaxValid + 1; static constexpr const size_t kError = kMaxValid + 2; static constexpr const size_t kUndef = kMaxValid + 3; @@ -88,12 +87,19 @@ const std::vector &getValidClusters() const { return Clusters_; } + // Returns true if the given point is within a distance Epsilon of each other. + bool isNeighbour(const std::vector &P, + const std::vector &Q) const; + private: InstructionBenchmarkClustering( - const std::vector &Points); + const std::vector &Points, double EpsilonSquared); llvm::Error validateAndSetup(); - void dbScan(size_t MinPts, double EpsilonSquared); + void dbScan(size_t MinPts); + std::vector rangeQuery(size_t Q) const; + const std::vector &Points_; + const double EpsilonSquared_; int NumDimensions_ = 0; // ClusterForPoint_[P] is the cluster id for Points[P]. std::vector ClusterIdForPoint_; Index: tools/llvm-exegesis/lib/Clustering.cpp =================================================================== --- tools/llvm-exegesis/lib/Clustering.cpp +++ tools/llvm-exegesis/lib/Clustering.cpp @@ -29,38 +29,41 @@ // [1] https://en.wikipedia.org/wiki/DBSCAN // [2] https://en.wikipedia.org/wiki/OPTICS_algorithm -namespace { - // Finds the points at distance less than sqrt(EpsilonSquared) of Q (not // including Q). -std::vector rangeQuery(const std::vector &Points, - const size_t Q, const double EpsilonSquared) { +std::vector +InstructionBenchmarkClustering::rangeQuery(const size_t Q) const { std::vector Neighbors; - const auto &QMeasurements = Points[Q].Measurements; - for (size_t P = 0, NumPoints = Points.size(); P < NumPoints; ++P) { + const auto &QMeasurements = Points_[Q].Measurements; + for (size_t P = 0, NumPoints = Points_.size(); P < NumPoints; ++P) { if (P == Q) continue; - const auto &PMeasurements = Points[P].Measurements; + const auto &PMeasurements = Points_[P].Measurements; if (PMeasurements.empty()) // Error point. continue; - double DistanceSquared = 0; - for (size_t I = 0, E = QMeasurements.size(); I < E; ++I) { - const auto Diff = PMeasurements[I].Value - QMeasurements[I].Value; - DistanceSquared += Diff * Diff; - } - if (DistanceSquared <= EpsilonSquared) { + if (isNeighbour(PMeasurements, QMeasurements)) { Neighbors.push_back(P); } } return Neighbors; } -} // namespace +bool InstructionBenchmarkClustering::isNeighbour( + const std::vector &P, + const std::vector &Q) const { + double DistanceSquared = 0.0; + for (size_t I = 0, E = P.size(); I < E; ++I) { + const auto Diff = P[I].Value - Q[I].Value; + DistanceSquared += Diff * Diff; + } + return DistanceSquared <= EpsilonSquared_; +} InstructionBenchmarkClustering::InstructionBenchmarkClustering( - const std::vector &Points) - : Points_(Points), NoiseCluster_(ClusterId::noise()), - ErrorCluster_(ClusterId::error()) {} + const std::vector &Points, + const double EpsilonSquared) + : Points_(Points), EpsilonSquared_(EpsilonSquared), + NoiseCluster_(ClusterId::noise()), ErrorCluster_(ClusterId::error()) {} llvm::Error InstructionBenchmarkClustering::validateAndSetup() { ClusterIdForPoint_.resize(Points_.size()); @@ -97,12 +100,11 @@ return llvm::Error::success(); } -void InstructionBenchmarkClustering::dbScan(const size_t MinPts, - const double EpsilonSquared) { +void InstructionBenchmarkClustering::dbScan(const size_t MinPts) { for (size_t P = 0, NumPoints = Points_.size(); P < NumPoints; ++P) { if (!ClusterIdForPoint_[P].isUndef()) continue; // Previously processed in inner loop. - const auto Neighbors = rangeQuery(Points_, P, EpsilonSquared); + const auto Neighbors = rangeQuery(P); if (Neighbors.size() + 1 < MinPts) { // Density check. // The region around P is not dense enough to create a new cluster, mark // as noise for now. @@ -136,7 +138,7 @@ ClusterIdForPoint_[Q] = CurrentCluster.Id; CurrentCluster.PointIndices.push_back(Q); // And extend to the neighbors of Q if the region is dense enough. - const auto Neighbors = rangeQuery(Points_, Q, EpsilonSquared); + const auto Neighbors = rangeQuery(Q); if (Neighbors.size() + 1 >= MinPts) { ToProcess.insert(Neighbors.begin(), Neighbors.end()); } @@ -155,7 +157,7 @@ InstructionBenchmarkClustering::create( const std::vector &Points, const size_t MinPts, const double Epsilon) { - InstructionBenchmarkClustering Clustering(Points); + InstructionBenchmarkClustering Clustering(Points, Epsilon * Epsilon); if (auto Error = Clustering.validateAndSetup()) { return std::move(Error); } @@ -163,7 +165,7 @@ return Clustering; // Nothing to cluster. } - Clustering.dbScan(MinPts, Epsilon * Epsilon); + Clustering.dbScan(MinPts); return Clustering; } Index: tools/llvm-mca/SummaryView.h =================================================================== --- tools/llvm-mca/SummaryView.h +++ tools/llvm-mca/SummaryView.h @@ -45,10 +45,15 @@ unsigned TotalCycles; // The total number of micro opcodes contributed by a block of instructions. unsigned NumMicroOps; - // For each processor resource, this map stores the cumulative number of - // resource cycles consumed by a block of instructions. The resource mask ID - // is used as the key value to access elements of this map. - llvm::DenseMap ProcResourceUsage; + // For each processor resource, this vector stores the cumulative number of + // resource cycles consumed by the analyzed code block. + llvm::SmallVector ProcResourceUsage; + + // Each processor resource is associated with a so-called processor resource + // mask. This vector allows to correlate processor resource IDs with processor + // resource masks. There is exactly one element per each processor resource + // declared by the scheduling model. + llvm::SmallVector ProcResourceMasks; // Compute the reciprocal throughput for the analyzed code block. // The reciprocal block throughput is computed as the MAX between: @@ -58,9 +63,7 @@ public: SummaryView(const llvm::MCSchedModel &Model, const SourceMgr &S, - unsigned Width) - : SM(Model), Source(S), DispatchWidth(Width), TotalCycles(0), - NumMicroOps(0) {} + unsigned Width); void onCycleEnd() override { ++TotalCycles; } Index: tools/llvm-mca/SummaryView.cpp =================================================================== --- tools/llvm-mca/SummaryView.cpp +++ tools/llvm-mca/SummaryView.cpp @@ -24,6 +24,14 @@ using namespace llvm; +SummaryView::SummaryView(const llvm::MCSchedModel &Model, const SourceMgr &S, + unsigned Width) + : SM(Model), Source(S), DispatchWidth(Width), TotalCycles(0), + NumMicroOps(0), ProcResourceUsage(Model.getNumProcResourceKinds(), 0), + ProcResourceMasks(Model.getNumProcResourceKinds(), 0) { + computeProcResourceMasks(SM, ProcResourceMasks); +} + void SummaryView::onInstructionEvent(const HWInstructionEvent &Event) { // We are only interested in the "instruction dispatched" events generated by // the dispatch stage for instructions that are part of iteration #0. @@ -41,48 +49,14 @@ const InstrDesc &Desc = Inst.getDesc(); NumMicroOps += Desc.NumMicroOps; for (const std::pair &RU : Desc.Resources) { - if (!RU.second.size()) - continue; - - assert(RU.second.NumUnits && "Expected more than one unit used!"); - if (ProcResourceUsage.find(RU.first) == ProcResourceUsage.end()) { - ProcResourceUsage[RU.first] = RU.second.size(); - continue; - } - - ProcResourceUsage[RU.first] += RU.second.size(); - } -} - -double SummaryView::getBlockRThroughput() const { - assert(NumMicroOps && "Expected at least one micro opcode!"); - - SmallVector Masks(SM.getNumProcResourceKinds()); - computeProcResourceMasks(SM, Masks); - - // The block throughput is bounded from above by the hardware dispatch - // throughput. That is because the DispatchWidth is an upper bound on the - // number of opcodes that can be part of a single dispatch group. - double Max = static_cast(NumMicroOps) / DispatchWidth; - - // The block throughput is also limited by the amount of hardware parallelism. - // The number of available resource units affects the resource pressure - // distributed, as well as how many blocks can be executed every cycle. - for (unsigned I = 0, E = SM.getNumProcResourceKinds(); I < E; ++I) { - uint64_t Mask = Masks[I]; - const auto It = ProcResourceUsage.find_as(Mask); - if (It != ProcResourceUsage.end()) { - const MCProcResourceDesc &MCDesc = *SM.getProcResource(I); - unsigned NumUnits = MCDesc.NumUnits; - double Throughput = static_cast(It->second) / NumUnits; - Max = std::max(Max, Throughput); + if (RU.second.size()) { + const auto It = find(ProcResourceMasks, RU.first); + assert(It != ProcResourceMasks.end() && + "Invalid processor resource mask!"); + ProcResourceUsage[std::distance(ProcResourceMasks.begin(), It)] += + RU.second.size(); } } - - // The block reciprocal throughput is computed as the MAX of: - // - (#uOps / DispatchWidth) - // - (#units / resource cycles) for every consumed processor resource. - return Max; } void SummaryView::printView(raw_ostream &OS) const { @@ -90,7 +64,8 @@ unsigned Instructions = Source.size(); unsigned TotalInstructions = Instructions * Iterations; double IPC = (double)TotalInstructions / TotalCycles; - double BlockRThroughput = getBlockRThroughput(); + double BlockRThroughput = computeBlockRThroughput( + SM, DispatchWidth, NumMicroOps, ProcResourceUsage); std::string Buffer; raw_string_ostream TempStream(Buffer); Index: tools/llvm-mca/Support.h =================================================================== --- tools/llvm-mca/Support.h +++ tools/llvm-mca/Support.h @@ -15,6 +15,7 @@ #ifndef LLVM_TOOLS_LLVM_MCA_SUPPORT_H #define LLVM_TOOLS_LLVM_MCA_SUPPORT_H +#include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/SmallVector.h" #include "llvm/MC/MCSchedule.h" @@ -44,6 +45,14 @@ /// problems with simple bit manipulation operations. void computeProcResourceMasks(const llvm::MCSchedModel &SM, llvm::SmallVectorImpl &Masks); + +/// Compute the reciprocal block throughput from a set of processor resource +/// cycles. The reciprocal block throughput is computed as the MAX between: +/// - NumMicroOps / DispatchWidth +/// - ProcResourceCycles / #ProcResourceUnits (for every consumed resource). +double computeBlockRThroughput(const llvm::MCSchedModel &SM, + unsigned DispatchWidth, unsigned NumMicroOps, + llvm::ArrayRef ProcResourceUsage); } // namespace mca #endif Index: tools/llvm-mca/Support.cpp =================================================================== --- tools/llvm-mca/Support.cpp +++ tools/llvm-mca/Support.cpp @@ -48,4 +48,32 @@ ProcResourceID++; } } + +double computeBlockRThroughput(const MCSchedModel &SM, unsigned DispatchWidth, + unsigned NumMicroOps, + ArrayRef ProcResourceUsage) { + // The block throughput is bounded from above by the hardware dispatch + // throughput. That is because the DispatchWidth is an upper bound on the + // number of opcodes that can be part of a single dispatch group. + double Max = static_cast(NumMicroOps) / DispatchWidth; + + // The block throughput is also limited by the amount of hardware parallelism. + // The number of available resource units affects the resource pressure + // distribution, as well as how many blocks can be executed every cycle. + for (unsigned I = 0, E = SM.getNumProcResourceKinds(); I < E; ++I) { + unsigned ResourceCycles = ProcResourceUsage[I]; + if (!ResourceCycles) + continue; + + const MCProcResourceDesc &MCDesc = *SM.getProcResource(I); + double Throughput = static_cast(ResourceCycles) / MCDesc.NumUnits; + Max = std::max(Max, Throughput); + } + + // The block reciprocal throughput is computed as the MAX of: + // - (NumMicroOps / DispatchWidth) + // - (NumUnits / ResourceCycles) for every consumed processor resource. + return Max; +} + } // namespace mca Index: unittests/tools/llvm-exegesis/X86/AnalysisTest.cpp =================================================================== --- /dev/null +++ unittests/tools/llvm-exegesis/X86/AnalysisTest.cpp @@ -0,0 +1,102 @@ +#include "Analysis.h" + +#include +#include + +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Support/TargetSelect.h" +#include "gmock/gmock.h" +#include "gtest/gtest.h" + +namespace exegesis { +namespace { + +using testing::Pair; +using testing::UnorderedElementsAre; + +class AnalysisTest : public ::testing::Test { +protected: + AnalysisTest() { + const std::string TT = "x86_64-unknown-linux"; + std::string error; + const llvm::Target *const TheTarget = + llvm::TargetRegistry::lookupTarget(TT, error); + if (!TheTarget) { + llvm::errs() << error << "\n"; + return; + } + STI.reset(TheTarget->createMCSubtargetInfo(TT, "haswell", "")); + + // Compute the ProxResIdx of ports unes in tests. + const auto &SM = STI->getSchedModel(); + for (unsigned I = 0, E = SM.getNumProcResourceKinds(); I < E; ++I) { + const std::string Name = SM.getProcResource(I)->Name; + if (Name == "HWPort0") { + P0Idx = I; + } else if (Name == "HWPort1") { + P1Idx = I; + } else if (Name == "HWPort5") { + P5Idx = I; + } else if (Name == "HWPort6") { + P6Idx = I; + } else if (Name == "HWPort05") { + P05Idx = I; + } else if (Name == "HWPort0156") { + P0156Idx = I; + } + } + EXPECT_NE(P0Idx, 0); + EXPECT_NE(P1Idx, 0); + EXPECT_NE(P5Idx, 0); + EXPECT_NE(P6Idx, 0); + EXPECT_NE(P05Idx, 0); + EXPECT_NE(P0156Idx, 0); + } + + static void SetUpTestCase() { + LLVMInitializeX86TargetInfo(); + LLVMInitializeX86Target(); + LLVMInitializeX86TargetMC(); + } + +protected: + std::unique_ptr STI; + uint16_t P0Idx = 0; + uint16_t P1Idx = 0; + uint16_t P5Idx = 0; + uint16_t P6Idx = 0; + uint16_t P05Idx = 0; + uint16_t P0156Idx = 0; +}; + +TEST_F(AnalysisTest, ComputeIdealizedProcResPressure_2P0) { + const auto Pressure = + computeIdealizedProcResPressure(STI->getSchedModel(), {{P0Idx, 2}}); + EXPECT_THAT(Pressure, UnorderedElementsAre(Pair(P0Idx, 2.0))); +} + +TEST_F(AnalysisTest, ComputeIdealizedProcResPressure_2P05) { + const auto Pressure = + computeIdealizedProcResPressure(STI->getSchedModel(), {{P05Idx, 2}}); + EXPECT_THAT(Pressure, + UnorderedElementsAre(Pair(P0Idx, 1.0), Pair(P5Idx, 1.0))); +} + +TEST_F(AnalysisTest, ComputeIdealizedProcResPressure_2P05_2P0156) { + const auto Pressure = computeIdealizedProcResPressure( + STI->getSchedModel(), {{P05Idx, 2}, {P0156Idx, 2}}); + EXPECT_THAT(Pressure, + UnorderedElementsAre(Pair(P0Idx, 1.0), Pair(P1Idx, 1.0), + Pair(P5Idx, 1.0), Pair(P6Idx, 1.0))); +} + +TEST_F(AnalysisTest, ComputeIdealizedProcResPressure_1P1_1P05_2P0156) { + const auto Pressure = computeIdealizedProcResPressure( + STI->getSchedModel(), {{P1Idx, 1}, {P05Idx, 1}, {P0156Idx, 2}}); + EXPECT_THAT(Pressure, + UnorderedElementsAre(Pair(P0Idx, 1.0), Pair(P1Idx, 1.0), + Pair(P5Idx, 1.0), Pair(P6Idx, 1.0))); +} + +} // namespace +} // namespace exegesis Index: unittests/tools/llvm-exegesis/X86/CMakeLists.txt =================================================================== --- unittests/tools/llvm-exegesis/X86/CMakeLists.txt +++ unittests/tools/llvm-exegesis/X86/CMakeLists.txt @@ -16,5 +16,6 @@ add_llvm_unittest(LLVMExegesisX86Tests RegisterAliasingTest.cpp AssemblerTest.cpp + AnalysisTest.cpp ) target_link_libraries(LLVMExegesisX86Tests PRIVATE LLVMExegesis)
ValidVariantuOpsLatencyWriteProcRes
WriteProcResIdealized " + "Resource Pressure
" << (SCDesc.isVariant() ? "✔" : "✕") << "" << SCDesc.NumMicroOps << "" << (SC.SCDesc.isVariant() ? "✔" : "✕") + << "" << SC.SCDesc.NumMicroOps << "
    "; - for (int I = 0, E = SCDesc.NumWriteLatencyEntries; I < E; ++I) { + for (int I = 0, E = SC.SCDesc.NumWriteLatencyEntries; I < E; ++I) { const auto *const Entry = - SubtargetInfo_->getWriteLatencyEntry(&SCDesc, I); + SubtargetInfo_->getWriteLatencyEntry(&SC.SCDesc, I); OS << "
  • " << Entry->Cycles; - if (SCDesc.NumWriteLatencyEntries > 1) { + if (SC.SCDesc.NumWriteLatencyEntries > 1) { // Dismabiguate if more than 1 latency. OS << " (WriteResourceID " << Entry->WriteResourceID << ")"; } @@ -323,13 +379,23 @@ OS << "
    "; - for (const auto &WPR : - getNonRedundantWriteProcRes(SCDesc, *SubtargetInfo_)) { + for (const auto &WPR : SC.NonRedundantWriteProcRes) { + OS << "
  • "; + writeEscaped(OS, + SM.getProcResource(WPR.ProcResourceIdx)->Name); + OS << ": " << WPR.Cycles << "
  • "; + } + OS << "
    "; + for (const auto &Pressure : SC.IdealizedProcResPressure) { OS << "
  • "; writeEscaped(OS, SubtargetInfo_->getSchedModel() - .getProcResource(WPR.ProcResourceIdx) + .getProcResource(Pressure.first) ->Name); - OS << ": " << WPR.Cycles << "
  • "; + OS << ": "; + writeMeasurementValue(OS, Pressure.second); + OS << ""; } OS << "