Index: include/llvm/CodeGen/TargetSubtargetInfo.h =================================================================== --- include/llvm/CodeGen/TargetSubtargetInfo.h +++ include/llvm/CodeGen/TargetSubtargetInfo.h @@ -14,6 +14,7 @@ #ifndef LLVM_CODEGEN_TARGETSUBTARGETINFO_H #define LLVM_CODEGEN_TARGETSUBTARGETINFO_H +#include "llvm/ADT/APInt.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" @@ -144,6 +145,31 @@ return 0; } + /// Returns true if \param MI is a dependency breaking zero-idiom instruction + /// for the subtarget. + /// + /// This function also sets bits in \param Mask related to input operands that + /// are not in a data dependency relationship. There is one bit for each + /// machine operand; implicit operands follow explicit operands in the bit + /// representation used for \param Mask. An empty \param Mask (i.e. a mask + /// with all bits cleared) means: data dependencies are "broken" for all the + /// explicit input machine operands of \param MI. + virtual bool isZeroIdiom(const MachineInstr *MI, APInt &Mask) const { + return false; + } + + /// Returns true if \param MI is a dependency breaking instruction for the + /// subtarget. + /// + /// Similar in behavior to `isZeroIdiom`. However, it knows how to identify + /// all dependency breaking instructions (i.e. not just zero-idioms). + /// + /// As for `isZeroIdiom`, this method returns a mask of "broken" dependencies. + /// (See method `isZeroIdiom` for a detailed description of \param Mask). + virtual bool isDependencyBreaking(const MachineInstr *MI, APInt &Mask) const { + return isZeroIdiom(MI, Mask); + } + /// True if the subtarget should run MachineScheduler after aggressive /// coalescing. /// Index: include/llvm/MC/MCInstrAnalysis.h =================================================================== --- include/llvm/MC/MCInstrAnalysis.h +++ include/llvm/MC/MCInstrAnalysis.h @@ -88,18 +88,53 @@ const MCInst &Inst, APInt &Writes) const; - /// Returns true if \param Inst is a dependency breaking instruction for the + /// Returns true if \param MI is a dependency breaking zero-idiom for the /// given subtarget. /// + /// \param Mask is used to identify input operands that have their dependency + /// broken. Each bit of the mask is associated with a specific input operand. + /// Bits associated with explicit input operands are laid out first in the + /// mask; implicit operands come after explicit operands. + /// + /// Dependencies are broken only for operands that have their corresponding bit + /// set. Operands that have their bit cleared, or that don't have a + /// corresponding bit in the mask don't have their dependency broken. + /// Note that \param Mask may not be big enough to describe all operands. + /// The assumption for operands that don't have a correspondent bit in the + /// mask is that those are still data dependent. + /// + /// The only exception to the rule is for when \param Mask has all zeroes. + /// A zero mask means: dependencies are broken for all explicit register + /// operands. + virtual bool isZeroIdiom(const MCInst &MI, APInt &Mask, + unsigned CPUID) const { + return false; + } + + /// Returns true if \param MI is a dependency breaking instruction for the + /// subtarget associated with \param CPUID. + /// /// The value computed by a dependency breaking instruction is not dependent /// on the inputs. An example of dependency breaking instruction on X86 is /// `XOR %eax, %eax`. - /// TODO: In future, we could implement an alternative approach where this - /// method returns `true` if the input instruction is not dependent on - /// some/all of its input operands. An APInt mask could then be used to - /// identify independent operands. - virtual bool isDependencyBreaking(const MCSubtargetInfo &STI, - const MCInst &Inst) const; + /// + /// If \param MI is a dependency breaking instruction for subtarget \param + /// CPUID, then \param Mask can be inspected to identify independent operands. + /// + /// Essentially, each bit of the mask corresponds to an input operand. + /// Explicit operands are laid out first in the mask; implicit operands follow + /// explicit operands. Bits are set for operands that are independent. + /// + /// Note that the number of bits in Mask may not be equivalent to the sum of + /// explicit and implicit operands in \param MI. Operands that don't have a + /// corresponding bit in Mask are assumed "not independente". + /// + /// The only exception is for when \param Mask is all zeroes. That means: + /// explicit input operands of \param MI are independent. + virtual bool isDependencyBreaking(const MCInst &MI, APInt &Mask, + unsigned CPUID) const { + return isZeroIdiom(MI, Mask, CPUID); + } /// Given a branch instruction try to get the address the branch /// targets. Return true on success, and the address in Target. Index: include/llvm/Target/TargetInstrPredicate.td =================================================================== --- include/llvm/Target/TargetInstrPredicate.td +++ include/llvm/Target/TargetInstrPredicate.td @@ -68,6 +68,7 @@ // Forward declarations. class Instruction; +class SchedMachineModel; // A generic machine instruction predicate. class MCInstPredicate; @@ -230,3 +231,102 @@ string MCInstFnName = MCInstFn; string MachineInstrFnName = MachineInstrFn; } + +// Used to classify machine instructions based on a machine instruction +// predicate. +// +// Let IC be an InstructionEquivalenceClass definition, and MI a machine +// instruction. We say that MI belongs to the equivalence class described by IC +// if and only if the following two conditions are met: +// a) MI's opcode is in the `opcodes` set, and +// b) `Predicate` evaluates to true when applied to MI. +// +// Instances of this class can be used by processor scheduling models to +// describe instructions that have a property in common. For example, +// InstructionEquivalenceClass definitions can be used to identify the set of +// dependency breaking instructions for a processor model. +// +// An (optional) list of operand indices can be used to further describe +// properties that apply to instruction operands. For example, it can be used to +// identify register uses of a dependency breaking instructions that are not in +// a RAW dependency. +class InstructionEquivalenceClass opcodes, + MCInstPredicate pred, + list operands = []> { + list Opcodes = opcodes; + MCInstPredicate Predicate = pred; + list OperandIndices = operands; +} + +// Used by processor models to describe dependency breaking instructions. +// +// This is mainly an alias for InstructionEquivalenceClass. Input operand +// `BrokenDeps` identifies the set of "broken dependencies". There is one bit +// per each implicit and explicit input operand. An empty set of broken +// dependencies means: "explicit input register operands are independent." +class DepBreakingClass opcodes, MCInstPredicate pred, + list BrokenDeps = []> + : InstructionEquivalenceClass; + +// A function descriptor used to describe the signature of a predicate methods +// which will be expanded by the STIPredicateExpander into a tablegen'd +// XXXGenSubtargetInfo class member definition (here, XXX is a target name). +// +// It describes the signature of a TargetSubtarget hook, as well as a few extra +// properties. Examples of extra properties are: +// - The default return value for the auto-generate function hook. +// - A list of subtarget hooks (Delegates) that are called from this function. +// +class STIPredicateDecl delegates = []> { + string Name = name; + MCInstPredicate DefaultReturnValue = default; + + // True if this method is declared as virtual in class TargetSubtargetInfo. + bit OverridesBaseClassMember = overrides; + + // True if this information is only meant to be used by CodeGen passes, and + // not exposed to the MC layer. + bit ExpandForMachineInstrOnly = machineInstrOnly; + + // True if the autogenerated method has a extra in/out APInt param used as a + // mask of operands. + bit UpdatesOpcodeMask = 0; + + // A list of STIPredicates used by this definition to delegate part of the + // computation. For example, STIPredicateFunction `isDependencyBreaking()` + // delegates to `isZeroIdiom()` part of its computation. + list Delegates = delegates; +} + +// A predicate function definition member of class `XXXGenSubtargetInfo`. +// +// If `Declaration.ExpandForMachineInstrOnly` is false, then SubtargetEmitter +// will also expand another definition of this method that accepts a MCInst. +class STIPredicate classes, + MCInstPredicate default = FalsePred, + list delegates = []> { + STIPredicateDecl Declaration = declaration; + list Classes = classes; + SchedMachineModel SchedModel = ?; +} + +// Convenience classes and definitions used by processor scheduling models to +// describe dependency breaking instructions. +let UpdatesOpcodeMask = 1 in { + +def IsZeroIdiomDecl : STIPredicateDecl<"isZeroIdiom">; + +let Delegates = [IsZeroIdiomDecl] in +def IsDepBreakingDecl : STIPredicateDecl<"isDependencyBreaking">; + +} // UpdatesOpcodeMask + +class IsZeroIdiomFunction classes> + : STIPredicate; + +class IsDepBreakingFunction classes> + : STIPredicate; Index: lib/MC/MCInstrAnalysis.cpp =================================================================== --- lib/MC/MCInstrAnalysis.cpp +++ lib/MC/MCInstrAnalysis.cpp @@ -24,11 +24,6 @@ return false; } -bool MCInstrAnalysis::isDependencyBreaking(const MCSubtargetInfo &STI, - const MCInst &Inst) const { - return false; -} - bool MCInstrAnalysis::evaluateBranch(const MCInst &Inst, uint64_t Addr, uint64_t Size, uint64_t &Target) const { if (Inst.getNumOperands() == 0 || Index: lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp =================================================================== --- lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp +++ lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp @@ -380,8 +380,9 @@ public: X86MCInstrAnalysis(const MCInstrInfo *MCII) : MCInstrAnalysis(MCII) {} - bool isDependencyBreaking(const MCSubtargetInfo &STI, - const MCInst &Inst) const override; +#define GET_STIPREDICATE_DECLS_FOR_MC_ANALYSIS +#include "X86GenSubtargetInfo.inc" + bool clearsSuperRegisters(const MCRegisterInfo &MRI, const MCInst &Inst, APInt &Mask) const override; std::vector> @@ -390,77 +391,8 @@ const Triple &TargetTriple) const override; }; -bool X86MCInstrAnalysis::isDependencyBreaking(const MCSubtargetInfo &STI, - const MCInst &Inst) const { - if (STI.getCPU() == "btver2") { - // Reference: Agner Fog's microarchitecture.pdf - Section 20 "AMD Bobcat and - // Jaguar pipeline", subsection 8 "Dependency-breaking instructions". - switch (Inst.getOpcode()) { - default: - return false; - case X86::SUB32rr: - case X86::SUB64rr: - case X86::SBB32rr: - case X86::SBB64rr: - case X86::XOR32rr: - case X86::XOR64rr: - case X86::XORPSrr: - case X86::XORPDrr: - case X86::VXORPSrr: - case X86::VXORPDrr: - case X86::ANDNPSrr: - case X86::VANDNPSrr: - case X86::ANDNPDrr: - case X86::VANDNPDrr: - case X86::PXORrr: - case X86::VPXORrr: - case X86::PANDNrr: - case X86::VPANDNrr: - case X86::PSUBBrr: - case X86::PSUBWrr: - case X86::PSUBDrr: - case X86::PSUBQrr: - case X86::VPSUBBrr: - case X86::VPSUBWrr: - case X86::VPSUBDrr: - case X86::VPSUBQrr: - case X86::PCMPEQBrr: - case X86::PCMPEQWrr: - case X86::PCMPEQDrr: - case X86::PCMPEQQrr: - case X86::VPCMPEQBrr: - case X86::VPCMPEQWrr: - case X86::VPCMPEQDrr: - case X86::VPCMPEQQrr: - case X86::PCMPGTBrr: - case X86::PCMPGTWrr: - case X86::PCMPGTDrr: - case X86::PCMPGTQrr: - case X86::VPCMPGTBrr: - case X86::VPCMPGTWrr: - case X86::VPCMPGTDrr: - case X86::VPCMPGTQrr: - case X86::MMX_PXORirr: - case X86::MMX_PANDNirr: - case X86::MMX_PSUBBirr: - case X86::MMX_PSUBDirr: - case X86::MMX_PSUBQirr: - case X86::MMX_PSUBWirr: - case X86::MMX_PCMPGTBirr: - case X86::MMX_PCMPGTDirr: - case X86::MMX_PCMPGTWirr: - case X86::MMX_PCMPEQBirr: - case X86::MMX_PCMPEQDirr: - case X86::MMX_PCMPEQWirr: - return Inst.getOperand(1).getReg() == Inst.getOperand(2).getReg(); - case X86::CMP32rr: - case X86::CMP64rr: - return Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg(); - } - } - - return false; -} +#define GET_STIPREDICATE_DEFS_FOR_MC_ANALYSIS +#include "X86GenSubtargetInfo.inc" bool X86MCInstrAnalysis::clearsSuperRegisters(const MCRegisterInfo &MRI, const MCInst &Inst, Index: lib/Target/X86/X86ScheduleBtVer2.td =================================================================== --- lib/Target/X86/X86ScheduleBtVer2.td +++ lib/Target/X86/X86ScheduleBtVer2.td @@ -687,4 +687,58 @@ def : InstRW<[JSlowLEA16r], (instrs LEA16r)>; +/////////////////////////////////////////////////////////////////////////////// +// Dependency breaking instructions. +/////////////////////////////////////////////////////////////////////////////// + +def : IsZeroIdiomFunction<[ + // GPR Zero-idioms. + DepBreakingClass<[ SUB32rr, SUB64rr, XOR32rr, XOR64rr ], ZeroIdiomPredicate>, + + // MMX Zero-idioms. + DepBreakingClass<[ + MMX_PXORirr, MMX_PANDNirr, MMX_PSUBBirr, + MMX_PSUBDirr, MMX_PSUBQirr, MMX_PSUBWirr, + MMX_PCMPGTBirr, MMX_PCMPGTDirr, MMX_PCMPGTWirr + ], ZeroIdiomPredicate>, + + // SSE Zero-idioms. + DepBreakingClass<[ + PXORrr, PANDNrr, XORPSrr, XORPDrr, + ANDNPSrr, ANDNPDrr, PSUBBrr, PSUBDrr, + PSUBQrr, PSUBWrr, PCMPGTBrr, PCMPGTDrr, + PCMPGTQrr, PCMPGTWrr + ], ZeroIdiomPredicate>, + + // AVX Zero-idioms. + DepBreakingClass<[ + VPXORrr, VPANDNrr, VXORPSrr, VXORPDrr, + VXORPSYrr, VXORPDYrr, VANDNPSrr, VANDNPDrr, + VANDNPSYrr, VANDNPDYrr, VPSUBBrr, VPSUBDrr, + VPSUBQrr, VPSUBWrr, VPCMPGTBrr, VPCMPGTDrr, + VPCMPGTQrr, VPCMPGTWrr + ], ZeroIdiomPredicate> +]>; + +def : IsDepBreakingFunction<[ + // GPR + DepBreakingClass<[ SBB32rr, SBB64rr ], ZeroIdiomPredicate>, + DepBreakingClass<[ CMP32rr, CMP64rr ], CheckSameRegOperand<0, 1> >, + + // MMX + DepBreakingClass<[ + MMX_PCMPEQBirr, MMX_PCMPEQDirr, MMX_PCMPEQWirr + ], ZeroIdiomPredicate>, + + // SSE + DepBreakingClass<[ + PCMPEQBrr, PCMPEQWrr, PCMPEQDrr, PCMPEQQrr + ], ZeroIdiomPredicate>, + + // AVX + DepBreakingClass<[ + VPCMPEQBrr, VPCMPEQWrr, VPCMPEQDrr, VPCMPEQQrr + ], ZeroIdiomPredicate> +]>; + } // SchedModel Index: test/tools/llvm-mca/X86/BtVer2/zero-idioms-avx-256.s =================================================================== --- test/tools/llvm-mca/X86/BtVer2/zero-idioms-avx-256.s +++ test/tools/llvm-mca/X86/BtVer2/zero-idioms-avx-256.s @@ -0,0 +1,322 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -timeline -timeline-max-iterations=3 < %s | FileCheck %s + +# TODO: Fix the processor resource usage for zero-idiom YMM XOR instructions. +# Those vector XOR instructions should only consume 1cy of JFPU1 (instead +# of 2cy). + +# LLVM-MCA-BEGIN ZERO-IDIOM-1 + +vaddps %ymm0, %ymm0, %ymm1 +vxorps %ymm1, %ymm1, %ymm1 +vblendps $2, %ymm1, %ymm2, %ymm3 + +# LLVM-MCA-END + +# LLVM-MCA-BEGIN ZERO-IDIOM-2 + +vaddpd %ymm0, %ymm0, %ymm1 +vxorpd %ymm1, %ymm1, %ymm1 +vblendpd $2, %ymm1, %ymm2, %ymm3 + +# LLVM-MCA-END + +# LLVM-MCA-BEGIN ZERO-IDIOM-3 +vaddps %xmm0, %xmm1, %xmm2 +vandnps %xmm2, %xmm2, %xmm3 +# LLVM-MCA-END + +# LLVM-MCA-BEGIN ZERO-IDIOM-4 +vaddps %xmm0, %xmm1, %xmm2 +vandnps %xmm2, %xmm2, %xmm3 +# LLVM-MCA-END + +# CHECK: [0] Code Region - ZERO-IDIOM-1 + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 300 +# CHECK-NEXT: Total Cycles: 306 +# CHECK-NEXT: Total uOps: 600 + +# CHECK: Dispatch Width: 2 +# CHECK-NEXT: uOps Per Cycle: 1.96 +# CHECK-NEXT: IPC: 0.98 +# CHECK-NEXT: Block RThroughput: 3.0 + +# CHECK: Instruction Info: +# CHECK-NEXT: [1]: #uOps +# CHECK-NEXT: [2]: Latency +# CHECK-NEXT: [3]: RThroughput +# CHECK-NEXT: [4]: MayLoad +# CHECK-NEXT: [5]: MayStore +# CHECK-NEXT: [6]: HasSideEffects (U) + +# CHECK: [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: 2 3 2.00 vaddps %ymm0, %ymm0, %ymm1 +# CHECK-NEXT: 2 1 1.00 vxorps %ymm1, %ymm1, %ymm1 +# CHECK-NEXT: 2 1 1.00 vblendps $2, %ymm1, %ymm2, %ymm3 + +# CHECK: Resources: +# CHECK-NEXT: [0] - JALU0 +# CHECK-NEXT: [1] - JALU1 +# CHECK-NEXT: [2] - JDiv +# CHECK-NEXT: [3] - JFPA +# CHECK-NEXT: [4] - JFPM +# CHECK-NEXT: [5] - JFPU0 +# CHECK-NEXT: [6] - JFPU1 +# CHECK-NEXT: [7] - JLAGU +# CHECK-NEXT: [8] - JMul +# CHECK-NEXT: [9] - JSAGU +# CHECK-NEXT: [10] - JSTC +# CHECK-NEXT: [11] - JVALU0 +# CHECK-NEXT: [12] - JVALU1 +# CHECK-NEXT: [13] - JVIMUL + +# CHECK: Resource pressure per iteration: +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] +# CHECK-NEXT: - - - 3.00 3.00 3.00 3.00 - - - - - - - + +# CHECK: Resource pressure by instruction: +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions: +# CHECK-NEXT: - - - 2.00 - 2.00 - - - - - - - - vaddps %ymm0, %ymm0, %ymm1 +# CHECK-NEXT: - - - - 2.00 - 2.00 - - - - - - - vxorps %ymm1, %ymm1, %ymm1 +# CHECK-NEXT: - - - 1.00 1.00 1.00 1.00 - - - - - - - vblendps $2, %ymm1, %ymm2, %ymm3 + +# CHECK: Timeline view: +# CHECK-NEXT: 012 +# CHECK-NEXT: Index 0123456789 + +# CHECK: [0,0] DeeeER . . vaddps %ymm0, %ymm0, %ymm1 +# CHECK-NEXT: [0,1] .DeE-R . . vxorps %ymm1, %ymm1, %ymm1 +# CHECK-NEXT: [0,2] . DeE-R . . vblendps $2, %ymm1, %ymm2, %ymm3 +# CHECK-NEXT: [1,0] . D=eeeER. . vaddps %ymm0, %ymm0, %ymm1 +# CHECK-NEXT: [1,1] . DeE--R. . vxorps %ymm1, %ymm1, %ymm1 +# CHECK-NEXT: [1,2] . D=eE-R . vblendps $2, %ymm1, %ymm2, %ymm3 +# CHECK-NEXT: [2,0] . .DeeeER. vaddps %ymm0, %ymm0, %ymm1 +# CHECK-NEXT: [2,1] . . D=eER. vxorps %ymm1, %ymm1, %ymm1 +# CHECK-NEXT: [2,2] . . D=eER vblendps $2, %ymm1, %ymm2, %ymm3 + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 3 1.3 1.3 0.0 vaddps %ymm0, %ymm0, %ymm1 +# CHECK-NEXT: 1. 3 1.3 1.3 1.0 vxorps %ymm1, %ymm1, %ymm1 +# CHECK-NEXT: 2. 3 1.7 0.3 0.7 vblendps $2, %ymm1, %ymm2, %ymm3 + +# CHECK: [1] Code Region - ZERO-IDIOM-2 + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 300 +# CHECK-NEXT: Total Cycles: 306 +# CHECK-NEXT: Total uOps: 600 + +# CHECK: Dispatch Width: 2 +# CHECK-NEXT: uOps Per Cycle: 1.96 +# CHECK-NEXT: IPC: 0.98 +# CHECK-NEXT: Block RThroughput: 3.0 + +# CHECK: Instruction Info: +# CHECK-NEXT: [1]: #uOps +# CHECK-NEXT: [2]: Latency +# CHECK-NEXT: [3]: RThroughput +# CHECK-NEXT: [4]: MayLoad +# CHECK-NEXT: [5]: MayStore +# CHECK-NEXT: [6]: HasSideEffects (U) + +# CHECK: [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: 2 3 2.00 vaddpd %ymm0, %ymm0, %ymm1 +# CHECK-NEXT: 2 1 1.00 vxorpd %ymm1, %ymm1, %ymm1 +# CHECK-NEXT: 2 1 1.00 vblendpd $2, %ymm1, %ymm2, %ymm3 + +# CHECK: Resources: +# CHECK-NEXT: [0] - JALU0 +# CHECK-NEXT: [1] - JALU1 +# CHECK-NEXT: [2] - JDiv +# CHECK-NEXT: [3] - JFPA +# CHECK-NEXT: [4] - JFPM +# CHECK-NEXT: [5] - JFPU0 +# CHECK-NEXT: [6] - JFPU1 +# CHECK-NEXT: [7] - JLAGU +# CHECK-NEXT: [8] - JMul +# CHECK-NEXT: [9] - JSAGU +# CHECK-NEXT: [10] - JSTC +# CHECK-NEXT: [11] - JVALU0 +# CHECK-NEXT: [12] - JVALU1 +# CHECK-NEXT: [13] - JVIMUL + +# CHECK: Resource pressure per iteration: +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] +# CHECK-NEXT: - - - 3.00 3.00 3.00 3.00 - - - - - - - + +# CHECK: Resource pressure by instruction: +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions: +# CHECK-NEXT: - - - 2.00 - 2.00 - - - - - - - - vaddpd %ymm0, %ymm0, %ymm1 +# CHECK-NEXT: - - - - 2.00 - 2.00 - - - - - - - vxorpd %ymm1, %ymm1, %ymm1 +# CHECK-NEXT: - - - 1.00 1.00 1.00 1.00 - - - - - - - vblendpd $2, %ymm1, %ymm2, %ymm3 + +# CHECK: Timeline view: +# CHECK-NEXT: 012 +# CHECK-NEXT: Index 0123456789 + +# CHECK: [0,0] DeeeER . . vaddpd %ymm0, %ymm0, %ymm1 +# CHECK-NEXT: [0,1] .DeE-R . . vxorpd %ymm1, %ymm1, %ymm1 +# CHECK-NEXT: [0,2] . DeE-R . . vblendpd $2, %ymm1, %ymm2, %ymm3 +# CHECK-NEXT: [1,0] . D=eeeER. . vaddpd %ymm0, %ymm0, %ymm1 +# CHECK-NEXT: [1,1] . DeE--R. . vxorpd %ymm1, %ymm1, %ymm1 +# CHECK-NEXT: [1,2] . D=eE-R . vblendpd $2, %ymm1, %ymm2, %ymm3 +# CHECK-NEXT: [2,0] . .DeeeER. vaddpd %ymm0, %ymm0, %ymm1 +# CHECK-NEXT: [2,1] . . D=eER. vxorpd %ymm1, %ymm1, %ymm1 +# CHECK-NEXT: [2,2] . . D=eER vblendpd $2, %ymm1, %ymm2, %ymm3 + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 3 1.3 1.3 0.0 vaddpd %ymm0, %ymm0, %ymm1 +# CHECK-NEXT: 1. 3 1.3 1.3 1.0 vxorpd %ymm1, %ymm1, %ymm1 +# CHECK-NEXT: 2. 3 1.7 0.3 0.7 vblendpd $2, %ymm1, %ymm2, %ymm3 + +# CHECK: [2] Code Region - ZERO-IDIOM-3 + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 200 +# CHECK-NEXT: Total Cycles: 105 +# CHECK-NEXT: Total uOps: 200 + +# CHECK: Dispatch Width: 2 +# CHECK-NEXT: uOps Per Cycle: 1.90 +# CHECK-NEXT: IPC: 1.90 +# CHECK-NEXT: Block RThroughput: 1.0 + +# CHECK: Instruction Info: +# CHECK-NEXT: [1]: #uOps +# CHECK-NEXT: [2]: Latency +# CHECK-NEXT: [3]: RThroughput +# CHECK-NEXT: [4]: MayLoad +# CHECK-NEXT: [5]: MayStore +# CHECK-NEXT: [6]: HasSideEffects (U) + +# CHECK: [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: 1 3 1.00 vaddps %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: 1 0 0.50 vandnps %xmm2, %xmm2, %xmm3 + +# CHECK: Resources: +# CHECK-NEXT: [0] - JALU0 +# CHECK-NEXT: [1] - JALU1 +# CHECK-NEXT: [2] - JDiv +# CHECK-NEXT: [3] - JFPA +# CHECK-NEXT: [4] - JFPM +# CHECK-NEXT: [5] - JFPU0 +# CHECK-NEXT: [6] - JFPU1 +# CHECK-NEXT: [7] - JLAGU +# CHECK-NEXT: [8] - JMul +# CHECK-NEXT: [9] - JSAGU +# CHECK-NEXT: [10] - JSTC +# CHECK-NEXT: [11] - JVALU0 +# CHECK-NEXT: [12] - JVALU1 +# CHECK-NEXT: [13] - JVIMUL + +# CHECK: Resource pressure per iteration: +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] +# CHECK-NEXT: - - - 1.00 - 1.00 - - - - - - - - + +# CHECK: Resource pressure by instruction: +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions: +# CHECK-NEXT: - - - 1.00 - 1.00 - - - - - - - - vaddps %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: - - - - - - - - - - - - - - vandnps %xmm2, %xmm2, %xmm3 + +# CHECK: Timeline view: +# CHECK-NEXT: Index 01234567 + +# CHECK: [0,0] DeeeER . vaddps %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: [0,1] D----R . vandnps %xmm2, %xmm2, %xmm3 +# CHECK-NEXT: [1,0] .DeeeER. vaddps %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: [1,1] .D----R. vandnps %xmm2, %xmm2, %xmm3 +# CHECK-NEXT: [2,0] . DeeeER vaddps %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: [2,1] . D----R vandnps %xmm2, %xmm2, %xmm3 + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 3 1.0 1.0 0.0 vaddps %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: 1. 3 0.0 0.0 4.0 vandnps %xmm2, %xmm2, %xmm3 + +# CHECK: [3] Code Region - ZERO-IDIOM-4 + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 200 +# CHECK-NEXT: Total Cycles: 105 +# CHECK-NEXT: Total uOps: 200 + +# CHECK: Dispatch Width: 2 +# CHECK-NEXT: uOps Per Cycle: 1.90 +# CHECK-NEXT: IPC: 1.90 +# CHECK-NEXT: Block RThroughput: 1.0 + +# CHECK: Instruction Info: +# CHECK-NEXT: [1]: #uOps +# CHECK-NEXT: [2]: Latency +# CHECK-NEXT: [3]: RThroughput +# CHECK-NEXT: [4]: MayLoad +# CHECK-NEXT: [5]: MayStore +# CHECK-NEXT: [6]: HasSideEffects (U) + +# CHECK: [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: 1 3 1.00 vaddps %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: 1 0 0.50 vandnps %xmm2, %xmm2, %xmm3 + +# CHECK: Resources: +# CHECK-NEXT: [0] - JALU0 +# CHECK-NEXT: [1] - JALU1 +# CHECK-NEXT: [2] - JDiv +# CHECK-NEXT: [3] - JFPA +# CHECK-NEXT: [4] - JFPM +# CHECK-NEXT: [5] - JFPU0 +# CHECK-NEXT: [6] - JFPU1 +# CHECK-NEXT: [7] - JLAGU +# CHECK-NEXT: [8] - JMul +# CHECK-NEXT: [9] - JSAGU +# CHECK-NEXT: [10] - JSTC +# CHECK-NEXT: [11] - JVALU0 +# CHECK-NEXT: [12] - JVALU1 +# CHECK-NEXT: [13] - JVIMUL + +# CHECK: Resource pressure per iteration: +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] +# CHECK-NEXT: - - - 1.00 - 1.00 - - - - - - - - + +# CHECK: Resource pressure by instruction: +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions: +# CHECK-NEXT: - - - 1.00 - 1.00 - - - - - - - - vaddps %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: - - - - - - - - - - - - - - vandnps %xmm2, %xmm2, %xmm3 + +# CHECK: Timeline view: +# CHECK-NEXT: Index 01234567 + +# CHECK: [0,0] DeeeER . vaddps %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: [0,1] D----R . vandnps %xmm2, %xmm2, %xmm3 +# CHECK-NEXT: [1,0] .DeeeER. vaddps %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: [1,1] .D----R. vandnps %xmm2, %xmm2, %xmm3 +# CHECK-NEXT: [2,0] . DeeeER vaddps %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: [2,1] . D----R vandnps %xmm2, %xmm2, %xmm3 + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 3 1.0 1.0 0.0 vaddps %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: 1. 3 0.0 0.0 4.0 vandnps %xmm2, %xmm2, %xmm3 Index: tools/llvm-mca/include/HardwareUnits/RegisterFile.h =================================================================== --- tools/llvm-mca/include/HardwareUnits/RegisterFile.h +++ tools/llvm-mca/include/HardwareUnits/RegisterFile.h @@ -136,16 +136,15 @@ // This method updates the register mappings inserting a new register // definition. This method is also responsible for updating the number of // allocated physical registers in each register file modified by the write. - // No physical regiser is allocated when flag ShouldAllocatePhysRegs is set. + // No physical regiser is allocated if this write is from a zero-idiom. void addRegisterWrite(WriteRef Write, - llvm::MutableArrayRef UsedPhysRegs, - bool ShouldAllocatePhysRegs = true); + llvm::MutableArrayRef UsedPhysRegs); // Removes write \param WS from the register mappings. // Physical registers may be released to reflect this update. + // No registers are released if this write is from a zero-idiom. void removeRegisterWrite(const WriteState &WS, - llvm::MutableArrayRef FreedPhysRegs, - bool ShouldFreePhysRegs = true); + llvm::MutableArrayRef FreedPhysRegs); // Checks if there are enough physical registers in the register files. // Returns a "response mask" where each bit represents the response from a Index: tools/llvm-mca/include/Instruction.h =================================================================== --- tools/llvm-mca/include/Instruction.h +++ tools/llvm-mca/include/Instruction.h @@ -102,6 +102,9 @@ // super-registers. bool ClearsSuperRegs; + // True if this write is from a dependency breaking zero-idiom instruction. + bool WritesZero; + // This field is set if this is a partial register write, and it has a false // dependency on any previous write of the same register (or a portion of it). // DependentWrite must be able to complete before this write completes, so @@ -121,10 +124,10 @@ public: WriteState(const WriteDescriptor &Desc, unsigned RegID, - bool clearsSuperRegs = false) + bool clearsSuperRegs = false, bool writesZero = false) : WD(Desc), CyclesLeft(UNKNOWN_CYCLES), RegisterID(RegID), - ClearsSuperRegs(clearsSuperRegs), DependentWrite(nullptr), - NumWriteUsers(0U) {} + ClearsSuperRegs(clearsSuperRegs), WritesZero(writesZero), + DependentWrite(nullptr), NumWriteUsers(0U) {} WriteState(const WriteState &Other) = delete; WriteState &operator=(const WriteState &Other) = delete; @@ -137,6 +140,7 @@ unsigned getNumUsers() const { return Users.size() + NumWriteUsers; } bool clearsSuperRegisters() const { return ClearsSuperRegs; } + bool isWriteZero() const { return WritesZero; } const WriteState *getDependentWrite() const { return DependentWrite; } void setDependentWrite(WriteState *Other) { @@ -177,11 +181,14 @@ // This field is set to true only if there are no dependent writes, and // there are no `CyclesLeft' to wait. bool IsReady; + // True if this register read is from a dependency-breaking instruction. + bool IndependentFromDef; public: ReadState(const ReadDescriptor &Desc, unsigned RegID) : RD(Desc), RegisterID(RegID), DependentWrites(0), - CyclesLeft(UNKNOWN_CYCLES), TotalCycles(0), IsReady(true) {} + CyclesLeft(UNKNOWN_CYCLES), TotalCycles(0), IsReady(true), + IndependentFromDef(false) {} ReadState(const ReadState &Other) = delete; ReadState &operator=(const ReadState &Other) = delete; @@ -192,6 +199,9 @@ bool isReady() const { return IsReady; } bool isImplicitRead() const { return RD.isImplicitRead(); } + bool isIndependentFromDef() const { return IndependentFromDef; } + void setIndependentFromDef() { IndependentFromDef = true; } + void cycleEvent(); void writeStartEvent(unsigned Cycles); void setDependentWrites(unsigned Writes) { @@ -309,8 +319,6 @@ // Retire Unit token ID for this instruction. unsigned RCUTokenID; - bool IsDepBreaking; - using UniqueDef = std::unique_ptr; using UniqueUse = std::unique_ptr; using VecDefs = std::vector; @@ -326,8 +334,7 @@ public: Instruction(const InstrDesc &D) - : Desc(D), Stage(IS_INVALID), CyclesLeft(UNKNOWN_CYCLES), RCUTokenID(0), - IsDepBreaking(false) {} + : Desc(D), Stage(IS_INVALID), CyclesLeft(UNKNOWN_CYCLES), RCUTokenID(0) {} Instruction(const Instruction &Other) = delete; Instruction &operator=(const Instruction &Other) = delete; @@ -345,9 +352,6 @@ }); } - bool isDependencyBreaking() const { return IsDepBreaking; } - void setDependencyBreaking() { IsDepBreaking = true; } - unsigned getNumUsers() const { unsigned NumUsers = 0; for (const UniqueDef &Def : Defs) Index: tools/llvm-mca/lib/HardwareUnits/RegisterFile.cpp =================================================================== --- tools/llvm-mca/lib/HardwareUnits/RegisterFile.cpp +++ tools/llvm-mca/lib/HardwareUnits/RegisterFile.cpp @@ -139,8 +139,7 @@ } void RegisterFile::addRegisterWrite(WriteRef Write, - MutableArrayRef UsedPhysRegs, - bool ShouldAllocatePhysRegs) { + MutableArrayRef UsedPhysRegs) { WriteState &WS = *Write.getWriteState(); unsigned RegID = WS.getRegisterID(); assert(RegID && "Adding an invalid register definition?"); @@ -163,6 +162,7 @@ // a false dependency on RenameAs. The only exception is for when the write // implicitly clears the upper portion of the underlying register. // If a write clears its super-registers, then it is renamed as `RenameAs`. + bool ShouldAllocatePhysRegs = !WS.isWriteZero(); const RegisterRenamingInfo &RRI = RegisterMappings[RegID].second; if (RRI.RenameAs && RRI.RenameAs != RegID) { RegID = RRI.RenameAs; @@ -200,9 +200,8 @@ RegisterMappings[*I].first = Write; } -void RegisterFile::removeRegisterWrite(const WriteState &WS, - MutableArrayRef FreedPhysRegs, - bool ShouldFreePhysRegs) { +void RegisterFile::removeRegisterWrite( + const WriteState &WS, MutableArrayRef FreedPhysRegs) { unsigned RegID = WS.getRegisterID(); assert(RegID != 0 && "Invalidating an already invalid register?"); @@ -210,6 +209,7 @@ "Invalidating a write of unknown cycles!"); assert(WS.getCyclesLeft() <= 0 && "Invalid cycles left for this write!"); + bool ShouldFreePhysRegs = !WS.isWriteZero(); unsigned RenameAs = RegisterMappings[RegID].second.RenameAs; if (RenameAs && RenameAs != RegID) { RegID = RenameAs; Index: tools/llvm-mca/lib/InstrBuilder.cpp =================================================================== --- tools/llvm-mca/lib/InstrBuilder.cpp +++ tools/llvm-mca/lib/InstrBuilder.cpp @@ -423,6 +423,14 @@ const InstrDesc &D = *DescOrErr; std::unique_ptr NewIS = llvm::make_unique(D); + // Check if this is a dependency breaking instruction. + APInt Mask; + + unsigned ProcID = STI.getSchedModel().getProcessorID(); + bool IsZeroIdiom = MCIA.isZeroIdiom(MCI, Mask, ProcID); + bool IsDepBreaking = + IsZeroIdiom || MCIA.isDependencyBreaking(MCI, Mask, ProcID); + // Initialize Reads first. for (const ReadDescriptor &RD : D.Reads) { int RegID = -1; @@ -444,7 +452,15 @@ // Okay, this is a register operand. Create a ReadState for it. assert(RegID > 0 && "Invalid register ID found!"); - NewIS->getUses().emplace_back(llvm::make_unique(RD, RegID)); + auto RS = llvm::make_unique(RD, RegID); + + if (IsDepBreaking) { + if ((Mask.isNullValue() && !RD.isImplicitRead()) || + ((Mask.getBitWidth() > RD.UseIndex) && Mask[RD.UseIndex])) { + RS->setIndependentFromDef(); + } + } + NewIS->getUses().emplace_back(std::move(RS)); } // Early exit if there are no writes. @@ -459,10 +475,6 @@ // register writes implicitly clear the upper portion of a super-register. MCIA.clearsSuperRegisters(MRI, MCI, WriteMask); - // Check if this is a dependency breaking instruction. - if (MCIA.isDependencyBreaking(STI, MCI)) - NewIS->setDependencyBreaking(); - // Initialize writes. unsigned WriteIndex = 0; for (const WriteDescriptor &WD : D.Writes) { @@ -476,7 +488,8 @@ assert(RegID && "Expected a valid register ID!"); NewIS->getDefs().emplace_back(llvm::make_unique( - WD, RegID, /* ClearsSuperRegs */ WriteMask[WriteIndex])); + WD, RegID, /* ClearsSuperRegs */ WriteMask[WriteIndex], + /* WritesZero */ IsZeroIdiom)); ++WriteIndex; } Index: tools/llvm-mca/lib/Stages/DispatchStage.cpp =================================================================== --- tools/llvm-mca/lib/Stages/DispatchStage.cpp +++ tools/llvm-mca/lib/Stages/DispatchStage.cpp @@ -106,21 +106,16 @@ // instruction. A dependency-breaking instruction is a zero-latency // instruction that doesn't consume hardware resources. // An example of dependency-breaking instruction on X86 is a zero-idiom XOR. - bool IsDependencyBreaking = IS.isDependencyBreaking(); for (std::unique_ptr &RS : IS.getUses()) - if (RS->isImplicitRead() || !IsDependencyBreaking) + if (!RS->isIndependentFromDef()) updateRAWDependencies(*RS, STI); - // By default, a dependency-breaking zero-latency instruction is expected to - // be optimized at register renaming stage. That means, no physical register - // is allocated to the instruction. - bool ShouldAllocateRegisters = - !(Desc.isZeroLatency() && IsDependencyBreaking); + // By default, a dependency-breaking zero-idiom is expected to be optimized + // at register renaming stage. That means, no physical register is allocated + // to the instruction. SmallVector RegisterFiles(PRF.getNumRegisterFiles()); - for (std::unique_ptr &WS : IS.getDefs()) { - PRF.addRegisterWrite(WriteRef(IR.first, WS.get()), RegisterFiles, - ShouldAllocateRegisters); - } + for (std::unique_ptr &WS : IS.getDefs()) + PRF.addRegisterWrite(WriteRef(IR.first, WS.get()), RegisterFiles); // Reserve slots in the RCU, and notify the instruction that it has been // dispatched to the schedulers for execution. Index: tools/llvm-mca/lib/Stages/RetireStage.cpp =================================================================== --- tools/llvm-mca/lib/Stages/RetireStage.cpp +++ tools/llvm-mca/lib/Stages/RetireStage.cpp @@ -51,11 +51,9 @@ LLVM_DEBUG(llvm::dbgs() << "[E] Instruction Retired: #" << IR << '\n'); llvm::SmallVector FreedRegs(PRF.getNumRegisterFiles()); const Instruction &Inst = *IR.getInstruction(); - const InstrDesc &Desc = Inst.getDesc(); - bool ShouldFreeRegs = !(Desc.isZeroLatency() && Inst.isDependencyBreaking()); for (const std::unique_ptr &WS : Inst.getDefs()) - PRF.removeRegisterWrite(*WS.get(), FreedRegs, ShouldFreeRegs); + PRF.removeRegisterWrite(*WS.get(), FreedRegs); notifyEvent(HWInstructionRetiredEvent(IR, FreedRegs)); } Index: utils/TableGen/CodeGenSchedule.h =================================================================== --- utils/TableGen/CodeGenSchedule.h +++ utils/TableGen/CodeGenSchedule.h @@ -15,6 +15,7 @@ #ifndef LLVM_UTILS_TABLEGEN_CODEGENSCHEDULE_H #define LLVM_UTILS_TABLEGEN_CODEGENSCHEDULE_H +#include "llvm/ADT/APInt.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/StringMap.h" #include "llvm/Support/ErrorHandling.h" @@ -270,6 +271,123 @@ #endif }; +/// Used to correlate instructions to predicates used by instruction equivalence +/// classes contributed by different processor models. +/// +/// Example: a XORrr where register operands are all the same, is a known +/// zero-idiom in most modern X86 processors. +/// +/// A PredicateInfo object can be used to identify a predicate used by +/// processor models to identify XORrr instructions that are zero-idioms. +/// If multiple processors use the same predicate for XORrr, then field +/// `ProcModelMask` will have more than one bit set. +/// +/// By construction, there can be multiple instances of PredicateInfo associated +/// to a same instruction opcode. For example, different processors may define +/// different constraints on the same opcode. +/// +/// Field OperandMask can be used as an extra constraint when classifying Predicates. +/// This may be used to describe conditions that appy only to a subset of the +/// operands of a machine instruction, and the operands subset may not be the +/// same for all processor models. +struct PredicateInfo { + llvm::APInt ProcModelMask; // A set of processor model indices. + llvm::APInt OperandMask; // An operand mask. + const Record *Predicate; // MCInstrPredicate definition. + PredicateInfo(llvm::APInt CpuMask, llvm::APInt Operands, const Record *Pred) + : ProcModelMask(CpuMask), OperandMask(Operands), Predicate(Pred) {} + + bool operator==(const PredicateInfo &Other) const { + return ProcModelMask == Other.ProcModelMask && + OperandMask == Other.OperandMask && Predicate == Other.Predicate; + } +}; + +/// A collection of PredicateInfo objects. +/// +/// There is at least one OpcodeInfo object for every opcode specified by a +/// TIPredicate definition. +class OpcodeInfo { + llvm::SmallVector Predicates; + + OpcodeInfo(const OpcodeInfo &Other) = delete; + OpcodeInfo &operator=(const OpcodeInfo &Other) = delete; + +public: + OpcodeInfo() = default; + OpcodeInfo &operator=(OpcodeInfo &&Other) = default; + OpcodeInfo(OpcodeInfo &&Other) = default; + + ArrayRef getPredicates() const { return Predicates; } + + void addPredicateForProcModel(const llvm::APInt &CpuMask, + const llvm::APInt &OperandMask, + const Record *Predicate); +}; + +/// Used to group together tablegen instruction definitions that are subject +/// to a same set of constraints (identified by an instance of OpcodeInfo). +class OpcodeGroup { + OpcodeInfo Info; + std::vector Opcodes; + + OpcodeGroup(const OpcodeGroup &Other) = delete; + OpcodeGroup &operator=(const OpcodeGroup &Other) = delete; + +public: + OpcodeGroup(OpcodeInfo &&OpInfo) : Info(std::move(OpInfo)) {} + OpcodeGroup(OpcodeGroup &&Other) = default; + + void addOpcode(const Record *Opcode) { + assert(std::find(Opcodes.begin(), Opcodes.end(), Opcode) == Opcodes.end() && + "Opcode already in set!"); + Opcodes.push_back(Opcode); + } + + ArrayRef getOpcodes() const { return Opcodes; } + const OpcodeInfo &getOpcodeInfo() const { return Info; } +}; + +/// An STIPredicateFunction descriptor used by tablegen backends to +/// auto-generate the body of a predicate function as a member of tablegen'd +/// class XXXGenSubtargetInfo. +class STIPredicateFunction { + const Record *FunctionDeclaration; + + std::vector Definitions; + std::vector Groups; + + STIPredicateFunction(const STIPredicateFunction &Other) = delete; + STIPredicateFunction &operator=(const STIPredicateFunction &Other) = delete; + +public: + STIPredicateFunction(const Record *Rec) : FunctionDeclaration(Rec) {} + STIPredicateFunction(STIPredicateFunction &&Other) = default; + + bool isCompatibleWith(const STIPredicateFunction &Other) const { + return FunctionDeclaration == Other.FunctionDeclaration; + } + + void addDefinition(const Record *Def) { Definitions.push_back(Def); } + void addOpcode(const Record *OpcodeRec, OpcodeInfo &&Info) { + if (Groups.empty() || + Groups.back().getOpcodeInfo().getPredicates() != Info.getPredicates()) + Groups.emplace_back(std::move(Info)); + Groups.back().addOpcode(OpcodeRec); + } + + StringRef getName() const { + return FunctionDeclaration->getValueAsString("Name"); + } + const Record *getDefaultReturnPredicate() const { + return FunctionDeclaration->getValueAsDef("DefaultReturnValue"); + } + + const Record *getDeclaration() const { return FunctionDeclaration; } + ArrayRef getDefinitions() const { return Definitions; } + ArrayRef getGroups() const { return Groups; } +}; + /// Top level container for machine model data. class CodeGenSchedModels { RecordKeeper &Records; @@ -303,6 +421,8 @@ using InstClassMapTy = DenseMap; InstClassMapTy InstrClassMap; + std::vector STIPredicates; + public: CodeGenSchedModels(RecordKeeper& RK, const CodeGenTarget &TGT); @@ -430,6 +550,9 @@ Record *findProcResUnits(Record *ProcResKind, const CodeGenProcModel &PM, ArrayRef Loc) const; + ArrayRef getSTIPredicates() const { + return STIPredicates; + } private: void collectProcModels(); @@ -467,6 +590,10 @@ void checkMCInstPredicates() const; + void checkSTIPredicates() const; + + void collectSTIPredicates(); + void checkCompleteness(); void inferFromRW(ArrayRef OperWrites, ArrayRef OperReads, Index: utils/TableGen/CodeGenSchedule.cpp =================================================================== --- utils/TableGen/CodeGenSchedule.cpp +++ utils/TableGen/CodeGenSchedule.cpp @@ -225,9 +225,203 @@ // Check MCInstPredicate definitions. checkMCInstPredicates(); + // Check STIPredicate definitions. + checkSTIPredicates(); + + // Find STIPredicate definitions for each processor model, and construct + // STIPredicateFunction objects. + collectSTIPredicates(); + checkCompleteness(); } +void CodeGenSchedModels::checkSTIPredicates() const { + DenseMap Declarations; + + // There cannot be multiple declarations with the same name. + const RecVec Decls = Records.getAllDerivedDefinitions("STIPredicateDecl"); + for (const Record *R : Decls) { + StringRef Name = R->getValueAsString("Name"); + const auto It = Declarations.find(Name); + if (It == Declarations.end()) { + Declarations[Name] = R; + continue; + } + + PrintError(R->getLoc(), "STIPredicate " + Name + " multiply declared."); + PrintNote(It->second->getLoc(), "Previous declaration was here."); + PrintFatalError(R->getLoc(), "Invalid STIPredicateDecl found."); + } +} + +// Used by function `ProcessSTIPredicate` to construct a mask of machine +// instruction operands. +static APInt constructOperandMask(ArrayRef Indices) { + APInt OperandMask; + if (Indices.empty()) + return OperandMask; + + int64_t MaxIndex = *std::max_element(Indices.begin(), Indices.end()); + assert(MaxIndex >= 0 && "Invalid negative indices in input!"); + OperandMask = OperandMask.zext(MaxIndex + 1); + for (const int64_t Index : Indices) { + assert(Index >= 0 && "Invalid negative indices!"); + OperandMask.setBit(Index); + } + + return OperandMask; +} + +static void +processSTIPredicate(STIPredicateFunction &Fn, + const DenseMap &ProcModelMap) { + DenseMap Opcode2Index; + using OpcodeMapPair = std::pair; + std::vector OpcodeMappings; + std::vector> OpcodeMasks; + + DenseMap Predicate2Index; + unsigned NumUniquePredicates = 0; + + // Number unique predicates and opcodes used by InstructionEquivalenceClass + // definitions. Each unique opcode will be associated with an OpcodeInfo + // object. + for (const Record *Def : Fn.getDefinitions()) { + RecVec Classes = Def->getValueAsListOfDefs("Classes"); + for (const Record *EC : Classes) { + const Record *Pred = EC->getValueAsDef("Predicate"); + if (Predicate2Index.find(Pred) == Predicate2Index.end()) + Predicate2Index[Pred] = NumUniquePredicates++; + + RecVec Opcodes = EC->getValueAsListOfDefs("Opcodes"); + for (const Record *Opcode : Opcodes) { + if (Opcode2Index.find(Opcode) == Opcode2Index.end()) { + Opcode2Index[Opcode] = OpcodeMappings.size(); + OpcodeMappings.emplace_back(Opcode, OpcodeInfo()); + } + } + } + } + + // Initialize vector `OpcodeMasks` with default values. We want to keep track + // of which processors "use" which opcodes. We also want to be able to + // identify predicates that are used by different processors for a same + // opcode. + // This information is used later on by this algorithm to sort OpcodeMapping + // elements based on their processor and predicate sets. + OpcodeMasks.resize(OpcodeMappings.size()); + APInt DefaultProcMask(ProcModelMap.size(), 0); + APInt DefaultPredMask(NumUniquePredicates, 0); + for (std::pair &MaskPair : OpcodeMasks) + MaskPair = std::make_pair(DefaultProcMask, DefaultPredMask); + + // Construct a OpcodeInfo object for every unique opcode declared by an + // InstructionEquivalenceClass definition. + for (const Record *Def : Fn.getDefinitions()) { + RecVec Classes = Def->getValueAsListOfDefs("Classes"); + const Record *SchedModel = Def->getValueAsDef("SchedModel"); + unsigned ProcIndex = ProcModelMap.find(SchedModel)->second; + APInt ProcMask(ProcModelMap.size(), 0); + ProcMask.setBit(ProcIndex); + + for (const Record *EC : Classes) { + RecVec Opcodes = EC->getValueAsListOfDefs("Opcodes"); + + std::vector OpIndices = + EC->getValueAsListOfInts("OperandIndices"); + APInt OperandMask = constructOperandMask(OpIndices); + + const Record *Pred = EC->getValueAsDef("Predicate"); + APInt PredMask(NumUniquePredicates, 0); + PredMask.setBit(Predicate2Index[Pred]); + + for (const Record *Opcode : Opcodes) { + unsigned OpcodeIdx = Opcode2Index[Opcode]; + OpcodeMasks[OpcodeIdx].first |= ProcMask; + OpcodeMasks[OpcodeIdx].second |= PredMask; + OpcodeInfo &OI = OpcodeMappings[OpcodeIdx].second; + OI.addPredicateForProcModel(ProcMask, OperandMask, Pred); + } + } + } + + // Sort OpcodeMappings elements based on their CPU and predicate masks. + // As a last resort, order elements by opcode identifier. + llvm::sort(OpcodeMappings.begin(), OpcodeMappings.end(), + [&](const OpcodeMapPair &Lhs, const OpcodeMapPair &Rhs) { + unsigned LhsIdx = Opcode2Index[Lhs.first]; + unsigned RhsIdx = Opcode2Index[Rhs.first]; + std::pair &LhsMasks = OpcodeMasks[LhsIdx]; + std::pair &RhsMasks = OpcodeMasks[RhsIdx]; + + if (LhsMasks.first != RhsMasks.first) { + if (LhsMasks.first.countPopulation() < + RhsMasks.first.countPopulation()) + return true; + return LhsMasks.first.countLeadingZeros() > + RhsMasks.first.countLeadingZeros(); + } + + if (LhsMasks.second != RhsMasks.second) { + if (LhsMasks.second.countPopulation() < + RhsMasks.second.countPopulation()) + return true; + return LhsMasks.second.countLeadingZeros() > + RhsMasks.second.countLeadingZeros(); + } + + return LhsIdx < RhsIdx; + }); + + // Now construct opcode groups. Groups are used by the SubtargetEmitter when + // expanding the body of a STIPredicate function. In particular, each opcode + // group is expanded into a sequence of labels in a switch statement. + // It identifies opcodes for which different processors define same predicates + // and same opcode masks. + for (OpcodeMapPair &Info : OpcodeMappings) + Fn.addOpcode(Info.first, std::move(Info.second)); +} + +void CodeGenSchedModels::collectSTIPredicates() { + // Map STIPredicateDecl records to elements of vector + // CodeGenSchedModels::STIPredicates. + DenseMap Decl2Index; + + RecVec RV = Records.getAllDerivedDefinitions("STIPredicate"); + for (const Record *R : RV) { + const Record *Decl = R->getValueAsDef("Declaration"); + + const auto It = Decl2Index.find(Decl); + if (It == Decl2Index.end()) { + Decl2Index[Decl] = STIPredicates.size(); + STIPredicateFunction Predicate(Decl); + Predicate.addDefinition(R); + STIPredicates.emplace_back(std::move(Predicate)); + continue; + } + + STIPredicateFunction &PreviousDef = STIPredicates[It->second]; + PreviousDef.addDefinition(R); + } + + for (STIPredicateFunction &Fn : STIPredicates) + processSTIPredicate(Fn, ProcModelMap); +} + +void OpcodeInfo::addPredicateForProcModel(const llvm::APInt &CpuMask, + const llvm::APInt &OperandMask, + const Record *Predicate) { + auto It = llvm::find_if( + Predicates, [&OperandMask, &Predicate](const PredicateInfo &P) { + return P.Predicate == Predicate && P.OperandMask == OperandMask; + }); + if (It == Predicates.end()) { + Predicates.emplace_back(CpuMask, OperandMask, Predicate); + return; + } + It->ProcModelMask |= CpuMask; +} + void CodeGenSchedModels::checkMCInstPredicates() const { RecVec MCPredicates = Records.getAllDerivedDefinitions("TIIPredicate"); if (MCPredicates.empty()) Index: utils/TableGen/PredicateExpander.h =================================================================== --- utils/TableGen/PredicateExpander.h +++ utils/TableGen/PredicateExpander.h @@ -43,14 +43,15 @@ bool shouldNegate() const { return NegatePredicate; } bool shouldExpandForMC() const { return ExpandForMC; } unsigned getIndentLevel() const { return IndentLevel; } + StringRef getTargetName() const { return TargetName; } void setByRef(bool Value) { EmitCallsByRef = Value; } void flipNegatePredicate() { NegatePredicate = !NegatePredicate; } void setNegatePredicate(bool Value) { NegatePredicate = Value; } void setExpandForMC(bool Value) { ExpandForMC = Value; } + void setIndentLevel(unsigned Level) { IndentLevel = Level; } void increaseIndentLevel() { ++IndentLevel; } void decreaseIndentLevel() { --IndentLevel; } - void setIndentLevel(unsigned Level) { IndentLevel = Level; } using RecVec = std::vector; void expandTrue(raw_ostream &OS); @@ -81,6 +82,36 @@ void expandStatement(raw_ostream &OS, const Record *Rec); }; +// Forward declarations. +class STIPredicateFunction; +class OpcodeGroup; + +class STIPredicateExpander : public PredicateExpander { + StringRef ClassPrefix; + bool ExpandDefinition; + + STIPredicateExpander(const PredicateExpander &) = delete; + STIPredicateExpander &operator=(const PredicateExpander &) = delete; + + void expandHeader(raw_ostream &OS, const STIPredicateFunction &Fn); + void expandPrologue(raw_ostream &OS, const STIPredicateFunction &Fn); + void expandOpcodeGroup(raw_ostream &OS, const OpcodeGroup &Group, + bool ShouldUpdateOpcodeMask); + void expandBody(raw_ostream &OS, const STIPredicateFunction &Fn); + void expandEpilogue(raw_ostream &OS, const STIPredicateFunction &Fn); + +public: + STIPredicateExpander(StringRef Target) + : PredicateExpander(Target), ClassPrefix(), ExpandDefinition(false) {} + + bool shouldExpandDefinition() const { return ExpandDefinition; } + StringRef getClassPrefix() const { return ClassPrefix; } + void setClassPrefix(StringRef S) { ClassPrefix = S; } + void setExpandDefinition(bool Value) { ExpandDefinition = Value; } + + void expandSTIPredicate(raw_ostream &OS, const STIPredicateFunction &Fn); +}; + } // namespace llvm #endif Index: utils/TableGen/PredicateExpander.cpp =================================================================== --- utils/TableGen/PredicateExpander.cpp +++ utils/TableGen/PredicateExpander.cpp @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "PredicateExpander.h" +#include "CodeGenSchedule.h" // Definition of STIPredicateFunction. namespace llvm { @@ -313,4 +314,158 @@ llvm_unreachable("No known rules to expand this MCInstPredicate"); } +void STIPredicateExpander::expandHeader(raw_ostream &OS, + const STIPredicateFunction &Fn) { + const Record *Rec = Fn.getDeclaration(); + StringRef FunctionName = Rec->getValueAsString("Name"); + + OS.indent(getIndentLevel() * 2); + OS << "bool "; + if (shouldExpandDefinition()) + OS << getClassPrefix() << "::"; + OS << FunctionName << "("; + if (shouldExpandForMC()) + OS << "const MCInst " << (isByRef() ? "&" : "*") << "MI"; + else + OS << "const MachineInstr " << (isByRef() ? "&" : "*") << "MI"; + if (Rec->getValueAsBit("UpdatesOpcodeMask")) + OS << ", APInt &Mask"; + OS << (shouldExpandForMC() ? ", unsigned ProcessorID) const " : ") const "); + if (shouldExpandDefinition()) { + OS << "{\n"; + return; + } + + if (Rec->getValueAsBit("OverridesBaseClassMember")) + OS << "override"; + OS << ";\n"; +} + +void STIPredicateExpander::expandPrologue(raw_ostream &OS, + const STIPredicateFunction &Fn) { + RecVec Delegates = Fn.getDeclaration()->getValueAsListOfDefs("Delegates"); + bool UpdatesOpcodeMask = + Fn.getDeclaration()->getValueAsBit("UpdatesOpcodeMask"); + + increaseIndentLevel(); + unsigned IndentLevel = getIndentLevel(); + for (const Record *Delegate : Delegates) { + OS.indent(IndentLevel * 2); + OS << "if (" << Delegate->getValueAsString("Name") << "(MI"; + if (UpdatesOpcodeMask) + OS << ", Mask"; + if (shouldExpandForMC()) + OS << ", ProcessorID"; + OS << "))\n"; + OS.indent((1 + IndentLevel) * 2); + OS << "return true;\n\n"; + } + + if (shouldExpandForMC()) + return; + + OS.indent(IndentLevel * 2); + OS << "unsigned ProcessorID = getSchedModel().getProcessorID();\n"; +} + +void STIPredicateExpander::expandOpcodeGroup(raw_ostream &OS, const OpcodeGroup &Group, + bool ShouldUpdateOpcodeMask) { + const OpcodeInfo &OI = Group.getOpcodeInfo(); + for (const PredicateInfo &PI : OI.getPredicates()) { + const APInt &ProcModelMask = PI.ProcModelMask; + bool FirstProcID = true; + for (unsigned I = 0, E = ProcModelMask.getActiveBits(); I < E; ++I) { + if (!ProcModelMask[I]) + continue; + + if (FirstProcID) { + OS.indent(getIndentLevel() * 2); + OS << "if (ProcessorID == " << I; + } else { + OS << " || ProcessorID == " << I; + } + FirstProcID = false; + } + + OS << ") {\n"; + + increaseIndentLevel(); + OS.indent(getIndentLevel() * 2); + if (ShouldUpdateOpcodeMask) { + if (PI.OperandMask.isNullValue()) + OS << "Mask.clearAllBits();\n"; + else + OS << "Mask = " << PI.OperandMask << ";\n"; + OS.indent(getIndentLevel() * 2); + } + OS << "return "; + expandPredicate(OS, PI.Predicate); + OS << ";\n"; + decreaseIndentLevel(); + OS.indent(getIndentLevel() * 2); + OS << "}\n"; + } +} + +void STIPredicateExpander::expandBody(raw_ostream &OS, + const STIPredicateFunction &Fn) { + bool UpdatesOpcodeMask = + Fn.getDeclaration()->getValueAsBit("UpdatesOpcodeMask"); + + unsigned IndentLevel = getIndentLevel(); + OS.indent(IndentLevel * 2); + OS << "switch(MI" << (isByRef() ? "." : "->") << "getOpcode()) {\n"; + OS.indent(IndentLevel * 2); + OS << "default:\n"; + OS.indent(IndentLevel * 2); + OS << " break;"; + + for (const OpcodeGroup &Group : Fn.getGroups()) { + for (const Record *Opcode : Group.getOpcodes()) { + OS << '\n'; + OS.indent(IndentLevel * 2); + OS << "case " << getTargetName() << "::" << Opcode->getName() << ":"; + } + + OS << '\n'; + increaseIndentLevel(); + expandOpcodeGroup(OS, Group, UpdatesOpcodeMask); + + OS.indent(getIndentLevel() * 2); + OS << "break;\n"; + decreaseIndentLevel(); + } + + OS.indent(IndentLevel * 2); + OS << "}\n"; +} + +void STIPredicateExpander::expandEpilogue(raw_ostream &OS, + const STIPredicateFunction &Fn) { + OS << '\n'; + OS.indent(getIndentLevel() * 2); + OS << "return "; + expandPredicate(OS, Fn.getDefaultReturnPredicate()); + OS << ";\n"; + + decreaseIndentLevel(); + OS.indent(getIndentLevel() * 2); + StringRef FunctionName = Fn.getDeclaration()->getValueAsString("Name"); + OS << "} // " << ClassPrefix << "::" << FunctionName << "\n\n"; +} + +void STIPredicateExpander::expandSTIPredicate(raw_ostream &OS, + const STIPredicateFunction &Fn) { + const Record *Rec = Fn.getDeclaration(); + if (shouldExpandForMC() && Rec->getValueAsBit("ExpandForMachineInstrOnly")) + return; + + expandHeader(OS, Fn); + if (shouldExpandDefinition()) { + expandPrologue(OS, Fn); + expandBody(OS, Fn); + expandEpilogue(OS, Fn); + } +} + } // namespace llvm Index: utils/TableGen/SubtargetEmitter.cpp =================================================================== --- utils/TableGen/SubtargetEmitter.cpp +++ utils/TableGen/SubtargetEmitter.cpp @@ -116,6 +116,7 @@ void emitSchedModelHelpersImpl(raw_ostream &OS, bool OnlyExpandMCInstPredicates = false); void emitGenMCSubtargetInfo(raw_ostream &OS); + void EmitMCInstrAnalysisPredicateFunctions(raw_ostream &OS); void EmitSchedModel(raw_ostream &OS); void EmitHwModeCheck(const std::string &ClassName, raw_ostream &OS); @@ -1672,7 +1673,16 @@ << " unsigned CPUID) const {\n" << " return " << Target << "_MC" << "::resolveVariantSchedClassImpl(SchedClass, MI, CPUID);\n" - << "} // " << ClassName << "::resolveVariantSchedClass\n"; + << "} // " << ClassName << "::resolveVariantSchedClass\n\n"; + + STIPredicateExpander PE(Target); + PE.setClassPrefix(ClassName); + PE.setExpandDefinition(true); + PE.setByRef(false); + PE.setIndentLevel(0); + + for (const STIPredicateFunction &Fn : SchedModels.getSTIPredicates()) + PE.expandSTIPredicate(OS, Fn); } void SubtargetEmitter::EmitHwModeCheck(const std::string &ClassName, @@ -1766,6 +1776,31 @@ OS << "};\n"; } +void SubtargetEmitter::EmitMCInstrAnalysisPredicateFunctions(raw_ostream &OS) { + OS << "\n#ifdef GET_STIPREDICATE_DECLS_FOR_MC_ANALYSIS\n"; + OS << "#undef GET_STIPREDICATE_DECLS_FOR_MC_ANALYSIS\n\n"; + + STIPredicateExpander PE(Target); + PE.setExpandForMC(true); + PE.setByRef(true); + for (const STIPredicateFunction &Fn : SchedModels.getSTIPredicates()) + PE.expandSTIPredicate(OS, Fn); + + OS << "#endif // GET_STIPREDICATE_DECLS_FOR_MC_ANALYSIS\n\n"; + + OS << "\n#ifdef GET_STIPREDICATE_DEFS_FOR_MC_ANALYSIS\n"; + OS << "#undef GET_STIPREDICATE_DEFS_FOR_MC_ANALYSIS\n\n"; + + std::string ClassPrefix = Target + "MCInstrAnalysis"; + PE.setExpandDefinition(true); + PE.setClassPrefix(ClassPrefix); + PE.setIndentLevel(0); + for (const STIPredicateFunction &Fn : SchedModels.getSTIPredicates()) + PE.expandSTIPredicate(OS, Fn); + + OS << "#endif // GET_STIPREDICATE_DEFS_FOR_MC_ANALYSIS\n\n"; +} + // // SubtargetEmitter::run - Main subtarget enumeration emitter. // @@ -1863,6 +1898,12 @@ << " const;\n"; if (TGT.getHwModes().getNumModeIds() > 1) OS << " unsigned getHwMode() const override;\n"; + + STIPredicateExpander PE(Target); + PE.setByRef(false); + for (const STIPredicateFunction &Fn : SchedModels.getSTIPredicates()) + PE.expandSTIPredicate(OS, Fn); + OS << "};\n" << "} // end namespace llvm\n\n"; @@ -1920,6 +1961,8 @@ OS << "} // end namespace llvm\n\n"; OS << "#endif // GET_SUBTARGETINFO_CTOR\n\n"; + + EmitMCInstrAnalysisPredicateFunctions(OS); } namespace llvm {