diff --git a/llvm/test/tools/llvm-exegesis/X86/latency-LEA64r.s b/llvm/test/tools/llvm-exegesis/X86/latency-LEA64r.s new file mode 100644 index 000000000000..e5d6db2ada27 --- /dev/null +++ b/llvm/test/tools/llvm-exegesis/X86/latency-LEA64r.s @@ -0,0 +1,16 @@ +# RUN: llvm-exegesis -mode=latency -opcode-name=LEA64r -repetition-mode=duplicate -max-configs-per-opcode=2 | FileCheck %s +# RUN: llvm-exegesis -mode=latency -opcode-name=LEA64r -repetition-mode=loop -max-configs-per-opcode=2 | FileCheck %s + +CHECK: --- +CHECK-NEXT: mode: latency +CHECK-NEXT: key: +CHECK-NEXT: instructions: +CHECK-NEXT: LEA64r +CHECK-NEXT: config: '0(%[[REG1:[A-Z0-9]+]], %[[REG1]], 1)' + +CHECK: --- +CHECK-NEXT: mode: latency +CHECK-NEXT: key: +CHECK-NEXT: instructions: +CHECK-NEXT: LEA64r +CHECK-NEXT: config: '42(%[[REG2:[A-Z0-9]+]], %[[REG2]], 1)' diff --git a/llvm/test/tools/llvm-exegesis/X86/uops-LEA64r.s b/llvm/test/tools/llvm-exegesis/X86/uops-LEA64r.s new file mode 100644 index 000000000000..68dde0a3be4b --- /dev/null +++ b/llvm/test/tools/llvm-exegesis/X86/uops-LEA64r.s @@ -0,0 +1,16 @@ +# RUN: llvm-exegesis -mode=uops -opcode-name=LEA64r -repetition-mode=duplicate -max-configs-per-opcode=2 | FileCheck %s +# RUN: llvm-exegesis -mode=uops -opcode-name=LEA64r -repetition-mode=loop -max-configs-per-opcode=2 | FileCheck %s + +CHECK: --- +CHECK-NEXT: mode: uops +CHECK-NEXT: key: +CHECK-NEXT: instructions: +CHECK-NEXT: LEA64r +CHECK-NEXT: config: '0(%[[REG1:[A-Z0-9]+]], %[[REG2:[A-Z0-9]+]], 1)' + +CHECK: --- +CHECK-NEXT: mode: uops +CHECK-NEXT: key: +CHECK-NEXT: instructions: +CHECK-NEXT: LEA64r +CHECK-NEXT: config: '42(%[[REG3:[A-Z0-9]+]], %[[REG4:[A-Z0-9]+]], 1)' diff --git a/llvm/tools/llvm-exegesis/lib/RegisterAliasing.h b/llvm/tools/llvm-exegesis/lib/RegisterAliasing.h index 0ddccdd6e52a..361acb641f88 100644 --- a/llvm/tools/llvm-exegesis/lib/RegisterAliasing.h +++ b/llvm/tools/llvm-exegesis/lib/RegisterAliasing.h @@ -1,109 +1,116 @@ //===-- RegisterAliasingTracker.h -------------------------------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// /// \file /// Defines classes to keep track of register aliasing. /// //===----------------------------------------------------------------------===// #ifndef LLVM_TOOLS_LLVM_EXEGESIS_ALIASINGTRACKER_H #define LLVM_TOOLS_LLVM_EXEGESIS_ALIASINGTRACKER_H #include #include #include "llvm/ADT/BitVector.h" #include "llvm/ADT/PackedVector.h" #include "llvm/MC/MCRegisterInfo.h" namespace llvm { namespace exegesis { // Returns the registers that are aliased by the ones set in SourceBits. llvm::BitVector getAliasedBits(const llvm::MCRegisterInfo &RegInfo, const llvm::BitVector &SourceBits); // Keeps track of a mapping from one register (or a register class) to its // aliased registers. // // e.g. // RegisterAliasingTracker Tracker(RegInfo, llvm::X86::EAX); // Tracker.sourceBits() == { llvm::X86::EAX } // Tracker.aliasedBits() == { llvm::X86::AL, llvm::X86::AH, llvm::X86::AX, // llvm::X86::EAX,llvm::X86::HAX, llvm::X86::RAX } // Tracker.getOrigin(llvm::X86::AL) == llvm::X86::EAX; // Tracker.getOrigin(llvm::X86::BX) == -1; struct RegisterAliasingTracker { // Construct a tracker from an MCRegisterClass. RegisterAliasingTracker(const llvm::MCRegisterInfo &RegInfo, const llvm::BitVector &ReservedReg, const llvm::MCRegisterClass &RegClass); // Construct a tracker from an MCPhysReg. RegisterAliasingTracker(const llvm::MCRegisterInfo &RegInfo, const llvm::MCPhysReg Register); const llvm::BitVector &sourceBits() const { return SourceBits; } // Retrieves all the touched registers as a BitVector. const llvm::BitVector &aliasedBits() const { return AliasedBits; } // Returns the origin of this register or -1. int getOrigin(llvm::MCPhysReg Aliased) const { if (!AliasedBits[Aliased]) return -1; return Origins[Aliased]; } private: RegisterAliasingTracker(const llvm::MCRegisterInfo &RegInfo); RegisterAliasingTracker(const RegisterAliasingTracker &) = delete; void FillOriginAndAliasedBits(const llvm::MCRegisterInfo &RegInfo, const llvm::BitVector &OriginalBits); llvm::BitVector SourceBits; llvm::BitVector AliasedBits; llvm::PackedVector Origins; // Max 1024 physical registers. }; // A cache of existing trackers. struct RegisterAliasingTrackerCache { // RegInfo must outlive the cache. RegisterAliasingTrackerCache(const llvm::MCRegisterInfo &RegInfo, const llvm::BitVector &ReservedReg); // Convenient function to retrieve a BitVector of the right size. const llvm::BitVector &emptyRegisters() const { return EmptyRegisters; } // Convenient function to retrieve the registers the function body can't use. const llvm::BitVector &reservedRegisters() const { return ReservedReg; } // Convenient function to retrieve the underlying MCRegInfo. const llvm::MCRegisterInfo ®Info() const { return RegInfo; } // Retrieves the RegisterAliasingTracker for this particular register. const RegisterAliasingTracker &getRegister(llvm::MCPhysReg Reg) const; // Retrieves the RegisterAliasingTracker for this particular register class. const RegisterAliasingTracker &getRegisterClass(unsigned RegClassIndex) const; private: const llvm::MCRegisterInfo &RegInfo; const llvm::BitVector ReservedReg; const llvm::BitVector EmptyRegisters; mutable std::unordered_map> Registers; mutable std::unordered_map> RegisterClasses; }; +// `a = a & ~b`, optimized for few bit sets in B and no allocation. +inline void remove(llvm::BitVector &A, const llvm::BitVector &B) { + assert(A.size() == B.size()); + for (auto I : B.set_bits()) + A.reset(I); +} + } // namespace exegesis } // namespace llvm #endif // LLVM_TOOLS_LLVM_EXEGESIS_ALIASINGTRACKER_H diff --git a/llvm/tools/llvm-exegesis/lib/Uops.cpp b/llvm/tools/llvm-exegesis/lib/Uops.cpp index 6823ec8fd175..c9cf06f409bc 100644 --- a/llvm/tools/llvm-exegesis/lib/Uops.cpp +++ b/llvm/tools/llvm-exegesis/lib/Uops.cpp @@ -1,259 +1,253 @@ //===-- Uops.cpp ------------------------------------------------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// #include "Uops.h" #include "Assembler.h" #include "BenchmarkRunner.h" #include "MCInstrDescView.h" #include "Target.h" // FIXME: Load constants into registers (e.g. with fld1) to not break // instructions like x87. // Ideally we would like the only limitation on executing uops to be the issue // ports. Maximizing port pressure increases the likelihood that the load is // distributed evenly across possible ports. // To achieve that, one approach is to generate instructions that do not have // data dependencies between them. // // For some instructions, this is trivial: // mov rax, qword ptr [rsi] // mov rax, qword ptr [rsi] // mov rax, qword ptr [rsi] // mov rax, qword ptr [rsi] // For the above snippet, haswell just renames rax four times and executes the // four instructions two at a time on P23 and P0126. // // For some instructions, we just need to make sure that the source is // different from the destination. For example, IDIV8r reads from GPR and // writes to AX. We just need to ensure that the Var is assigned a // register which is different from AX: // idiv bx // idiv bx // idiv bx // idiv bx // The above snippet will be able to fully saturate the ports, while the same // with ax would issue one uop every `latency(IDIV8r)` cycles. // // Some instructions make this harder because they both read and write from // the same register: // inc rax // inc rax // inc rax // inc rax // This has a data dependency from each instruction to the next, limit the // number of instructions that can be issued in parallel. // It turns out that this is not a big issue on recent Intel CPUs because they // have heuristics to balance port pressure. In the snippet above, subsequent // instructions will end up evenly distributed on {P0,P1,P5,P6}, but some CPUs // might end up executing them all on P0 (just because they can), or try // avoiding P5 because it's usually under high pressure from vector // instructions. // This issue is even more important for high-latency instructions because // they increase the idle time of the CPU, e.g. : // imul rax, rbx // imul rax, rbx // imul rax, rbx // imul rax, rbx // // To avoid that, we do the renaming statically by generating as many // independent exclusive assignments as possible (until all possible registers // are exhausted) e.g.: // imul rax, rbx // imul rcx, rbx // imul rdx, rbx // imul r8, rbx // // Some instruction even make the above static renaming impossible because // they implicitly read and write from the same operand, e.g. ADC16rr reads // and writes from EFLAGS. // In that case we just use a greedy register assignment and hope for the // best. namespace llvm { namespace exegesis { static llvm::SmallVector getVariablesWithTiedOperands(const Instruction &Instr) { llvm::SmallVector Result; for (const auto &Var : Instr.Variables) if (Var.hasTiedOperands()) Result.push_back(&Var); return Result; } -static void remove(llvm::BitVector &a, const llvm::BitVector &b) { - assert(a.size() == b.size()); - for (auto I : b.set_bits()) - a.reset(I); -} - UopsBenchmarkRunner::~UopsBenchmarkRunner() = default; UopsSnippetGenerator::~UopsSnippetGenerator() = default; void UopsSnippetGenerator::instantiateMemoryOperands( const unsigned ScratchSpacePointerInReg, std::vector &Instructions) const { if (ScratchSpacePointerInReg == 0) return; // no memory operands. const auto &ET = State.getExegesisTarget(); const unsigned MemStep = ET.getMaxMemoryAccessSize(); const size_t OriginalInstructionsSize = Instructions.size(); size_t I = 0; for (InstructionTemplate &IT : Instructions) { ET.fillMemoryOperands(IT, ScratchSpacePointerInReg, I * MemStep); ++I; } while (Instructions.size() < kMinNumDifferentAddresses) { InstructionTemplate IT = Instructions[I % OriginalInstructionsSize]; ET.fillMemoryOperands(IT, ScratchSpacePointerInReg, I * MemStep); ++I; Instructions.push_back(std::move(IT)); } assert(I * MemStep < BenchmarkRunner::ScratchSpace::kSize && "not enough scratch space"); } static std::vector generateSnippetUsingStaticRenaming( const LLVMState &State, const InstructionTemplate &IT, const ArrayRef TiedVariables, const BitVector &ForbiddenRegisters) { std::vector Instructions; // Assign registers to variables in a round-robin manner. This is simple but // ensures that the most register-constrained variable does not get starved. std::vector PossibleRegsForVar; for (const Variable *Var : TiedVariables) { assert(Var); const Operand &Op = IT.Instr.getPrimaryOperand(*Var); assert(Op.isReg()); BitVector PossibleRegs = Op.getRegisterAliasing().sourceBits(); remove(PossibleRegs, ForbiddenRegisters); PossibleRegsForVar.push_back(std::move(PossibleRegs)); } SmallVector Iterators(TiedVariables.size(), 0); while (true) { InstructionTemplate TmpIT = IT; // Find a possible register for each variable in turn, marking the // register as taken. for (size_t VarId = 0; VarId < TiedVariables.size(); ++VarId) { const int NextPossibleReg = PossibleRegsForVar[VarId].find_next(Iterators[VarId]); if (NextPossibleReg <= 0) { return Instructions; } TmpIT.getValueFor(*TiedVariables[VarId]) = llvm::MCOperand::createReg(NextPossibleReg); // Bump iterator. Iterators[VarId] = NextPossibleReg; // Prevent other variables from using the register. for (BitVector &OtherPossibleRegs : PossibleRegsForVar) { OtherPossibleRegs.reset(NextPossibleReg); } } Instructions.push_back(std::move(TmpIT)); } } llvm::Expected> UopsSnippetGenerator::generateCodeTemplates( const Instruction &Instr, const BitVector &ForbiddenRegisters) const { CodeTemplate CT; CT.ScratchSpacePointerInReg = Instr.hasMemoryOperands() ? State.getExegesisTarget().getScratchMemoryRegister( State.getTargetMachine().getTargetTriple()) : 0; const AliasingConfigurations SelfAliasing(Instr, Instr); InstructionTemplate IT(Instr); if (SelfAliasing.empty()) { CT.Info = "instruction is parallel, repeating a random one."; CT.Instructions.push_back(std::move(IT)); instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions); return getSingleton(std::move(CT)); } if (SelfAliasing.hasImplicitAliasing()) { CT.Info = "instruction is serial, repeating a random one."; CT.Instructions.push_back(std::move(IT)); instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions); return getSingleton(std::move(CT)); } const auto TiedVariables = getVariablesWithTiedOperands(Instr); if (!TiedVariables.empty()) { CT.Info = "instruction has tied variables, using static renaming."; CT.Instructions = generateSnippetUsingStaticRenaming( State, IT, TiedVariables, ForbiddenRegisters); instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions); return getSingleton(std::move(CT)); } // No tied variables, we pick random values for defs. llvm::BitVector Defs(State.getRegInfo().getNumRegs()); for (const auto &Op : Instr.Operands) { if (Op.isReg() && Op.isExplicit() && Op.isDef() && !Op.isMemory()) { auto PossibleRegisters = Op.getRegisterAliasing().sourceBits(); // Do not use forbidden registers. remove(PossibleRegisters, ForbiddenRegisters); assert(PossibleRegisters.any() && "No register left to choose from"); const auto RandomReg = randomBit(PossibleRegisters); Defs.set(RandomReg); IT.getValueFor(Op) = llvm::MCOperand::createReg(RandomReg); } } // And pick random use values that are not reserved and don't alias with defs. const auto DefAliases = getAliasedBits(State.getRegInfo(), Defs); for (const auto &Op : Instr.Operands) { if (Op.isReg() && Op.isExplicit() && Op.isUse() && !Op.isMemory()) { auto PossibleRegisters = Op.getRegisterAliasing().sourceBits(); remove(PossibleRegisters, ForbiddenRegisters); remove(PossibleRegisters, DefAliases); assert(PossibleRegisters.any() && "No register left to choose from"); const auto RandomReg = randomBit(PossibleRegisters); IT.getValueFor(Op) = llvm::MCOperand::createReg(RandomReg); } } CT.Info = "instruction has no tied variables picking Uses different from defs"; CT.Instructions.push_back(std::move(IT)); instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions); return getSingleton(std::move(CT)); } llvm::Expected> UopsBenchmarkRunner::runMeasurements(const FunctionExecutor &Executor) const { std::vector Result; const PfmCountersInfo &PCI = State.getPfmCounters(); // Uops per port. for (const auto *IssueCounter = PCI.IssueCounters, *IssueCounterEnd = PCI.IssueCounters + PCI.NumIssueCounters; IssueCounter != IssueCounterEnd; ++IssueCounter) { if (!IssueCounter->Counter) continue; auto ExpectedCounterValue = Executor.runAndMeasure(IssueCounter->Counter); if (!ExpectedCounterValue) return ExpectedCounterValue.takeError(); Result.push_back(BenchmarkMeasure::Create(IssueCounter->ProcResName, *ExpectedCounterValue)); } // NumMicroOps. if (const char *const UopsCounter = PCI.UopsCounter) { auto ExpectedCounterValue = Executor.runAndMeasure(UopsCounter); if (!ExpectedCounterValue) return ExpectedCounterValue.takeError(); Result.push_back( BenchmarkMeasure::Create("NumMicroOps", *ExpectedCounterValue)); } return std::move(Result); } constexpr const size_t UopsSnippetGenerator::kMinNumDifferentAddresses; } // namespace exegesis } // namespace llvm diff --git a/llvm/tools/llvm-exegesis/lib/X86/Target.cpp b/llvm/tools/llvm-exegesis/lib/X86/Target.cpp index 1532af8ddec6..681bf4cebf82 100644 --- a/llvm/tools/llvm-exegesis/lib/X86/Target.cpp +++ b/llvm/tools/llvm-exegesis/lib/X86/Target.cpp @@ -1,629 +1,724 @@ //===-- Target.cpp ----------------------------------------------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// #include "../Target.h" #include "../Error.h" #include "../Latency.h" #include "../SnippetGenerator.h" #include "../Uops.h" #include "MCTargetDesc/X86BaseInfo.h" #include "MCTargetDesc/X86MCTargetDesc.h" #include "X86.h" #include "X86RegisterInfo.h" #include "X86Subtarget.h" #include "llvm/MC/MCInstBuilder.h" +#include "llvm/Support/FormatVariadic.h" namespace llvm { namespace exegesis { // Returns an error if we cannot handle the memory references in this // instruction. static Error isInvalidMemoryInstr(const Instruction &Instr) { switch (Instr.Description->TSFlags & X86II::FormMask) { default: llvm_unreachable("Unknown FormMask value"); // These have no memory access. case X86II::Pseudo: case X86II::RawFrm: case X86II::AddCCFrm: case X86II::MRMDestReg: case X86II::MRMSrcReg: case X86II::MRMSrcReg4VOp3: case X86II::MRMSrcRegOp4: case X86II::MRMSrcRegCC: case X86II::MRMXrCC: case X86II::MRMXr: case X86II::MRM0r: case X86II::MRM1r: case X86II::MRM2r: case X86II::MRM3r: case X86II::MRM4r: case X86II::MRM5r: case X86II::MRM6r: case X86II::MRM7r: case X86II::MRM_C0: case X86II::MRM_C1: case X86II::MRM_C2: case X86II::MRM_C3: case X86II::MRM_C4: case X86II::MRM_C5: case X86II::MRM_C6: case X86II::MRM_C7: case X86II::MRM_C8: case X86II::MRM_C9: case X86II::MRM_CA: case X86II::MRM_CB: case X86II::MRM_CC: case X86II::MRM_CD: case X86II::MRM_CE: case X86II::MRM_CF: case X86II::MRM_D0: case X86II::MRM_D1: case X86II::MRM_D2: case X86II::MRM_D3: case X86II::MRM_D4: case X86II::MRM_D5: case X86II::MRM_D6: case X86II::MRM_D7: case X86II::MRM_D8: case X86II::MRM_D9: case X86II::MRM_DA: case X86II::MRM_DB: case X86II::MRM_DC: case X86II::MRM_DD: case X86II::MRM_DE: case X86II::MRM_DF: case X86II::MRM_E0: case X86II::MRM_E1: case X86II::MRM_E2: case X86II::MRM_E3: case X86II::MRM_E4: case X86II::MRM_E5: case X86II::MRM_E6: case X86II::MRM_E7: case X86II::MRM_E8: case X86II::MRM_E9: case X86II::MRM_EA: case X86II::MRM_EB: case X86II::MRM_EC: case X86II::MRM_ED: case X86II::MRM_EE: case X86II::MRM_EF: case X86II::MRM_F0: case X86II::MRM_F1: case X86II::MRM_F2: case X86II::MRM_F3: case X86II::MRM_F4: case X86II::MRM_F5: case X86II::MRM_F6: case X86II::MRM_F7: case X86II::MRM_F8: case X86II::MRM_F9: case X86II::MRM_FA: case X86II::MRM_FB: case X86II::MRM_FC: case X86II::MRM_FD: case X86II::MRM_FE: case X86II::MRM_FF: case X86II::RawFrmImm8: return Error::success(); case X86II::AddRegFrm: return (Instr.Description->Opcode == X86::POP16r || Instr.Description->Opcode == X86::POP32r || Instr.Description->Opcode == X86::PUSH16r || Instr.Description->Opcode == X86::PUSH32r) ? make_error( "unsupported opcode: unsupported memory access") : Error::success(); // These access memory and are handled. case X86II::MRMDestMem: case X86II::MRMSrcMem: case X86II::MRMSrcMem4VOp3: case X86II::MRMSrcMemOp4: case X86II::MRMSrcMemCC: case X86II::MRMXmCC: case X86II::MRMXm: case X86II::MRM0m: case X86II::MRM1m: case X86II::MRM2m: case X86II::MRM3m: case X86II::MRM4m: case X86II::MRM5m: case X86II::MRM6m: case X86II::MRM7m: return Error::success(); // These access memory and are not handled yet. case X86II::RawFrmImm16: case X86II::RawFrmMemOffs: case X86II::RawFrmSrc: case X86II::RawFrmDst: case X86II::RawFrmDstSrc: return make_error("unsupported opcode: non uniform memory access"); } } static llvm::Error IsInvalidOpcode(const Instruction &Instr) { const auto OpcodeName = Instr.Name; if ((Instr.Description->TSFlags & X86II::FormMask) == X86II::Pseudo) return llvm::make_error("unsupported opcode: pseudo instruction"); if (OpcodeName.startswith("POPF") || OpcodeName.startswith("PUSHF") || OpcodeName.startswith("ADJCALLSTACK")) return llvm::make_error( "unsupported opcode: Push/Pop/AdjCallStack"); if (llvm::Error Error = isInvalidMemoryInstr(Instr)) return Error; // We do not handle instructions with OPERAND_PCREL. for (const Operand &Op : Instr.Operands) if (Op.isExplicit() && Op.getExplicitOperandInfo().OperandType == llvm::MCOI::OPERAND_PCREL) return llvm::make_error( "unsupported opcode: PC relative operand"); // We do not handle second-form X87 instructions. We only handle first-form // ones (_Fp), see comment in X86InstrFPStack.td. for (const Operand &Op : Instr.Operands) if (Op.isReg() && Op.isExplicit() && Op.getExplicitOperandInfo().RegClass == llvm::X86::RSTRegClassID) return llvm::make_error( "unsupported second-form X87 instruction"); return llvm::Error::success(); } static unsigned getX86FPFlags(const Instruction &Instr) { return Instr.Description->TSFlags & llvm::X86II::FPTypeMask; } +// Helper to fill a memory operand with a value. +static void setMemOp(InstructionTemplate &IT, int OpIdx, + const MCOperand &OpVal) { + const auto Op = IT.Instr.Operands[OpIdx]; + assert(Op.isExplicit() && "invalid memory pattern"); + IT.getValueFor(Op) = OpVal; +}; + +// Common (latency, uops) code for LEA templates. `GetDestReg` takes the +// addressing base and index registers and returns the LEA destination register. +static llvm::Expected> generateLEATemplatesCommon( + const Instruction &Instr, const BitVector &ForbiddenRegisters, + const LLVMState &State, const SnippetGenerator::Options &Opts, + std::function GetDestReg) { + assert(Instr.Operands.size() == 6 && "invalid LEA"); + assert(X86II::getMemoryOperandNo(Instr.Description->TSFlags) == 1 && + "invalid LEA"); + + constexpr const int kDestOp = 0; + constexpr const int kBaseOp = 1; + constexpr const int kIndexOp = 3; + auto PossibleDestRegs = + Instr.Operands[kDestOp].getRegisterAliasing().sourceBits(); + remove(PossibleDestRegs, ForbiddenRegisters); + auto PossibleBaseRegs = + Instr.Operands[kBaseOp].getRegisterAliasing().sourceBits(); + remove(PossibleBaseRegs, ForbiddenRegisters); + auto PossibleIndexRegs = + Instr.Operands[kIndexOp].getRegisterAliasing().sourceBits(); + remove(PossibleIndexRegs, ForbiddenRegisters); + + const auto &RegInfo = State.getRegInfo(); + std::vector Result; + for (const unsigned BaseReg : PossibleBaseRegs.set_bits()) { + for (const unsigned IndexReg : PossibleIndexRegs.set_bits()) { + for (int LogScale = 0; LogScale <= 3; ++LogScale) { + // FIXME: Add an option for controlling how we explore immediates. + for (const int Disp : {0, 42}) { + InstructionTemplate IT(Instr); + const int64_t Scale = 1ull << LogScale; + setMemOp(IT, 1, MCOperand::createReg(BaseReg)); + setMemOp(IT, 2, MCOperand::createImm(Scale)); + setMemOp(IT, 3, MCOperand::createReg(IndexReg)); + setMemOp(IT, 4, MCOperand::createImm(Disp)); + // SegmentReg must be 0 for LEA. + setMemOp(IT, 5, MCOperand::createReg(0)); + + // Output reg is selected by the caller. + setMemOp(IT, 0, MCOperand::createReg(GetDestReg(BaseReg, IndexReg))); + + CodeTemplate CT; + CT.Instructions.push_back(std::move(IT)); + CT.Config = formatv("{3}(%{0}, %{1}, {2})", RegInfo.getName(BaseReg), + RegInfo.getName(IndexReg), Scale, Disp) + .str(); + Result.push_back(std::move(CT)); + if (Result.size() >= Opts.MaxConfigsPerOpcode) + return Result; + } + } + } + } + + return Result; +} + namespace { class X86LatencySnippetGenerator : public LatencySnippetGenerator { public: using LatencySnippetGenerator::LatencySnippetGenerator; llvm::Expected> generateCodeTemplates(const Instruction &Instr, const BitVector &ForbiddenRegisters) const override; }; } // namespace llvm::Expected> X86LatencySnippetGenerator::generateCodeTemplates( const Instruction &Instr, const BitVector &ForbiddenRegisters) const { if (auto E = IsInvalidOpcode(Instr)) return std::move(E); + // LEA gets special attention. + const auto Opcode = Instr.Description->getOpcode(); + if (Opcode == X86::LEA64r || Opcode == X86::LEA64_32r) { + return generateLEATemplatesCommon(Instr, ForbiddenRegisters, State, Opts, + [](unsigned BaseReg, unsigned IndexReg) { + // We just select the same base and + // output register. + return BaseReg; + }); + } + switch (getX86FPFlags(Instr)) { case llvm::X86II::NotFP: return LatencySnippetGenerator::generateCodeTemplates(Instr, ForbiddenRegisters); case llvm::X86II::ZeroArgFP: case llvm::X86II::OneArgFP: case llvm::X86II::SpecialFP: case llvm::X86II::CompareFP: case llvm::X86II::CondMovFP: return llvm::make_error("Unsupported x87 Instruction"); case llvm::X86II::OneArgFPRW: case llvm::X86II::TwoArgFP: // These are instructions like // - `ST(0) = fsqrt(ST(0))` (OneArgFPRW) // - `ST(0) = ST(0) + ST(i)` (TwoArgFP) // They are intrinsically serial and do not modify the state of the stack. return generateSelfAliasingCodeTemplates(Instr); default: llvm_unreachable("Unknown FP Type!"); } } namespace { class X86UopsSnippetGenerator : public UopsSnippetGenerator { public: using UopsSnippetGenerator::UopsSnippetGenerator; llvm::Expected> generateCodeTemplates(const Instruction &Instr, const BitVector &ForbiddenRegisters) const override; }; + } // namespace llvm::Expected> X86UopsSnippetGenerator::generateCodeTemplates( const Instruction &Instr, const BitVector &ForbiddenRegisters) const { if (auto E = IsInvalidOpcode(Instr)) return std::move(E); + // LEA gets special attention. + const auto Opcode = Instr.Description->getOpcode(); + if (Opcode == X86::LEA64r || Opcode == X86::LEA64_32r) { + // Any destination register that is not used for adddressing is fine. + auto PossibleDestRegs = + Instr.Operands[0].getRegisterAliasing().sourceBits(); + remove(PossibleDestRegs, ForbiddenRegisters); + return generateLEATemplatesCommon( + Instr, ForbiddenRegisters, State, Opts, + [this, &PossibleDestRegs](unsigned BaseReg, unsigned IndexReg) { + auto PossibleDestRegsNow = PossibleDestRegs; + remove(PossibleDestRegsNow, + State.getRATC().getRegister(BaseReg).aliasedBits()); + remove(PossibleDestRegsNow, + State.getRATC().getRegister(IndexReg).aliasedBits()); + assert(PossibleDestRegsNow.set_bits().begin() != + PossibleDestRegsNow.set_bits().end() && + "no remaining registers"); + return *PossibleDestRegsNow.set_bits().begin(); + }); + } + switch (getX86FPFlags(Instr)) { case llvm::X86II::NotFP: return UopsSnippetGenerator::generateCodeTemplates(Instr, ForbiddenRegisters); case llvm::X86II::ZeroArgFP: case llvm::X86II::OneArgFP: case llvm::X86II::SpecialFP: return llvm::make_error("Unsupported x87 Instruction"); case llvm::X86II::OneArgFPRW: case llvm::X86II::TwoArgFP: // These are instructions like // - `ST(0) = fsqrt(ST(0))` (OneArgFPRW) // - `ST(0) = ST(0) + ST(i)` (TwoArgFP) // They are intrinsically serial and do not modify the state of the stack. // We generate the same code for latency and uops. return generateSelfAliasingCodeTemplates(Instr); case llvm::X86II::CompareFP: case llvm::X86II::CondMovFP: // We can compute uops for any FP instruction that does not grow or shrink // the stack (either do not touch the stack or push as much as they pop). return generateUnconstrainedCodeTemplates( Instr, "instruction does not grow/shrink the FP stack"); default: llvm_unreachable("Unknown FP Type!"); } } static unsigned getLoadImmediateOpcode(unsigned RegBitWidth) { switch (RegBitWidth) { case 8: return llvm::X86::MOV8ri; case 16: return llvm::X86::MOV16ri; case 32: return llvm::X86::MOV32ri; case 64: return llvm::X86::MOV64ri; } llvm_unreachable("Invalid Value Width"); } // Generates instruction to load an immediate value into a register. static llvm::MCInst loadImmediate(unsigned Reg, unsigned RegBitWidth, const llvm::APInt &Value) { if (Value.getBitWidth() > RegBitWidth) llvm_unreachable("Value must fit in the Register"); return llvm::MCInstBuilder(getLoadImmediateOpcode(RegBitWidth)) .addReg(Reg) .addImm(Value.getZExtValue()); } // Allocates scratch memory on the stack. static llvm::MCInst allocateStackSpace(unsigned Bytes) { return llvm::MCInstBuilder(llvm::X86::SUB64ri8) .addReg(llvm::X86::RSP) .addReg(llvm::X86::RSP) .addImm(Bytes); } // Fills scratch memory at offset `OffsetBytes` with value `Imm`. static llvm::MCInst fillStackSpace(unsigned MovOpcode, unsigned OffsetBytes, uint64_t Imm) { return llvm::MCInstBuilder(MovOpcode) // Address = ESP .addReg(llvm::X86::RSP) // BaseReg .addImm(1) // ScaleAmt .addReg(0) // IndexReg .addImm(OffsetBytes) // Disp .addReg(0) // Segment // Immediate. .addImm(Imm); } // Loads scratch memory into register `Reg` using opcode `RMOpcode`. static llvm::MCInst loadToReg(unsigned Reg, unsigned RMOpcode) { return llvm::MCInstBuilder(RMOpcode) .addReg(Reg) // Address = ESP .addReg(llvm::X86::RSP) // BaseReg .addImm(1) // ScaleAmt .addReg(0) // IndexReg .addImm(0) // Disp .addReg(0); // Segment } // Releases scratch memory. static llvm::MCInst releaseStackSpace(unsigned Bytes) { return llvm::MCInstBuilder(llvm::X86::ADD64ri8) .addReg(llvm::X86::RSP) .addReg(llvm::X86::RSP) .addImm(Bytes); } // Reserves some space on the stack, fills it with the content of the provided // constant and provide methods to load the stack value into a register. namespace { struct ConstantInliner { explicit ConstantInliner(const llvm::APInt &Constant) : Constant_(Constant) {} std::vector loadAndFinalize(unsigned Reg, unsigned RegBitWidth, unsigned Opcode); std::vector loadX87STAndFinalize(unsigned Reg); std::vector loadX87FPAndFinalize(unsigned Reg); std::vector popFlagAndFinalize(); private: ConstantInliner &add(const llvm::MCInst &Inst) { Instructions.push_back(Inst); return *this; } void initStack(unsigned Bytes); static constexpr const unsigned kF80Bytes = 10; // 80 bits. llvm::APInt Constant_; std::vector Instructions; }; } // namespace std::vector ConstantInliner::loadAndFinalize(unsigned Reg, unsigned RegBitWidth, unsigned Opcode) { assert((RegBitWidth & 7) == 0 && "RegBitWidth must be a multiple of 8 bits"); initStack(RegBitWidth / 8); add(loadToReg(Reg, Opcode)); add(releaseStackSpace(RegBitWidth / 8)); return std::move(Instructions); } std::vector ConstantInliner::loadX87STAndFinalize(unsigned Reg) { initStack(kF80Bytes); add(llvm::MCInstBuilder(llvm::X86::LD_F80m) // Address = ESP .addReg(llvm::X86::RSP) // BaseReg .addImm(1) // ScaleAmt .addReg(0) // IndexReg .addImm(0) // Disp .addReg(0)); // Segment if (Reg != llvm::X86::ST0) add(llvm::MCInstBuilder(llvm::X86::ST_Frr).addReg(Reg)); add(releaseStackSpace(kF80Bytes)); return std::move(Instructions); } std::vector ConstantInliner::loadX87FPAndFinalize(unsigned Reg) { initStack(kF80Bytes); add(llvm::MCInstBuilder(llvm::X86::LD_Fp80m) .addReg(Reg) // Address = ESP .addReg(llvm::X86::RSP) // BaseReg .addImm(1) // ScaleAmt .addReg(0) // IndexReg .addImm(0) // Disp .addReg(0)); // Segment add(releaseStackSpace(kF80Bytes)); return std::move(Instructions); } std::vector ConstantInliner::popFlagAndFinalize() { initStack(8); add(llvm::MCInstBuilder(llvm::X86::POPF64)); return std::move(Instructions); } void ConstantInliner::initStack(unsigned Bytes) { assert(Constant_.getBitWidth() <= Bytes * 8 && "Value does not have the correct size"); const llvm::APInt WideConstant = Constant_.getBitWidth() < Bytes * 8 ? Constant_.sext(Bytes * 8) : Constant_; add(allocateStackSpace(Bytes)); size_t ByteOffset = 0; for (; Bytes - ByteOffset >= 4; ByteOffset += 4) add(fillStackSpace( llvm::X86::MOV32mi, ByteOffset, WideConstant.extractBits(32, ByteOffset * 8).getZExtValue())); if (Bytes - ByteOffset >= 2) { add(fillStackSpace( llvm::X86::MOV16mi, ByteOffset, WideConstant.extractBits(16, ByteOffset * 8).getZExtValue())); ByteOffset += 2; } if (Bytes - ByteOffset >= 1) add(fillStackSpace( llvm::X86::MOV8mi, ByteOffset, WideConstant.extractBits(8, ByteOffset * 8).getZExtValue())); } #include "X86GenExegesis.inc" namespace { class ExegesisX86Target : public ExegesisTarget { public: ExegesisX86Target() : ExegesisTarget(X86CpuPfmCounters) {} private: void addTargetSpecificPasses(llvm::PassManagerBase &PM) const override; unsigned getScratchMemoryRegister(const llvm::Triple &TT) const override; unsigned getLoopCounterRegister(const llvm::Triple &) const override; unsigned getMaxMemoryAccessSize() const override { return 64; } void randomizeMCOperand(const Instruction &Instr, const Variable &Var, llvm::MCOperand &AssignedValue, const llvm::BitVector &ForbiddenRegs) const override; void fillMemoryOperands(InstructionTemplate &IT, unsigned Reg, unsigned Offset) const override; void decrementLoopCounterAndJump(MachineBasicBlock &MBB, MachineBasicBlock &TargetMBB, const llvm::MCInstrInfo &MII) const override; std::vector setRegTo(const llvm::MCSubtargetInfo &STI, unsigned Reg, const llvm::APInt &Value) const override; ArrayRef getUnavailableRegisters() const override { return makeArrayRef(kUnavailableRegisters, sizeof(kUnavailableRegisters) / sizeof(kUnavailableRegisters[0])); } std::unique_ptr createLatencySnippetGenerator( const LLVMState &State, const SnippetGenerator::Options &Opts) const override { return std::make_unique(State, Opts); } std::unique_ptr createUopsSnippetGenerator( const LLVMState &State, const SnippetGenerator::Options &Opts) const override { return std::make_unique(State, Opts); } bool matchesArch(llvm::Triple::ArchType Arch) const override { return Arch == llvm::Triple::x86_64 || Arch == llvm::Triple::x86; } static const unsigned kUnavailableRegisters[4]; }; // We disable a few registers that cannot be encoded on instructions with a REX // prefix. const unsigned ExegesisX86Target::kUnavailableRegisters[4] = {X86::AH, X86::BH, X86::CH, X86::DH}; // We're using one of R8-R15 because these registers are never hardcoded in // instructions (e.g. MOVS writes to EDI, ESI, EDX), so they have less // conflicts. constexpr const unsigned kLoopCounterReg = X86::R8; } // namespace void ExegesisX86Target::addTargetSpecificPasses( llvm::PassManagerBase &PM) const { // Lowers FP pseudo-instructions, e.g. ABS_Fp32 -> ABS_F. PM.add(llvm::createX86FloatingPointStackifierPass()); } unsigned ExegesisX86Target::getScratchMemoryRegister(const llvm::Triple &TT) const { if (!TT.isArch64Bit()) { // FIXME: This would require popping from the stack, so we would have to // add some additional setup code. return 0; } return TT.isOSWindows() ? llvm::X86::RCX : llvm::X86::RDI; } unsigned ExegesisX86Target::getLoopCounterRegister(const llvm::Triple &TT) const { if (!TT.isArch64Bit()) { return 0; } return kLoopCounterReg; } void ExegesisX86Target::randomizeMCOperand( const Instruction &Instr, const Variable &Var, llvm::MCOperand &AssignedValue, const llvm::BitVector &ForbiddenRegs) const { ExegesisTarget::randomizeMCOperand(Instr, Var, AssignedValue, ForbiddenRegs); const Operand &Op = Instr.getPrimaryOperand(Var); switch (Op.getExplicitOperandInfo().OperandType) { case llvm::X86::OperandType::OPERAND_COND_CODE: AssignedValue = llvm::MCOperand::createImm( randomIndex(llvm::X86::CondCode::LAST_VALID_COND)); break; default: break; } } void ExegesisX86Target::fillMemoryOperands(InstructionTemplate &IT, unsigned Reg, unsigned Offset) const { assert(!isInvalidMemoryInstr(IT.Instr) && "fillMemoryOperands requires a valid memory instruction"); int MemOpIdx = X86II::getMemoryOperandNo(IT.Instr.Description->TSFlags); assert(MemOpIdx >= 0 && "invalid memory operand index"); // getMemoryOperandNo() ignores tied operands, so we have to add them back. for (unsigned I = 0; I <= static_cast(MemOpIdx); ++I) { const auto &Op = IT.Instr.Operands[I]; if (Op.isTied() && Op.getTiedToIndex() < I) { ++MemOpIdx; } } - // Now fill in the memory operands. - const auto SetOp = [&IT](int OpIdx, const MCOperand &OpVal) { - const auto Op = IT.Instr.Operands[OpIdx]; - assert(Op.isMemory() && Op.isExplicit() && "invalid memory pattern"); - IT.getValueFor(Op) = OpVal; - }; - SetOp(MemOpIdx + 0, MCOperand::createReg(Reg)); // BaseReg - SetOp(MemOpIdx + 1, MCOperand::createImm(1)); // ScaleAmt - SetOp(MemOpIdx + 2, MCOperand::createReg(0)); // IndexReg - SetOp(MemOpIdx + 3, MCOperand::createImm(Offset)); // Disp - SetOp(MemOpIdx + 4, MCOperand::createReg(0)); // Segment + setMemOp(IT, MemOpIdx + 0, MCOperand::createReg(Reg)); // BaseReg + setMemOp(IT, MemOpIdx + 1, MCOperand::createImm(1)); // ScaleAmt + setMemOp(IT, MemOpIdx + 2, MCOperand::createReg(0)); // IndexReg + setMemOp(IT, MemOpIdx + 3, MCOperand::createImm(Offset)); // Disp + setMemOp(IT, MemOpIdx + 4, MCOperand::createReg(0)); // Segment } void ExegesisX86Target::decrementLoopCounterAndJump( MachineBasicBlock &MBB, MachineBasicBlock &TargetMBB, const llvm::MCInstrInfo &MII) const { BuildMI(&MBB, DebugLoc(), MII.get(X86::ADD64ri8)) .addDef(kLoopCounterReg) .addUse(kLoopCounterReg) .addImm(-1); BuildMI(&MBB, DebugLoc(), MII.get(X86::JCC_1)) .addMBB(&TargetMBB) .addImm(X86::COND_NE); } std::vector ExegesisX86Target::setRegTo(const llvm::MCSubtargetInfo &STI, unsigned Reg, const llvm::APInt &Value) const { if (llvm::X86::GR8RegClass.contains(Reg)) return {loadImmediate(Reg, 8, Value)}; if (llvm::X86::GR16RegClass.contains(Reg)) return {loadImmediate(Reg, 16, Value)}; if (llvm::X86::GR32RegClass.contains(Reg)) return {loadImmediate(Reg, 32, Value)}; if (llvm::X86::GR64RegClass.contains(Reg)) return {loadImmediate(Reg, 64, Value)}; ConstantInliner CI(Value); if (llvm::X86::VR64RegClass.contains(Reg)) return CI.loadAndFinalize(Reg, 64, llvm::X86::MMX_MOVQ64rm); if (llvm::X86::VR128XRegClass.contains(Reg)) { if (STI.getFeatureBits()[llvm::X86::FeatureAVX512]) return CI.loadAndFinalize(Reg, 128, llvm::X86::VMOVDQU32Z128rm); if (STI.getFeatureBits()[llvm::X86::FeatureAVX]) return CI.loadAndFinalize(Reg, 128, llvm::X86::VMOVDQUrm); return CI.loadAndFinalize(Reg, 128, llvm::X86::MOVDQUrm); } if (llvm::X86::VR256XRegClass.contains(Reg)) { if (STI.getFeatureBits()[llvm::X86::FeatureAVX512]) return CI.loadAndFinalize(Reg, 256, llvm::X86::VMOVDQU32Z256rm); if (STI.getFeatureBits()[llvm::X86::FeatureAVX]) return CI.loadAndFinalize(Reg, 256, llvm::X86::VMOVDQUYrm); } if (llvm::X86::VR512RegClass.contains(Reg)) if (STI.getFeatureBits()[llvm::X86::FeatureAVX512]) return CI.loadAndFinalize(Reg, 512, llvm::X86::VMOVDQU32Zrm); if (llvm::X86::RSTRegClass.contains(Reg)) { return CI.loadX87STAndFinalize(Reg); } if (llvm::X86::RFP32RegClass.contains(Reg) || llvm::X86::RFP64RegClass.contains(Reg) || llvm::X86::RFP80RegClass.contains(Reg)) { return CI.loadX87FPAndFinalize(Reg); } if (Reg == llvm::X86::EFLAGS) return CI.popFlagAndFinalize(); return {}; // Not yet implemented. } static ExegesisTarget *getTheExegesisX86Target() { static ExegesisX86Target Target; return &Target; } void InitializeX86ExegesisTarget() { ExegesisTarget::registerTarget(getTheExegesisX86Target()); } } // namespace exegesis } // namespace llvm