diff --git a/llvm/include/llvm/CodeGen/LiveIntervals.h b/llvm/include/llvm/CodeGen/LiveIntervals.h index 8c6f94052295..fa08166791b0 100644 --- a/llvm/include/llvm/CodeGen/LiveIntervals.h +++ b/llvm/include/llvm/CodeGen/LiveIntervals.h @@ -1,501 +1,494 @@ //===- LiveIntervals.h - Live Interval Analysis -----------------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // /// \file This file implements the LiveInterval analysis pass. Given some /// numbering of each the machine instructions (in this implemention depth-first /// order) an interval [i, j) is said to be a live interval for register v if /// there is no instruction with number j' > j such that v is live at j' and /// there is no instruction with number i' < i such that v is live at i'. In /// this implementation intervals can have holes, i.e. an interval might look /// like [1,20), [50,65), [1000,1001). // //===----------------------------------------------------------------------===// #ifndef LLVM_CODEGEN_LIVEINTERVALS_H #define LLVM_CODEGEN_LIVEINTERVALS_H #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/IndexedMap.h" #include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/LiveInterval.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/SlotIndexes.h" #include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/MC/LaneBitmask.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/ErrorHandling.h" #include #include #include namespace llvm { extern cl::opt UseSegmentSetForPhysRegs; class AAResults; class BitVector; class LiveIntervalCalc; class MachineBlockFrequencyInfo; class MachineDominatorTree; class MachineFunction; class MachineInstr; class MachineRegisterInfo; class raw_ostream; class TargetInstrInfo; class VirtRegMap; class LiveIntervals : public MachineFunctionPass { MachineFunction* MF; MachineRegisterInfo* MRI; const TargetRegisterInfo* TRI; const TargetInstrInfo* TII; AAResults *AA; SlotIndexes* Indexes; MachineDominatorTree *DomTree = nullptr; LiveIntervalCalc *LICalc = nullptr; /// Special pool allocator for VNInfo's (LiveInterval val#). VNInfo::Allocator VNInfoAllocator; /// Live interval pointers for all the virtual registers. IndexedMap VirtRegIntervals; /// Sorted list of instructions with register mask operands. Always use the /// 'r' slot, RegMasks are normal clobbers, not early clobbers. SmallVector RegMaskSlots; /// This vector is parallel to RegMaskSlots, it holds a pointer to the /// corresponding register mask. This pointer can be recomputed as: /// /// MI = Indexes->getInstructionFromIndex(RegMaskSlot[N]); /// unsigned OpNum = findRegMaskOperand(MI); /// RegMaskBits[N] = MI->getOperand(OpNum).getRegMask(); /// /// This is kept in a separate vector partly because some standard /// libraries don't support lower_bound() with mixed objects, partly to /// improve locality when searching in RegMaskSlots. /// Also see the comment in LiveInterval::find(). SmallVector RegMaskBits; /// For each basic block number, keep (begin, size) pairs indexing into the /// RegMaskSlots and RegMaskBits arrays. /// Note that basic block numbers may not be layout contiguous, that's why /// we can't just keep track of the first register mask in each basic /// block. SmallVector, 8> RegMaskBlocks; /// Keeps a live range set for each register unit to track fixed physreg /// interference. SmallVector RegUnitRanges; public: static char ID; LiveIntervals(); ~LiveIntervals() override; /// Calculate the spill weight to assign to a single instruction. static float getSpillWeight(bool isDef, bool isUse, const MachineBlockFrequencyInfo *MBFI, const MachineInstr &MI); /// Calculate the spill weight to assign to a single instruction. static float getSpillWeight(bool isDef, bool isUse, const MachineBlockFrequencyInfo *MBFI, const MachineBasicBlock *MBB); LiveInterval &getInterval(Register Reg) { if (hasInterval(Reg)) return *VirtRegIntervals[Reg.id()]; return createAndComputeVirtRegInterval(Reg); } const LiveInterval &getInterval(Register Reg) const { return const_cast(this)->getInterval(Reg); } bool hasInterval(Register Reg) const { return VirtRegIntervals.inBounds(Reg.id()) && VirtRegIntervals[Reg.id()]; } /// Interval creation. LiveInterval &createEmptyInterval(Register Reg) { assert(!hasInterval(Reg) && "Interval already exists!"); VirtRegIntervals.grow(Reg.id()); VirtRegIntervals[Reg.id()] = createInterval(Reg); return *VirtRegIntervals[Reg.id()]; } LiveInterval &createAndComputeVirtRegInterval(Register Reg) { LiveInterval &LI = createEmptyInterval(Reg); computeVirtRegInterval(LI); return LI; } /// Interval removal. void removeInterval(Register Reg) { delete VirtRegIntervals[Reg]; VirtRegIntervals[Reg] = nullptr; } /// Given a register and an instruction, adds a live segment from that /// instruction to the end of its MBB. LiveInterval::Segment addSegmentToEndOfBlock(Register Reg, MachineInstr &startInst); /// After removing some uses of a register, shrink its live range to just /// the remaining uses. This method does not compute reaching defs for new /// uses, and it doesn't remove dead defs. /// Dead PHIDef values are marked as unused. New dead machine instructions /// are added to the dead vector. Returns true if the interval may have been /// separated into multiple connected components. bool shrinkToUses(LiveInterval *li, SmallVectorImpl *dead = nullptr); /// Specialized version of /// shrinkToUses(LiveInterval *li, SmallVectorImpl *dead) /// that works on a subregister live range and only looks at uses matching /// the lane mask of the subregister range. /// This may leave the subrange empty which needs to be cleaned up with /// LiveInterval::removeEmptySubranges() afterwards. void shrinkToUses(LiveInterval::SubRange &SR, Register Reg); /// Extend the live range \p LR to reach all points in \p Indices. The /// points in the \p Indices array must be jointly dominated by the union /// of the existing defs in \p LR and points in \p Undefs. /// /// PHI-defs are added as needed to maintain SSA form. /// /// If a SlotIndex in \p Indices is the end index of a basic block, \p LR /// will be extended to be live out of the basic block. /// If a SlotIndex in \p Indices is jointy dominated only by points in /// \p Undefs, the live range will not be extended to that point. /// /// See also LiveRangeCalc::extend(). void extendToIndices(LiveRange &LR, ArrayRef Indices, ArrayRef Undefs); void extendToIndices(LiveRange &LR, ArrayRef Indices) { extendToIndices(LR, Indices, /*Undefs=*/{}); } /// If \p LR has a live value at \p Kill, prune its live range by removing /// any liveness reachable from Kill. Add live range end points to /// EndPoints such that extendToIndices(LI, EndPoints) will reconstruct the /// value's live range. /// /// Calling pruneValue() and extendToIndices() can be used to reconstruct /// SSA form after adding defs to a virtual register. void pruneValue(LiveRange &LR, SlotIndex Kill, SmallVectorImpl *EndPoints); /// This function should not be used. Its intent is to tell you that you are /// doing something wrong if you call pruneValue directly on a /// LiveInterval. Indeed, you are supposed to call pruneValue on the main /// LiveRange and all the LiveRanges of the subranges if any. LLVM_ATTRIBUTE_UNUSED void pruneValue(LiveInterval &, SlotIndex, SmallVectorImpl *) { llvm_unreachable( "Use pruneValue on the main LiveRange and on each subrange"); } SlotIndexes *getSlotIndexes() const { return Indexes; } AAResults *getAliasAnalysis() const { return AA; } /// Returns true if the specified machine instr has been removed or was /// never entered in the map. bool isNotInMIMap(const MachineInstr &Instr) const { return !Indexes->hasIndex(Instr); } /// Returns the base index of the given instruction. SlotIndex getInstructionIndex(const MachineInstr &Instr) const { return Indexes->getInstructionIndex(Instr); } /// Returns the instruction associated with the given index. MachineInstr* getInstructionFromIndex(SlotIndex index) const { return Indexes->getInstructionFromIndex(index); } /// Return the first index in the given basic block. SlotIndex getMBBStartIdx(const MachineBasicBlock *mbb) const { return Indexes->getMBBStartIdx(mbb); } /// Return the last index in the given basic block. SlotIndex getMBBEndIdx(const MachineBasicBlock *mbb) const { return Indexes->getMBBEndIdx(mbb); } bool isLiveInToMBB(const LiveRange &LR, const MachineBasicBlock *mbb) const { return LR.liveAt(getMBBStartIdx(mbb)); } bool isLiveOutOfMBB(const LiveRange &LR, const MachineBasicBlock *mbb) const { return LR.liveAt(getMBBEndIdx(mbb).getPrevSlot()); } MachineBasicBlock* getMBBFromIndex(SlotIndex index) const { return Indexes->getMBBFromIndex(index); } void insertMBBInMaps(MachineBasicBlock *MBB) { Indexes->insertMBBInMaps(MBB); assert(unsigned(MBB->getNumber()) == RegMaskBlocks.size() && "Blocks must be added in order."); RegMaskBlocks.push_back(std::make_pair(RegMaskSlots.size(), 0)); } SlotIndex InsertMachineInstrInMaps(MachineInstr &MI) { return Indexes->insertMachineInstrInMaps(MI); } void InsertMachineInstrRangeInMaps(MachineBasicBlock::iterator B, MachineBasicBlock::iterator E) { for (MachineBasicBlock::iterator I = B; I != E; ++I) Indexes->insertMachineInstrInMaps(*I); } void RemoveMachineInstrFromMaps(MachineInstr &MI) { Indexes->removeMachineInstrFromMaps(MI); } SlotIndex ReplaceMachineInstrInMaps(MachineInstr &MI, MachineInstr &NewMI) { return Indexes->replaceMachineInstrInMaps(MI, NewMI); } VNInfo::Allocator& getVNInfoAllocator() { return VNInfoAllocator; } void getAnalysisUsage(AnalysisUsage &AU) const override; void releaseMemory() override; /// Pass entry point; Calculates LiveIntervals. bool runOnMachineFunction(MachineFunction&) override; /// Implement the dump method. void print(raw_ostream &O, const Module* = nullptr) const override; /// If LI is confined to a single basic block, return a pointer to that /// block. If LI is live in to or out of any block, return NULL. MachineBasicBlock *intervalIsInOneMBB(const LiveInterval &LI) const; /// Returns true if VNI is killed by any PHI-def values in LI. /// This may conservatively return true to avoid expensive computations. bool hasPHIKill(const LiveInterval &LI, const VNInfo *VNI) const; /// Add kill flags to any instruction that kills a virtual register. void addKillFlags(const VirtRegMap*); /// Call this method to notify LiveIntervals that instruction \p MI has been /// moved within a basic block. This will update the live intervals for all /// operands of \p MI. Moves between basic blocks are not supported. /// /// \param UpdateFlags Update live intervals for nonallocatable physregs. void handleMove(MachineInstr &MI, bool UpdateFlags = false); /// Update intervals of operands of all instructions in the newly /// created bundle specified by \p BundleStart. /// /// \param UpdateFlags Update live intervals for nonallocatable physregs. /// /// Assumes existing liveness is accurate. /// \pre BundleStart should be the first instruction in the Bundle. /// \pre BundleStart should not have a have SlotIndex as one will be assigned. void handleMoveIntoNewBundle(MachineInstr &BundleStart, bool UpdateFlags = false); /// Update live intervals for instructions in a range of iterators. It is /// intended for use after target hooks that may insert or remove /// instructions, and is only efficient for a small number of instructions. /// /// OrigRegs is a vector of registers that were originally used by the /// instructions in the range between the two iterators. /// /// Currently, the only only changes that are supported are simple removal /// and addition of uses. void repairIntervalsInRange(MachineBasicBlock *MBB, MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, ArrayRef OrigRegs); // Register mask functions. // // Machine instructions may use a register mask operand to indicate that a // large number of registers are clobbered by the instruction. This is // typically used for calls. // // For compile time performance reasons, these clobbers are not recorded in // the live intervals for individual physical registers. Instead, // LiveIntervalAnalysis maintains a sorted list of instructions with // register mask operands. /// Returns a sorted array of slot indices of all instructions with /// register mask operands. ArrayRef getRegMaskSlots() const { return RegMaskSlots; } /// Returns a sorted array of slot indices of all instructions with register /// mask operands in the basic block numbered \p MBBNum. ArrayRef getRegMaskSlotsInBlock(unsigned MBBNum) const { std::pair P = RegMaskBlocks[MBBNum]; return getRegMaskSlots().slice(P.first, P.second); } /// Returns an array of register mask pointers corresponding to /// getRegMaskSlots(). ArrayRef getRegMaskBits() const { return RegMaskBits; } /// Returns an array of mask pointers corresponding to /// getRegMaskSlotsInBlock(MBBNum). ArrayRef getRegMaskBitsInBlock(unsigned MBBNum) const { std::pair P = RegMaskBlocks[MBBNum]; return getRegMaskBits().slice(P.first, P.second); } /// Test if \p LI is live across any register mask instructions, and /// compute a bit mask of physical registers that are not clobbered by any /// of them. /// /// Returns false if \p LI doesn't cross any register mask instructions. In /// that case, the bit vector is not filled in. bool checkRegMaskInterference(LiveInterval &LI, BitVector &UsableRegs); - /// Get the interferenced slot index and its regmask for an live interval. - /// Return false if ther is no interference. - bool - getInterferenceRegMasks(LiveInterval &LI, - SmallVectorImpl &RegSlots, - SmallVectorImpl &RegMaskBits); - // Register unit functions. // // Fixed interference occurs when MachineInstrs use physregs directly // instead of virtual registers. This typically happens when passing // arguments to a function call, or when instructions require operands in // fixed registers. // // Each physreg has one or more register units, see MCRegisterInfo. We // track liveness per register unit to handle aliasing registers more // efficiently. /// Return the live range for register unit \p Unit. It will be computed if /// it doesn't exist. LiveRange &getRegUnit(unsigned Unit) { LiveRange *LR = RegUnitRanges[Unit]; if (!LR) { // Compute missing ranges on demand. // Use segment set to speed-up initial computation of the live range. RegUnitRanges[Unit] = LR = new LiveRange(UseSegmentSetForPhysRegs); computeRegUnitRange(*LR, Unit); } return *LR; } /// Return the live range for register unit \p Unit if it has already been /// computed, or nullptr if it hasn't been computed yet. LiveRange *getCachedRegUnit(unsigned Unit) { return RegUnitRanges[Unit]; } const LiveRange *getCachedRegUnit(unsigned Unit) const { return RegUnitRanges[Unit]; } /// Remove computed live range for register unit \p Unit. Subsequent uses /// should rely on on-demand recomputation. void removeRegUnit(unsigned Unit) { delete RegUnitRanges[Unit]; RegUnitRanges[Unit] = nullptr; } /// Remove associated live ranges for the register units associated with \p /// Reg. Subsequent uses should rely on on-demand recomputation. \note This /// method can result in inconsistent liveness tracking if multiple phyical /// registers share a regunit, and should be used cautiously. void removeAllRegUnitsForPhysReg(MCRegister Reg) { for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units) removeRegUnit(*Units); } /// Remove value numbers and related live segments starting at position /// \p Pos that are part of any liverange of physical register \p Reg or one /// of its subregisters. void removePhysRegDefAt(MCRegister Reg, SlotIndex Pos); /// Remove value number and related live segments of \p LI and its subranges /// that start at position \p Pos. void removeVRegDefAt(LiveInterval &LI, SlotIndex Pos); /// Split separate components in LiveInterval \p LI into separate intervals. void splitSeparateComponents(LiveInterval &LI, SmallVectorImpl &SplitLIs); /// For live interval \p LI with correct SubRanges construct matching /// information for the main live range. Expects the main live range to not /// have any segments or value numbers. void constructMainRangeFromSubranges(LiveInterval &LI); private: /// Compute live intervals for all virtual registers. void computeVirtRegs(); /// Compute RegMaskSlots and RegMaskBits. void computeRegMasks(); /// Walk the values in \p LI and check for dead values: /// - Dead PHIDef values are marked as unused. /// - Dead operands are marked as such. /// - Completely dead machine instructions are added to the \p dead vector /// if it is not nullptr. /// Returns true if any PHI value numbers have been removed which may /// have separated the interval into multiple connected components. bool computeDeadValues(LiveInterval &LI, SmallVectorImpl *dead); static LiveInterval *createInterval(Register Reg); void printInstrs(raw_ostream &O) const; void dumpInstrs() const; void computeLiveInRegUnits(); void computeRegUnitRange(LiveRange&, unsigned Unit); bool computeVirtRegInterval(LiveInterval&); using ShrinkToUsesWorkList = SmallVector, 16>; void extendSegmentsToUses(LiveRange &Segments, ShrinkToUsesWorkList &WorkList, Register Reg, LaneBitmask LaneMask); /// Helper function for repairIntervalsInRange(), walks backwards and /// creates/modifies live segments in \p LR to match the operands found. /// Only full operands or operands with subregisters matching \p LaneMask /// are considered. void repairOldRegInRange(MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, const SlotIndex endIdx, LiveRange &LR, Register Reg, LaneBitmask LaneMask = LaneBitmask::getAll()); class HMEditor; }; } // end namespace llvm #endif diff --git a/llvm/lib/CodeGen/LiveIntervals.cpp b/llvm/lib/CodeGen/LiveIntervals.cpp index 17005b38ac94..a32b486240c8 100644 --- a/llvm/lib/CodeGen/LiveIntervals.cpp +++ b/llvm/lib/CodeGen/LiveIntervals.cpp @@ -1,1785 +1,1735 @@ //===- LiveIntervals.cpp - Live Interval Analysis -------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // /// \file This file implements the LiveInterval analysis pass which is used /// by the Linear Scan Register allocator. This pass linearizes the /// basic blocks of the function in DFS order and computes live intervals for /// each virtual and physical register. // //===----------------------------------------------------------------------===// #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/iterator_range.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/CodeGen/LiveInterval.h" #include "llvm/CodeGen/LiveIntervalCalc.h" #include "llvm/CodeGen/LiveVariables.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineBlockFrequencyInfo.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBundle.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/SlotIndexes.h" #include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/CodeGen/VirtRegMap.h" #include "llvm/Config/llvm-config.h" #include "llvm/IR/InstrTypes.h" #include "llvm/MC/LaneBitmask.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/Pass.h" #include "llvm/Support/BlockFrequency.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include #include #include #include #include #include using namespace llvm; #define DEBUG_TYPE "regalloc" char LiveIntervals::ID = 0; char &llvm::LiveIntervalsID = LiveIntervals::ID; INITIALIZE_PASS_BEGIN(LiveIntervals, "liveintervals", "Live Interval Analysis", false, false) INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) INITIALIZE_PASS_DEPENDENCY(SlotIndexes) INITIALIZE_PASS_END(LiveIntervals, "liveintervals", "Live Interval Analysis", false, false) #ifndef NDEBUG static cl::opt EnablePrecomputePhysRegs( "precompute-phys-liveness", cl::Hidden, cl::desc("Eagerly compute live intervals for all physreg units.")); #else static bool EnablePrecomputePhysRegs = false; #endif // NDEBUG namespace llvm { cl::opt UseSegmentSetForPhysRegs( "use-segment-set-for-physregs", cl::Hidden, cl::init(true), cl::desc( "Use segment set for the computation of the live ranges of physregs.")); } // end namespace llvm void LiveIntervals::getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesCFG(); AU.addRequired(); AU.addPreserved(); AU.addPreserved(); AU.addPreservedID(MachineLoopInfoID); AU.addRequiredTransitiveID(MachineDominatorsID); AU.addPreservedID(MachineDominatorsID); AU.addPreserved(); AU.addRequiredTransitive(); MachineFunctionPass::getAnalysisUsage(AU); } LiveIntervals::LiveIntervals() : MachineFunctionPass(ID) { initializeLiveIntervalsPass(*PassRegistry::getPassRegistry()); } LiveIntervals::~LiveIntervals() { delete LICalc; } void LiveIntervals::releaseMemory() { // Free the live intervals themselves. for (unsigned i = 0, e = VirtRegIntervals.size(); i != e; ++i) delete VirtRegIntervals[Register::index2VirtReg(i)]; VirtRegIntervals.clear(); RegMaskSlots.clear(); RegMaskBits.clear(); RegMaskBlocks.clear(); for (LiveRange *LR : RegUnitRanges) delete LR; RegUnitRanges.clear(); // Release VNInfo memory regions, VNInfo objects don't need to be dtor'd. VNInfoAllocator.Reset(); } bool LiveIntervals::runOnMachineFunction(MachineFunction &fn) { MF = &fn; MRI = &MF->getRegInfo(); TRI = MF->getSubtarget().getRegisterInfo(); TII = MF->getSubtarget().getInstrInfo(); AA = &getAnalysis().getAAResults(); Indexes = &getAnalysis(); DomTree = &getAnalysis(); if (!LICalc) LICalc = new LiveIntervalCalc(); // Allocate space for all virtual registers. VirtRegIntervals.resize(MRI->getNumVirtRegs()); computeVirtRegs(); computeRegMasks(); computeLiveInRegUnits(); if (EnablePrecomputePhysRegs) { // For stress testing, precompute live ranges of all physical register // units, including reserved registers. for (unsigned i = 0, e = TRI->getNumRegUnits(); i != e; ++i) getRegUnit(i); } LLVM_DEBUG(dump()); return true; } void LiveIntervals::print(raw_ostream &OS, const Module* ) const { OS << "********** INTERVALS **********\n"; // Dump the regunits. for (unsigned Unit = 0, UnitE = RegUnitRanges.size(); Unit != UnitE; ++Unit) if (LiveRange *LR = RegUnitRanges[Unit]) OS << printRegUnit(Unit, TRI) << ' ' << *LR << '\n'; // Dump the virtregs. for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) { Register Reg = Register::index2VirtReg(i); if (hasInterval(Reg)) OS << getInterval(Reg) << '\n'; } OS << "RegMasks:"; for (SlotIndex Idx : RegMaskSlots) OS << ' ' << Idx; OS << '\n'; printInstrs(OS); } void LiveIntervals::printInstrs(raw_ostream &OS) const { OS << "********** MACHINEINSTRS **********\n"; MF->print(OS, Indexes); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) LLVM_DUMP_METHOD void LiveIntervals::dumpInstrs() const { printInstrs(dbgs()); } #endif LiveInterval *LiveIntervals::createInterval(Register reg) { float Weight = Register::isPhysicalRegister(reg) ? huge_valf : 0.0F; return new LiveInterval(reg, Weight); } /// Compute the live interval of a virtual register, based on defs and uses. bool LiveIntervals::computeVirtRegInterval(LiveInterval &LI) { assert(LICalc && "LICalc not initialized."); assert(LI.empty() && "Should only compute empty intervals."); LICalc->reset(MF, getSlotIndexes(), DomTree, &getVNInfoAllocator()); LICalc->calculate(LI, MRI->shouldTrackSubRegLiveness(LI.reg())); return computeDeadValues(LI, nullptr); } void LiveIntervals::computeVirtRegs() { for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) { Register Reg = Register::index2VirtReg(i); if (MRI->reg_nodbg_empty(Reg)) continue; LiveInterval &LI = createEmptyInterval(Reg); bool NeedSplit = computeVirtRegInterval(LI); if (NeedSplit) { SmallVector SplitLIs; splitSeparateComponents(LI, SplitLIs); } } } void LiveIntervals::computeRegMasks() { RegMaskBlocks.resize(MF->getNumBlockIDs()); // Find all instructions with regmask operands. for (const MachineBasicBlock &MBB : *MF) { std::pair &RMB = RegMaskBlocks[MBB.getNumber()]; RMB.first = RegMaskSlots.size(); // Some block starts, such as EH funclets, create masks. if (const uint32_t *Mask = MBB.getBeginClobberMask(TRI)) { RegMaskSlots.push_back(Indexes->getMBBStartIdx(&MBB)); RegMaskBits.push_back(Mask); } // Unwinders may clobber additional registers. // FIXME: This functionality can possibly be merged into // MachineBasicBlock::getBeginClobberMask(). if (MBB.isEHPad()) if (auto *Mask = TRI->getCustomEHPadPreservedMask(*MBB.getParent())) { RegMaskSlots.push_back(Indexes->getMBBStartIdx(&MBB)); RegMaskBits.push_back(Mask); } for (const MachineInstr &MI : MBB) { for (const MachineOperand &MO : MI.operands()) { if (!MO.isRegMask()) continue; RegMaskSlots.push_back(Indexes->getInstructionIndex(MI).getRegSlot()); RegMaskBits.push_back(MO.getRegMask()); } } // Some block ends, such as funclet returns, create masks. Put the mask on // the last instruction of the block, because MBB slot index intervals are // half-open. if (const uint32_t *Mask = MBB.getEndClobberMask(TRI)) { assert(!MBB.empty() && "empty return block?"); RegMaskSlots.push_back( Indexes->getInstructionIndex(MBB.back()).getRegSlot()); RegMaskBits.push_back(Mask); } // Compute the number of register mask instructions in this block. RMB.second = RegMaskSlots.size() - RMB.first; } } //===----------------------------------------------------------------------===// // Register Unit Liveness //===----------------------------------------------------------------------===// // // Fixed interference typically comes from ABI boundaries: Function arguments // and return values are passed in fixed registers, and so are exception // pointers entering landing pads. Certain instructions require values to be // present in specific registers. That is also represented through fixed // interference. // /// Compute the live range of a register unit, based on the uses and defs of /// aliasing registers. The range should be empty, or contain only dead /// phi-defs from ABI blocks. void LiveIntervals::computeRegUnitRange(LiveRange &LR, unsigned Unit) { assert(LICalc && "LICalc not initialized."); LICalc->reset(MF, getSlotIndexes(), DomTree, &getVNInfoAllocator()); // The physregs aliasing Unit are the roots and their super-registers. // Create all values as dead defs before extending to uses. Note that roots // may share super-registers. That's OK because createDeadDefs() is // idempotent. It is very rare for a register unit to have multiple roots, so // uniquing super-registers is probably not worthwhile. bool IsReserved = false; for (MCRegUnitRootIterator Root(Unit, TRI); Root.isValid(); ++Root) { bool IsRootReserved = true; for (MCSuperRegIterator Super(*Root, TRI, /*IncludeSelf=*/true); Super.isValid(); ++Super) { MCRegister Reg = *Super; if (!MRI->reg_empty(Reg)) LICalc->createDeadDefs(LR, Reg); // A register unit is considered reserved if all its roots and all their // super registers are reserved. if (!MRI->isReserved(Reg)) IsRootReserved = false; } IsReserved |= IsRootReserved; } assert(IsReserved == MRI->isReservedRegUnit(Unit) && "reserved computation mismatch"); // Now extend LR to reach all uses. // Ignore uses of reserved registers. We only track defs of those. if (!IsReserved) { for (MCRegUnitRootIterator Root(Unit, TRI); Root.isValid(); ++Root) { for (MCSuperRegIterator Super(*Root, TRI, /*IncludeSelf=*/true); Super.isValid(); ++Super) { MCRegister Reg = *Super; if (!MRI->reg_empty(Reg)) LICalc->extendToUses(LR, Reg); } } } // Flush the segment set to the segment vector. if (UseSegmentSetForPhysRegs) LR.flushSegmentSet(); } /// Precompute the live ranges of any register units that are live-in to an ABI /// block somewhere. Register values can appear without a corresponding def when /// entering the entry block or a landing pad. void LiveIntervals::computeLiveInRegUnits() { RegUnitRanges.resize(TRI->getNumRegUnits()); LLVM_DEBUG(dbgs() << "Computing live-in reg-units in ABI blocks.\n"); // Keep track of the live range sets allocated. SmallVector NewRanges; // Check all basic blocks for live-ins. for (const MachineBasicBlock &MBB : *MF) { // We only care about ABI blocks: Entry + landing pads. if ((&MBB != &MF->front() && !MBB.isEHPad()) || MBB.livein_empty()) continue; // Create phi-defs at Begin for all live-in registers. SlotIndex Begin = Indexes->getMBBStartIdx(&MBB); LLVM_DEBUG(dbgs() << Begin << "\t" << printMBBReference(MBB)); for (const auto &LI : MBB.liveins()) { for (MCRegUnitIterator Units(LI.PhysReg, TRI); Units.isValid(); ++Units) { unsigned Unit = *Units; LiveRange *LR = RegUnitRanges[Unit]; if (!LR) { // Use segment set to speed-up initial computation of the live range. LR = RegUnitRanges[Unit] = new LiveRange(UseSegmentSetForPhysRegs); NewRanges.push_back(Unit); } VNInfo *VNI = LR->createDeadDef(Begin, getVNInfoAllocator()); (void)VNI; LLVM_DEBUG(dbgs() << ' ' << printRegUnit(Unit, TRI) << '#' << VNI->id); } } LLVM_DEBUG(dbgs() << '\n'); } LLVM_DEBUG(dbgs() << "Created " << NewRanges.size() << " new intervals.\n"); // Compute the 'normal' part of the ranges. for (unsigned Unit : NewRanges) computeRegUnitRange(*RegUnitRanges[Unit], Unit); } static void createSegmentsForValues(LiveRange &LR, iterator_range VNIs) { for (VNInfo *VNI : VNIs) { if (VNI->isUnused()) continue; SlotIndex Def = VNI->def; LR.addSegment(LiveRange::Segment(Def, Def.getDeadSlot(), VNI)); } } void LiveIntervals::extendSegmentsToUses(LiveRange &Segments, ShrinkToUsesWorkList &WorkList, Register Reg, LaneBitmask LaneMask) { // Keep track of the PHIs that are in use. SmallPtrSet UsedPHIs; // Blocks that have already been added to WorkList as live-out. SmallPtrSet LiveOut; auto getSubRange = [](const LiveInterval &I, LaneBitmask M) -> const LiveRange& { if (M.none()) return I; for (const LiveInterval::SubRange &SR : I.subranges()) { if ((SR.LaneMask & M).any()) { assert(SR.LaneMask == M && "Expecting lane masks to match exactly"); return SR; } } llvm_unreachable("Subrange for mask not found"); }; const LiveInterval &LI = getInterval(Reg); const LiveRange &OldRange = getSubRange(LI, LaneMask); // Extend intervals to reach all uses in WorkList. while (!WorkList.empty()) { SlotIndex Idx = WorkList.back().first; VNInfo *VNI = WorkList.back().second; WorkList.pop_back(); const MachineBasicBlock *MBB = Indexes->getMBBFromIndex(Idx.getPrevSlot()); SlotIndex BlockStart = Indexes->getMBBStartIdx(MBB); // Extend the live range for VNI to be live at Idx. if (VNInfo *ExtVNI = Segments.extendInBlock(BlockStart, Idx)) { assert(ExtVNI == VNI && "Unexpected existing value number"); (void)ExtVNI; // Is this a PHIDef we haven't seen before? if (!VNI->isPHIDef() || VNI->def != BlockStart || !UsedPHIs.insert(VNI).second) continue; // The PHI is live, make sure the predecessors are live-out. for (const MachineBasicBlock *Pred : MBB->predecessors()) { if (!LiveOut.insert(Pred).second) continue; SlotIndex Stop = Indexes->getMBBEndIdx(Pred); // A predecessor is not required to have a live-out value for a PHI. if (VNInfo *PVNI = OldRange.getVNInfoBefore(Stop)) WorkList.push_back(std::make_pair(Stop, PVNI)); } continue; } // VNI is live-in to MBB. LLVM_DEBUG(dbgs() << " live-in at " << BlockStart << '\n'); Segments.addSegment(LiveRange::Segment(BlockStart, Idx, VNI)); // Make sure VNI is live-out from the predecessors. for (const MachineBasicBlock *Pred : MBB->predecessors()) { if (!LiveOut.insert(Pred).second) continue; SlotIndex Stop = Indexes->getMBBEndIdx(Pred); if (VNInfo *OldVNI = OldRange.getVNInfoBefore(Stop)) { assert(OldVNI == VNI && "Wrong value out of predecessor"); (void)OldVNI; WorkList.push_back(std::make_pair(Stop, VNI)); } else { #ifndef NDEBUG // There was no old VNI. Verify that Stop is jointly dominated // by s for this live range. assert(LaneMask.any() && "Missing value out of predecessor for main range"); SmallVector Undefs; LI.computeSubRangeUndefs(Undefs, LaneMask, *MRI, *Indexes); assert(LiveRangeCalc::isJointlyDominated(Pred, Undefs, *Indexes) && "Missing value out of predecessor for subrange"); #endif } } } } bool LiveIntervals::shrinkToUses(LiveInterval *li, SmallVectorImpl *dead) { LLVM_DEBUG(dbgs() << "Shrink: " << *li << '\n'); assert(Register::isVirtualRegister(li->reg()) && "Can only shrink virtual registers"); // Shrink subregister live ranges. bool NeedsCleanup = false; for (LiveInterval::SubRange &S : li->subranges()) { shrinkToUses(S, li->reg()); if (S.empty()) NeedsCleanup = true; } if (NeedsCleanup) li->removeEmptySubRanges(); // Find all the values used, including PHI kills. ShrinkToUsesWorkList WorkList; // Visit all instructions reading li->reg(). Register Reg = li->reg(); for (MachineInstr &UseMI : MRI->reg_instructions(Reg)) { if (UseMI.isDebugValue() || !UseMI.readsVirtualRegister(Reg)) continue; SlotIndex Idx = getInstructionIndex(UseMI).getRegSlot(); LiveQueryResult LRQ = li->Query(Idx); VNInfo *VNI = LRQ.valueIn(); if (!VNI) { // This shouldn't happen: readsVirtualRegister returns true, but there is // no live value. It is likely caused by a target getting flags // wrong. LLVM_DEBUG( dbgs() << Idx << '\t' << UseMI << "Warning: Instr claims to read non-existent value in " << *li << '\n'); continue; } // Special case: An early-clobber tied operand reads and writes the // register one slot early. if (VNInfo *DefVNI = LRQ.valueDefined()) Idx = DefVNI->def; WorkList.push_back(std::make_pair(Idx, VNI)); } // Create new live ranges with only minimal live segments per def. LiveRange NewLR; createSegmentsForValues(NewLR, make_range(li->vni_begin(), li->vni_end())); extendSegmentsToUses(NewLR, WorkList, Reg, LaneBitmask::getNone()); // Move the trimmed segments back. li->segments.swap(NewLR.segments); // Handle dead values. bool CanSeparate = computeDeadValues(*li, dead); LLVM_DEBUG(dbgs() << "Shrunk: " << *li << '\n'); return CanSeparate; } bool LiveIntervals::computeDeadValues(LiveInterval &LI, SmallVectorImpl *dead) { bool MayHaveSplitComponents = false; bool HaveDeadDef = false; for (VNInfo *VNI : LI.valnos) { if (VNI->isUnused()) continue; SlotIndex Def = VNI->def; LiveRange::iterator I = LI.FindSegmentContaining(Def); assert(I != LI.end() && "Missing segment for VNI"); // Is the register live before? Otherwise we may have to add a read-undef // flag for subregister defs. Register VReg = LI.reg(); if (MRI->shouldTrackSubRegLiveness(VReg)) { if ((I == LI.begin() || std::prev(I)->end < Def) && !VNI->isPHIDef()) { MachineInstr *MI = getInstructionFromIndex(Def); MI->setRegisterDefReadUndef(VReg); } } if (I->end != Def.getDeadSlot()) continue; if (VNI->isPHIDef()) { // This is a dead PHI. Remove it. VNI->markUnused(); LI.removeSegment(I); LLVM_DEBUG(dbgs() << "Dead PHI at " << Def << " may separate interval\n"); MayHaveSplitComponents = true; } else { // This is a dead def. Make sure the instruction knows. MachineInstr *MI = getInstructionFromIndex(Def); assert(MI && "No instruction defining live value"); MI->addRegisterDead(LI.reg(), TRI); if (HaveDeadDef) MayHaveSplitComponents = true; HaveDeadDef = true; if (dead && MI->allDefsAreDead()) { LLVM_DEBUG(dbgs() << "All defs dead: " << Def << '\t' << *MI); dead->push_back(MI); } } } return MayHaveSplitComponents; } void LiveIntervals::shrinkToUses(LiveInterval::SubRange &SR, Register Reg) { LLVM_DEBUG(dbgs() << "Shrink: " << SR << '\n'); assert(Register::isVirtualRegister(Reg) && "Can only shrink virtual registers"); // Find all the values used, including PHI kills. ShrinkToUsesWorkList WorkList; // Visit all instructions reading Reg. SlotIndex LastIdx; for (MachineOperand &MO : MRI->use_nodbg_operands(Reg)) { // Skip "undef" uses. if (!MO.readsReg()) continue; // Maybe the operand is for a subregister we don't care about. unsigned SubReg = MO.getSubReg(); if (SubReg != 0) { LaneBitmask LaneMask = TRI->getSubRegIndexLaneMask(SubReg); if ((LaneMask & SR.LaneMask).none()) continue; } // We only need to visit each instruction once. MachineInstr *UseMI = MO.getParent(); SlotIndex Idx = getInstructionIndex(*UseMI).getRegSlot(); if (Idx == LastIdx) continue; LastIdx = Idx; LiveQueryResult LRQ = SR.Query(Idx); VNInfo *VNI = LRQ.valueIn(); // For Subranges it is possible that only undef values are left in that // part of the subregister, so there is no real liverange at the use if (!VNI) continue; // Special case: An early-clobber tied operand reads and writes the // register one slot early. if (VNInfo *DefVNI = LRQ.valueDefined()) Idx = DefVNI->def; WorkList.push_back(std::make_pair(Idx, VNI)); } // Create a new live ranges with only minimal live segments per def. LiveRange NewLR; createSegmentsForValues(NewLR, make_range(SR.vni_begin(), SR.vni_end())); extendSegmentsToUses(NewLR, WorkList, Reg, SR.LaneMask); // Move the trimmed ranges back. SR.segments.swap(NewLR.segments); // Remove dead PHI value numbers for (VNInfo *VNI : SR.valnos) { if (VNI->isUnused()) continue; const LiveRange::Segment *Segment = SR.getSegmentContaining(VNI->def); assert(Segment != nullptr && "Missing segment for VNI"); if (Segment->end != VNI->def.getDeadSlot()) continue; if (VNI->isPHIDef()) { // This is a dead PHI. Remove it. LLVM_DEBUG(dbgs() << "Dead PHI at " << VNI->def << " may separate interval\n"); VNI->markUnused(); SR.removeSegment(*Segment); } } LLVM_DEBUG(dbgs() << "Shrunk: " << SR << '\n'); } void LiveIntervals::extendToIndices(LiveRange &LR, ArrayRef Indices, ArrayRef Undefs) { assert(LICalc && "LICalc not initialized."); LICalc->reset(MF, getSlotIndexes(), DomTree, &getVNInfoAllocator()); for (SlotIndex Idx : Indices) LICalc->extend(LR, Idx, /*PhysReg=*/0, Undefs); } void LiveIntervals::pruneValue(LiveRange &LR, SlotIndex Kill, SmallVectorImpl *EndPoints) { LiveQueryResult LRQ = LR.Query(Kill); VNInfo *VNI = LRQ.valueOutOrDead(); if (!VNI) return; MachineBasicBlock *KillMBB = Indexes->getMBBFromIndex(Kill); SlotIndex MBBEnd = Indexes->getMBBEndIdx(KillMBB); // If VNI isn't live out from KillMBB, the value is trivially pruned. if (LRQ.endPoint() < MBBEnd) { LR.removeSegment(Kill, LRQ.endPoint()); if (EndPoints) EndPoints->push_back(LRQ.endPoint()); return; } // VNI is live out of KillMBB. LR.removeSegment(Kill, MBBEnd); if (EndPoints) EndPoints->push_back(MBBEnd); // Find all blocks that are reachable from KillMBB without leaving VNI's live // range. It is possible that KillMBB itself is reachable, so start a DFS // from each successor. using VisitedTy = df_iterator_default_set; VisitedTy Visited; for (MachineBasicBlock *Succ : KillMBB->successors()) { for (df_ext_iterator I = df_ext_begin(Succ, Visited), E = df_ext_end(Succ, Visited); I != E;) { MachineBasicBlock *MBB = *I; // Check if VNI is live in to MBB. SlotIndex MBBStart, MBBEnd; std::tie(MBBStart, MBBEnd) = Indexes->getMBBRange(MBB); LiveQueryResult LRQ = LR.Query(MBBStart); if (LRQ.valueIn() != VNI) { // This block isn't part of the VNI segment. Prune the search. I.skipChildren(); continue; } // Prune the search if VNI is killed in MBB. if (LRQ.endPoint() < MBBEnd) { LR.removeSegment(MBBStart, LRQ.endPoint()); if (EndPoints) EndPoints->push_back(LRQ.endPoint()); I.skipChildren(); continue; } // VNI is live through MBB. LR.removeSegment(MBBStart, MBBEnd); if (EndPoints) EndPoints->push_back(MBBEnd); ++I; } } } //===----------------------------------------------------------------------===// // Register allocator hooks. // void LiveIntervals::addKillFlags(const VirtRegMap *VRM) { // Keep track of regunit ranges. SmallVector, 8> RU; // Keep track of subregister ranges. SmallVector, 4> SRs; for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) { Register Reg = Register::index2VirtReg(i); if (MRI->reg_nodbg_empty(Reg)) continue; const LiveInterval &LI = getInterval(Reg); if (LI.empty()) continue; // Find the regunit intervals for the assigned register. They may overlap // the virtual register live range, cancelling any kills. RU.clear(); for (MCRegUnitIterator Unit(VRM->getPhys(Reg), TRI); Unit.isValid(); ++Unit) { const LiveRange &RURange = getRegUnit(*Unit); if (RURange.empty()) continue; RU.push_back(std::make_pair(&RURange, RURange.find(LI.begin()->end))); } if (MRI->subRegLivenessEnabled()) { SRs.clear(); for (const LiveInterval::SubRange &SR : LI.subranges()) { SRs.push_back(std::make_pair(&SR, SR.find(LI.begin()->end))); } } // Every instruction that kills Reg corresponds to a segment range end // point. for (LiveInterval::const_iterator RI = LI.begin(), RE = LI.end(); RI != RE; ++RI) { // A block index indicates an MBB edge. if (RI->end.isBlock()) continue; MachineInstr *MI = getInstructionFromIndex(RI->end); if (!MI) continue; // Check if any of the regunits are live beyond the end of RI. That could // happen when a physreg is defined as a copy of a virtreg: // // %eax = COPY %5 // FOO %5 <--- MI, cancel kill because %eax is live. // BAR killed %eax // // There should be no kill flag on FOO when %5 is rewritten as %eax. for (auto &RUP : RU) { const LiveRange &RURange = *RUP.first; LiveRange::const_iterator &I = RUP.second; if (I == RURange.end()) continue; I = RURange.advanceTo(I, RI->end); if (I == RURange.end() || I->start >= RI->end) continue; // I is overlapping RI. goto CancelKill; } if (MRI->subRegLivenessEnabled()) { // When reading a partial undefined value we must not add a kill flag. // The regalloc might have used the undef lane for something else. // Example: // %1 = ... ; R32: %1 // %2:high16 = ... ; R64: %2 // = read killed %2 ; R64: %2 // = read %1 ; R32: %1 // The flag is correct for %2, but the register allocator may // assign R0L to %1, and R0 to %2 because the low 32bits of R0 // are actually never written by %2. After assignment the // flag at the read instruction is invalid. LaneBitmask DefinedLanesMask; if (!SRs.empty()) { // Compute a mask of lanes that are defined. DefinedLanesMask = LaneBitmask::getNone(); for (auto &SRP : SRs) { const LiveInterval::SubRange &SR = *SRP.first; LiveRange::const_iterator &I = SRP.second; if (I == SR.end()) continue; I = SR.advanceTo(I, RI->end); if (I == SR.end() || I->start >= RI->end) continue; // I is overlapping RI DefinedLanesMask |= SR.LaneMask; } } else DefinedLanesMask = LaneBitmask::getAll(); bool IsFullWrite = false; for (const MachineOperand &MO : MI->operands()) { if (!MO.isReg() || MO.getReg() != Reg) continue; if (MO.isUse()) { // Reading any undefined lanes? LaneBitmask UseMask = TRI->getSubRegIndexLaneMask(MO.getSubReg()); if ((UseMask & ~DefinedLanesMask).any()) goto CancelKill; } else if (MO.getSubReg() == 0) { // Writing to the full register? assert(MO.isDef()); IsFullWrite = true; } } // If an instruction writes to a subregister, a new segment starts in // the LiveInterval. But as this is only overriding part of the register // adding kill-flags is not correct here after registers have been // assigned. if (!IsFullWrite) { // Next segment has to be adjacent in the subregister write case. LiveRange::const_iterator N = std::next(RI); if (N != LI.end() && N->start == RI->end) goto CancelKill; } } MI->addRegisterKilled(Reg, nullptr); continue; CancelKill: MI->clearRegisterKills(Reg, nullptr); } } } MachineBasicBlock* LiveIntervals::intervalIsInOneMBB(const LiveInterval &LI) const { // A local live range must be fully contained inside the block, meaning it is // defined and killed at instructions, not at block boundaries. It is not // live in or out of any block. // // It is technically possible to have a PHI-defined live range identical to a // single block, but we are going to return false in that case. SlotIndex Start = LI.beginIndex(); if (Start.isBlock()) return nullptr; SlotIndex Stop = LI.endIndex(); if (Stop.isBlock()) return nullptr; // getMBBFromIndex doesn't need to search the MBB table when both indexes // belong to proper instructions. MachineBasicBlock *MBB1 = Indexes->getMBBFromIndex(Start); MachineBasicBlock *MBB2 = Indexes->getMBBFromIndex(Stop); return MBB1 == MBB2 ? MBB1 : nullptr; } bool LiveIntervals::hasPHIKill(const LiveInterval &LI, const VNInfo *VNI) const { for (const VNInfo *PHI : LI.valnos) { if (PHI->isUnused() || !PHI->isPHIDef()) continue; const MachineBasicBlock *PHIMBB = getMBBFromIndex(PHI->def); // Conservatively return true instead of scanning huge predecessor lists. if (PHIMBB->pred_size() > 100) return true; for (const MachineBasicBlock *Pred : PHIMBB->predecessors()) if (VNI == LI.getVNInfoBefore(Indexes->getMBBEndIdx(Pred))) return true; } return false; } float LiveIntervals::getSpillWeight(bool isDef, bool isUse, const MachineBlockFrequencyInfo *MBFI, const MachineInstr &MI) { return getSpillWeight(isDef, isUse, MBFI, MI.getParent()); } float LiveIntervals::getSpillWeight(bool isDef, bool isUse, const MachineBlockFrequencyInfo *MBFI, const MachineBasicBlock *MBB) { return (isDef + isUse) * MBFI->getBlockFreqRelativeToEntryBlock(MBB); } LiveRange::Segment LiveIntervals::addSegmentToEndOfBlock(Register Reg, MachineInstr &startInst) { LiveInterval &Interval = createEmptyInterval(Reg); VNInfo *VN = Interval.getNextValue( SlotIndex(getInstructionIndex(startInst).getRegSlot()), getVNInfoAllocator()); LiveRange::Segment S(SlotIndex(getInstructionIndex(startInst).getRegSlot()), getMBBEndIdx(startInst.getParent()), VN); Interval.addSegment(S); return S; } //===----------------------------------------------------------------------===// // Register mask functions //===----------------------------------------------------------------------===// bool LiveIntervals::checkRegMaskInterference(LiveInterval &LI, BitVector &UsableRegs) { if (LI.empty()) return false; LiveInterval::iterator LiveI = LI.begin(), LiveE = LI.end(); // Use a smaller arrays for local live ranges. ArrayRef Slots; ArrayRef Bits; if (MachineBasicBlock *MBB = intervalIsInOneMBB(LI)) { Slots = getRegMaskSlotsInBlock(MBB->getNumber()); Bits = getRegMaskBitsInBlock(MBB->getNumber()); } else { Slots = getRegMaskSlots(); Bits = getRegMaskBits(); } // We are going to enumerate all the register mask slots contained in LI. // Start with a binary search of RegMaskSlots to find a starting point. ArrayRef::iterator SlotI = llvm::lower_bound(Slots, LiveI->start); ArrayRef::iterator SlotE = Slots.end(); // No slots in range, LI begins after the last call. if (SlotI == SlotE) return false; bool Found = false; while (true) { assert(*SlotI >= LiveI->start); // Loop over all slots overlapping this segment. while (*SlotI < LiveI->end) { // *SlotI overlaps LI. Collect mask bits. if (!Found) { // This is the first overlap. Initialize UsableRegs to all ones. UsableRegs.clear(); UsableRegs.resize(TRI->getNumRegs(), true); Found = true; } // Remove usable registers clobbered by this mask. UsableRegs.clearBitsNotInMask(Bits[SlotI-Slots.begin()]); if (++SlotI == SlotE) return Found; } // *SlotI is beyond the current LI segment. LiveI = LI.advanceTo(LiveI, *SlotI); if (LiveI == LiveE) return Found; // Advance SlotI until it overlaps. while (*SlotI < LiveI->start) if (++SlotI == SlotE) return Found; } } -bool LiveIntervals::getInterferenceRegMasks( - LiveInterval &LI, SmallVectorImpl &RegSlots, - SmallVectorImpl &RegBits) { - if (LI.empty()) - return false; - LiveInterval::iterator LiveI = LI.begin(), LiveE = LI.end(); - - // Use a smaller arrays for local live ranges. - ArrayRef Slots; - ArrayRef Bits; - if (MachineBasicBlock *MBB = intervalIsInOneMBB(LI)) { - Slots = getRegMaskSlotsInBlock(MBB->getNumber()); - Bits = getRegMaskBitsInBlock(MBB->getNumber()); - } else { - Slots = getRegMaskSlots(); - Bits = getRegMaskBits(); - } - - // We are going to enumerate all the register mask slots contained in LI. - // Start with a binary search of RegMaskSlots to find a starting point. - ArrayRef::iterator SlotI = llvm::lower_bound(Slots, LiveI->start); - ArrayRef::iterator SlotE = Slots.end(); - - // No slots in range, LI begins after the last call. - if (SlotI == SlotE) - return false; - - bool Found = false; - while (true) { - assert(*SlotI >= LiveI->start); - // Loop over all slots overlapping this segment. - while (*SlotI < LiveI->end) { - // *SlotI overlaps LI. Collect mask bits. - Found = true; - RegSlots.push_back(*SlotI); - RegBits.push_back(Bits[SlotI - Slots.begin()]); - if (++SlotI == SlotE) - return Found; - } - // *SlotI is beyond the current LI segment. - LiveI = LI.advanceTo(LiveI, *SlotI); - if (LiveI == LiveE) - return Found; - // Advance SlotI until it overlaps. - while (*SlotI < LiveI->start) - if (++SlotI == SlotE) - return Found; - } -} - //===----------------------------------------------------------------------===// // IntervalUpdate class. //===----------------------------------------------------------------------===// /// Toolkit used by handleMove to trim or extend live intervals. class LiveIntervals::HMEditor { private: LiveIntervals& LIS; const MachineRegisterInfo& MRI; const TargetRegisterInfo& TRI; SlotIndex OldIdx; SlotIndex NewIdx; SmallPtrSet Updated; bool UpdateFlags; public: HMEditor(LiveIntervals& LIS, const MachineRegisterInfo& MRI, const TargetRegisterInfo& TRI, SlotIndex OldIdx, SlotIndex NewIdx, bool UpdateFlags) : LIS(LIS), MRI(MRI), TRI(TRI), OldIdx(OldIdx), NewIdx(NewIdx), UpdateFlags(UpdateFlags) {} // FIXME: UpdateFlags is a workaround that creates live intervals for all // physregs, even those that aren't needed for regalloc, in order to update // kill flags. This is wasteful. Eventually, LiveVariables will strip all kill // flags, and postRA passes will use a live register utility instead. LiveRange *getRegUnitLI(unsigned Unit) { if (UpdateFlags && !MRI.isReservedRegUnit(Unit)) return &LIS.getRegUnit(Unit); return LIS.getCachedRegUnit(Unit); } /// Update all live ranges touched by MI, assuming a move from OldIdx to /// NewIdx. void updateAllRanges(MachineInstr *MI) { LLVM_DEBUG(dbgs() << "handleMove " << OldIdx << " -> " << NewIdx << ": " << *MI); bool hasRegMask = false; for (MachineOperand &MO : MI->operands()) { if (MO.isRegMask()) hasRegMask = true; if (!MO.isReg()) continue; if (MO.isUse()) { if (!MO.readsReg()) continue; // Aggressively clear all kill flags. // They are reinserted by VirtRegRewriter. MO.setIsKill(false); } Register Reg = MO.getReg(); if (!Reg) continue; if (Register::isVirtualRegister(Reg)) { LiveInterval &LI = LIS.getInterval(Reg); if (LI.hasSubRanges()) { unsigned SubReg = MO.getSubReg(); LaneBitmask LaneMask = SubReg ? TRI.getSubRegIndexLaneMask(SubReg) : MRI.getMaxLaneMaskForVReg(Reg); for (LiveInterval::SubRange &S : LI.subranges()) { if ((S.LaneMask & LaneMask).none()) continue; updateRange(S, Reg, S.LaneMask); } } updateRange(LI, Reg, LaneBitmask::getNone()); // If main range has a hole and we are moving a subrange use across // the hole updateRange() cannot properly handle it since it only // gets the LiveRange and not the whole LiveInterval. As a result // we may end up with a main range not covering all subranges. // This is extremely rare case, so let's check and reconstruct the // main range. for (LiveInterval::SubRange &S : LI.subranges()) { if (LI.covers(S)) continue; LI.clear(); LIS.constructMainRangeFromSubranges(LI); break; } continue; } // For physregs, only update the regunits that actually have a // precomputed live range. for (MCRegUnitIterator Units(Reg.asMCReg(), &TRI); Units.isValid(); ++Units) if (LiveRange *LR = getRegUnitLI(*Units)) updateRange(*LR, *Units, LaneBitmask::getNone()); } if (hasRegMask) updateRegMaskSlots(); } private: /// Update a single live range, assuming an instruction has been moved from /// OldIdx to NewIdx. void updateRange(LiveRange &LR, Register Reg, LaneBitmask LaneMask) { if (!Updated.insert(&LR).second) return; LLVM_DEBUG({ dbgs() << " "; if (Register::isVirtualRegister(Reg)) { dbgs() << printReg(Reg); if (LaneMask.any()) dbgs() << " L" << PrintLaneMask(LaneMask); } else { dbgs() << printRegUnit(Reg, &TRI); } dbgs() << ":\t" << LR << '\n'; }); if (SlotIndex::isEarlierInstr(OldIdx, NewIdx)) handleMoveDown(LR); else handleMoveUp(LR, Reg, LaneMask); LLVM_DEBUG(dbgs() << " -->\t" << LR << '\n'); LR.verify(); } /// Update LR to reflect an instruction has been moved downwards from OldIdx /// to NewIdx (OldIdx < NewIdx). void handleMoveDown(LiveRange &LR) { LiveRange::iterator E = LR.end(); // Segment going into OldIdx. LiveRange::iterator OldIdxIn = LR.find(OldIdx.getBaseIndex()); // No value live before or after OldIdx? Nothing to do. if (OldIdxIn == E || SlotIndex::isEarlierInstr(OldIdx, OldIdxIn->start)) return; LiveRange::iterator OldIdxOut; // Do we have a value live-in to OldIdx? if (SlotIndex::isEarlierInstr(OldIdxIn->start, OldIdx)) { // If the live-in value already extends to NewIdx, there is nothing to do. if (SlotIndex::isEarlierEqualInstr(NewIdx, OldIdxIn->end)) return; // Aggressively remove all kill flags from the old kill point. // Kill flags shouldn't be used while live intervals exist, they will be // reinserted by VirtRegRewriter. if (MachineInstr *KillMI = LIS.getInstructionFromIndex(OldIdxIn->end)) for (MachineOperand &MOP : mi_bundle_ops(*KillMI)) if (MOP.isReg() && MOP.isUse()) MOP.setIsKill(false); // Is there a def before NewIdx which is not OldIdx? LiveRange::iterator Next = std::next(OldIdxIn); if (Next != E && !SlotIndex::isSameInstr(OldIdx, Next->start) && SlotIndex::isEarlierInstr(Next->start, NewIdx)) { // If we are here then OldIdx was just a use but not a def. We only have // to ensure liveness extends to NewIdx. LiveRange::iterator NewIdxIn = LR.advanceTo(Next, NewIdx.getBaseIndex()); // Extend the segment before NewIdx if necessary. if (NewIdxIn == E || !SlotIndex::isEarlierInstr(NewIdxIn->start, NewIdx)) { LiveRange::iterator Prev = std::prev(NewIdxIn); Prev->end = NewIdx.getRegSlot(); } // Extend OldIdxIn. OldIdxIn->end = Next->start; return; } // Adjust OldIdxIn->end to reach NewIdx. This may temporarily make LR // invalid by overlapping ranges. bool isKill = SlotIndex::isSameInstr(OldIdx, OldIdxIn->end); OldIdxIn->end = NewIdx.getRegSlot(OldIdxIn->end.isEarlyClobber()); // If this was not a kill, then there was no def and we're done. if (!isKill) return; // Did we have a Def at OldIdx? OldIdxOut = Next; if (OldIdxOut == E || !SlotIndex::isSameInstr(OldIdx, OldIdxOut->start)) return; } else { OldIdxOut = OldIdxIn; } // If we are here then there is a Definition at OldIdx. OldIdxOut points // to the segment starting there. assert(OldIdxOut != E && SlotIndex::isSameInstr(OldIdx, OldIdxOut->start) && "No def?"); VNInfo *OldIdxVNI = OldIdxOut->valno; assert(OldIdxVNI->def == OldIdxOut->start && "Inconsistent def"); // If the defined value extends beyond NewIdx, just move the beginning // of the segment to NewIdx. SlotIndex NewIdxDef = NewIdx.getRegSlot(OldIdxOut->start.isEarlyClobber()); if (SlotIndex::isEarlierInstr(NewIdxDef, OldIdxOut->end)) { OldIdxVNI->def = NewIdxDef; OldIdxOut->start = OldIdxVNI->def; return; } // If we are here then we have a Definition at OldIdx which ends before // NewIdx. // Is there an existing Def at NewIdx? LiveRange::iterator AfterNewIdx = LR.advanceTo(OldIdxOut, NewIdx.getRegSlot()); bool OldIdxDefIsDead = OldIdxOut->end.isDead(); if (!OldIdxDefIsDead && SlotIndex::isEarlierInstr(OldIdxOut->end, NewIdxDef)) { // OldIdx is not a dead def, and NewIdxDef is inside a new interval. VNInfo *DefVNI; if (OldIdxOut != LR.begin() && !SlotIndex::isEarlierInstr(std::prev(OldIdxOut)->end, OldIdxOut->start)) { // There is no gap between OldIdxOut and its predecessor anymore, // merge them. LiveRange::iterator IPrev = std::prev(OldIdxOut); DefVNI = OldIdxVNI; IPrev->end = OldIdxOut->end; } else { // The value is live in to OldIdx LiveRange::iterator INext = std::next(OldIdxOut); assert(INext != E && "Must have following segment"); // We merge OldIdxOut and its successor. As we're dealing with subreg // reordering, there is always a successor to OldIdxOut in the same BB // We don't need INext->valno anymore and will reuse for the new segment // we create later. DefVNI = OldIdxVNI; INext->start = OldIdxOut->end; INext->valno->def = INext->start; } // If NewIdx is behind the last segment, extend that and append a new one. if (AfterNewIdx == E) { // OldIdxOut is undef at this point, Slide (OldIdxOut;AfterNewIdx] up // one position. // |- ?/OldIdxOut -| |- X0 -| ... |- Xn -| end // => |- X0/OldIdxOut -| ... |- Xn -| |- undef/NewS -| end std::copy(std::next(OldIdxOut), E, OldIdxOut); // The last segment is undefined now, reuse it for a dead def. LiveRange::iterator NewSegment = std::prev(E); *NewSegment = LiveRange::Segment(NewIdxDef, NewIdxDef.getDeadSlot(), DefVNI); DefVNI->def = NewIdxDef; LiveRange::iterator Prev = std::prev(NewSegment); Prev->end = NewIdxDef; } else { // OldIdxOut is undef at this point, Slide (OldIdxOut;AfterNewIdx] up // one position. // |- ?/OldIdxOut -| |- X0 -| ... |- Xn/AfterNewIdx -| |- Next -| // => |- X0/OldIdxOut -| ... |- Xn -| |- Xn/AfterNewIdx -| |- Next -| std::copy(std::next(OldIdxOut), std::next(AfterNewIdx), OldIdxOut); LiveRange::iterator Prev = std::prev(AfterNewIdx); // We have two cases: if (SlotIndex::isEarlierInstr(Prev->start, NewIdxDef)) { // Case 1: NewIdx is inside a liverange. Split this liverange at // NewIdxDef into the segment "Prev" followed by "NewSegment". LiveRange::iterator NewSegment = AfterNewIdx; *NewSegment = LiveRange::Segment(NewIdxDef, Prev->end, Prev->valno); Prev->valno->def = NewIdxDef; *Prev = LiveRange::Segment(Prev->start, NewIdxDef, DefVNI); DefVNI->def = Prev->start; } else { // Case 2: NewIdx is in a lifetime hole. Keep AfterNewIdx as is and // turn Prev into a segment from NewIdx to AfterNewIdx->start. *Prev = LiveRange::Segment(NewIdxDef, AfterNewIdx->start, DefVNI); DefVNI->def = NewIdxDef; assert(DefVNI != AfterNewIdx->valno); } } return; } if (AfterNewIdx != E && SlotIndex::isSameInstr(AfterNewIdx->start, NewIdxDef)) { // There is an existing def at NewIdx. The def at OldIdx is coalesced into // that value. assert(AfterNewIdx->valno != OldIdxVNI && "Multiple defs of value?"); LR.removeValNo(OldIdxVNI); } else { // There was no existing def at NewIdx. We need to create a dead def // at NewIdx. Shift segments over the old OldIdxOut segment, this frees // a new segment at the place where we want to construct the dead def. // |- OldIdxOut -| |- X0 -| ... |- Xn -| |- AfterNewIdx -| // => |- X0/OldIdxOut -| ... |- Xn -| |- undef/NewS. -| |- AfterNewIdx -| assert(AfterNewIdx != OldIdxOut && "Inconsistent iterators"); std::copy(std::next(OldIdxOut), AfterNewIdx, OldIdxOut); // We can reuse OldIdxVNI now. LiveRange::iterator NewSegment = std::prev(AfterNewIdx); VNInfo *NewSegmentVNI = OldIdxVNI; NewSegmentVNI->def = NewIdxDef; *NewSegment = LiveRange::Segment(NewIdxDef, NewIdxDef.getDeadSlot(), NewSegmentVNI); } } /// Update LR to reflect an instruction has been moved upwards from OldIdx /// to NewIdx (NewIdx < OldIdx). void handleMoveUp(LiveRange &LR, Register Reg, LaneBitmask LaneMask) { LiveRange::iterator E = LR.end(); // Segment going into OldIdx. LiveRange::iterator OldIdxIn = LR.find(OldIdx.getBaseIndex()); // No value live before or after OldIdx? Nothing to do. if (OldIdxIn == E || SlotIndex::isEarlierInstr(OldIdx, OldIdxIn->start)) return; LiveRange::iterator OldIdxOut; // Do we have a value live-in to OldIdx? if (SlotIndex::isEarlierInstr(OldIdxIn->start, OldIdx)) { // If the live-in value isn't killed here, then we have no Def at // OldIdx, moreover the value must be live at NewIdx so there is nothing // to do. bool isKill = SlotIndex::isSameInstr(OldIdx, OldIdxIn->end); if (!isKill) return; // At this point we have to move OldIdxIn->end back to the nearest // previous use or (dead-)def but no further than NewIdx. SlotIndex DefBeforeOldIdx = std::max(OldIdxIn->start.getDeadSlot(), NewIdx.getRegSlot(OldIdxIn->end.isEarlyClobber())); OldIdxIn->end = findLastUseBefore(DefBeforeOldIdx, Reg, LaneMask); // Did we have a Def at OldIdx? If not we are done now. OldIdxOut = std::next(OldIdxIn); if (OldIdxOut == E || !SlotIndex::isSameInstr(OldIdx, OldIdxOut->start)) return; } else { OldIdxOut = OldIdxIn; OldIdxIn = OldIdxOut != LR.begin() ? std::prev(OldIdxOut) : E; } // If we are here then there is a Definition at OldIdx. OldIdxOut points // to the segment starting there. assert(OldIdxOut != E && SlotIndex::isSameInstr(OldIdx, OldIdxOut->start) && "No def?"); VNInfo *OldIdxVNI = OldIdxOut->valno; assert(OldIdxVNI->def == OldIdxOut->start && "Inconsistent def"); bool OldIdxDefIsDead = OldIdxOut->end.isDead(); // Is there an existing def at NewIdx? SlotIndex NewIdxDef = NewIdx.getRegSlot(OldIdxOut->start.isEarlyClobber()); LiveRange::iterator NewIdxOut = LR.find(NewIdx.getRegSlot()); if (SlotIndex::isSameInstr(NewIdxOut->start, NewIdx)) { assert(NewIdxOut->valno != OldIdxVNI && "Same value defined more than once?"); // If OldIdx was a dead def remove it. if (!OldIdxDefIsDead) { // Remove segment starting at NewIdx and move begin of OldIdxOut to // NewIdx so it can take its place. OldIdxVNI->def = NewIdxDef; OldIdxOut->start = NewIdxDef; LR.removeValNo(NewIdxOut->valno); } else { // Simply remove the dead def at OldIdx. LR.removeValNo(OldIdxVNI); } } else { // Previously nothing was live after NewIdx, so all we have to do now is // move the begin of OldIdxOut to NewIdx. if (!OldIdxDefIsDead) { // Do we have any intermediate Defs between OldIdx and NewIdx? if (OldIdxIn != E && SlotIndex::isEarlierInstr(NewIdxDef, OldIdxIn->start)) { // OldIdx is not a dead def and NewIdx is before predecessor start. LiveRange::iterator NewIdxIn = NewIdxOut; assert(NewIdxIn == LR.find(NewIdx.getBaseIndex())); const SlotIndex SplitPos = NewIdxDef; OldIdxVNI = OldIdxIn->valno; SlotIndex NewDefEndPoint = std::next(NewIdxIn)->end; LiveRange::iterator Prev = std::prev(OldIdxIn); if (OldIdxIn != LR.begin() && SlotIndex::isEarlierInstr(NewIdx, Prev->end)) { // If the segment before OldIdx read a value defined earlier than // NewIdx, the moved instruction also reads and forwards that // value. Extend the lifetime of the new def point. // Extend to where the previous range started, unless there is // another redef first. NewDefEndPoint = std::min(OldIdxIn->start, std::next(NewIdxOut)->start); } // Merge the OldIdxIn and OldIdxOut segments into OldIdxOut. OldIdxOut->valno->def = OldIdxIn->start; *OldIdxOut = LiveRange::Segment(OldIdxIn->start, OldIdxOut->end, OldIdxOut->valno); // OldIdxIn and OldIdxVNI are now undef and can be overridden. // We Slide [NewIdxIn, OldIdxIn) down one position. // |- X0/NewIdxIn -| ... |- Xn-1 -||- Xn/OldIdxIn -||- OldIdxOut -| // => |- undef/NexIdxIn -| |- X0 -| ... |- Xn-1 -| |- Xn/OldIdxOut -| std::copy_backward(NewIdxIn, OldIdxIn, OldIdxOut); // NewIdxIn is now considered undef so we can reuse it for the moved // value. LiveRange::iterator NewSegment = NewIdxIn; LiveRange::iterator Next = std::next(NewSegment); if (SlotIndex::isEarlierInstr(Next->start, NewIdx)) { // There is no gap between NewSegment and its predecessor. *NewSegment = LiveRange::Segment(Next->start, SplitPos, Next->valno); *Next = LiveRange::Segment(SplitPos, NewDefEndPoint, OldIdxVNI); Next->valno->def = SplitPos; } else { // There is a gap between NewSegment and its predecessor // Value becomes live in. *NewSegment = LiveRange::Segment(SplitPos, Next->start, OldIdxVNI); NewSegment->valno->def = SplitPos; } } else { // Leave the end point of a live def. OldIdxOut->start = NewIdxDef; OldIdxVNI->def = NewIdxDef; if (OldIdxIn != E && SlotIndex::isEarlierInstr(NewIdx, OldIdxIn->end)) OldIdxIn->end = NewIdxDef; } } else if (OldIdxIn != E && SlotIndex::isEarlierInstr(NewIdxOut->start, NewIdx) && SlotIndex::isEarlierInstr(NewIdx, NewIdxOut->end)) { // OldIdxVNI is a dead def that has been moved into the middle of // another value in LR. That can happen when LR is a whole register, // but the dead def is a write to a subreg that is dead at NewIdx. // The dead def may have been moved across other values // in LR, so move OldIdxOut up to NewIdxOut. Slide [NewIdxOut;OldIdxOut) // down one position. // |- X0/NewIdxOut -| ... |- Xn-1 -| |- Xn/OldIdxOut -| |- next - | // => |- X0/NewIdxOut -| |- X0 -| ... |- Xn-1 -| |- next -| std::copy_backward(NewIdxOut, OldIdxOut, std::next(OldIdxOut)); // Modify the segment at NewIdxOut and the following segment to meet at // the point of the dead def, with the following segment getting // OldIdxVNI as its value number. *NewIdxOut = LiveRange::Segment( NewIdxOut->start, NewIdxDef.getRegSlot(), NewIdxOut->valno); *(NewIdxOut + 1) = LiveRange::Segment( NewIdxDef.getRegSlot(), (NewIdxOut + 1)->end, OldIdxVNI); OldIdxVNI->def = NewIdxDef; // Modify subsequent segments to be defined by the moved def OldIdxVNI. for (auto Idx = NewIdxOut + 2; Idx <= OldIdxOut; ++Idx) Idx->valno = OldIdxVNI; // Aggressively remove all dead flags from the former dead definition. // Kill/dead flags shouldn't be used while live intervals exist; they // will be reinserted by VirtRegRewriter. if (MachineInstr *KillMI = LIS.getInstructionFromIndex(NewIdx)) for (MIBundleOperands MO(*KillMI); MO.isValid(); ++MO) if (MO->isReg() && !MO->isUse()) MO->setIsDead(false); } else { // OldIdxVNI is a dead def. It may have been moved across other values // in LR, so move OldIdxOut up to NewIdxOut. Slide [NewIdxOut;OldIdxOut) // down one position. // |- X0/NewIdxOut -| ... |- Xn-1 -| |- Xn/OldIdxOut -| |- next - | // => |- undef/NewIdxOut -| |- X0 -| ... |- Xn-1 -| |- next -| std::copy_backward(NewIdxOut, OldIdxOut, std::next(OldIdxOut)); // OldIdxVNI can be reused now to build a new dead def segment. LiveRange::iterator NewSegment = NewIdxOut; VNInfo *NewSegmentVNI = OldIdxVNI; *NewSegment = LiveRange::Segment(NewIdxDef, NewIdxDef.getDeadSlot(), NewSegmentVNI); NewSegmentVNI->def = NewIdxDef; } } } void updateRegMaskSlots() { SmallVectorImpl::iterator RI = llvm::lower_bound(LIS.RegMaskSlots, OldIdx); assert(RI != LIS.RegMaskSlots.end() && *RI == OldIdx.getRegSlot() && "No RegMask at OldIdx."); *RI = NewIdx.getRegSlot(); assert((RI == LIS.RegMaskSlots.begin() || SlotIndex::isEarlierInstr(*std::prev(RI), *RI)) && "Cannot move regmask instruction above another call"); assert((std::next(RI) == LIS.RegMaskSlots.end() || SlotIndex::isEarlierInstr(*RI, *std::next(RI))) && "Cannot move regmask instruction below another call"); } // Return the last use of reg between NewIdx and OldIdx. SlotIndex findLastUseBefore(SlotIndex Before, Register Reg, LaneBitmask LaneMask) { if (Register::isVirtualRegister(Reg)) { SlotIndex LastUse = Before; for (MachineOperand &MO : MRI.use_nodbg_operands(Reg)) { if (MO.isUndef()) continue; unsigned SubReg = MO.getSubReg(); if (SubReg != 0 && LaneMask.any() && (TRI.getSubRegIndexLaneMask(SubReg) & LaneMask).none()) continue; const MachineInstr &MI = *MO.getParent(); SlotIndex InstSlot = LIS.getSlotIndexes()->getInstructionIndex(MI); if (InstSlot > LastUse && InstSlot < OldIdx) LastUse = InstSlot.getRegSlot(); } return LastUse; } // This is a regunit interval, so scanning the use list could be very // expensive. Scan upwards from OldIdx instead. assert(Before < OldIdx && "Expected upwards move"); SlotIndexes *Indexes = LIS.getSlotIndexes(); MachineBasicBlock *MBB = Indexes->getMBBFromIndex(Before); // OldIdx may not correspond to an instruction any longer, so set MII to // point to the next instruction after OldIdx, or MBB->end(). MachineBasicBlock::iterator MII = MBB->end(); if (MachineInstr *MI = Indexes->getInstructionFromIndex( Indexes->getNextNonNullIndex(OldIdx))) if (MI->getParent() == MBB) MII = MI; MachineBasicBlock::iterator Begin = MBB->begin(); while (MII != Begin) { if ((--MII)->isDebugInstr()) continue; SlotIndex Idx = Indexes->getInstructionIndex(*MII); // Stop searching when Before is reached. if (!SlotIndex::isEarlierInstr(Before, Idx)) return Before; // Check if MII uses Reg. for (MIBundleOperands MO(*MII); MO.isValid(); ++MO) if (MO->isReg() && !MO->isUndef() && Register::isPhysicalRegister(MO->getReg()) && TRI.hasRegUnit(MO->getReg(), Reg)) return Idx.getRegSlot(); } // Didn't reach Before. It must be the first instruction in the block. return Before; } }; void LiveIntervals::handleMove(MachineInstr &MI, bool UpdateFlags) { // It is fine to move a bundle as a whole, but not an individual instruction // inside it. assert((!MI.isBundled() || MI.getOpcode() == TargetOpcode::BUNDLE) && "Cannot move instruction in bundle"); SlotIndex OldIndex = Indexes->getInstructionIndex(MI); Indexes->removeMachineInstrFromMaps(MI); SlotIndex NewIndex = Indexes->insertMachineInstrInMaps(MI); assert(getMBBStartIdx(MI.getParent()) <= OldIndex && OldIndex < getMBBEndIdx(MI.getParent()) && "Cannot handle moves across basic block boundaries."); HMEditor HME(*this, *MRI, *TRI, OldIndex, NewIndex, UpdateFlags); HME.updateAllRanges(&MI); } void LiveIntervals::handleMoveIntoNewBundle(MachineInstr &BundleStart, bool UpdateFlags) { assert((BundleStart.getOpcode() == TargetOpcode::BUNDLE) && "Bundle start is not a bundle"); SmallVector ToProcess; const SlotIndex NewIndex = Indexes->insertMachineInstrInMaps(BundleStart); auto BundleEnd = getBundleEnd(BundleStart.getIterator()); auto I = BundleStart.getIterator(); I++; while (I != BundleEnd) { if (!Indexes->hasIndex(*I)) continue; SlotIndex OldIndex = Indexes->getInstructionIndex(*I, true); ToProcess.push_back(OldIndex); Indexes->removeMachineInstrFromMaps(*I, true); I++; } for (SlotIndex OldIndex : ToProcess) { HMEditor HME(*this, *MRI, *TRI, OldIndex, NewIndex, UpdateFlags); HME.updateAllRanges(&BundleStart); } // Fix up dead defs const SlotIndex Index = getInstructionIndex(BundleStart); for (unsigned Idx = 0, E = BundleStart.getNumOperands(); Idx != E; ++Idx) { MachineOperand &MO = BundleStart.getOperand(Idx); if (!MO.isReg()) continue; Register Reg = MO.getReg(); if (Reg.isVirtual() && hasInterval(Reg) && !MO.isUndef()) { LiveInterval &LI = getInterval(Reg); LiveQueryResult LRQ = LI.Query(Index); if (LRQ.isDeadDef()) MO.setIsDead(); } } } void LiveIntervals::repairOldRegInRange(const MachineBasicBlock::iterator Begin, const MachineBasicBlock::iterator End, const SlotIndex EndIdx, LiveRange &LR, const Register Reg, LaneBitmask LaneMask) { LiveInterval::iterator LII = LR.find(EndIdx); SlotIndex lastUseIdx; if (LII == LR.begin()) { // This happens when the function is called for a subregister that only // occurs _after_ the range that is to be repaired. return; } if (LII != LR.end() && LII->start < EndIdx) lastUseIdx = LII->end; else --LII; for (MachineBasicBlock::iterator I = End; I != Begin;) { --I; MachineInstr &MI = *I; if (MI.isDebugInstr()) continue; SlotIndex instrIdx = getInstructionIndex(MI); bool isStartValid = getInstructionFromIndex(LII->start); bool isEndValid = getInstructionFromIndex(LII->end); // FIXME: This doesn't currently handle early-clobber or multiple removed // defs inside of the region to repair. for (MachineInstr::mop_iterator OI = MI.operands_begin(), OE = MI.operands_end(); OI != OE; ++OI) { const MachineOperand &MO = *OI; if (!MO.isReg() || MO.getReg() != Reg) continue; unsigned SubReg = MO.getSubReg(); LaneBitmask Mask = TRI->getSubRegIndexLaneMask(SubReg); if ((Mask & LaneMask).none()) continue; if (MO.isDef()) { if (!isStartValid) { if (LII->end.isDead()) { SlotIndex prevStart; if (LII != LR.begin()) prevStart = std::prev(LII)->start; // FIXME: This could be more efficient if there was a // removeSegment method that returned an iterator. LR.removeSegment(*LII, true); if (prevStart.isValid()) LII = LR.find(prevStart); else LII = LR.begin(); } else { LII->start = instrIdx.getRegSlot(); LII->valno->def = instrIdx.getRegSlot(); if (MO.getSubReg() && !MO.isUndef()) lastUseIdx = instrIdx.getRegSlot(); else lastUseIdx = SlotIndex(); continue; } } if (!lastUseIdx.isValid()) { VNInfo *VNI = LR.getNextValue(instrIdx.getRegSlot(), VNInfoAllocator); LiveRange::Segment S(instrIdx.getRegSlot(), instrIdx.getDeadSlot(), VNI); LII = LR.addSegment(S); } else if (LII->start != instrIdx.getRegSlot()) { VNInfo *VNI = LR.getNextValue(instrIdx.getRegSlot(), VNInfoAllocator); LiveRange::Segment S(instrIdx.getRegSlot(), lastUseIdx, VNI); LII = LR.addSegment(S); } if (MO.getSubReg() && !MO.isUndef()) lastUseIdx = instrIdx.getRegSlot(); else lastUseIdx = SlotIndex(); } else if (MO.isUse()) { // FIXME: This should probably be handled outside of this branch, // either as part of the def case (for defs inside of the region) or // after the loop over the region. if (!isEndValid && !LII->end.isBlock()) LII->end = instrIdx.getRegSlot(); if (!lastUseIdx.isValid()) lastUseIdx = instrIdx.getRegSlot(); } } } } void LiveIntervals::repairIntervalsInRange(MachineBasicBlock *MBB, MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, ArrayRef OrigRegs) { // Find anchor points, which are at the beginning/end of blocks or at // instructions that already have indexes. while (Begin != MBB->begin() && !Indexes->hasIndex(*Begin)) --Begin; while (End != MBB->end() && !Indexes->hasIndex(*End)) ++End; SlotIndex EndIdx; if (End == MBB->end()) EndIdx = getMBBEndIdx(MBB).getPrevSlot(); else EndIdx = getInstructionIndex(*End); Indexes->repairIndexesInRange(MBB, Begin, End); for (MachineBasicBlock::iterator I = End; I != Begin;) { --I; MachineInstr &MI = *I; if (MI.isDebugInstr()) continue; for (MachineInstr::const_mop_iterator MOI = MI.operands_begin(), MOE = MI.operands_end(); MOI != MOE; ++MOI) { if (MOI->isReg() && Register::isVirtualRegister(MOI->getReg()) && !hasInterval(MOI->getReg())) { createAndComputeVirtRegInterval(MOI->getReg()); } } } for (Register Reg : OrigRegs) { if (!Reg.isVirtual()) continue; LiveInterval &LI = getInterval(Reg); // FIXME: Should we support undefs that gain defs? if (!LI.hasAtLeastOneValue()) continue; for (LiveInterval::SubRange &S : LI.subranges()) repairOldRegInRange(Begin, End, EndIdx, S, Reg, S.LaneMask); repairOldRegInRange(Begin, End, EndIdx, LI, Reg); } } void LiveIntervals::removePhysRegDefAt(MCRegister Reg, SlotIndex Pos) { for (MCRegUnitIterator Unit(Reg, TRI); Unit.isValid(); ++Unit) { if (LiveRange *LR = getCachedRegUnit(*Unit)) if (VNInfo *VNI = LR->getVNInfoAt(Pos)) LR->removeValNo(VNI); } } void LiveIntervals::removeVRegDefAt(LiveInterval &LI, SlotIndex Pos) { // LI may not have the main range computed yet, but its subranges may // be present. VNInfo *VNI = LI.getVNInfoAt(Pos); if (VNI != nullptr) { assert(VNI->def.getBaseIndex() == Pos.getBaseIndex()); LI.removeValNo(VNI); } // Also remove the value defined in subranges. for (LiveInterval::SubRange &S : LI.subranges()) { if (VNInfo *SVNI = S.getVNInfoAt(Pos)) if (SVNI->def.getBaseIndex() == Pos.getBaseIndex()) S.removeValNo(SVNI); } LI.removeEmptySubRanges(); } void LiveIntervals::splitSeparateComponents(LiveInterval &LI, SmallVectorImpl &SplitLIs) { ConnectedVNInfoEqClasses ConEQ(*this); unsigned NumComp = ConEQ.Classify(LI); if (NumComp <= 1) return; LLVM_DEBUG(dbgs() << " Split " << NumComp << " components: " << LI << '\n'); Register Reg = LI.reg(); const TargetRegisterClass *RegClass = MRI->getRegClass(Reg); for (unsigned I = 1; I < NumComp; ++I) { Register NewVReg = MRI->createVirtualRegister(RegClass); LiveInterval &NewLI = createEmptyInterval(NewVReg); SplitLIs.push_back(&NewLI); } ConEQ.Distribute(LI, SplitLIs.data(), *MRI); } void LiveIntervals::constructMainRangeFromSubranges(LiveInterval &LI) { assert(LICalc && "LICalc not initialized."); LICalc->reset(MF, getSlotIndexes(), DomTree, &getVNInfoAllocator()); LICalc->constructMainRangeFromSubranges(LI); } diff --git a/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/llvm/lib/Target/X86/X86ExpandPseudo.cpp index 395f437bb648..15af0fb2e888 100644 --- a/llvm/lib/Target/X86/X86ExpandPseudo.cpp +++ b/llvm/lib/Target/X86/X86ExpandPseudo.cpp @@ -1,526 +1,539 @@ //===------- X86ExpandPseudo.cpp - Expand pseudo instructions -------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file contains a pass that expands pseudo instructions into target // instructions to allow proper scheduling, if-conversion, other late // optimizations, or simply the encoding of the instructions. // //===----------------------------------------------------------------------===// #include "X86.h" #include "X86FrameLowering.h" #include "X86InstrBuilder.h" #include "X86InstrInfo.h" #include "X86MachineFunctionInfo.h" #include "X86Subtarget.h" #include "llvm/Analysis/EHPersonalities.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/Passes.h" // For IDs of passes that are preserved. #include "llvm/IR/GlobalValue.h" using namespace llvm; #define DEBUG_TYPE "x86-pseudo" #define X86_EXPAND_PSEUDO_NAME "X86 pseudo instruction expansion pass" namespace { class X86ExpandPseudo : public MachineFunctionPass { public: static char ID; X86ExpandPseudo() : MachineFunctionPass(ID) {} void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); AU.addPreservedID(MachineLoopInfoID); AU.addPreservedID(MachineDominatorsID); MachineFunctionPass::getAnalysisUsage(AU); } const X86Subtarget *STI = nullptr; const X86InstrInfo *TII = nullptr; const X86RegisterInfo *TRI = nullptr; const X86MachineFunctionInfo *X86FI = nullptr; const X86FrameLowering *X86FL = nullptr; bool runOnMachineFunction(MachineFunction &Fn) override; MachineFunctionProperties getRequiredProperties() const override { return MachineFunctionProperties().set( MachineFunctionProperties::Property::NoVRegs); } StringRef getPassName() const override { return "X86 pseudo instruction expansion pass"; } private: void ExpandICallBranchFunnel(MachineBasicBlock *MBB, MachineBasicBlock::iterator MBBI); bool ExpandMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI); bool ExpandMBB(MachineBasicBlock &MBB); }; char X86ExpandPseudo::ID = 0; } // End anonymous namespace. INITIALIZE_PASS(X86ExpandPseudo, DEBUG_TYPE, X86_EXPAND_PSEUDO_NAME, false, false) void X86ExpandPseudo::ExpandICallBranchFunnel( MachineBasicBlock *MBB, MachineBasicBlock::iterator MBBI) { MachineBasicBlock *JTMBB = MBB; MachineInstr *JTInst = &*MBBI; MachineFunction *MF = MBB->getParent(); const BasicBlock *BB = MBB->getBasicBlock(); auto InsPt = MachineFunction::iterator(MBB); ++InsPt; std::vector> TargetMBBs; DebugLoc DL = JTInst->getDebugLoc(); MachineOperand Selector = JTInst->getOperand(0); const GlobalValue *CombinedGlobal = JTInst->getOperand(1).getGlobal(); auto CmpTarget = [&](unsigned Target) { if (Selector.isReg()) MBB->addLiveIn(Selector.getReg()); BuildMI(*MBB, MBBI, DL, TII->get(X86::LEA64r), X86::R11) .addReg(X86::RIP) .addImm(1) .addReg(0) .addGlobalAddress(CombinedGlobal, JTInst->getOperand(2 + 2 * Target).getImm()) .addReg(0); BuildMI(*MBB, MBBI, DL, TII->get(X86::CMP64rr)) .add(Selector) .addReg(X86::R11); }; auto CreateMBB = [&]() { auto *NewMBB = MF->CreateMachineBasicBlock(BB); MBB->addSuccessor(NewMBB); if (!MBB->isLiveIn(X86::EFLAGS)) MBB->addLiveIn(X86::EFLAGS); return NewMBB; }; auto EmitCondJump = [&](unsigned CC, MachineBasicBlock *ThenMBB) { BuildMI(*MBB, MBBI, DL, TII->get(X86::JCC_1)).addMBB(ThenMBB).addImm(CC); auto *ElseMBB = CreateMBB(); MF->insert(InsPt, ElseMBB); MBB = ElseMBB; MBBI = MBB->end(); }; auto EmitCondJumpTarget = [&](unsigned CC, unsigned Target) { auto *ThenMBB = CreateMBB(); TargetMBBs.push_back({ThenMBB, Target}); EmitCondJump(CC, ThenMBB); }; auto EmitTailCall = [&](unsigned Target) { BuildMI(*MBB, MBBI, DL, TII->get(X86::TAILJMPd64)) .add(JTInst->getOperand(3 + 2 * Target)); }; std::function EmitBranchFunnel = [&](unsigned FirstTarget, unsigned NumTargets) { if (NumTargets == 1) { EmitTailCall(FirstTarget); return; } if (NumTargets == 2) { CmpTarget(FirstTarget + 1); EmitCondJumpTarget(X86::COND_B, FirstTarget); EmitTailCall(FirstTarget + 1); return; } if (NumTargets < 6) { CmpTarget(FirstTarget + 1); EmitCondJumpTarget(X86::COND_B, FirstTarget); EmitCondJumpTarget(X86::COND_E, FirstTarget + 1); EmitBranchFunnel(FirstTarget + 2, NumTargets - 2); return; } auto *ThenMBB = CreateMBB(); CmpTarget(FirstTarget + (NumTargets / 2)); EmitCondJump(X86::COND_B, ThenMBB); EmitCondJumpTarget(X86::COND_E, FirstTarget + (NumTargets / 2)); EmitBranchFunnel(FirstTarget + (NumTargets / 2) + 1, NumTargets - (NumTargets / 2) - 1); MF->insert(InsPt, ThenMBB); MBB = ThenMBB; MBBI = MBB->end(); EmitBranchFunnel(FirstTarget, NumTargets / 2); }; EmitBranchFunnel(0, (JTInst->getNumOperands() - 2) / 2); for (auto P : TargetMBBs) { MF->insert(InsPt, P.first); BuildMI(P.first, DL, TII->get(X86::TAILJMPd64)) .add(JTInst->getOperand(3 + 2 * P.second)); } JTMBB->erase(JTInst); } /// If \p MBBI is a pseudo instruction, this method expands /// it to the corresponding (sequence of) actual instruction(s). /// \returns true if \p MBBI has been expanded. bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) { MachineInstr &MI = *MBBI; unsigned Opcode = MI.getOpcode(); DebugLoc DL = MBBI->getDebugLoc(); switch (Opcode) { default: return false; case X86::TCRETURNdi: case X86::TCRETURNdicc: case X86::TCRETURNri: case X86::TCRETURNmi: case X86::TCRETURNdi64: case X86::TCRETURNdi64cc: case X86::TCRETURNri64: case X86::TCRETURNmi64: { bool isMem = Opcode == X86::TCRETURNmi || Opcode == X86::TCRETURNmi64; MachineOperand &JumpTarget = MBBI->getOperand(0); MachineOperand &StackAdjust = MBBI->getOperand(isMem ? X86::AddrNumOperands : 1); assert(StackAdjust.isImm() && "Expecting immediate value."); // Adjust stack pointer. int StackAdj = StackAdjust.getImm(); int MaxTCDelta = X86FI->getTCReturnAddrDelta(); int Offset = 0; assert(MaxTCDelta <= 0 && "MaxTCDelta should never be positive"); // Incoporate the retaddr area. Offset = StackAdj - MaxTCDelta; assert(Offset >= 0 && "Offset should never be negative"); if (Opcode == X86::TCRETURNdicc || Opcode == X86::TCRETURNdi64cc) { assert(Offset == 0 && "Conditional tail call cannot adjust the stack."); } if (Offset) { // Check for possible merge with preceding ADD instruction. Offset += X86FL->mergeSPUpdates(MBB, MBBI, true); X86FL->emitSPUpdate(MBB, MBBI, DL, Offset, /*InEpilogue=*/true); } // Jump to label or value in register. bool IsWin64 = STI->isTargetWin64(); if (Opcode == X86::TCRETURNdi || Opcode == X86::TCRETURNdicc || Opcode == X86::TCRETURNdi64 || Opcode == X86::TCRETURNdi64cc) { unsigned Op; switch (Opcode) { case X86::TCRETURNdi: Op = X86::TAILJMPd; break; case X86::TCRETURNdicc: Op = X86::TAILJMPd_CC; break; case X86::TCRETURNdi64cc: assert(!MBB.getParent()->hasWinCFI() && "Conditional tail calls confuse " "the Win64 unwinder."); Op = X86::TAILJMPd64_CC; break; default: // Note: Win64 uses REX prefixes indirect jumps out of functions, but // not direct ones. Op = X86::TAILJMPd64; break; } MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(Op)); if (JumpTarget.isGlobal()) { MIB.addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset(), JumpTarget.getTargetFlags()); } else { assert(JumpTarget.isSymbol()); MIB.addExternalSymbol(JumpTarget.getSymbolName(), JumpTarget.getTargetFlags()); } if (Op == X86::TAILJMPd_CC || Op == X86::TAILJMPd64_CC) { MIB.addImm(MBBI->getOperand(2).getImm()); } } else if (Opcode == X86::TCRETURNmi || Opcode == X86::TCRETURNmi64) { unsigned Op = (Opcode == X86::TCRETURNmi) ? X86::TAILJMPm : (IsWin64 ? X86::TAILJMPm64_REX : X86::TAILJMPm64); MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(Op)); for (unsigned i = 0; i != X86::AddrNumOperands; ++i) MIB.add(MBBI->getOperand(i)); } else if (Opcode == X86::TCRETURNri64) { JumpTarget.setIsKill(); BuildMI(MBB, MBBI, DL, TII->get(IsWin64 ? X86::TAILJMPr64_REX : X86::TAILJMPr64)) .add(JumpTarget); } else { JumpTarget.setIsKill(); BuildMI(MBB, MBBI, DL, TII->get(X86::TAILJMPr)) .add(JumpTarget); } MachineInstr &NewMI = *std::prev(MBBI); NewMI.copyImplicitOps(*MBBI->getParent()->getParent(), *MBBI); // Update the call site info. if (MBBI->isCandidateForCallSiteEntry()) MBB.getParent()->moveCallSiteInfo(&*MBBI, &NewMI); // Delete the pseudo instruction TCRETURN. MBB.erase(MBBI); return true; } case X86::EH_RETURN: case X86::EH_RETURN64: { MachineOperand &DestAddr = MBBI->getOperand(0); assert(DestAddr.isReg() && "Offset should be in register!"); const bool Uses64BitFramePtr = STI->isTarget64BitLP64() || STI->isTargetNaCl64(); Register StackPtr = TRI->getStackRegister(); BuildMI(MBB, MBBI, DL, TII->get(Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr), StackPtr) .addReg(DestAddr.getReg()); // The EH_RETURN pseudo is really removed during the MC Lowering. return true; } case X86::IRET: { // Adjust stack to erase error code int64_t StackAdj = MBBI->getOperand(0).getImm(); X86FL->emitSPUpdate(MBB, MBBI, DL, StackAdj, true); // Replace pseudo with machine iret BuildMI(MBB, MBBI, DL, TII->get(STI->is64Bit() ? X86::IRET64 : X86::IRET32)); MBB.erase(MBBI); return true; } case X86::RET: { // Adjust stack to erase error code int64_t StackAdj = MBBI->getOperand(0).getImm(); MachineInstrBuilder MIB; if (StackAdj == 0) { MIB = BuildMI(MBB, MBBI, DL, TII->get(STI->is64Bit() ? X86::RETQ : X86::RETL)); } else if (isUInt<16>(StackAdj)) { MIB = BuildMI(MBB, MBBI, DL, TII->get(STI->is64Bit() ? X86::RETIQ : X86::RETIL)) .addImm(StackAdj); } else { assert(!STI->is64Bit() && "shouldn't need to do this for x86_64 targets!"); // A ret can only handle immediates as big as 2**16-1. If we need to pop // off bytes before the return address, we must do it manually. BuildMI(MBB, MBBI, DL, TII->get(X86::POP32r)).addReg(X86::ECX, RegState::Define); X86FL->emitSPUpdate(MBB, MBBI, DL, StackAdj, /*InEpilogue=*/true); BuildMI(MBB, MBBI, DL, TII->get(X86::PUSH32r)).addReg(X86::ECX); MIB = BuildMI(MBB, MBBI, DL, TII->get(X86::RETL)); } for (unsigned I = 1, E = MBBI->getNumOperands(); I != E; ++I) MIB.add(MBBI->getOperand(I)); MBB.erase(MBBI); return true; } case X86::LCMPXCHG16B_SAVE_RBX: { // Perform the following transformation. // SaveRbx = pseudocmpxchg Addr, <4 opds for the address>, InArg, SaveRbx // => // RBX = InArg // actualcmpxchg Addr // RBX = SaveRbx const MachineOperand &InArg = MBBI->getOperand(6); Register SaveRbx = MBBI->getOperand(7).getReg(); // Copy the input argument of the pseudo into the argument of the // actual instruction. // NOTE: We don't copy the kill flag since the input might be the same reg // as one of the other operands of LCMPXCHG16B. TII->copyPhysReg(MBB, MBBI, DL, X86::RBX, InArg.getReg(), false); // Create the actual instruction. MachineInstr *NewInstr = BuildMI(MBB, MBBI, DL, TII->get(X86::LCMPXCHG16B)); // Copy the operands related to the address. for (unsigned Idx = 1; Idx < 6; ++Idx) NewInstr->addOperand(MBBI->getOperand(Idx)); // Finally, restore the value of RBX. TII->copyPhysReg(MBB, MBBI, DL, X86::RBX, SaveRbx, /*SrcIsKill*/ true); // Delete the pseudo. MBBI->eraseFromParent(); return true; } // Loading/storing mask pairs requires two kmov operations. The second one of // these needs a 2 byte displacement relative to the specified address (with // 32 bit spill size). The pairs of 1bit masks up to 16 bit masks all use the // same spill size, they all are stored using MASKPAIR16STORE, loaded using // MASKPAIR16LOAD. // // The displacement value might wrap around in theory, thus the asserts in // both cases. case X86::MASKPAIR16LOAD: { int64_t Disp = MBBI->getOperand(1 + X86::AddrDisp).getImm(); assert(Disp >= 0 && Disp <= INT32_MAX - 2 && "Unexpected displacement"); Register Reg = MBBI->getOperand(0).getReg(); bool DstIsDead = MBBI->getOperand(0).isDead(); Register Reg0 = TRI->getSubReg(Reg, X86::sub_mask_0); Register Reg1 = TRI->getSubReg(Reg, X86::sub_mask_1); auto MIBLo = BuildMI(MBB, MBBI, DL, TII->get(X86::KMOVWkm)) .addReg(Reg0, RegState::Define | getDeadRegState(DstIsDead)); auto MIBHi = BuildMI(MBB, MBBI, DL, TII->get(X86::KMOVWkm)) .addReg(Reg1, RegState::Define | getDeadRegState(DstIsDead)); for (int i = 0; i < X86::AddrNumOperands; ++i) { MIBLo.add(MBBI->getOperand(1 + i)); if (i == X86::AddrDisp) MIBHi.addImm(Disp + 2); else MIBHi.add(MBBI->getOperand(1 + i)); } // Split the memory operand, adjusting the offset and size for the halves. MachineMemOperand *OldMMO = MBBI->memoperands().front(); MachineFunction *MF = MBB.getParent(); MachineMemOperand *MMOLo = MF->getMachineMemOperand(OldMMO, 0, 2); MachineMemOperand *MMOHi = MF->getMachineMemOperand(OldMMO, 2, 2); MIBLo.setMemRefs(MMOLo); MIBHi.setMemRefs(MMOHi); // Delete the pseudo. MBB.erase(MBBI); return true; } case X86::MASKPAIR16STORE: { int64_t Disp = MBBI->getOperand(X86::AddrDisp).getImm(); assert(Disp >= 0 && Disp <= INT32_MAX - 2 && "Unexpected displacement"); Register Reg = MBBI->getOperand(X86::AddrNumOperands).getReg(); bool SrcIsKill = MBBI->getOperand(X86::AddrNumOperands).isKill(); Register Reg0 = TRI->getSubReg(Reg, X86::sub_mask_0); Register Reg1 = TRI->getSubReg(Reg, X86::sub_mask_1); auto MIBLo = BuildMI(MBB, MBBI, DL, TII->get(X86::KMOVWmk)); auto MIBHi = BuildMI(MBB, MBBI, DL, TII->get(X86::KMOVWmk)); for (int i = 0; i < X86::AddrNumOperands; ++i) { MIBLo.add(MBBI->getOperand(i)); if (i == X86::AddrDisp) MIBHi.addImm(Disp + 2); else MIBHi.add(MBBI->getOperand(i)); } MIBLo.addReg(Reg0, getKillRegState(SrcIsKill)); MIBHi.addReg(Reg1, getKillRegState(SrcIsKill)); // Split the memory operand, adjusting the offset and size for the halves. MachineMemOperand *OldMMO = MBBI->memoperands().front(); MachineFunction *MF = MBB.getParent(); MachineMemOperand *MMOLo = MF->getMachineMemOperand(OldMMO, 0, 2); MachineMemOperand *MMOHi = MF->getMachineMemOperand(OldMMO, 2, 2); MIBLo.setMemRefs(MMOLo); MIBHi.setMemRefs(MMOHi); // Delete the pseudo. MBB.erase(MBBI); return true; } case X86::MWAITX_SAVE_RBX: { // Perform the following transformation. // SaveRbx = pseudomwaitx InArg, SaveRbx // => // [E|R]BX = InArg // actualmwaitx // [E|R]BX = SaveRbx const MachineOperand &InArg = MBBI->getOperand(1); // Copy the input argument of the pseudo into the argument of the // actual instruction. TII->copyPhysReg(MBB, MBBI, DL, X86::EBX, InArg.getReg(), InArg.isKill()); // Create the actual instruction. BuildMI(MBB, MBBI, DL, TII->get(X86::MWAITXrrr)); // Finally, restore the value of RBX. Register SaveRbx = MBBI->getOperand(2).getReg(); TII->copyPhysReg(MBB, MBBI, DL, X86::RBX, SaveRbx, /*SrcIsKill*/ true); // Delete the pseudo. MBBI->eraseFromParent(); return true; } case TargetOpcode::ICALL_BRANCH_FUNNEL: ExpandICallBranchFunnel(&MBB, MBBI); return true; + case X86::PLDTILECFG: { + MI.RemoveOperand(0); + MI.setDesc(TII->get(X86::LDTILECFG)); + return true; + } + case X86::PSTTILECFG: { + MI.RemoveOperand(MI.getNumOperands() - 1); // Remove $tmmcfg + MI.setDesc(TII->get(X86::STTILECFG)); + return true; + } case X86::PTILELOADDV: { + MI.RemoveOperand(8); // Remove $tmmcfg for (unsigned i = 2; i > 0; --i) MI.RemoveOperand(i); MI.setDesc(TII->get(X86::TILELOADD)); return true; } case X86::PTDPBSSDV: { + MI.RemoveOperand(7); // Remove $tmmcfg MI.untieRegOperand(4); for (unsigned i = 3; i > 0; --i) MI.RemoveOperand(i); MI.setDesc(TII->get(X86::TDPBSSD)); MI.tieOperands(0, 1); return true; } case X86::PTILESTOREDV: { + MI.RemoveOperand(8); // Remove $tmmcfg for (int i = 1; i >= 0; --i) MI.RemoveOperand(i); MI.setDesc(TII->get(X86::TILESTORED)); return true; } case X86::PTILEZEROV: { - for (int i = 2; i > 0; --i) // Remove row, col + for (int i = 3; i > 0; --i) // Remove row, col, $tmmcfg MI.RemoveOperand(i); MI.setDesc(TII->get(X86::TILEZERO)); return true; } } llvm_unreachable("Previous switch has a fallthrough?"); } /// Expand all pseudo instructions contained in \p MBB. /// \returns true if any expansion occurred for \p MBB. bool X86ExpandPseudo::ExpandMBB(MachineBasicBlock &MBB) { bool Modified = false; // MBBI may be invalidated by the expansion. MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); while (MBBI != E) { MachineBasicBlock::iterator NMBBI = std::next(MBBI); Modified |= ExpandMI(MBB, MBBI); MBBI = NMBBI; } return Modified; } bool X86ExpandPseudo::runOnMachineFunction(MachineFunction &MF) { STI = &static_cast(MF.getSubtarget()); TII = STI->getInstrInfo(); TRI = STI->getRegisterInfo(); X86FI = MF.getInfo(); X86FL = STI->getFrameLowering(); bool Modified = false; for (MachineBasicBlock &MBB : MF) Modified |= ExpandMBB(MBB); return Modified; } /// Returns an instance of the pseudo instruction expansion pass. FunctionPass *llvm::createX86ExpandPseudoPass() { return new X86ExpandPseudo(); } diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp index fcddfb93b7a3..8339f512158d 100644 --- a/llvm/lib/Target/X86/X86FrameLowering.cpp +++ b/llvm/lib/Target/X86/X86FrameLowering.cpp @@ -1,3595 +1,3589 @@ //===-- X86FrameLowering.cpp - X86 Frame Information ----------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file contains the X86 implementation of TargetFrameLowering class. // //===----------------------------------------------------------------------===// #include "X86FrameLowering.h" #include "X86InstrBuilder.h" #include "X86InstrInfo.h" #include "X86MachineFunctionInfo.h" #include "X86Subtarget.h" #include "X86TargetMachine.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/EHPersonalities.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/WinEHFuncInfo.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Function.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCObjectFileInfo.h" #include "llvm/MC/MCSymbol.h" #include "llvm/Support/Debug.h" #include "llvm/Target/TargetOptions.h" #include #define DEBUG_TYPE "x86-fl" STATISTIC(NumFrameLoopProbe, "Number of loop stack probes used in prologue"); STATISTIC(NumFrameExtraProbe, "Number of extra stack probes generated in prologue"); using namespace llvm; X86FrameLowering::X86FrameLowering(const X86Subtarget &STI, MaybeAlign StackAlignOverride) : TargetFrameLowering(StackGrowsDown, StackAlignOverride.valueOrOne(), STI.is64Bit() ? -8 : -4), STI(STI), TII(*STI.getInstrInfo()), TRI(STI.getRegisterInfo()) { // Cache a bunch of frame-related predicates for this subtarget. SlotSize = TRI->getSlotSize(); Is64Bit = STI.is64Bit(); IsLP64 = STI.isTarget64BitLP64(); // standard x86_64 and NaCl use 64-bit frame/stack pointers, x32 - 32-bit. Uses64BitFramePtr = STI.isTarget64BitLP64() || STI.isTargetNaCl64(); StackPtr = TRI->getStackRegister(); } bool X86FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const { return !MF.getFrameInfo().hasVarSizedObjects() && !MF.getInfo()->getHasPushSequences() && !MF.getInfo()->hasPreallocatedCall(); } /// canSimplifyCallFramePseudos - If there is a reserved call frame, the /// call frame pseudos can be simplified. Having a FP, as in the default /// implementation, is not sufficient here since we can't always use it. /// Use a more nuanced condition. bool X86FrameLowering::canSimplifyCallFramePseudos(const MachineFunction &MF) const { return hasReservedCallFrame(MF) || MF.getInfo()->hasPreallocatedCall() || (hasFP(MF) && !TRI->needsStackRealignment(MF)) || TRI->hasBasePointer(MF); } // needsFrameIndexResolution - Do we need to perform FI resolution for // this function. Normally, this is required only when the function // has any stack objects. However, FI resolution actually has another job, // not apparent from the title - it resolves callframesetup/destroy // that were not simplified earlier. // So, this is required for x86 functions that have push sequences even // when there are no stack objects. bool X86FrameLowering::needsFrameIndexResolution(const MachineFunction &MF) const { return MF.getFrameInfo().hasStackObjects() || MF.getInfo()->getHasPushSequences(); } /// hasFP - Return true if the specified function should have a dedicated frame /// pointer register. This is true if the function has variable sized allocas /// or if frame pointer elimination is disabled. bool X86FrameLowering::hasFP(const MachineFunction &MF) const { const MachineFrameInfo &MFI = MF.getFrameInfo(); return (MF.getTarget().Options.DisableFramePointerElim(MF) || TRI->needsStackRealignment(MF) || MFI.hasVarSizedObjects() || MFI.isFrameAddressTaken() || MFI.hasOpaqueSPAdjustment() || MF.getInfo()->getForceFramePointer() || MF.getInfo()->hasPreallocatedCall() || MF.callsUnwindInit() || MF.hasEHFunclets() || MF.callsEHReturn() || MFI.hasStackMap() || MFI.hasPatchPoint() || MFI.hasCopyImplyingStackAdjustment()); } static unsigned getSUBriOpcode(bool IsLP64, int64_t Imm) { if (IsLP64) { if (isInt<8>(Imm)) return X86::SUB64ri8; return X86::SUB64ri32; } else { if (isInt<8>(Imm)) return X86::SUB32ri8; return X86::SUB32ri; } } static unsigned getADDriOpcode(bool IsLP64, int64_t Imm) { if (IsLP64) { if (isInt<8>(Imm)) return X86::ADD64ri8; return X86::ADD64ri32; } else { if (isInt<8>(Imm)) return X86::ADD32ri8; return X86::ADD32ri; } } static unsigned getSUBrrOpcode(bool IsLP64) { return IsLP64 ? X86::SUB64rr : X86::SUB32rr; } static unsigned getADDrrOpcode(bool IsLP64) { return IsLP64 ? X86::ADD64rr : X86::ADD32rr; } static unsigned getANDriOpcode(bool IsLP64, int64_t Imm) { if (IsLP64) { if (isInt<8>(Imm)) return X86::AND64ri8; return X86::AND64ri32; } if (isInt<8>(Imm)) return X86::AND32ri8; return X86::AND32ri; } static unsigned getLEArOpcode(bool IsLP64) { return IsLP64 ? X86::LEA64r : X86::LEA32r; } static bool isEAXLiveIn(MachineBasicBlock &MBB) { for (MachineBasicBlock::RegisterMaskPair RegMask : MBB.liveins()) { unsigned Reg = RegMask.PhysReg; if (Reg == X86::RAX || Reg == X86::EAX || Reg == X86::AX || Reg == X86::AH || Reg == X86::AL) return true; } return false; } /// Check if the flags need to be preserved before the terminators. /// This would be the case, if the eflags is live-in of the region /// composed by the terminators or live-out of that region, without /// being defined by a terminator. static bool flagsNeedToBePreservedBeforeTheTerminators(const MachineBasicBlock &MBB) { for (const MachineInstr &MI : MBB.terminators()) { bool BreakNext = false; for (const MachineOperand &MO : MI.operands()) { if (!MO.isReg()) continue; Register Reg = MO.getReg(); if (Reg != X86::EFLAGS) continue; // This terminator needs an eflags that is not defined // by a previous another terminator: // EFLAGS is live-in of the region composed by the terminators. if (!MO.isDef()) return true; // This terminator defines the eflags, i.e., we don't need to preserve it. // However, we still need to check this specific terminator does not // read a live-in value. BreakNext = true; } // We found a definition of the eflags, no need to preserve them. if (BreakNext) return false; } // None of the terminators use or define the eflags. // Check if they are live-out, that would imply we need to preserve them. for (const MachineBasicBlock *Succ : MBB.successors()) if (Succ->isLiveIn(X86::EFLAGS)) return true; return false; } /// emitSPUpdate - Emit a series of instructions to increment / decrement the /// stack pointer by a constant value. void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, const DebugLoc &DL, int64_t NumBytes, bool InEpilogue) const { bool isSub = NumBytes < 0; uint64_t Offset = isSub ? -NumBytes : NumBytes; MachineInstr::MIFlag Flag = isSub ? MachineInstr::FrameSetup : MachineInstr::FrameDestroy; uint64_t Chunk = (1LL << 31) - 1; MachineFunction &MF = *MBB.getParent(); const X86Subtarget &STI = MF.getSubtarget(); const X86TargetLowering &TLI = *STI.getTargetLowering(); const bool EmitInlineStackProbe = TLI.hasInlineStackProbe(MF); // It's ok to not take into account large chunks when probing, as the // allocation is split in smaller chunks anyway. if (EmitInlineStackProbe && !InEpilogue) { // This pseudo-instruction is going to be expanded, potentially using a // loop, by inlineStackProbe(). BuildMI(MBB, MBBI, DL, TII.get(X86::STACKALLOC_W_PROBING)).addImm(Offset); return; } else if (Offset > Chunk) { // Rather than emit a long series of instructions for large offsets, // load the offset into a register and do one sub/add unsigned Reg = 0; unsigned Rax = (unsigned)(Is64Bit ? X86::RAX : X86::EAX); if (isSub && !isEAXLiveIn(MBB)) Reg = Rax; else Reg = TRI->findDeadCallerSavedReg(MBB, MBBI); unsigned MovRIOpc = Is64Bit ? X86::MOV64ri : X86::MOV32ri; unsigned AddSubRROpc = isSub ? getSUBrrOpcode(Is64Bit) : getADDrrOpcode(Is64Bit); if (Reg) { BuildMI(MBB, MBBI, DL, TII.get(MovRIOpc), Reg) .addImm(Offset) .setMIFlag(Flag); MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(AddSubRROpc), StackPtr) .addReg(StackPtr) .addReg(Reg); MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead. return; } else if (Offset > 8 * Chunk) { // If we would need more than 8 add or sub instructions (a >16GB stack // frame), it's worth spilling RAX to materialize this immediate. // pushq %rax // movabsq +-$Offset+-SlotSize, %rax // addq %rsp, %rax // xchg %rax, (%rsp) // movq (%rsp), %rsp assert(Is64Bit && "can't have 32-bit 16GB stack frame"); BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH64r)) .addReg(Rax, RegState::Kill) .setMIFlag(Flag); // Subtract is not commutative, so negate the offset and always use add. // Subtract 8 less and add 8 more to account for the PUSH we just did. if (isSub) Offset = -(Offset - SlotSize); else Offset = Offset + SlotSize; BuildMI(MBB, MBBI, DL, TII.get(MovRIOpc), Rax) .addImm(Offset) .setMIFlag(Flag); MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(X86::ADD64rr), Rax) .addReg(Rax) .addReg(StackPtr); MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead. // Exchange the new SP in RAX with the top of the stack. addRegOffset( BuildMI(MBB, MBBI, DL, TII.get(X86::XCHG64rm), Rax).addReg(Rax), StackPtr, false, 0); // Load new SP from the top of the stack into RSP. addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64rm), StackPtr), StackPtr, false, 0); return; } } while (Offset) { uint64_t ThisVal = std::min(Offset, Chunk); if (ThisVal == SlotSize) { // Use push / pop for slot sized adjustments as a size optimization. We // need to find a dead register when using pop. unsigned Reg = isSub ? (unsigned)(Is64Bit ? X86::RAX : X86::EAX) : TRI->findDeadCallerSavedReg(MBB, MBBI); if (Reg) { unsigned Opc = isSub ? (Is64Bit ? X86::PUSH64r : X86::PUSH32r) : (Is64Bit ? X86::POP64r : X86::POP32r); BuildMI(MBB, MBBI, DL, TII.get(Opc)) .addReg(Reg, getDefRegState(!isSub) | getUndefRegState(isSub)) .setMIFlag(Flag); Offset -= ThisVal; continue; } } BuildStackAdjustment(MBB, MBBI, DL, isSub ? -ThisVal : ThisVal, InEpilogue) .setMIFlag(Flag); Offset -= ThisVal; } } MachineInstrBuilder X86FrameLowering::BuildStackAdjustment( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, int64_t Offset, bool InEpilogue) const { assert(Offset != 0 && "zero offset stack adjustment requested"); // On Atom, using LEA to adjust SP is preferred, but using it in the epilogue // is tricky. bool UseLEA; if (!InEpilogue) { // Check if inserting the prologue at the beginning // of MBB would require to use LEA operations. // We need to use LEA operations if EFLAGS is live in, because // it means an instruction will read it before it gets defined. UseLEA = STI.useLeaForSP() || MBB.isLiveIn(X86::EFLAGS); } else { // If we can use LEA for SP but we shouldn't, check that none // of the terminators uses the eflags. Otherwise we will insert // a ADD that will redefine the eflags and break the condition. // Alternatively, we could move the ADD, but this may not be possible // and is an optimization anyway. UseLEA = canUseLEAForSPInEpilogue(*MBB.getParent()); if (UseLEA && !STI.useLeaForSP()) UseLEA = flagsNeedToBePreservedBeforeTheTerminators(MBB); // If that assert breaks, that means we do not do the right thing // in canUseAsEpilogue. assert((UseLEA || !flagsNeedToBePreservedBeforeTheTerminators(MBB)) && "We shouldn't have allowed this insertion point"); } MachineInstrBuilder MI; if (UseLEA) { MI = addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(getLEArOpcode(Uses64BitFramePtr)), StackPtr), StackPtr, false, Offset); } else { bool IsSub = Offset < 0; uint64_t AbsOffset = IsSub ? -Offset : Offset; const unsigned Opc = IsSub ? getSUBriOpcode(Uses64BitFramePtr, AbsOffset) : getADDriOpcode(Uses64BitFramePtr, AbsOffset); MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr) .addReg(StackPtr) .addImm(AbsOffset); MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead. } return MI; } int X86FrameLowering::mergeSPUpdates(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, bool doMergeWithPrevious) const { if ((doMergeWithPrevious && MBBI == MBB.begin()) || (!doMergeWithPrevious && MBBI == MBB.end())) return 0; MachineBasicBlock::iterator PI = doMergeWithPrevious ? std::prev(MBBI) : MBBI; PI = skipDebugInstructionsBackward(PI, MBB.begin()); // It is assumed that ADD/SUB/LEA instruction is succeded by one CFI // instruction, and that there are no DBG_VALUE or other instructions between // ADD/SUB/LEA and its corresponding CFI instruction. /* TODO: Add support for the case where there are multiple CFI instructions below the ADD/SUB/LEA, e.g.: ... add cfi_def_cfa_offset cfi_offset ... */ if (doMergeWithPrevious && PI != MBB.begin() && PI->isCFIInstruction()) PI = std::prev(PI); unsigned Opc = PI->getOpcode(); int Offset = 0; if ((Opc == X86::ADD64ri32 || Opc == X86::ADD64ri8 || Opc == X86::ADD32ri || Opc == X86::ADD32ri8) && PI->getOperand(0).getReg() == StackPtr){ assert(PI->getOperand(1).getReg() == StackPtr); Offset = PI->getOperand(2).getImm(); } else if ((Opc == X86::LEA32r || Opc == X86::LEA64_32r) && PI->getOperand(0).getReg() == StackPtr && PI->getOperand(1).getReg() == StackPtr && PI->getOperand(2).getImm() == 1 && PI->getOperand(3).getReg() == X86::NoRegister && PI->getOperand(5).getReg() == X86::NoRegister) { // For LEAs we have: def = lea SP, FI, noreg, Offset, noreg. Offset = PI->getOperand(4).getImm(); } else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB64ri8 || Opc == X86::SUB32ri || Opc == X86::SUB32ri8) && PI->getOperand(0).getReg() == StackPtr) { assert(PI->getOperand(1).getReg() == StackPtr); Offset = -PI->getOperand(2).getImm(); } else return 0; PI = MBB.erase(PI); if (PI != MBB.end() && PI->isCFIInstruction()) PI = MBB.erase(PI); if (!doMergeWithPrevious) MBBI = skipDebugInstructionsForward(PI, MBB.end()); return Offset; } void X86FrameLowering::BuildCFI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, const MCCFIInstruction &CFIInst) const { MachineFunction &MF = *MBB.getParent(); unsigned CFIIndex = MF.addFrameInst(CFIInst); BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) .addCFIIndex(CFIIndex); } /// Emits Dwarf Info specifying offsets of callee saved registers and /// frame pointer. This is called only when basic block sections are enabled. void X86FrameLowering::emitCalleeSavedFrameMoves( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const { MachineFunction &MF = *MBB.getParent(); if (!hasFP(MF)) { emitCalleeSavedFrameMoves(MBB, MBBI, DebugLoc{}, true); return; } const MachineModuleInfo &MMI = MF.getMMI(); const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo(); const Register FramePtr = TRI->getFrameRegister(MF); const Register MachineFramePtr = STI.isTarget64BitILP32() ? Register(getX86SubSuperRegister(FramePtr, 64)) : FramePtr; unsigned DwarfReg = MRI->getDwarfRegNum(MachineFramePtr, true); // Offset = space for return address + size of the frame pointer itself. unsigned Offset = (Is64Bit ? 8 : 4) + (Uses64BitFramePtr ? 8 : 4); BuildCFI(MBB, MBBI, DebugLoc{}, MCCFIInstruction::createOffset(nullptr, DwarfReg, -Offset)); emitCalleeSavedFrameMoves(MBB, MBBI, DebugLoc{}, true); } void X86FrameLowering::emitCalleeSavedFrameMoves( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, bool IsPrologue) const { MachineFunction &MF = *MBB.getParent(); MachineFrameInfo &MFI = MF.getFrameInfo(); MachineModuleInfo &MMI = MF.getMMI(); const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo(); // Add callee saved registers to move list. const std::vector &CSI = MFI.getCalleeSavedInfo(); if (CSI.empty()) return; // Calculate offsets. for (std::vector::const_iterator I = CSI.begin(), E = CSI.end(); I != E; ++I) { int64_t Offset = MFI.getObjectOffset(I->getFrameIdx()); unsigned Reg = I->getReg(); unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true); if (IsPrologue) { BuildCFI(MBB, MBBI, DL, MCCFIInstruction::createOffset(nullptr, DwarfReg, Offset)); } else { BuildCFI(MBB, MBBI, DL, MCCFIInstruction::createRestore(nullptr, DwarfReg)); } } } void X86FrameLowering::emitStackProbe(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, bool InProlog) const { const X86Subtarget &STI = MF.getSubtarget(); if (STI.isTargetWindowsCoreCLR()) { if (InProlog) { BuildMI(MBB, MBBI, DL, TII.get(X86::STACKALLOC_W_PROBING)) .addImm(0 /* no explicit stack size */); } else { emitStackProbeInline(MF, MBB, MBBI, DL, false); } } else { emitStackProbeCall(MF, MBB, MBBI, DL, InProlog); } } void X86FrameLowering::inlineStackProbe(MachineFunction &MF, MachineBasicBlock &PrologMBB) const { auto Where = llvm::find_if(PrologMBB, [](MachineInstr &MI) { return MI.getOpcode() == X86::STACKALLOC_W_PROBING; }); if (Where != PrologMBB.end()) { DebugLoc DL = PrologMBB.findDebugLoc(Where); emitStackProbeInline(MF, PrologMBB, Where, DL, true); Where->eraseFromParent(); } } void X86FrameLowering::emitStackProbeInline(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, bool InProlog) const { const X86Subtarget &STI = MF.getSubtarget(); if (STI.isTargetWindowsCoreCLR() && STI.is64Bit()) emitStackProbeInlineWindowsCoreCLR64(MF, MBB, MBBI, DL, InProlog); else emitStackProbeInlineGeneric(MF, MBB, MBBI, DL, InProlog); } void X86FrameLowering::emitStackProbeInlineGeneric( MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, bool InProlog) const { MachineInstr &AllocWithProbe = *MBBI; uint64_t Offset = AllocWithProbe.getOperand(0).getImm(); const X86Subtarget &STI = MF.getSubtarget(); const X86TargetLowering &TLI = *STI.getTargetLowering(); assert(!(STI.is64Bit() && STI.isTargetWindowsCoreCLR()) && "different expansion expected for CoreCLR 64 bit"); const uint64_t StackProbeSize = TLI.getStackProbeSize(MF); uint64_t ProbeChunk = StackProbeSize * 8; uint64_t MaxAlign = TRI->needsStackRealignment(MF) ? calculateMaxStackAlign(MF) : 0; // Synthesize a loop or unroll it, depending on the number of iterations. // BuildStackAlignAND ensures that only MaxAlign % StackProbeSize bits left // between the unaligned rsp and current rsp. if (Offset > ProbeChunk) { emitStackProbeInlineGenericLoop(MF, MBB, MBBI, DL, Offset, MaxAlign % StackProbeSize); } else { emitStackProbeInlineGenericBlock(MF, MBB, MBBI, DL, Offset, MaxAlign % StackProbeSize); } } void X86FrameLowering::emitStackProbeInlineGenericBlock( MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, uint64_t Offset, uint64_t AlignOffset) const { const X86Subtarget &STI = MF.getSubtarget(); const X86TargetLowering &TLI = *STI.getTargetLowering(); const unsigned Opc = getSUBriOpcode(Uses64BitFramePtr, Offset); const unsigned MovMIOpc = Is64Bit ? X86::MOV64mi32 : X86::MOV32mi; const uint64_t StackProbeSize = TLI.getStackProbeSize(MF); uint64_t CurrentOffset = 0; assert(AlignOffset < StackProbeSize); // If the offset is so small it fits within a page, there's nothing to do. if (StackProbeSize < Offset + AlignOffset) { MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr) .addReg(StackPtr) .addImm(StackProbeSize - AlignOffset) .setMIFlag(MachineInstr::FrameSetup); MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead. addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(MovMIOpc)) .setMIFlag(MachineInstr::FrameSetup), StackPtr, false, 0) .addImm(0) .setMIFlag(MachineInstr::FrameSetup); NumFrameExtraProbe++; CurrentOffset = StackProbeSize - AlignOffset; } // For the next N - 1 pages, just probe. I tried to take advantage of // natural probes but it implies much more logic and there was very few // interesting natural probes to interleave. while (CurrentOffset + StackProbeSize < Offset) { MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr) .addReg(StackPtr) .addImm(StackProbeSize) .setMIFlag(MachineInstr::FrameSetup); MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead. addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(MovMIOpc)) .setMIFlag(MachineInstr::FrameSetup), StackPtr, false, 0) .addImm(0) .setMIFlag(MachineInstr::FrameSetup); NumFrameExtraProbe++; CurrentOffset += StackProbeSize; } // No need to probe the tail, it is smaller than a Page. uint64_t ChunkSize = Offset - CurrentOffset; MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr) .addReg(StackPtr) .addImm(ChunkSize) .setMIFlag(MachineInstr::FrameSetup); MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead. } void X86FrameLowering::emitStackProbeInlineGenericLoop( MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, uint64_t Offset, uint64_t AlignOffset) const { assert(Offset && "null offset"); const X86Subtarget &STI = MF.getSubtarget(); const X86TargetLowering &TLI = *STI.getTargetLowering(); const unsigned MovMIOpc = Is64Bit ? X86::MOV64mi32 : X86::MOV32mi; const uint64_t StackProbeSize = TLI.getStackProbeSize(MF); if (AlignOffset) { if (AlignOffset < StackProbeSize) { // Perform a first smaller allocation followed by a probe. const unsigned SUBOpc = getSUBriOpcode(Uses64BitFramePtr, AlignOffset); MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(SUBOpc), StackPtr) .addReg(StackPtr) .addImm(AlignOffset) .setMIFlag(MachineInstr::FrameSetup); MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead. addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(MovMIOpc)) .setMIFlag(MachineInstr::FrameSetup), StackPtr, false, 0) .addImm(0) .setMIFlag(MachineInstr::FrameSetup); NumFrameExtraProbe++; Offset -= AlignOffset; } } // Synthesize a loop NumFrameLoopProbe++; const BasicBlock *LLVM_BB = MBB.getBasicBlock(); MachineBasicBlock *testMBB = MF.CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *tailMBB = MF.CreateMachineBasicBlock(LLVM_BB); MachineFunction::iterator MBBIter = ++MBB.getIterator(); MF.insert(MBBIter, testMBB); MF.insert(MBBIter, tailMBB); Register FinalStackProbed = Uses64BitFramePtr ? X86::R11 : X86::R11D; BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::COPY), FinalStackProbed) .addReg(StackPtr) .setMIFlag(MachineInstr::FrameSetup); // save loop bound { const unsigned SUBOpc = getSUBriOpcode(Uses64BitFramePtr, Offset); BuildMI(MBB, MBBI, DL, TII.get(SUBOpc), FinalStackProbed) .addReg(FinalStackProbed) .addImm(Offset / StackProbeSize * StackProbeSize) .setMIFlag(MachineInstr::FrameSetup); } // allocate a page { const unsigned SUBOpc = getSUBriOpcode(Uses64BitFramePtr, StackProbeSize); BuildMI(testMBB, DL, TII.get(SUBOpc), StackPtr) .addReg(StackPtr) .addImm(StackProbeSize) .setMIFlag(MachineInstr::FrameSetup); } // touch the page addRegOffset(BuildMI(testMBB, DL, TII.get(MovMIOpc)) .setMIFlag(MachineInstr::FrameSetup), StackPtr, false, 0) .addImm(0) .setMIFlag(MachineInstr::FrameSetup); // cmp with stack pointer bound BuildMI(testMBB, DL, TII.get(Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr)) .addReg(StackPtr) .addReg(FinalStackProbed) .setMIFlag(MachineInstr::FrameSetup); // jump BuildMI(testMBB, DL, TII.get(X86::JCC_1)) .addMBB(testMBB) .addImm(X86::COND_NE) .setMIFlag(MachineInstr::FrameSetup); testMBB->addSuccessor(testMBB); testMBB->addSuccessor(tailMBB); // BB management tailMBB->splice(tailMBB->end(), &MBB, MBBI, MBB.end()); tailMBB->transferSuccessorsAndUpdatePHIs(&MBB); MBB.addSuccessor(testMBB); // handle tail unsigned TailOffset = Offset % StackProbeSize; if (TailOffset) { const unsigned Opc = getSUBriOpcode(Uses64BitFramePtr, TailOffset); BuildMI(*tailMBB, tailMBB->begin(), DL, TII.get(Opc), StackPtr) .addReg(StackPtr) .addImm(TailOffset) .setMIFlag(MachineInstr::FrameSetup); } // Update Live In information recomputeLiveIns(*testMBB); recomputeLiveIns(*tailMBB); } void X86FrameLowering::emitStackProbeInlineWindowsCoreCLR64( MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, bool InProlog) const { const X86Subtarget &STI = MF.getSubtarget(); assert(STI.is64Bit() && "different expansion needed for 32 bit"); assert(STI.isTargetWindowsCoreCLR() && "custom expansion expects CoreCLR"); const TargetInstrInfo &TII = *STI.getInstrInfo(); const BasicBlock *LLVM_BB = MBB.getBasicBlock(); // RAX contains the number of bytes of desired stack adjustment. // The handling here assumes this value has already been updated so as to // maintain stack alignment. // // We need to exit with RSP modified by this amount and execute suitable // page touches to notify the OS that we're growing the stack responsibly. // All stack probing must be done without modifying RSP. // // MBB: // SizeReg = RAX; // ZeroReg = 0 // CopyReg = RSP // Flags, TestReg = CopyReg - SizeReg // FinalReg = !Flags.Ovf ? TestReg : ZeroReg // LimitReg = gs magic thread env access // if FinalReg >= LimitReg goto ContinueMBB // RoundBB: // RoundReg = page address of FinalReg // LoopMBB: // LoopReg = PHI(LimitReg,ProbeReg) // ProbeReg = LoopReg - PageSize // [ProbeReg] = 0 // if (ProbeReg > RoundReg) goto LoopMBB // ContinueMBB: // RSP = RSP - RAX // [rest of original MBB] // Set up the new basic blocks MachineBasicBlock *RoundMBB = MF.CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *LoopMBB = MF.CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *ContinueMBB = MF.CreateMachineBasicBlock(LLVM_BB); MachineFunction::iterator MBBIter = std::next(MBB.getIterator()); MF.insert(MBBIter, RoundMBB); MF.insert(MBBIter, LoopMBB); MF.insert(MBBIter, ContinueMBB); // Split MBB and move the tail portion down to ContinueMBB. MachineBasicBlock::iterator BeforeMBBI = std::prev(MBBI); ContinueMBB->splice(ContinueMBB->begin(), &MBB, MBBI, MBB.end()); ContinueMBB->transferSuccessorsAndUpdatePHIs(&MBB); // Some useful constants const int64_t ThreadEnvironmentStackLimit = 0x10; const int64_t PageSize = 0x1000; const int64_t PageMask = ~(PageSize - 1); // Registers we need. For the normal case we use virtual // registers. For the prolog expansion we use RAX, RCX and RDX. MachineRegisterInfo &MRI = MF.getRegInfo(); const TargetRegisterClass *RegClass = &X86::GR64RegClass; const Register SizeReg = InProlog ? X86::RAX : MRI.createVirtualRegister(RegClass), ZeroReg = InProlog ? X86::RCX : MRI.createVirtualRegister(RegClass), CopyReg = InProlog ? X86::RDX : MRI.createVirtualRegister(RegClass), TestReg = InProlog ? X86::RDX : MRI.createVirtualRegister(RegClass), FinalReg = InProlog ? X86::RDX : MRI.createVirtualRegister(RegClass), RoundedReg = InProlog ? X86::RDX : MRI.createVirtualRegister(RegClass), LimitReg = InProlog ? X86::RCX : MRI.createVirtualRegister(RegClass), JoinReg = InProlog ? X86::RCX : MRI.createVirtualRegister(RegClass), ProbeReg = InProlog ? X86::RCX : MRI.createVirtualRegister(RegClass); // SP-relative offsets where we can save RCX and RDX. int64_t RCXShadowSlot = 0; int64_t RDXShadowSlot = 0; // If inlining in the prolog, save RCX and RDX. if (InProlog) { // Compute the offsets. We need to account for things already // pushed onto the stack at this point: return address, frame // pointer (if used), and callee saves. X86MachineFunctionInfo *X86FI = MF.getInfo(); const int64_t CalleeSaveSize = X86FI->getCalleeSavedFrameSize(); const bool HasFP = hasFP(MF); // Check if we need to spill RCX and/or RDX. // Here we assume that no earlier prologue instruction changes RCX and/or // RDX, so checking the block live-ins is enough. const bool IsRCXLiveIn = MBB.isLiveIn(X86::RCX); const bool IsRDXLiveIn = MBB.isLiveIn(X86::RDX); int64_t InitSlot = 8 + CalleeSaveSize + (HasFP ? 8 : 0); // Assign the initial slot to both registers, then change RDX's slot if both // need to be spilled. if (IsRCXLiveIn) RCXShadowSlot = InitSlot; if (IsRDXLiveIn) RDXShadowSlot = InitSlot; if (IsRDXLiveIn && IsRCXLiveIn) RDXShadowSlot += 8; // Emit the saves if needed. if (IsRCXLiveIn) addRegOffset(BuildMI(&MBB, DL, TII.get(X86::MOV64mr)), X86::RSP, false, RCXShadowSlot) .addReg(X86::RCX); if (IsRDXLiveIn) addRegOffset(BuildMI(&MBB, DL, TII.get(X86::MOV64mr)), X86::RSP, false, RDXShadowSlot) .addReg(X86::RDX); } else { // Not in the prolog. Copy RAX to a virtual reg. BuildMI(&MBB, DL, TII.get(X86::MOV64rr), SizeReg).addReg(X86::RAX); } // Add code to MBB to check for overflow and set the new target stack pointer // to zero if so. BuildMI(&MBB, DL, TII.get(X86::XOR64rr), ZeroReg) .addReg(ZeroReg, RegState::Undef) .addReg(ZeroReg, RegState::Undef); BuildMI(&MBB, DL, TII.get(X86::MOV64rr), CopyReg).addReg(X86::RSP); BuildMI(&MBB, DL, TII.get(X86::SUB64rr), TestReg) .addReg(CopyReg) .addReg(SizeReg); BuildMI(&MBB, DL, TII.get(X86::CMOV64rr), FinalReg) .addReg(TestReg) .addReg(ZeroReg) .addImm(X86::COND_B); // FinalReg now holds final stack pointer value, or zero if // allocation would overflow. Compare against the current stack // limit from the thread environment block. Note this limit is the // lowest touched page on the stack, not the point at which the OS // will cause an overflow exception, so this is just an optimization // to avoid unnecessarily touching pages that are below the current // SP but already committed to the stack by the OS. BuildMI(&MBB, DL, TII.get(X86::MOV64rm), LimitReg) .addReg(0) .addImm(1) .addReg(0) .addImm(ThreadEnvironmentStackLimit) .addReg(X86::GS); BuildMI(&MBB, DL, TII.get(X86::CMP64rr)).addReg(FinalReg).addReg(LimitReg); // Jump if the desired stack pointer is at or above the stack limit. BuildMI(&MBB, DL, TII.get(X86::JCC_1)).addMBB(ContinueMBB).addImm(X86::COND_AE); // Add code to roundMBB to round the final stack pointer to a page boundary. RoundMBB->addLiveIn(FinalReg); BuildMI(RoundMBB, DL, TII.get(X86::AND64ri32), RoundedReg) .addReg(FinalReg) .addImm(PageMask); BuildMI(RoundMBB, DL, TII.get(X86::JMP_1)).addMBB(LoopMBB); // LimitReg now holds the current stack limit, RoundedReg page-rounded // final RSP value. Add code to loopMBB to decrement LimitReg page-by-page // and probe until we reach RoundedReg. if (!InProlog) { BuildMI(LoopMBB, DL, TII.get(X86::PHI), JoinReg) .addReg(LimitReg) .addMBB(RoundMBB) .addReg(ProbeReg) .addMBB(LoopMBB); } LoopMBB->addLiveIn(JoinReg); addRegOffset(BuildMI(LoopMBB, DL, TII.get(X86::LEA64r), ProbeReg), JoinReg, false, -PageSize); // Probe by storing a byte onto the stack. BuildMI(LoopMBB, DL, TII.get(X86::MOV8mi)) .addReg(ProbeReg) .addImm(1) .addReg(0) .addImm(0) .addReg(0) .addImm(0); LoopMBB->addLiveIn(RoundedReg); BuildMI(LoopMBB, DL, TII.get(X86::CMP64rr)) .addReg(RoundedReg) .addReg(ProbeReg); BuildMI(LoopMBB, DL, TII.get(X86::JCC_1)).addMBB(LoopMBB).addImm(X86::COND_NE); MachineBasicBlock::iterator ContinueMBBI = ContinueMBB->getFirstNonPHI(); // If in prolog, restore RDX and RCX. if (InProlog) { if (RCXShadowSlot) // It means we spilled RCX in the prologue. addRegOffset(BuildMI(*ContinueMBB, ContinueMBBI, DL, TII.get(X86::MOV64rm), X86::RCX), X86::RSP, false, RCXShadowSlot); if (RDXShadowSlot) // It means we spilled RDX in the prologue. addRegOffset(BuildMI(*ContinueMBB, ContinueMBBI, DL, TII.get(X86::MOV64rm), X86::RDX), X86::RSP, false, RDXShadowSlot); } // Now that the probing is done, add code to continueMBB to update // the stack pointer for real. ContinueMBB->addLiveIn(SizeReg); BuildMI(*ContinueMBB, ContinueMBBI, DL, TII.get(X86::SUB64rr), X86::RSP) .addReg(X86::RSP) .addReg(SizeReg); // Add the control flow edges we need. MBB.addSuccessor(ContinueMBB); MBB.addSuccessor(RoundMBB); RoundMBB->addSuccessor(LoopMBB); LoopMBB->addSuccessor(ContinueMBB); LoopMBB->addSuccessor(LoopMBB); // Mark all the instructions added to the prolog as frame setup. if (InProlog) { for (++BeforeMBBI; BeforeMBBI != MBB.end(); ++BeforeMBBI) { BeforeMBBI->setFlag(MachineInstr::FrameSetup); } for (MachineInstr &MI : *RoundMBB) { MI.setFlag(MachineInstr::FrameSetup); } for (MachineInstr &MI : *LoopMBB) { MI.setFlag(MachineInstr::FrameSetup); } for (MachineBasicBlock::iterator CMBBI = ContinueMBB->begin(); CMBBI != ContinueMBBI; ++CMBBI) { CMBBI->setFlag(MachineInstr::FrameSetup); } } } void X86FrameLowering::emitStackProbeCall(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, bool InProlog) const { bool IsLargeCodeModel = MF.getTarget().getCodeModel() == CodeModel::Large; // FIXME: Add indirect thunk support and remove this. if (Is64Bit && IsLargeCodeModel && STI.useIndirectThunkCalls()) report_fatal_error("Emitting stack probe calls on 64-bit with the large " "code model and indirect thunks not yet implemented."); unsigned CallOp; if (Is64Bit) CallOp = IsLargeCodeModel ? X86::CALL64r : X86::CALL64pcrel32; else CallOp = X86::CALLpcrel32; StringRef Symbol = STI.getTargetLowering()->getStackProbeSymbolName(MF); MachineInstrBuilder CI; MachineBasicBlock::iterator ExpansionMBBI = std::prev(MBBI); // All current stack probes take AX and SP as input, clobber flags, and // preserve all registers. x86_64 probes leave RSP unmodified. if (Is64Bit && MF.getTarget().getCodeModel() == CodeModel::Large) { // For the large code model, we have to call through a register. Use R11, // as it is scratch in all supported calling conventions. BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri), X86::R11) .addExternalSymbol(MF.createExternalSymbolName(Symbol)); CI = BuildMI(MBB, MBBI, DL, TII.get(CallOp)).addReg(X86::R11); } else { CI = BuildMI(MBB, MBBI, DL, TII.get(CallOp)) .addExternalSymbol(MF.createExternalSymbolName(Symbol)); } unsigned AX = Uses64BitFramePtr ? X86::RAX : X86::EAX; unsigned SP = Uses64BitFramePtr ? X86::RSP : X86::ESP; CI.addReg(AX, RegState::Implicit) .addReg(SP, RegState::Implicit) .addReg(AX, RegState::Define | RegState::Implicit) .addReg(SP, RegState::Define | RegState::Implicit) .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); if (STI.isTargetWin64() || !STI.isOSWindows()) { // MSVC x32's _chkstk and cygwin/mingw's _alloca adjust %esp themselves. // MSVC x64's __chkstk and cygwin/mingw's ___chkstk_ms do not adjust %rsp // themselves. They also does not clobber %rax so we can reuse it when // adjusting %rsp. // All other platforms do not specify a particular ABI for the stack probe // function, so we arbitrarily define it to not adjust %esp/%rsp itself. BuildMI(MBB, MBBI, DL, TII.get(getSUBrrOpcode(Uses64BitFramePtr)), SP) .addReg(SP) .addReg(AX); } if (InProlog) { // Apply the frame setup flag to all inserted instrs. for (++ExpansionMBBI; ExpansionMBBI != MBBI; ++ExpansionMBBI) ExpansionMBBI->setFlag(MachineInstr::FrameSetup); } } static unsigned calculateSetFPREG(uint64_t SPAdjust) { // Win64 ABI has a less restrictive limitation of 240; 128 works equally well // and might require smaller successive adjustments. const uint64_t Win64MaxSEHOffset = 128; uint64_t SEHFrameOffset = std::min(SPAdjust, Win64MaxSEHOffset); // Win64 ABI requires 16-byte alignment for the UWOP_SET_FPREG opcode. return SEHFrameOffset & -16; } // If we're forcing a stack realignment we can't rely on just the frame // info, we need to know the ABI stack alignment as well in case we // have a call out. Otherwise just make sure we have some alignment - we'll // go with the minimum SlotSize. uint64_t X86FrameLowering::calculateMaxStackAlign(const MachineFunction &MF) const { const MachineFrameInfo &MFI = MF.getFrameInfo(); Align MaxAlign = MFI.getMaxAlign(); // Desired stack alignment. Align StackAlign = getStackAlign(); if (MF.getFunction().hasFnAttribute("stackrealign")) { if (MFI.hasCalls()) MaxAlign = (StackAlign > MaxAlign) ? StackAlign : MaxAlign; else if (MaxAlign < SlotSize) MaxAlign = Align(SlotSize); } return MaxAlign.value(); } void X86FrameLowering::BuildStackAlignAND(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, unsigned Reg, uint64_t MaxAlign) const { uint64_t Val = -MaxAlign; unsigned AndOp = getANDriOpcode(Uses64BitFramePtr, Val); MachineFunction &MF = *MBB.getParent(); const X86Subtarget &STI = MF.getSubtarget(); const X86TargetLowering &TLI = *STI.getTargetLowering(); const uint64_t StackProbeSize = TLI.getStackProbeSize(MF); const bool EmitInlineStackProbe = TLI.hasInlineStackProbe(MF); // We want to make sure that (in worst case) less than StackProbeSize bytes // are not probed after the AND. This assumption is used in // emitStackProbeInlineGeneric. if (Reg == StackPtr && EmitInlineStackProbe && MaxAlign >= StackProbeSize) { { NumFrameLoopProbe++; MachineBasicBlock *entryMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock()); MachineBasicBlock *headMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock()); MachineBasicBlock *bodyMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock()); MachineBasicBlock *footMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock()); MachineFunction::iterator MBBIter = MBB.getIterator(); MF.insert(MBBIter, entryMBB); MF.insert(MBBIter, headMBB); MF.insert(MBBIter, bodyMBB); MF.insert(MBBIter, footMBB); const unsigned MovMIOpc = Is64Bit ? X86::MOV64mi32 : X86::MOV32mi; Register FinalStackProbed = Uses64BitFramePtr ? X86::R11 : X86::R11D; // Setup entry block { entryMBB->splice(entryMBB->end(), &MBB, MBB.begin(), MBBI); BuildMI(entryMBB, DL, TII.get(TargetOpcode::COPY), FinalStackProbed) .addReg(StackPtr) .setMIFlag(MachineInstr::FrameSetup); MachineInstr *MI = BuildMI(entryMBB, DL, TII.get(AndOp), FinalStackProbed) .addReg(FinalStackProbed) .addImm(Val) .setMIFlag(MachineInstr::FrameSetup); // The EFLAGS implicit def is dead. MI->getOperand(3).setIsDead(); BuildMI(entryMBB, DL, TII.get(Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr)) .addReg(FinalStackProbed) .addReg(StackPtr) .setMIFlag(MachineInstr::FrameSetup); BuildMI(entryMBB, DL, TII.get(X86::JCC_1)) .addMBB(&MBB) .addImm(X86::COND_E) .setMIFlag(MachineInstr::FrameSetup); entryMBB->addSuccessor(headMBB); entryMBB->addSuccessor(&MBB); } // Loop entry block { const unsigned SUBOpc = getSUBriOpcode(Uses64BitFramePtr, StackProbeSize); BuildMI(headMBB, DL, TII.get(SUBOpc), StackPtr) .addReg(StackPtr) .addImm(StackProbeSize) .setMIFlag(MachineInstr::FrameSetup); BuildMI(headMBB, DL, TII.get(Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr)) .addReg(FinalStackProbed) .addReg(StackPtr) .setMIFlag(MachineInstr::FrameSetup); // jump BuildMI(headMBB, DL, TII.get(X86::JCC_1)) .addMBB(footMBB) .addImm(X86::COND_B) .setMIFlag(MachineInstr::FrameSetup); headMBB->addSuccessor(bodyMBB); headMBB->addSuccessor(footMBB); } // setup loop body { addRegOffset(BuildMI(bodyMBB, DL, TII.get(MovMIOpc)) .setMIFlag(MachineInstr::FrameSetup), StackPtr, false, 0) .addImm(0) .setMIFlag(MachineInstr::FrameSetup); const unsigned SUBOpc = getSUBriOpcode(Uses64BitFramePtr, StackProbeSize); BuildMI(bodyMBB, DL, TII.get(SUBOpc), StackPtr) .addReg(StackPtr) .addImm(StackProbeSize) .setMIFlag(MachineInstr::FrameSetup); // cmp with stack pointer bound BuildMI(bodyMBB, DL, TII.get(Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr)) .addReg(FinalStackProbed) .addReg(StackPtr) .setMIFlag(MachineInstr::FrameSetup); // jump BuildMI(bodyMBB, DL, TII.get(X86::JCC_1)) .addMBB(bodyMBB) .addImm(X86::COND_B) .setMIFlag(MachineInstr::FrameSetup); bodyMBB->addSuccessor(bodyMBB); bodyMBB->addSuccessor(footMBB); } // setup loop footer { BuildMI(footMBB, DL, TII.get(TargetOpcode::COPY), StackPtr) .addReg(FinalStackProbed) .setMIFlag(MachineInstr::FrameSetup); addRegOffset(BuildMI(footMBB, DL, TII.get(MovMIOpc)) .setMIFlag(MachineInstr::FrameSetup), StackPtr, false, 0) .addImm(0) .setMIFlag(MachineInstr::FrameSetup); footMBB->addSuccessor(&MBB); } recomputeLiveIns(*headMBB); recomputeLiveIns(*bodyMBB); recomputeLiveIns(*footMBB); recomputeLiveIns(MBB); } } else { MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(AndOp), Reg) .addReg(Reg) .addImm(Val) .setMIFlag(MachineInstr::FrameSetup); // The EFLAGS implicit def is dead. MI->getOperand(3).setIsDead(); } } bool X86FrameLowering::has128ByteRedZone(const MachineFunction& MF) const { // x86-64 (non Win64) has a 128 byte red zone which is guaranteed not to be // clobbered by any interrupt handler. assert(&STI == &MF.getSubtarget() && "MF used frame lowering for wrong subtarget"); const Function &Fn = MF.getFunction(); const bool IsWin64CC = STI.isCallingConvWin64(Fn.getCallingConv()); return Is64Bit && !IsWin64CC && !Fn.hasFnAttribute(Attribute::NoRedZone); } /// emitPrologue - Push callee-saved registers onto the stack, which /// automatically adjust the stack pointer. Adjust the stack pointer to allocate /// space for local variables. Also emit labels used by the exception handler to /// generate the exception handling frames. /* Here's a gist of what gets emitted: ; Establish frame pointer, if needed [if needs FP] push %rbp .cfi_def_cfa_offset 16 .cfi_offset %rbp, -16 .seh_pushreg %rpb mov %rsp, %rbp .cfi_def_cfa_register %rbp ; Spill general-purpose registers [for all callee-saved GPRs] pushq % [if not needs FP] .cfi_def_cfa_offset (offset from RETADDR) .seh_pushreg % ; If the required stack alignment > default stack alignment ; rsp needs to be re-aligned. This creates a "re-alignment gap" ; of unknown size in the stack frame. [if stack needs re-alignment] and $MASK, %rsp ; Allocate space for locals [if target is Windows and allocated space > 4096 bytes] ; Windows needs special care for allocations larger ; than one page. mov $NNN, %rax call ___chkstk_ms/___chkstk sub %rax, %rsp [else] sub $NNN, %rsp [if needs FP] .seh_stackalloc (size of XMM spill slots) .seh_setframe %rbp, SEHFrameOffset ; = size of all spill slots [else] .seh_stackalloc NNN ; Spill XMMs ; Note, that while only Windows 64 ABI specifies XMMs as callee-preserved, ; they may get spilled on any platform, if the current function ; calls @llvm.eh.unwind.init [if needs FP] [for all callee-saved XMM registers] movaps %, -MMM(%rbp) [for all callee-saved XMM registers] .seh_savexmm %, (-MMM + SEHFrameOffset) ; i.e. the offset relative to (%rbp - SEHFrameOffset) [else] [for all callee-saved XMM registers] movaps %, KKK(%rsp) [for all callee-saved XMM registers] .seh_savexmm %, KKK .seh_endprologue [if needs base pointer] mov %rsp, %rbx [if needs to restore base pointer] mov %rsp, -MMM(%rbp) ; Emit CFI info [if needs FP] [for all callee-saved registers] .cfi_offset %, (offset from %rbp) [else] .cfi_def_cfa_offset (offset from RETADDR) [for all callee-saved registers] .cfi_offset %, (offset from %rsp) Notes: - .seh directives are emitted only for Windows 64 ABI - .cv_fpo directives are emitted on win32 when emitting CodeView - .cfi directives are emitted for all other ABIs - for 32-bit code, substitute %e?? registers for %r?? */ void X86FrameLowering::emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const { assert(&STI == &MF.getSubtarget() && "MF used frame lowering for wrong subtarget"); MachineBasicBlock::iterator MBBI = MBB.begin(); MachineFrameInfo &MFI = MF.getFrameInfo(); const Function &Fn = MF.getFunction(); MachineModuleInfo &MMI = MF.getMMI(); X86MachineFunctionInfo *X86FI = MF.getInfo(); uint64_t MaxAlign = calculateMaxStackAlign(MF); // Desired stack alignment. uint64_t StackSize = MFI.getStackSize(); // Number of bytes to allocate. bool IsFunclet = MBB.isEHFuncletEntry(); EHPersonality Personality = EHPersonality::Unknown; if (Fn.hasPersonalityFn()) Personality = classifyEHPersonality(Fn.getPersonalityFn()); bool FnHasClrFunclet = MF.hasEHFunclets() && Personality == EHPersonality::CoreCLR; bool IsClrFunclet = IsFunclet && FnHasClrFunclet; bool HasFP = hasFP(MF); bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); bool NeedsWin64CFI = IsWin64Prologue && Fn.needsUnwindTableEntry(); // FIXME: Emit FPO data for EH funclets. bool NeedsWinFPO = !IsFunclet && STI.isTargetWin32() && MMI.getModule()->getCodeViewFlag(); bool NeedsWinCFI = NeedsWin64CFI || NeedsWinFPO; bool NeedsDwarfCFI = !IsWin64Prologue && MF.needsFrameMoves(); Register FramePtr = TRI->getFrameRegister(MF); const Register MachineFramePtr = STI.isTarget64BitILP32() ? Register(getX86SubSuperRegister(FramePtr, 64)) : FramePtr; Register BasePtr = TRI->getBaseRegister(); bool HasWinCFI = false; // Debug location must be unknown since the first debug location is used // to determine the end of the prologue. DebugLoc DL; // Add RETADDR move area to callee saved frame size. int TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta(); if (TailCallReturnAddrDelta && IsWin64Prologue) report_fatal_error("Can't handle guaranteed tail call under win64 yet"); if (TailCallReturnAddrDelta < 0) X86FI->setCalleeSavedFrameSize( X86FI->getCalleeSavedFrameSize() - TailCallReturnAddrDelta); const bool EmitStackProbeCall = STI.getTargetLowering()->hasStackProbeSymbol(MF); unsigned StackProbeSize = STI.getTargetLowering()->getStackProbeSize(MF); // Re-align the stack on 64-bit if the x86-interrupt calling convention is // used and an error code was pushed, since the x86-64 ABI requires a 16-byte // stack alignment. if (Fn.getCallingConv() == CallingConv::X86_INTR && Is64Bit && Fn.arg_size() == 2) { StackSize += 8; MFI.setStackSize(StackSize); emitSPUpdate(MBB, MBBI, DL, -8, /*InEpilogue=*/false); } // If this is x86-64 and the Red Zone is not disabled, if we are a leaf // function, and use up to 128 bytes of stack space, don't have a frame // pointer, calls, or dynamic alloca then we do not need to adjust the // stack pointer (we fit in the Red Zone). We also check that we don't // push and pop from the stack. if (has128ByteRedZone(MF) && !TRI->needsStackRealignment(MF) && !MFI.hasVarSizedObjects() && // No dynamic alloca. !MFI.adjustsStack() && // No calls. !EmitStackProbeCall && // No stack probes. !MFI.hasCopyImplyingStackAdjustment() && // Don't push and pop. !MF.shouldSplitStack()) { // Regular stack uint64_t MinSize = X86FI->getCalleeSavedFrameSize(); if (HasFP) MinSize += SlotSize; X86FI->setUsesRedZone(MinSize > 0 || StackSize > 0); StackSize = std::max(MinSize, StackSize > 128 ? StackSize - 128 : 0); MFI.setStackSize(StackSize); } // Insert stack pointer adjustment for later moving of return addr. Only // applies to tail call optimized functions where the callee argument stack // size is bigger than the callers. if (TailCallReturnAddrDelta < 0) { BuildStackAdjustment(MBB, MBBI, DL, TailCallReturnAddrDelta, /*InEpilogue=*/false) .setMIFlag(MachineInstr::FrameSetup); } // Mapping for machine moves: // // DST: VirtualFP AND // SRC: VirtualFP => DW_CFA_def_cfa_offset // ELSE => DW_CFA_def_cfa // // SRC: VirtualFP AND // DST: Register => DW_CFA_def_cfa_register // // ELSE // OFFSET < 0 => DW_CFA_offset_extended_sf // REG < 64 => DW_CFA_offset + Reg // ELSE => DW_CFA_offset_extended uint64_t NumBytes = 0; int stackGrowth = -SlotSize; // Find the funclet establisher parameter Register Establisher = X86::NoRegister; if (IsClrFunclet) Establisher = Uses64BitFramePtr ? X86::RCX : X86::ECX; else if (IsFunclet) Establisher = Uses64BitFramePtr ? X86::RDX : X86::EDX; if (IsWin64Prologue && IsFunclet && !IsClrFunclet) { // Immediately spill establisher into the home slot. // The runtime cares about this. // MOV64mr %rdx, 16(%rsp) unsigned MOVmr = Uses64BitFramePtr ? X86::MOV64mr : X86::MOV32mr; addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(MOVmr)), StackPtr, true, 16) .addReg(Establisher) .setMIFlag(MachineInstr::FrameSetup); MBB.addLiveIn(Establisher); } if (HasFP) { assert(MF.getRegInfo().isReserved(MachineFramePtr) && "FP reserved"); // Calculate required stack adjustment. uint64_t FrameSize = StackSize - SlotSize; // If required, include space for extra hidden slot for stashing base pointer. if (X86FI->getRestoreBasePointer()) FrameSize += SlotSize; NumBytes = FrameSize - X86FI->getCalleeSavedFrameSize(); // Callee-saved registers are pushed on stack before the stack is realigned. if (TRI->needsStackRealignment(MF) && !IsWin64Prologue) NumBytes = alignTo(NumBytes, MaxAlign); // Save EBP/RBP into the appropriate stack slot. BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::PUSH64r : X86::PUSH32r)) .addReg(MachineFramePtr, RegState::Kill) .setMIFlag(MachineInstr::FrameSetup); if (NeedsDwarfCFI) { // Mark the place where EBP/RBP was saved. // Define the current CFA rule to use the provided offset. assert(StackSize); BuildCFI(MBB, MBBI, DL, MCCFIInstruction::cfiDefCfaOffset(nullptr, -2 * stackGrowth)); // Change the rule for the FramePtr to be an "offset" rule. unsigned DwarfFramePtr = TRI->getDwarfRegNum(MachineFramePtr, true); BuildCFI(MBB, MBBI, DL, MCCFIInstruction::createOffset( nullptr, DwarfFramePtr, 2 * stackGrowth)); } if (NeedsWinCFI) { HasWinCFI = true; BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg)) .addImm(FramePtr) .setMIFlag(MachineInstr::FrameSetup); } if (!IsWin64Prologue && !IsFunclet) { // Update EBP with the new base value. BuildMI(MBB, MBBI, DL, TII.get(Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr), FramePtr) .addReg(StackPtr) .setMIFlag(MachineInstr::FrameSetup); if (NeedsDwarfCFI) { // Mark effective beginning of when frame pointer becomes valid. // Define the current CFA to use the EBP/RBP register. unsigned DwarfFramePtr = TRI->getDwarfRegNum(MachineFramePtr, true); BuildCFI(MBB, MBBI, DL, MCCFIInstruction::createDefCfaRegister( nullptr, DwarfFramePtr)); } if (NeedsWinFPO) { // .cv_fpo_setframe $FramePtr HasWinCFI = true; BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SetFrame)) .addImm(FramePtr) .addImm(0) .setMIFlag(MachineInstr::FrameSetup); } } } else { assert(!IsFunclet && "funclets without FPs not yet implemented"); NumBytes = StackSize - X86FI->getCalleeSavedFrameSize(); } // Update the offset adjustment, which is mainly used by codeview to translate // from ESP to VFRAME relative local variable offsets. if (!IsFunclet) { if (HasFP && TRI->needsStackRealignment(MF)) MFI.setOffsetAdjustment(-NumBytes); else MFI.setOffsetAdjustment(-StackSize); } // For EH funclets, only allocate enough space for outgoing calls. Save the // NumBytes value that we would've used for the parent frame. unsigned ParentFrameNumBytes = NumBytes; if (IsFunclet) NumBytes = getWinEHFuncletFrameSize(MF); // Skip the callee-saved push instructions. bool PushedRegs = false; int StackOffset = 2 * stackGrowth; while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup) && (MBBI->getOpcode() == X86::PUSH32r || MBBI->getOpcode() == X86::PUSH64r)) { PushedRegs = true; Register Reg = MBBI->getOperand(0).getReg(); ++MBBI; if (!HasFP && NeedsDwarfCFI) { // Mark callee-saved push instruction. // Define the current CFA rule to use the provided offset. assert(StackSize); BuildCFI(MBB, MBBI, DL, MCCFIInstruction::cfiDefCfaOffset(nullptr, -StackOffset)); StackOffset += stackGrowth; } if (NeedsWinCFI) { HasWinCFI = true; BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg)) .addImm(Reg) .setMIFlag(MachineInstr::FrameSetup); } } // Realign stack after we pushed callee-saved registers (so that we'll be // able to calculate their offsets from the frame pointer). // Don't do this for Win64, it needs to realign the stack after the prologue. if (!IsWin64Prologue && !IsFunclet && TRI->needsStackRealignment(MF)) { assert(HasFP && "There should be a frame pointer if stack is realigned."); BuildStackAlignAND(MBB, MBBI, DL, StackPtr, MaxAlign); if (NeedsWinCFI) { HasWinCFI = true; BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_StackAlign)) .addImm(MaxAlign) .setMIFlag(MachineInstr::FrameSetup); } } // If there is an SUB32ri of ESP immediately before this instruction, merge // the two. This can be the case when tail call elimination is enabled and // the callee has more arguments then the caller. NumBytes -= mergeSPUpdates(MBB, MBBI, true); // Adjust stack pointer: ESP -= numbytes. // Windows and cygwin/mingw require a prologue helper routine when allocating // more than 4K bytes on the stack. Windows uses __chkstk and cygwin/mingw // uses __alloca. __alloca and the 32-bit version of __chkstk will probe the // stack and adjust the stack pointer in one go. The 64-bit version of // __chkstk is only responsible for probing the stack. The 64-bit prologue is // responsible for adjusting the stack pointer. Touching the stack at 4K // increments is necessary to ensure that the guard pages used by the OS // virtual memory manager are allocated in correct sequence. uint64_t AlignedNumBytes = NumBytes; if (IsWin64Prologue && !IsFunclet && TRI->needsStackRealignment(MF)) AlignedNumBytes = alignTo(AlignedNumBytes, MaxAlign); if (AlignedNumBytes >= StackProbeSize && EmitStackProbeCall) { assert(!X86FI->getUsesRedZone() && "The Red Zone is not accounted for in stack probes"); // Check whether EAX is livein for this block. bool isEAXAlive = isEAXLiveIn(MBB); if (isEAXAlive) { if (Is64Bit) { // Save RAX BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH64r)) .addReg(X86::RAX, RegState::Kill) .setMIFlag(MachineInstr::FrameSetup); } else { // Save EAX BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH32r)) .addReg(X86::EAX, RegState::Kill) .setMIFlag(MachineInstr::FrameSetup); } } if (Is64Bit) { // Handle the 64-bit Windows ABI case where we need to call __chkstk. // Function prologue is responsible for adjusting the stack pointer. int64_t Alloc = isEAXAlive ? NumBytes - 8 : NumBytes; if (isUInt<32>(Alloc)) { BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX) .addImm(Alloc) .setMIFlag(MachineInstr::FrameSetup); } else if (isInt<32>(Alloc)) { BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri32), X86::RAX) .addImm(Alloc) .setMIFlag(MachineInstr::FrameSetup); } else { BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri), X86::RAX) .addImm(Alloc) .setMIFlag(MachineInstr::FrameSetup); } } else { // Allocate NumBytes-4 bytes on stack in case of isEAXAlive. // We'll also use 4 already allocated bytes for EAX. BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX) .addImm(isEAXAlive ? NumBytes - 4 : NumBytes) .setMIFlag(MachineInstr::FrameSetup); } // Call __chkstk, __chkstk_ms, or __alloca. emitStackProbe(MF, MBB, MBBI, DL, true); if (isEAXAlive) { // Restore RAX/EAX MachineInstr *MI; if (Is64Bit) MI = addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV64rm), X86::RAX), StackPtr, false, NumBytes - 8); else MI = addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV32rm), X86::EAX), StackPtr, false, NumBytes - 4); MI->setFlag(MachineInstr::FrameSetup); MBB.insert(MBBI, MI); } } else if (NumBytes) { emitSPUpdate(MBB, MBBI, DL, -(int64_t)NumBytes, /*InEpilogue=*/false); } if (NeedsWinCFI && NumBytes) { HasWinCFI = true; BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_StackAlloc)) .addImm(NumBytes) .setMIFlag(MachineInstr::FrameSetup); } int SEHFrameOffset = 0; unsigned SPOrEstablisher; if (IsFunclet) { if (IsClrFunclet) { // The establisher parameter passed to a CLR funclet is actually a pointer // to the (mostly empty) frame of its nearest enclosing funclet; we have // to find the root function establisher frame by loading the PSPSym from // the intermediate frame. unsigned PSPSlotOffset = getPSPSlotOffsetFromSP(MF); MachinePointerInfo NoInfo; MBB.addLiveIn(Establisher); addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64rm), Establisher), Establisher, false, PSPSlotOffset) .addMemOperand(MF.getMachineMemOperand( NoInfo, MachineMemOperand::MOLoad, SlotSize, Align(SlotSize))); ; // Save the root establisher back into the current funclet's (mostly // empty) frame, in case a sub-funclet or the GC needs it. addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64mr)), StackPtr, false, PSPSlotOffset) .addReg(Establisher) .addMemOperand(MF.getMachineMemOperand( NoInfo, MachineMemOperand::MOStore | MachineMemOperand::MOVolatile, SlotSize, Align(SlotSize))); } SPOrEstablisher = Establisher; } else { SPOrEstablisher = StackPtr; } if (IsWin64Prologue && HasFP) { // Set RBP to a small fixed offset from RSP. In the funclet case, we base // this calculation on the incoming establisher, which holds the value of // RSP from the parent frame at the end of the prologue. SEHFrameOffset = calculateSetFPREG(ParentFrameNumBytes); if (SEHFrameOffset) addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::LEA64r), FramePtr), SPOrEstablisher, false, SEHFrameOffset); else BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64rr), FramePtr) .addReg(SPOrEstablisher); // If this is not a funclet, emit the CFI describing our frame pointer. if (NeedsWinCFI && !IsFunclet) { assert(!NeedsWinFPO && "this setframe incompatible with FPO data"); HasWinCFI = true; BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SetFrame)) .addImm(FramePtr) .addImm(SEHFrameOffset) .setMIFlag(MachineInstr::FrameSetup); if (isAsynchronousEHPersonality(Personality)) MF.getWinEHFuncInfo()->SEHSetFrameOffset = SEHFrameOffset; } } else if (IsFunclet && STI.is32Bit()) { // Reset EBP / ESI to something good for funclets. MBBI = restoreWin32EHStackPointers(MBB, MBBI, DL); // If we're a catch funclet, we can be returned to via catchret. Save ESP // into the registration node so that the runtime will restore it for us. if (!MBB.isCleanupFuncletEntry()) { assert(Personality == EHPersonality::MSVC_CXX); Register FrameReg; int FI = MF.getWinEHFuncInfo()->EHRegNodeFrameIndex; int64_t EHRegOffset = getFrameIndexReference(MF, FI, FrameReg).getFixed(); // ESP is the first field, so no extra displacement is needed. addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32mr)), FrameReg, false, EHRegOffset) .addReg(X86::ESP); } } while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup)) { const MachineInstr &FrameInstr = *MBBI; ++MBBI; if (NeedsWinCFI) { int FI; if (unsigned Reg = TII.isStoreToStackSlot(FrameInstr, FI)) { if (X86::FR64RegClass.contains(Reg)) { int Offset; Register IgnoredFrameReg; if (IsWin64Prologue && IsFunclet) Offset = getWin64EHFrameIndexRef(MF, FI, IgnoredFrameReg); else Offset = getFrameIndexReference(MF, FI, IgnoredFrameReg).getFixed() + SEHFrameOffset; HasWinCFI = true; assert(!NeedsWinFPO && "SEH_SaveXMM incompatible with FPO data"); BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SaveXMM)) .addImm(Reg) .addImm(Offset) .setMIFlag(MachineInstr::FrameSetup); } } } } if (NeedsWinCFI && HasWinCFI) BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_EndPrologue)) .setMIFlag(MachineInstr::FrameSetup); if (FnHasClrFunclet && !IsFunclet) { // Save the so-called Initial-SP (i.e. the value of the stack pointer // immediately after the prolog) into the PSPSlot so that funclets // and the GC can recover it. unsigned PSPSlotOffset = getPSPSlotOffsetFromSP(MF); auto PSPInfo = MachinePointerInfo::getFixedStack( MF, MF.getWinEHFuncInfo()->PSPSymFrameIdx); addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64mr)), StackPtr, false, PSPSlotOffset) .addReg(StackPtr) .addMemOperand(MF.getMachineMemOperand( PSPInfo, MachineMemOperand::MOStore | MachineMemOperand::MOVolatile, SlotSize, Align(SlotSize))); } // Realign stack after we spilled callee-saved registers (so that we'll be // able to calculate their offsets from the frame pointer). // Win64 requires aligning the stack after the prologue. if (IsWin64Prologue && TRI->needsStackRealignment(MF)) { assert(HasFP && "There should be a frame pointer if stack is realigned."); BuildStackAlignAND(MBB, MBBI, DL, SPOrEstablisher, MaxAlign); } // We already dealt with stack realignment and funclets above. if (IsFunclet && STI.is32Bit()) return; // If we need a base pointer, set it up here. It's whatever the value // of the stack pointer is at this point. Any variable size objects // will be allocated after this, so we can still use the base pointer // to reference locals. if (TRI->hasBasePointer(MF)) { // Update the base pointer with the current stack pointer. unsigned Opc = Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr; BuildMI(MBB, MBBI, DL, TII.get(Opc), BasePtr) .addReg(SPOrEstablisher) .setMIFlag(MachineInstr::FrameSetup); if (X86FI->getRestoreBasePointer()) { // Stash value of base pointer. Saving RSP instead of EBP shortens // dependence chain. Used by SjLj EH. unsigned Opm = Uses64BitFramePtr ? X86::MOV64mr : X86::MOV32mr; addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opm)), FramePtr, true, X86FI->getRestoreBasePointerOffset()) .addReg(SPOrEstablisher) .setMIFlag(MachineInstr::FrameSetup); } if (X86FI->getHasSEHFramePtrSave() && !IsFunclet) { // Stash the value of the frame pointer relative to the base pointer for // Win32 EH. This supports Win32 EH, which does the inverse of the above: // it recovers the frame pointer from the base pointer rather than the // other way around. unsigned Opm = Uses64BitFramePtr ? X86::MOV64mr : X86::MOV32mr; Register UsedReg; int Offset = getFrameIndexReference(MF, X86FI->getSEHFramePtrSaveIndex(), UsedReg) .getFixed(); assert(UsedReg == BasePtr); addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opm)), UsedReg, true, Offset) .addReg(FramePtr) .setMIFlag(MachineInstr::FrameSetup); } } if (((!HasFP && NumBytes) || PushedRegs) && NeedsDwarfCFI) { // Mark end of stack pointer adjustment. if (!HasFP && NumBytes) { // Define the current CFA rule to use the provided offset. assert(StackSize); BuildCFI( MBB, MBBI, DL, MCCFIInstruction::cfiDefCfaOffset(nullptr, StackSize - stackGrowth)); } // Emit DWARF info specifying the offsets of the callee-saved registers. emitCalleeSavedFrameMoves(MBB, MBBI, DL, true); } // X86 Interrupt handling function cannot assume anything about the direction // flag (DF in EFLAGS register). Clear this flag by creating "cld" instruction // in each prologue of interrupt handler function. // // FIXME: Create "cld" instruction only in these cases: // 1. The interrupt handling function uses any of the "rep" instructions. // 2. Interrupt handling function calls another function. // if (Fn.getCallingConv() == CallingConv::X86_INTR) BuildMI(MBB, MBBI, DL, TII.get(X86::CLD)) .setMIFlag(MachineInstr::FrameSetup); // At this point we know if the function has WinCFI or not. MF.setHasWinCFI(HasWinCFI); } bool X86FrameLowering::canUseLEAForSPInEpilogue( const MachineFunction &MF) const { // We can't use LEA instructions for adjusting the stack pointer if we don't // have a frame pointer in the Win64 ABI. Only ADD instructions may be used // to deallocate the stack. // This means that we can use LEA for SP in two situations: // 1. We *aren't* using the Win64 ABI which means we are free to use LEA. // 2. We *have* a frame pointer which means we are permitted to use LEA. return !MF.getTarget().getMCAsmInfo()->usesWindowsCFI() || hasFP(MF); } static bool isFuncletReturnInstr(MachineInstr &MI) { switch (MI.getOpcode()) { case X86::CATCHRET: case X86::CLEANUPRET: return true; default: return false; } llvm_unreachable("impossible"); } // CLR funclets use a special "Previous Stack Pointer Symbol" slot on the // stack. It holds a pointer to the bottom of the root function frame. The // establisher frame pointer passed to a nested funclet may point to the // (mostly empty) frame of its parent funclet, but it will need to find // the frame of the root function to access locals. To facilitate this, // every funclet copies the pointer to the bottom of the root function // frame into a PSPSym slot in its own (mostly empty) stack frame. Using the // same offset for the PSPSym in the root function frame that's used in the // funclets' frames allows each funclet to dynamically accept any ancestor // frame as its establisher argument (the runtime doesn't guarantee the // immediate parent for some reason lost to history), and also allows the GC, // which uses the PSPSym for some bookkeeping, to find it in any funclet's // frame with only a single offset reported for the entire method. unsigned X86FrameLowering::getPSPSlotOffsetFromSP(const MachineFunction &MF) const { const WinEHFuncInfo &Info = *MF.getWinEHFuncInfo(); Register SPReg; int Offset = getFrameIndexReferencePreferSP(MF, Info.PSPSymFrameIdx, SPReg, /*IgnoreSPUpdates*/ true) .getFixed(); assert(Offset >= 0 && SPReg == TRI->getStackRegister()); return static_cast(Offset); } unsigned X86FrameLowering::getWinEHFuncletFrameSize(const MachineFunction &MF) const { const X86MachineFunctionInfo *X86FI = MF.getInfo(); // This is the size of the pushed CSRs. unsigned CSSize = X86FI->getCalleeSavedFrameSize(); // This is the size of callee saved XMMs. const auto& WinEHXMMSlotInfo = X86FI->getWinEHXMMSlotInfo(); unsigned XMMSize = WinEHXMMSlotInfo.size() * TRI->getSpillSize(X86::VR128RegClass); // This is the amount of stack a funclet needs to allocate. unsigned UsedSize; EHPersonality Personality = classifyEHPersonality(MF.getFunction().getPersonalityFn()); if (Personality == EHPersonality::CoreCLR) { // CLR funclets need to hold enough space to include the PSPSym, at the // same offset from the stack pointer (immediately after the prolog) as it // resides at in the main function. UsedSize = getPSPSlotOffsetFromSP(MF) + SlotSize; } else { // Other funclets just need enough stack for outgoing call arguments. UsedSize = MF.getFrameInfo().getMaxCallFrameSize(); } // RBP is not included in the callee saved register block. After pushing RBP, // everything is 16 byte aligned. Everything we allocate before an outgoing // call must also be 16 byte aligned. unsigned FrameSizeMinusRBP = alignTo(CSSize + UsedSize, getStackAlign()); // Subtract out the size of the callee saved registers. This is how much stack // each funclet will allocate. return FrameSizeMinusRBP + XMMSize - CSSize; } static bool isTailCallOpcode(unsigned Opc) { return Opc == X86::TCRETURNri || Opc == X86::TCRETURNdi || Opc == X86::TCRETURNmi || Opc == X86::TCRETURNri64 || Opc == X86::TCRETURNdi64 || Opc == X86::TCRETURNmi64; } void X86FrameLowering::emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const { const MachineFrameInfo &MFI = MF.getFrameInfo(); X86MachineFunctionInfo *X86FI = MF.getInfo(); MachineBasicBlock::iterator Terminator = MBB.getFirstTerminator(); MachineBasicBlock::iterator MBBI = Terminator; DebugLoc DL; if (MBBI != MBB.end()) DL = MBBI->getDebugLoc(); // standard x86_64 and NaCl use 64-bit frame/stack pointers, x32 - 32-bit. const bool Is64BitILP32 = STI.isTarget64BitILP32(); Register FramePtr = TRI->getFrameRegister(MF); Register MachineFramePtr = Is64BitILP32 ? Register(getX86SubSuperRegister(FramePtr, 64)) : FramePtr; bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); bool NeedsWin64CFI = IsWin64Prologue && MF.getFunction().needsUnwindTableEntry(); bool IsFunclet = MBBI == MBB.end() ? false : isFuncletReturnInstr(*MBBI); // Get the number of bytes to allocate from the FrameInfo. uint64_t StackSize = MFI.getStackSize(); uint64_t MaxAlign = calculateMaxStackAlign(MF); unsigned CSSize = X86FI->getCalleeSavedFrameSize(); bool HasFP = hasFP(MF); uint64_t NumBytes = 0; bool NeedsDwarfCFI = (!MF.getTarget().getTargetTriple().isOSDarwin() && !MF.getTarget().getTargetTriple().isOSWindows()) && MF.needsFrameMoves(); if (IsFunclet) { assert(HasFP && "EH funclets without FP not yet implemented"); NumBytes = getWinEHFuncletFrameSize(MF); } else if (HasFP) { // Calculate required stack adjustment. uint64_t FrameSize = StackSize - SlotSize; NumBytes = FrameSize - CSSize; // Callee-saved registers were pushed on stack before the stack was // realigned. if (TRI->needsStackRealignment(MF) && !IsWin64Prologue) NumBytes = alignTo(FrameSize, MaxAlign); } else { NumBytes = StackSize - CSSize; } uint64_t SEHStackAllocAmt = NumBytes; // AfterPop is the position to insert .cfi_restore. MachineBasicBlock::iterator AfterPop = MBBI; if (HasFP) { // Pop EBP. BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::POP64r : X86::POP32r), MachineFramePtr) .setMIFlag(MachineInstr::FrameDestroy); if (NeedsDwarfCFI) { unsigned DwarfStackPtr = TRI->getDwarfRegNum(Is64Bit ? X86::RSP : X86::ESP, true); BuildCFI(MBB, MBBI, DL, MCCFIInstruction::cfiDefCfa(nullptr, DwarfStackPtr, SlotSize)); if (!MBB.succ_empty() && !MBB.isReturnBlock()) { unsigned DwarfFramePtr = TRI->getDwarfRegNum(MachineFramePtr, true); BuildCFI(MBB, AfterPop, DL, MCCFIInstruction::createRestore(nullptr, DwarfFramePtr)); --MBBI; --AfterPop; } --MBBI; } } MachineBasicBlock::iterator FirstCSPop = MBBI; // Skip the callee-saved pop instructions. while (MBBI != MBB.begin()) { MachineBasicBlock::iterator PI = std::prev(MBBI); unsigned Opc = PI->getOpcode(); if (Opc != X86::DBG_VALUE && !PI->isTerminator()) { if ((Opc != X86::POP32r || !PI->getFlag(MachineInstr::FrameDestroy)) && (Opc != X86::POP64r || !PI->getFlag(MachineInstr::FrameDestroy))) break; FirstCSPop = PI; } --MBBI; } MBBI = FirstCSPop; if (IsFunclet && Terminator->getOpcode() == X86::CATCHRET) emitCatchRetReturnValue(MBB, FirstCSPop, &*Terminator); if (MBBI != MBB.end()) DL = MBBI->getDebugLoc(); // If there is an ADD32ri or SUB32ri of ESP immediately before this // instruction, merge the two instructions. if (NumBytes || MFI.hasVarSizedObjects()) NumBytes += mergeSPUpdates(MBB, MBBI, true); // If dynamic alloca is used, then reset esp to point to the last callee-saved // slot before popping them off! Same applies for the case, when stack was // realigned. Don't do this if this was a funclet epilogue, since the funclets // will not do realignment or dynamic stack allocation. if ((TRI->needsStackRealignment(MF) || MFI.hasVarSizedObjects()) && !IsFunclet) { if (TRI->needsStackRealignment(MF)) MBBI = FirstCSPop; unsigned SEHFrameOffset = calculateSetFPREG(SEHStackAllocAmt); uint64_t LEAAmount = IsWin64Prologue ? SEHStackAllocAmt - SEHFrameOffset : -CSSize; // There are only two legal forms of epilogue: // - add SEHAllocationSize, %rsp // - lea SEHAllocationSize(%FramePtr), %rsp // // 'mov %FramePtr, %rsp' will not be recognized as an epilogue sequence. // However, we may use this sequence if we have a frame pointer because the // effects of the prologue can safely be undone. if (LEAAmount != 0) { unsigned Opc = getLEArOpcode(Uses64BitFramePtr); addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr), FramePtr, false, LEAAmount); --MBBI; } else { unsigned Opc = (Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr); BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr) .addReg(FramePtr); --MBBI; } } else if (NumBytes) { // Adjust stack pointer back: ESP += numbytes. emitSPUpdate(MBB, MBBI, DL, NumBytes, /*InEpilogue=*/true); if (!hasFP(MF) && NeedsDwarfCFI) { // Define the current CFA rule to use the provided offset. BuildCFI(MBB, MBBI, DL, MCCFIInstruction::cfiDefCfaOffset(nullptr, CSSize + SlotSize)); } --MBBI; } // Windows unwinder will not invoke function's exception handler if IP is // either in prologue or in epilogue. This behavior causes a problem when a // call immediately precedes an epilogue, because the return address points // into the epilogue. To cope with that, we insert an epilogue marker here, // then replace it with a 'nop' if it ends up immediately after a CALL in the // final emitted code. if (NeedsWin64CFI && MF.hasWinCFI()) BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_Epilogue)); if (!hasFP(MF) && NeedsDwarfCFI) { MBBI = FirstCSPop; int64_t Offset = -CSSize - SlotSize; // Mark callee-saved pop instruction. // Define the current CFA rule to use the provided offset. while (MBBI != MBB.end()) { MachineBasicBlock::iterator PI = MBBI; unsigned Opc = PI->getOpcode(); ++MBBI; if (Opc == X86::POP32r || Opc == X86::POP64r) { Offset += SlotSize; BuildCFI(MBB, MBBI, DL, MCCFIInstruction::cfiDefCfaOffset(nullptr, -Offset)); } } } // Emit DWARF info specifying the restores of the callee-saved registers. // For epilogue with return inside or being other block without successor, // no need to generate .cfi_restore for callee-saved registers. if (NeedsDwarfCFI && !MBB.succ_empty() && !MBB.isReturnBlock()) { emitCalleeSavedFrameMoves(MBB, AfterPop, DL, false); } if (Terminator == MBB.end() || !isTailCallOpcode(Terminator->getOpcode())) { // Add the return addr area delta back since we are not tail calling. int Offset = -1 * X86FI->getTCReturnAddrDelta(); assert(Offset >= 0 && "TCDelta should never be positive"); if (Offset) { // Check for possible merge with preceding ADD instruction. Offset += mergeSPUpdates(MBB, Terminator, true); emitSPUpdate(MBB, Terminator, DL, Offset, /*InEpilogue=*/true); } } // Emit tilerelease for AMX kernel. const MachineRegisterInfo &MRI = MF.getRegInfo(); - const TargetRegisterClass *RC = TRI->getRegClass(X86::TILERegClassID); - unsigned TileRegNum = RC->getNumRegs(); - for (unsigned I = 0; I < TileRegNum; I++) { - if (!MRI.reg_nodbg_empty(X86::TMM0 + I)) { - BuildMI(MBB, Terminator, DL, TII.get(X86::TILERELEASE)); - break; - } - } + if (!MRI.reg_nodbg_empty(X86::TMMCFG)) + BuildMI(MBB, Terminator, DL, TII.get(X86::TILERELEASE)); } StackOffset X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, Register &FrameReg) const { const MachineFrameInfo &MFI = MF.getFrameInfo(); bool IsFixed = MFI.isFixedObjectIndex(FI); // We can't calculate offset from frame pointer if the stack is realigned, // so enforce usage of stack/base pointer. The base pointer is used when we // have dynamic allocas in addition to dynamic realignment. if (TRI->hasBasePointer(MF)) FrameReg = IsFixed ? TRI->getFramePtr() : TRI->getBaseRegister(); else if (TRI->needsStackRealignment(MF)) FrameReg = IsFixed ? TRI->getFramePtr() : TRI->getStackRegister(); else FrameReg = TRI->getFrameRegister(MF); // Offset will hold the offset from the stack pointer at function entry to the // object. // We need to factor in additional offsets applied during the prologue to the // frame, base, and stack pointer depending on which is used. int Offset = MFI.getObjectOffset(FI) - getOffsetOfLocalArea(); const X86MachineFunctionInfo *X86FI = MF.getInfo(); unsigned CSSize = X86FI->getCalleeSavedFrameSize(); uint64_t StackSize = MFI.getStackSize(); bool HasFP = hasFP(MF); bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); int64_t FPDelta = 0; // In an x86 interrupt, remove the offset we added to account for the return // address from any stack object allocated in the caller's frame. Interrupts // do not have a standard return address. Fixed objects in the current frame, // such as SSE register spills, should not get this treatment. if (MF.getFunction().getCallingConv() == CallingConv::X86_INTR && Offset >= 0) { Offset += getOffsetOfLocalArea(); } if (IsWin64Prologue) { assert(!MFI.hasCalls() || (StackSize % 16) == 8); // Calculate required stack adjustment. uint64_t FrameSize = StackSize - SlotSize; // If required, include space for extra hidden slot for stashing base pointer. if (X86FI->getRestoreBasePointer()) FrameSize += SlotSize; uint64_t NumBytes = FrameSize - CSSize; uint64_t SEHFrameOffset = calculateSetFPREG(NumBytes); if (FI && FI == X86FI->getFAIndex()) return StackOffset::getFixed(-SEHFrameOffset); // FPDelta is the offset from the "traditional" FP location of the old base // pointer followed by return address and the location required by the // restricted Win64 prologue. // Add FPDelta to all offsets below that go through the frame pointer. FPDelta = FrameSize - SEHFrameOffset; assert((!MFI.hasCalls() || (FPDelta % 16) == 0) && "FPDelta isn't aligned per the Win64 ABI!"); } if (TRI->hasBasePointer(MF)) { assert(HasFP && "VLAs and dynamic stack realign, but no FP?!"); if (FI < 0) { // Skip the saved EBP. return StackOffset::getFixed(Offset + SlotSize + FPDelta); } else { assert(isAligned(MFI.getObjectAlign(FI), -(Offset + StackSize))); return StackOffset::getFixed(Offset + StackSize); } } else if (TRI->needsStackRealignment(MF)) { if (FI < 0) { // Skip the saved EBP. return StackOffset::getFixed(Offset + SlotSize + FPDelta); } else { assert(isAligned(MFI.getObjectAlign(FI), -(Offset + StackSize))); return StackOffset::getFixed(Offset + StackSize); } // FIXME: Support tail calls } else { if (!HasFP) return StackOffset::getFixed(Offset + StackSize); // Skip the saved EBP. Offset += SlotSize; // Skip the RETADDR move area int TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta(); if (TailCallReturnAddrDelta < 0) Offset -= TailCallReturnAddrDelta; } return StackOffset::getFixed(Offset + FPDelta); } int X86FrameLowering::getWin64EHFrameIndexRef(const MachineFunction &MF, int FI, Register &FrameReg) const { const MachineFrameInfo &MFI = MF.getFrameInfo(); const X86MachineFunctionInfo *X86FI = MF.getInfo(); const auto& WinEHXMMSlotInfo = X86FI->getWinEHXMMSlotInfo(); const auto it = WinEHXMMSlotInfo.find(FI); if (it == WinEHXMMSlotInfo.end()) return getFrameIndexReference(MF, FI, FrameReg).getFixed(); FrameReg = TRI->getStackRegister(); return alignDown(MFI.getMaxCallFrameSize(), getStackAlign().value()) + it->second; } StackOffset X86FrameLowering::getFrameIndexReferenceSP(const MachineFunction &MF, int FI, Register &FrameReg, int Adjustment) const { const MachineFrameInfo &MFI = MF.getFrameInfo(); FrameReg = TRI->getStackRegister(); return StackOffset::getFixed(MFI.getObjectOffset(FI) - getOffsetOfLocalArea() + Adjustment); } StackOffset X86FrameLowering::getFrameIndexReferencePreferSP(const MachineFunction &MF, int FI, Register &FrameReg, bool IgnoreSPUpdates) const { const MachineFrameInfo &MFI = MF.getFrameInfo(); // Does not include any dynamic realign. const uint64_t StackSize = MFI.getStackSize(); // LLVM arranges the stack as follows: // ... // ARG2 // ARG1 // RETADDR // PUSH RBP <-- RBP points here // PUSH CSRs // ~~~~~~~ <-- possible stack realignment (non-win64) // ... // STACK OBJECTS // ... <-- RSP after prologue points here // ~~~~~~~ <-- possible stack realignment (win64) // // if (hasVarSizedObjects()): // ... <-- "base pointer" (ESI/RBX) points here // DYNAMIC ALLOCAS // ... <-- RSP points here // // Case 1: In the simple case of no stack realignment and no dynamic // allocas, both "fixed" stack objects (arguments and CSRs) are addressable // with fixed offsets from RSP. // // Case 2: In the case of stack realignment with no dynamic allocas, fixed // stack objects are addressed with RBP and regular stack objects with RSP. // // Case 3: In the case of dynamic allocas and stack realignment, RSP is used // to address stack arguments for outgoing calls and nothing else. The "base // pointer" points to local variables, and RBP points to fixed objects. // // In cases 2 and 3, we can only answer for non-fixed stack objects, and the // answer we give is relative to the SP after the prologue, and not the // SP in the middle of the function. if (MFI.isFixedObjectIndex(FI) && TRI->needsStackRealignment(MF) && !STI.isTargetWin64()) return getFrameIndexReference(MF, FI, FrameReg); // If !hasReservedCallFrame the function might have SP adjustement in the // body. So, even though the offset is statically known, it depends on where // we are in the function. if (!IgnoreSPUpdates && !hasReservedCallFrame(MF)) return getFrameIndexReference(MF, FI, FrameReg); // We don't handle tail calls, and shouldn't be seeing them either. assert(MF.getInfo()->getTCReturnAddrDelta() >= 0 && "we don't handle this case!"); // This is how the math works out: // // %rsp grows (i.e. gets lower) left to right. Each box below is // one word (eight bytes). Obj0 is the stack slot we're trying to // get to. // // ---------------------------------- // | BP | Obj0 | Obj1 | ... | ObjN | // ---------------------------------- // ^ ^ ^ ^ // A B C E // // A is the incoming stack pointer. // (B - A) is the local area offset (-8 for x86-64) [1] // (C - A) is the Offset returned by MFI.getObjectOffset for Obj0 [2] // // |(E - B)| is the StackSize (absolute value, positive). For a // stack that grown down, this works out to be (B - E). [3] // // E is also the value of %rsp after stack has been set up, and we // want (C - E) -- the value we can add to %rsp to get to Obj0. Now // (C - E) == (C - A) - (B - A) + (B - E) // { Using [1], [2] and [3] above } // == getObjectOffset - LocalAreaOffset + StackSize return getFrameIndexReferenceSP(MF, FI, FrameReg, StackSize); } bool X86FrameLowering::assignCalleeSavedSpillSlots( MachineFunction &MF, const TargetRegisterInfo *TRI, std::vector &CSI) const { MachineFrameInfo &MFI = MF.getFrameInfo(); X86MachineFunctionInfo *X86FI = MF.getInfo(); unsigned CalleeSavedFrameSize = 0; unsigned XMMCalleeSavedFrameSize = 0; auto &WinEHXMMSlotInfo = X86FI->getWinEHXMMSlotInfo(); int SpillSlotOffset = getOffsetOfLocalArea() + X86FI->getTCReturnAddrDelta(); int64_t TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta(); if (TailCallReturnAddrDelta < 0) { // create RETURNADDR area // arg // arg // RETADDR // { ... // RETADDR area // ... // } // [EBP] MFI.CreateFixedObject(-TailCallReturnAddrDelta, TailCallReturnAddrDelta - SlotSize, true); } // Spill the BasePtr if it's used. if (this->TRI->hasBasePointer(MF)) { // Allocate a spill slot for EBP if we have a base pointer and EH funclets. if (MF.hasEHFunclets()) { int FI = MFI.CreateSpillStackObject(SlotSize, Align(SlotSize)); X86FI->setHasSEHFramePtrSave(true); X86FI->setSEHFramePtrSaveIndex(FI); } } if (hasFP(MF)) { // emitPrologue always spills frame register the first thing. SpillSlotOffset -= SlotSize; MFI.CreateFixedSpillStackObject(SlotSize, SpillSlotOffset); // Since emitPrologue and emitEpilogue will handle spilling and restoring of // the frame register, we can delete it from CSI list and not have to worry // about avoiding it later. Register FPReg = TRI->getFrameRegister(MF); for (unsigned i = 0; i < CSI.size(); ++i) { if (TRI->regsOverlap(CSI[i].getReg(),FPReg)) { CSI.erase(CSI.begin() + i); break; } } } // Assign slots for GPRs. It increases frame size. for (unsigned i = CSI.size(); i != 0; --i) { unsigned Reg = CSI[i - 1].getReg(); if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg)) continue; SpillSlotOffset -= SlotSize; CalleeSavedFrameSize += SlotSize; int SlotIndex = MFI.CreateFixedSpillStackObject(SlotSize, SpillSlotOffset); CSI[i - 1].setFrameIdx(SlotIndex); } X86FI->setCalleeSavedFrameSize(CalleeSavedFrameSize); MFI.setCVBytesOfCalleeSavedRegisters(CalleeSavedFrameSize); // Assign slots for XMMs. for (unsigned i = CSI.size(); i != 0; --i) { unsigned Reg = CSI[i - 1].getReg(); if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg)) continue; // If this is k-register make sure we lookup via the largest legal type. MVT VT = MVT::Other; if (X86::VK16RegClass.contains(Reg)) VT = STI.hasBWI() ? MVT::v64i1 : MVT::v16i1; const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT); unsigned Size = TRI->getSpillSize(*RC); Align Alignment = TRI->getSpillAlign(*RC); // ensure alignment assert(SpillSlotOffset < 0 && "SpillSlotOffset should always < 0 on X86"); SpillSlotOffset = -alignTo(-SpillSlotOffset, Alignment); // spill into slot SpillSlotOffset -= Size; int SlotIndex = MFI.CreateFixedSpillStackObject(Size, SpillSlotOffset); CSI[i - 1].setFrameIdx(SlotIndex); MFI.ensureMaxAlignment(Alignment); // Save the start offset and size of XMM in stack frame for funclets. if (X86::VR128RegClass.contains(Reg)) { WinEHXMMSlotInfo[SlotIndex] = XMMCalleeSavedFrameSize; XMMCalleeSavedFrameSize += Size; } } return true; } bool X86FrameLowering::spillCalleeSavedRegisters( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, ArrayRef CSI, const TargetRegisterInfo *TRI) const { DebugLoc DL = MBB.findDebugLoc(MI); // Don't save CSRs in 32-bit EH funclets. The caller saves EBX, EBP, ESI, EDI // for us, and there are no XMM CSRs on Win32. if (MBB.isEHFuncletEntry() && STI.is32Bit() && STI.isOSWindows()) return true; // Push GPRs. It increases frame size. const MachineFunction &MF = *MBB.getParent(); unsigned Opc = STI.is64Bit() ? X86::PUSH64r : X86::PUSH32r; for (unsigned i = CSI.size(); i != 0; --i) { unsigned Reg = CSI[i - 1].getReg(); if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg)) continue; const MachineRegisterInfo &MRI = MF.getRegInfo(); bool isLiveIn = MRI.isLiveIn(Reg); if (!isLiveIn) MBB.addLiveIn(Reg); // Decide whether we can add a kill flag to the use. bool CanKill = !isLiveIn; // Check if any subregister is live-in if (CanKill) { for (MCRegAliasIterator AReg(Reg, TRI, false); AReg.isValid(); ++AReg) { if (MRI.isLiveIn(*AReg)) { CanKill = false; break; } } } // Do not set a kill flag on values that are also marked as live-in. This // happens with the @llvm-returnaddress intrinsic and with arguments // passed in callee saved registers. // Omitting the kill flags is conservatively correct even if the live-in // is not used after all. BuildMI(MBB, MI, DL, TII.get(Opc)).addReg(Reg, getKillRegState(CanKill)) .setMIFlag(MachineInstr::FrameSetup); } // Make XMM regs spilled. X86 does not have ability of push/pop XMM. // It can be done by spilling XMMs to stack frame. for (unsigned i = CSI.size(); i != 0; --i) { unsigned Reg = CSI[i-1].getReg(); if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg)) continue; // If this is k-register make sure we lookup via the largest legal type. MVT VT = MVT::Other; if (X86::VK16RegClass.contains(Reg)) VT = STI.hasBWI() ? MVT::v64i1 : MVT::v16i1; // Add the callee-saved register as live-in. It's killed at the spill. MBB.addLiveIn(Reg); const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT); TII.storeRegToStackSlot(MBB, MI, Reg, true, CSI[i - 1].getFrameIdx(), RC, TRI); --MI; MI->setFlag(MachineInstr::FrameSetup); ++MI; } return true; } void X86FrameLowering::emitCatchRetReturnValue(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, MachineInstr *CatchRet) const { // SEH shouldn't use catchret. assert(!isAsynchronousEHPersonality(classifyEHPersonality( MBB.getParent()->getFunction().getPersonalityFn())) && "SEH should not use CATCHRET"); DebugLoc DL = CatchRet->getDebugLoc(); MachineBasicBlock *CatchRetTarget = CatchRet->getOperand(0).getMBB(); // Fill EAX/RAX with the address of the target block. if (STI.is64Bit()) { // LEA64r CatchRetTarget(%rip), %rax BuildMI(MBB, MBBI, DL, TII.get(X86::LEA64r), X86::RAX) .addReg(X86::RIP) .addImm(0) .addReg(0) .addMBB(CatchRetTarget) .addReg(0); } else { // MOV32ri $CatchRetTarget, %eax BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX) .addMBB(CatchRetTarget); } // Record that we've taken the address of CatchRetTarget and no longer just // reference it in a terminator. CatchRetTarget->setHasAddressTaken(); } bool X86FrameLowering::restoreCalleeSavedRegisters( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, MutableArrayRef CSI, const TargetRegisterInfo *TRI) const { if (CSI.empty()) return false; if (MI != MBB.end() && isFuncletReturnInstr(*MI) && STI.isOSWindows()) { // Don't restore CSRs in 32-bit EH funclets. Matches // spillCalleeSavedRegisters. if (STI.is32Bit()) return true; // Don't restore CSRs before an SEH catchret. SEH except blocks do not form // funclets. emitEpilogue transforms these to normal jumps. if (MI->getOpcode() == X86::CATCHRET) { const Function &F = MBB.getParent()->getFunction(); bool IsSEH = isAsynchronousEHPersonality( classifyEHPersonality(F.getPersonalityFn())); if (IsSEH) return true; } } DebugLoc DL = MBB.findDebugLoc(MI); // Reload XMMs from stack frame. for (unsigned i = 0, e = CSI.size(); i != e; ++i) { unsigned Reg = CSI[i].getReg(); if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg)) continue; // If this is k-register make sure we lookup via the largest legal type. MVT VT = MVT::Other; if (X86::VK16RegClass.contains(Reg)) VT = STI.hasBWI() ? MVT::v64i1 : MVT::v16i1; const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT); TII.loadRegFromStackSlot(MBB, MI, Reg, CSI[i].getFrameIdx(), RC, TRI); } // POP GPRs. unsigned Opc = STI.is64Bit() ? X86::POP64r : X86::POP32r; for (unsigned i = 0, e = CSI.size(); i != e; ++i) { unsigned Reg = CSI[i].getReg(); if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg)) continue; BuildMI(MBB, MI, DL, TII.get(Opc), Reg) .setMIFlag(MachineInstr::FrameDestroy); } return true; } void X86FrameLowering::determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs, RegScavenger *RS) const { TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS); // Spill the BasePtr if it's used. if (TRI->hasBasePointer(MF)){ Register BasePtr = TRI->getBaseRegister(); if (STI.isTarget64BitILP32()) BasePtr = getX86SubSuperRegister(BasePtr, 64); SavedRegs.set(BasePtr); } } static bool HasNestArgument(const MachineFunction *MF) { const Function &F = MF->getFunction(); for (Function::const_arg_iterator I = F.arg_begin(), E = F.arg_end(); I != E; I++) { if (I->hasNestAttr() && !I->use_empty()) return true; } return false; } /// GetScratchRegister - Get a temp register for performing work in the /// segmented stack and the Erlang/HiPE stack prologue. Depending on platform /// and the properties of the function either one or two registers will be /// needed. Set primary to true for the first register, false for the second. static unsigned GetScratchRegister(bool Is64Bit, bool IsLP64, const MachineFunction &MF, bool Primary) { CallingConv::ID CallingConvention = MF.getFunction().getCallingConv(); // Erlang stuff. if (CallingConvention == CallingConv::HiPE) { if (Is64Bit) return Primary ? X86::R14 : X86::R13; else return Primary ? X86::EBX : X86::EDI; } if (Is64Bit) { if (IsLP64) return Primary ? X86::R11 : X86::R12; else return Primary ? X86::R11D : X86::R12D; } bool IsNested = HasNestArgument(&MF); if (CallingConvention == CallingConv::X86_FastCall || CallingConvention == CallingConv::Fast || CallingConvention == CallingConv::Tail) { if (IsNested) report_fatal_error("Segmented stacks does not support fastcall with " "nested function."); return Primary ? X86::EAX : X86::ECX; } if (IsNested) return Primary ? X86::EDX : X86::EAX; return Primary ? X86::ECX : X86::EAX; } // The stack limit in the TCB is set to this many bytes above the actual stack // limit. static const uint64_t kSplitStackAvailable = 256; void X86FrameLowering::adjustForSegmentedStacks( MachineFunction &MF, MachineBasicBlock &PrologueMBB) const { MachineFrameInfo &MFI = MF.getFrameInfo(); uint64_t StackSize; unsigned TlsReg, TlsOffset; DebugLoc DL; // To support shrink-wrapping we would need to insert the new blocks // at the right place and update the branches to PrologueMBB. assert(&(*MF.begin()) == &PrologueMBB && "Shrink-wrapping not supported yet"); unsigned ScratchReg = GetScratchRegister(Is64Bit, IsLP64, MF, true); assert(!MF.getRegInfo().isLiveIn(ScratchReg) && "Scratch register is live-in"); if (MF.getFunction().isVarArg()) report_fatal_error("Segmented stacks do not support vararg functions."); if (!STI.isTargetLinux() && !STI.isTargetDarwin() && !STI.isTargetWin32() && !STI.isTargetWin64() && !STI.isTargetFreeBSD() && !STI.isTargetDragonFly()) report_fatal_error("Segmented stacks not supported on this platform."); // Eventually StackSize will be calculated by a link-time pass; which will // also decide whether checking code needs to be injected into this particular // prologue. StackSize = MFI.getStackSize(); // Do not generate a prologue for leaf functions with a stack of size zero. // For non-leaf functions we have to allow for the possibility that the // callis to a non-split function, as in PR37807. This function could also // take the address of a non-split function. When the linker tries to adjust // its non-existent prologue, it would fail with an error. Mark the object // file so that such failures are not errors. See this Go language bug-report // https://go-review.googlesource.com/c/go/+/148819/ if (StackSize == 0 && !MFI.hasTailCall()) { MF.getMMI().setHasNosplitStack(true); return; } MachineBasicBlock *allocMBB = MF.CreateMachineBasicBlock(); MachineBasicBlock *checkMBB = MF.CreateMachineBasicBlock(); X86MachineFunctionInfo *X86FI = MF.getInfo(); bool IsNested = false; // We need to know if the function has a nest argument only in 64 bit mode. if (Is64Bit) IsNested = HasNestArgument(&MF); // The MOV R10, RAX needs to be in a different block, since the RET we emit in // allocMBB needs to be last (terminating) instruction. for (const auto &LI : PrologueMBB.liveins()) { allocMBB->addLiveIn(LI); checkMBB->addLiveIn(LI); } if (IsNested) allocMBB->addLiveIn(IsLP64 ? X86::R10 : X86::R10D); MF.push_front(allocMBB); MF.push_front(checkMBB); // When the frame size is less than 256 we just compare the stack // boundary directly to the value of the stack pointer, per gcc. bool CompareStackPointer = StackSize < kSplitStackAvailable; // Read the limit off the current stacklet off the stack_guard location. if (Is64Bit) { if (STI.isTargetLinux()) { TlsReg = X86::FS; TlsOffset = IsLP64 ? 0x70 : 0x40; } else if (STI.isTargetDarwin()) { TlsReg = X86::GS; TlsOffset = 0x60 + 90*8; // See pthread_machdep.h. Steal TLS slot 90. } else if (STI.isTargetWin64()) { TlsReg = X86::GS; TlsOffset = 0x28; // pvArbitrary, reserved for application use } else if (STI.isTargetFreeBSD()) { TlsReg = X86::FS; TlsOffset = 0x18; } else if (STI.isTargetDragonFly()) { TlsReg = X86::FS; TlsOffset = 0x20; // use tls_tcb.tcb_segstack } else { report_fatal_error("Segmented stacks not supported on this platform."); } if (CompareStackPointer) ScratchReg = IsLP64 ? X86::RSP : X86::ESP; else BuildMI(checkMBB, DL, TII.get(IsLP64 ? X86::LEA64r : X86::LEA64_32r), ScratchReg).addReg(X86::RSP) .addImm(1).addReg(0).addImm(-StackSize).addReg(0); BuildMI(checkMBB, DL, TII.get(IsLP64 ? X86::CMP64rm : X86::CMP32rm)).addReg(ScratchReg) .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg); } else { if (STI.isTargetLinux()) { TlsReg = X86::GS; TlsOffset = 0x30; } else if (STI.isTargetDarwin()) { TlsReg = X86::GS; TlsOffset = 0x48 + 90*4; } else if (STI.isTargetWin32()) { TlsReg = X86::FS; TlsOffset = 0x14; // pvArbitrary, reserved for application use } else if (STI.isTargetDragonFly()) { TlsReg = X86::FS; TlsOffset = 0x10; // use tls_tcb.tcb_segstack } else if (STI.isTargetFreeBSD()) { report_fatal_error("Segmented stacks not supported on FreeBSD i386."); } else { report_fatal_error("Segmented stacks not supported on this platform."); } if (CompareStackPointer) ScratchReg = X86::ESP; else BuildMI(checkMBB, DL, TII.get(X86::LEA32r), ScratchReg).addReg(X86::ESP) .addImm(1).addReg(0).addImm(-StackSize).addReg(0); if (STI.isTargetLinux() || STI.isTargetWin32() || STI.isTargetWin64() || STI.isTargetDragonFly()) { BuildMI(checkMBB, DL, TII.get(X86::CMP32rm)).addReg(ScratchReg) .addReg(0).addImm(0).addReg(0).addImm(TlsOffset).addReg(TlsReg); } else if (STI.isTargetDarwin()) { // TlsOffset doesn't fit into a mod r/m byte so we need an extra register. unsigned ScratchReg2; bool SaveScratch2; if (CompareStackPointer) { // The primary scratch register is available for holding the TLS offset. ScratchReg2 = GetScratchRegister(Is64Bit, IsLP64, MF, true); SaveScratch2 = false; } else { // Need to use a second register to hold the TLS offset ScratchReg2 = GetScratchRegister(Is64Bit, IsLP64, MF, false); // Unfortunately, with fastcc the second scratch register may hold an // argument. SaveScratch2 = MF.getRegInfo().isLiveIn(ScratchReg2); } // If Scratch2 is live-in then it needs to be saved. assert((!MF.getRegInfo().isLiveIn(ScratchReg2) || SaveScratch2) && "Scratch register is live-in and not saved"); if (SaveScratch2) BuildMI(checkMBB, DL, TII.get(X86::PUSH32r)) .addReg(ScratchReg2, RegState::Kill); BuildMI(checkMBB, DL, TII.get(X86::MOV32ri), ScratchReg2) .addImm(TlsOffset); BuildMI(checkMBB, DL, TII.get(X86::CMP32rm)) .addReg(ScratchReg) .addReg(ScratchReg2).addImm(1).addReg(0) .addImm(0) .addReg(TlsReg); if (SaveScratch2) BuildMI(checkMBB, DL, TII.get(X86::POP32r), ScratchReg2); } } // This jump is taken if SP >= (Stacklet Limit + Stack Space required). // It jumps to normal execution of the function body. BuildMI(checkMBB, DL, TII.get(X86::JCC_1)).addMBB(&PrologueMBB).addImm(X86::COND_A); // On 32 bit we first push the arguments size and then the frame size. On 64 // bit, we pass the stack frame size in r10 and the argument size in r11. if (Is64Bit) { // Functions with nested arguments use R10, so it needs to be saved across // the call to _morestack const unsigned RegAX = IsLP64 ? X86::RAX : X86::EAX; const unsigned Reg10 = IsLP64 ? X86::R10 : X86::R10D; const unsigned Reg11 = IsLP64 ? X86::R11 : X86::R11D; const unsigned MOVrr = IsLP64 ? X86::MOV64rr : X86::MOV32rr; const unsigned MOVri = IsLP64 ? X86::MOV64ri : X86::MOV32ri; if (IsNested) BuildMI(allocMBB, DL, TII.get(MOVrr), RegAX).addReg(Reg10); BuildMI(allocMBB, DL, TII.get(MOVri), Reg10) .addImm(StackSize); BuildMI(allocMBB, DL, TII.get(MOVri), Reg11) .addImm(X86FI->getArgumentStackSize()); } else { BuildMI(allocMBB, DL, TII.get(X86::PUSHi32)) .addImm(X86FI->getArgumentStackSize()); BuildMI(allocMBB, DL, TII.get(X86::PUSHi32)) .addImm(StackSize); } // __morestack is in libgcc if (Is64Bit && MF.getTarget().getCodeModel() == CodeModel::Large) { // Under the large code model, we cannot assume that __morestack lives // within 2^31 bytes of the call site, so we cannot use pc-relative // addressing. We cannot perform the call via a temporary register, // as the rax register may be used to store the static chain, and all // other suitable registers may be either callee-save or used for // parameter passing. We cannot use the stack at this point either // because __morestack manipulates the stack directly. // // To avoid these issues, perform an indirect call via a read-only memory // location containing the address. // // This solution is not perfect, as it assumes that the .rodata section // is laid out within 2^31 bytes of each function body, but this seems // to be sufficient for JIT. // FIXME: Add retpoline support and remove the error here.. if (STI.useIndirectThunkCalls()) report_fatal_error("Emitting morestack calls on 64-bit with the large " "code model and thunks not yet implemented."); BuildMI(allocMBB, DL, TII.get(X86::CALL64m)) .addReg(X86::RIP) .addImm(0) .addReg(0) .addExternalSymbol("__morestack_addr") .addReg(0); MF.getMMI().setUsesMorestackAddr(true); } else { if (Is64Bit) BuildMI(allocMBB, DL, TII.get(X86::CALL64pcrel32)) .addExternalSymbol("__morestack"); else BuildMI(allocMBB, DL, TII.get(X86::CALLpcrel32)) .addExternalSymbol("__morestack"); } if (IsNested) BuildMI(allocMBB, DL, TII.get(X86::MORESTACK_RET_RESTORE_R10)); else BuildMI(allocMBB, DL, TII.get(X86::MORESTACK_RET)); allocMBB->addSuccessor(&PrologueMBB); checkMBB->addSuccessor(allocMBB, BranchProbability::getZero()); checkMBB->addSuccessor(&PrologueMBB, BranchProbability::getOne()); #ifdef EXPENSIVE_CHECKS MF.verify(); #endif } /// Lookup an ERTS parameter in the !hipe.literals named metadata node. /// HiPE provides Erlang Runtime System-internal parameters, such as PCB offsets /// to fields it needs, through a named metadata node "hipe.literals" containing /// name-value pairs. static unsigned getHiPELiteral( NamedMDNode *HiPELiteralsMD, const StringRef LiteralName) { for (int i = 0, e = HiPELiteralsMD->getNumOperands(); i != e; ++i) { MDNode *Node = HiPELiteralsMD->getOperand(i); if (Node->getNumOperands() != 2) continue; MDString *NodeName = dyn_cast(Node->getOperand(0)); ValueAsMetadata *NodeVal = dyn_cast(Node->getOperand(1)); if (!NodeName || !NodeVal) continue; ConstantInt *ValConst = dyn_cast_or_null(NodeVal->getValue()); if (ValConst && NodeName->getString() == LiteralName) { return ValConst->getZExtValue(); } } report_fatal_error("HiPE literal " + LiteralName + " required but not provided"); } // Return true if there are no non-ehpad successors to MBB and there are no // non-meta instructions between MBBI and MBB.end(). static bool blockEndIsUnreachable(const MachineBasicBlock &MBB, MachineBasicBlock::const_iterator MBBI) { return llvm::all_of( MBB.successors(), [](const MachineBasicBlock *Succ) { return Succ->isEHPad(); }) && std::all_of(MBBI, MBB.end(), [](const MachineInstr &MI) { return MI.isMetaInstruction(); }); } /// Erlang programs may need a special prologue to handle the stack size they /// might need at runtime. That is because Erlang/OTP does not implement a C /// stack but uses a custom implementation of hybrid stack/heap architecture. /// (for more information see Eric Stenman's Ph.D. thesis: /// http://publications.uu.se/uu/fulltext/nbn_se_uu_diva-2688.pdf) /// /// CheckStack: /// temp0 = sp - MaxStack /// if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart /// OldStart: /// ... /// IncStack: /// call inc_stack # doubles the stack space /// temp0 = sp - MaxStack /// if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart void X86FrameLowering::adjustForHiPEPrologue( MachineFunction &MF, MachineBasicBlock &PrologueMBB) const { MachineFrameInfo &MFI = MF.getFrameInfo(); DebugLoc DL; // To support shrink-wrapping we would need to insert the new blocks // at the right place and update the branches to PrologueMBB. assert(&(*MF.begin()) == &PrologueMBB && "Shrink-wrapping not supported yet"); // HiPE-specific values NamedMDNode *HiPELiteralsMD = MF.getMMI().getModule() ->getNamedMetadata("hipe.literals"); if (!HiPELiteralsMD) report_fatal_error( "Can't generate HiPE prologue without runtime parameters"); const unsigned HipeLeafWords = getHiPELiteral(HiPELiteralsMD, Is64Bit ? "AMD64_LEAF_WORDS" : "X86_LEAF_WORDS"); const unsigned CCRegisteredArgs = Is64Bit ? 6 : 5; const unsigned Guaranteed = HipeLeafWords * SlotSize; unsigned CallerStkArity = MF.getFunction().arg_size() > CCRegisteredArgs ? MF.getFunction().arg_size() - CCRegisteredArgs : 0; unsigned MaxStack = MFI.getStackSize() + CallerStkArity*SlotSize + SlotSize; assert(STI.isTargetLinux() && "HiPE prologue is only supported on Linux operating systems."); // Compute the largest caller's frame that is needed to fit the callees' // frames. This 'MaxStack' is computed from: // // a) the fixed frame size, which is the space needed for all spilled temps, // b) outgoing on-stack parameter areas, and // c) the minimum stack space this function needs to make available for the // functions it calls (a tunable ABI property). if (MFI.hasCalls()) { unsigned MoreStackForCalls = 0; for (auto &MBB : MF) { for (auto &MI : MBB) { if (!MI.isCall()) continue; // Get callee operand. const MachineOperand &MO = MI.getOperand(0); // Only take account of global function calls (no closures etc.). if (!MO.isGlobal()) continue; const Function *F = dyn_cast(MO.getGlobal()); if (!F) continue; // Do not update 'MaxStack' for primitive and built-in functions // (encoded with names either starting with "erlang."/"bif_" or not // having a ".", such as a simple .., or an // "_", such as the BIF "suspend_0") as they are executed on another // stack. if (F->getName().find("erlang.") != StringRef::npos || F->getName().find("bif_") != StringRef::npos || F->getName().find_first_of("._") == StringRef::npos) continue; unsigned CalleeStkArity = F->arg_size() > CCRegisteredArgs ? F->arg_size()-CCRegisteredArgs : 0; if (HipeLeafWords - 1 > CalleeStkArity) MoreStackForCalls = std::max(MoreStackForCalls, (HipeLeafWords - 1 - CalleeStkArity) * SlotSize); } } MaxStack += MoreStackForCalls; } // If the stack frame needed is larger than the guaranteed then runtime checks // and calls to "inc_stack_0" BIF should be inserted in the assembly prologue. if (MaxStack > Guaranteed) { MachineBasicBlock *stackCheckMBB = MF.CreateMachineBasicBlock(); MachineBasicBlock *incStackMBB = MF.CreateMachineBasicBlock(); for (const auto &LI : PrologueMBB.liveins()) { stackCheckMBB->addLiveIn(LI); incStackMBB->addLiveIn(LI); } MF.push_front(incStackMBB); MF.push_front(stackCheckMBB); unsigned ScratchReg, SPReg, PReg, SPLimitOffset; unsigned LEAop, CMPop, CALLop; SPLimitOffset = getHiPELiteral(HiPELiteralsMD, "P_NSP_LIMIT"); if (Is64Bit) { SPReg = X86::RSP; PReg = X86::RBP; LEAop = X86::LEA64r; CMPop = X86::CMP64rm; CALLop = X86::CALL64pcrel32; } else { SPReg = X86::ESP; PReg = X86::EBP; LEAop = X86::LEA32r; CMPop = X86::CMP32rm; CALLop = X86::CALLpcrel32; } ScratchReg = GetScratchRegister(Is64Bit, IsLP64, MF, true); assert(!MF.getRegInfo().isLiveIn(ScratchReg) && "HiPE prologue scratch register is live-in"); // Create new MBB for StackCheck: addRegOffset(BuildMI(stackCheckMBB, DL, TII.get(LEAop), ScratchReg), SPReg, false, -MaxStack); // SPLimitOffset is in a fixed heap location (pointed by BP). addRegOffset(BuildMI(stackCheckMBB, DL, TII.get(CMPop)) .addReg(ScratchReg), PReg, false, SPLimitOffset); BuildMI(stackCheckMBB, DL, TII.get(X86::JCC_1)).addMBB(&PrologueMBB).addImm(X86::COND_AE); // Create new MBB for IncStack: BuildMI(incStackMBB, DL, TII.get(CALLop)). addExternalSymbol("inc_stack_0"); addRegOffset(BuildMI(incStackMBB, DL, TII.get(LEAop), ScratchReg), SPReg, false, -MaxStack); addRegOffset(BuildMI(incStackMBB, DL, TII.get(CMPop)) .addReg(ScratchReg), PReg, false, SPLimitOffset); BuildMI(incStackMBB, DL, TII.get(X86::JCC_1)).addMBB(incStackMBB).addImm(X86::COND_LE); stackCheckMBB->addSuccessor(&PrologueMBB, {99, 100}); stackCheckMBB->addSuccessor(incStackMBB, {1, 100}); incStackMBB->addSuccessor(&PrologueMBB, {99, 100}); incStackMBB->addSuccessor(incStackMBB, {1, 100}); } #ifdef EXPENSIVE_CHECKS MF.verify(); #endif } bool X86FrameLowering::adjustStackWithPops(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, int Offset) const { if (Offset <= 0) return false; if (Offset % SlotSize) return false; int NumPops = Offset / SlotSize; // This is only worth it if we have at most 2 pops. if (NumPops != 1 && NumPops != 2) return false; // Handle only the trivial case where the adjustment directly follows // a call. This is the most common one, anyway. if (MBBI == MBB.begin()) return false; MachineBasicBlock::iterator Prev = std::prev(MBBI); if (!Prev->isCall() || !Prev->getOperand(1).isRegMask()) return false; unsigned Regs[2]; unsigned FoundRegs = 0; const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); const MachineOperand &RegMask = Prev->getOperand(1); auto &RegClass = Is64Bit ? X86::GR64_NOREX_NOSPRegClass : X86::GR32_NOREX_NOSPRegClass; // Try to find up to NumPops free registers. for (auto Candidate : RegClass) { // Poor man's liveness: // Since we're immediately after a call, any register that is clobbered // by the call and not defined by it can be considered dead. if (!RegMask.clobbersPhysReg(Candidate)) continue; // Don't clobber reserved registers if (MRI.isReserved(Candidate)) continue; bool IsDef = false; for (const MachineOperand &MO : Prev->implicit_operands()) { if (MO.isReg() && MO.isDef() && TRI->isSuperOrSubRegisterEq(MO.getReg(), Candidate)) { IsDef = true; break; } } if (IsDef) continue; Regs[FoundRegs++] = Candidate; if (FoundRegs == (unsigned)NumPops) break; } if (FoundRegs == 0) return false; // If we found only one free register, but need two, reuse the same one twice. while (FoundRegs < (unsigned)NumPops) Regs[FoundRegs++] = Regs[0]; for (int i = 0; i < NumPops; ++i) BuildMI(MBB, MBBI, DL, TII.get(STI.is64Bit() ? X86::POP64r : X86::POP32r), Regs[i]); return true; } MachineBasicBlock::iterator X86FrameLowering:: eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const { bool reserveCallFrame = hasReservedCallFrame(MF); unsigned Opcode = I->getOpcode(); bool isDestroy = Opcode == TII.getCallFrameDestroyOpcode(); DebugLoc DL = I->getDebugLoc(); uint64_t Amount = TII.getFrameSize(*I); uint64_t InternalAmt = (isDestroy || Amount) ? TII.getFrameAdjustment(*I) : 0; I = MBB.erase(I); auto InsertPos = skipDebugInstructionsForward(I, MBB.end()); // Try to avoid emitting dead SP adjustments if the block end is unreachable, // typically because the function is marked noreturn (abort, throw, // assert_fail, etc). if (isDestroy && blockEndIsUnreachable(MBB, I)) return I; if (!reserveCallFrame) { // If the stack pointer can be changed after prologue, turn the // adjcallstackup instruction into a 'sub ESP, ' and the // adjcallstackdown instruction into 'add ESP, ' // We need to keep the stack aligned properly. To do this, we round the // amount of space needed for the outgoing arguments up to the next // alignment boundary. Amount = alignTo(Amount, getStackAlign()); const Function &F = MF.getFunction(); bool WindowsCFI = MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); bool DwarfCFI = !WindowsCFI && MF.needsFrameMoves(); // If we have any exception handlers in this function, and we adjust // the SP before calls, we may need to indicate this to the unwinder // using GNU_ARGS_SIZE. Note that this may be necessary even when // Amount == 0, because the preceding function may have set a non-0 // GNU_ARGS_SIZE. // TODO: We don't need to reset this between subsequent functions, // if it didn't change. bool HasDwarfEHHandlers = !WindowsCFI && !MF.getLandingPads().empty(); if (HasDwarfEHHandlers && !isDestroy && MF.getInfo()->getHasPushSequences()) BuildCFI(MBB, InsertPos, DL, MCCFIInstruction::createGnuArgsSize(nullptr, Amount)); if (Amount == 0) return I; // Factor out the amount that gets handled inside the sequence // (Pushes of argument for frame setup, callee pops for frame destroy) Amount -= InternalAmt; // TODO: This is needed only if we require precise CFA. // If this is a callee-pop calling convention, emit a CFA adjust for // the amount the callee popped. if (isDestroy && InternalAmt && DwarfCFI && !hasFP(MF)) BuildCFI(MBB, InsertPos, DL, MCCFIInstruction::createAdjustCfaOffset(nullptr, -InternalAmt)); // Add Amount to SP to destroy a frame, or subtract to setup. int64_t StackAdjustment = isDestroy ? Amount : -Amount; if (StackAdjustment) { // Merge with any previous or following adjustment instruction. Note: the // instructions merged with here do not have CFI, so their stack // adjustments do not feed into CfaAdjustment. StackAdjustment += mergeSPUpdates(MBB, InsertPos, true); StackAdjustment += mergeSPUpdates(MBB, InsertPos, false); if (StackAdjustment) { if (!(F.hasMinSize() && adjustStackWithPops(MBB, InsertPos, DL, StackAdjustment))) BuildStackAdjustment(MBB, InsertPos, DL, StackAdjustment, /*InEpilogue=*/false); } } if (DwarfCFI && !hasFP(MF)) { // If we don't have FP, but need to generate unwind information, // we need to set the correct CFA offset after the stack adjustment. // How much we adjust the CFA offset depends on whether we're emitting // CFI only for EH purposes or for debugging. EH only requires the CFA // offset to be correct at each call site, while for debugging we want // it to be more precise. int64_t CfaAdjustment = -StackAdjustment; // TODO: When not using precise CFA, we also need to adjust for the // InternalAmt here. if (CfaAdjustment) { BuildCFI(MBB, InsertPos, DL, MCCFIInstruction::createAdjustCfaOffset(nullptr, CfaAdjustment)); } } return I; } if (InternalAmt) { MachineBasicBlock::iterator CI = I; MachineBasicBlock::iterator B = MBB.begin(); while (CI != B && !std::prev(CI)->isCall()) --CI; BuildStackAdjustment(MBB, CI, DL, -InternalAmt, /*InEpilogue=*/false); } return I; } bool X86FrameLowering::canUseAsPrologue(const MachineBasicBlock &MBB) const { assert(MBB.getParent() && "Block is not attached to a function!"); const MachineFunction &MF = *MBB.getParent(); return !TRI->needsStackRealignment(MF) || !MBB.isLiveIn(X86::EFLAGS); } bool X86FrameLowering::canUseAsEpilogue(const MachineBasicBlock &MBB) const { assert(MBB.getParent() && "Block is not attached to a function!"); // Win64 has strict requirements in terms of epilogue and we are // not taking a chance at messing with them. // I.e., unless this block is already an exit block, we can't use // it as an epilogue. if (STI.isTargetWin64() && !MBB.succ_empty() && !MBB.isReturnBlock()) return false; if (canUseLEAForSPInEpilogue(*MBB.getParent())) return true; // If we cannot use LEA to adjust SP, we may need to use ADD, which // clobbers the EFLAGS. Check that we do not need to preserve it, // otherwise, conservatively assume this is not // safe to insert the epilogue here. return !flagsNeedToBePreservedBeforeTheTerminators(MBB); } bool X86FrameLowering::enableShrinkWrapping(const MachineFunction &MF) const { // If we may need to emit frameless compact unwind information, give // up as this is currently broken: PR25614. bool CompactUnwind = MF.getMMI().getContext().getObjectFileInfo()->getCompactUnwindSection() != nullptr; return (MF.getFunction().hasFnAttribute(Attribute::NoUnwind) || hasFP(MF) || !CompactUnwind) && // The lowering of segmented stack and HiPE only support entry // blocks as prologue blocks: PR26107. This limitation may be // lifted if we fix: // - adjustForSegmentedStacks // - adjustForHiPEPrologue MF.getFunction().getCallingConv() != CallingConv::HiPE && !MF.shouldSplitStack(); } MachineBasicBlock::iterator X86FrameLowering::restoreWin32EHStackPointers( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, bool RestoreSP) const { assert(STI.isTargetWindowsMSVC() && "funclets only supported in MSVC env"); assert(STI.isTargetWin32() && "EBP/ESI restoration only required on win32"); assert(STI.is32Bit() && !Uses64BitFramePtr && "restoring EBP/ESI on non-32-bit target"); MachineFunction &MF = *MBB.getParent(); Register FramePtr = TRI->getFrameRegister(MF); Register BasePtr = TRI->getBaseRegister(); WinEHFuncInfo &FuncInfo = *MF.getWinEHFuncInfo(); X86MachineFunctionInfo *X86FI = MF.getInfo(); MachineFrameInfo &MFI = MF.getFrameInfo(); // FIXME: Don't set FrameSetup flag in catchret case. int FI = FuncInfo.EHRegNodeFrameIndex; int EHRegSize = MFI.getObjectSize(FI); if (RestoreSP) { // MOV32rm -EHRegSize(%ebp), %esp addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32rm), X86::ESP), X86::EBP, true, -EHRegSize) .setMIFlag(MachineInstr::FrameSetup); } Register UsedReg; int EHRegOffset = getFrameIndexReference(MF, FI, UsedReg).getFixed(); int EndOffset = -EHRegOffset - EHRegSize; FuncInfo.EHRegNodeEndOffset = EndOffset; if (UsedReg == FramePtr) { // ADD $offset, %ebp unsigned ADDri = getADDriOpcode(false, EndOffset); BuildMI(MBB, MBBI, DL, TII.get(ADDri), FramePtr) .addReg(FramePtr) .addImm(EndOffset) .setMIFlag(MachineInstr::FrameSetup) ->getOperand(3) .setIsDead(); assert(EndOffset >= 0 && "end of registration object above normal EBP position!"); } else if (UsedReg == BasePtr) { // LEA offset(%ebp), %esi addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::LEA32r), BasePtr), FramePtr, false, EndOffset) .setMIFlag(MachineInstr::FrameSetup); // MOV32rm SavedEBPOffset(%esi), %ebp assert(X86FI->getHasSEHFramePtrSave()); int Offset = getFrameIndexReference(MF, X86FI->getSEHFramePtrSaveIndex(), UsedReg) .getFixed(); assert(UsedReg == BasePtr); addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32rm), FramePtr), UsedReg, true, Offset) .setMIFlag(MachineInstr::FrameSetup); } else { llvm_unreachable("32-bit frames with WinEH must use FramePtr or BasePtr"); } return MBBI; } int X86FrameLowering::getInitialCFAOffset(const MachineFunction &MF) const { return TRI->getSlotSize(); } Register X86FrameLowering::getInitialCFARegister(const MachineFunction &MF) const { return TRI->getDwarfRegNum(StackPtr, true); } namespace { // Struct used by orderFrameObjects to help sort the stack objects. struct X86FrameSortingObject { bool IsValid = false; // true if we care about this Object. unsigned ObjectIndex = 0; // Index of Object into MFI list. unsigned ObjectSize = 0; // Size of Object in bytes. Align ObjectAlignment = Align(1); // Alignment of Object in bytes. unsigned ObjectNumUses = 0; // Object static number of uses. }; // The comparison function we use for std::sort to order our local // stack symbols. The current algorithm is to use an estimated // "density". This takes into consideration the size and number of // uses each object has in order to roughly minimize code size. // So, for example, an object of size 16B that is referenced 5 times // will get higher priority than 4 4B objects referenced 1 time each. // It's not perfect and we may be able to squeeze a few more bytes out of // it (for example : 0(esp) requires fewer bytes, symbols allocated at the // fringe end can have special consideration, given their size is less // important, etc.), but the algorithmic complexity grows too much to be // worth the extra gains we get. This gets us pretty close. // The final order leaves us with objects with highest priority going // at the end of our list. struct X86FrameSortingComparator { inline bool operator()(const X86FrameSortingObject &A, const X86FrameSortingObject &B) const { uint64_t DensityAScaled, DensityBScaled; // For consistency in our comparison, all invalid objects are placed // at the end. This also allows us to stop walking when we hit the // first invalid item after it's all sorted. if (!A.IsValid) return false; if (!B.IsValid) return true; // The density is calculated by doing : // (double)DensityA = A.ObjectNumUses / A.ObjectSize // (double)DensityB = B.ObjectNumUses / B.ObjectSize // Since this approach may cause inconsistencies in // the floating point <, >, == comparisons, depending on the floating // point model with which the compiler was built, we're going // to scale both sides by multiplying with // A.ObjectSize * B.ObjectSize. This ends up factoring away // the division and, with it, the need for any floating point // arithmetic. DensityAScaled = static_cast(A.ObjectNumUses) * static_cast(B.ObjectSize); DensityBScaled = static_cast(B.ObjectNumUses) * static_cast(A.ObjectSize); // If the two densities are equal, prioritize highest alignment // objects. This allows for similar alignment objects // to be packed together (given the same density). // There's room for improvement here, also, since we can pack // similar alignment (different density) objects next to each // other to save padding. This will also require further // complexity/iterations, and the overall gain isn't worth it, // in general. Something to keep in mind, though. if (DensityAScaled == DensityBScaled) return A.ObjectAlignment < B.ObjectAlignment; return DensityAScaled < DensityBScaled; } }; } // namespace // Order the symbols in the local stack. // We want to place the local stack objects in some sort of sensible order. // The heuristic we use is to try and pack them according to static number // of uses and size of object in order to minimize code size. void X86FrameLowering::orderFrameObjects( const MachineFunction &MF, SmallVectorImpl &ObjectsToAllocate) const { const MachineFrameInfo &MFI = MF.getFrameInfo(); // Don't waste time if there's nothing to do. if (ObjectsToAllocate.empty()) return; // Create an array of all MFI objects. We won't need all of these // objects, but we're going to create a full array of them to make // it easier to index into when we're counting "uses" down below. // We want to be able to easily/cheaply access an object by simply // indexing into it, instead of having to search for it every time. std::vector SortingObjects(MFI.getObjectIndexEnd()); // Walk the objects we care about and mark them as such in our working // struct. for (auto &Obj : ObjectsToAllocate) { SortingObjects[Obj].IsValid = true; SortingObjects[Obj].ObjectIndex = Obj; SortingObjects[Obj].ObjectAlignment = MFI.getObjectAlign(Obj); // Set the size. int ObjectSize = MFI.getObjectSize(Obj); if (ObjectSize == 0) // Variable size. Just use 4. SortingObjects[Obj].ObjectSize = 4; else SortingObjects[Obj].ObjectSize = ObjectSize; } // Count the number of uses for each object. for (auto &MBB : MF) { for (auto &MI : MBB) { if (MI.isDebugInstr()) continue; for (const MachineOperand &MO : MI.operands()) { // Check to see if it's a local stack symbol. if (!MO.isFI()) continue; int Index = MO.getIndex(); // Check to see if it falls within our range, and is tagged // to require ordering. if (Index >= 0 && Index < MFI.getObjectIndexEnd() && SortingObjects[Index].IsValid) SortingObjects[Index].ObjectNumUses++; } } } // Sort the objects using X86FrameSortingAlgorithm (see its comment for // info). llvm::stable_sort(SortingObjects, X86FrameSortingComparator()); // Now modify the original list to represent the final order that // we want. The order will depend on whether we're going to access them // from the stack pointer or the frame pointer. For SP, the list should // end up with the END containing objects that we want with smaller offsets. // For FP, it should be flipped. int i = 0; for (auto &Obj : SortingObjects) { // All invalid items are sorted at the end, so it's safe to stop. if (!Obj.IsValid) break; ObjectsToAllocate[i++] = Obj.ObjectIndex; } // Flip it if we're accessing off of the FP. if (!TRI->needsStackRealignment(MF) && hasFP(MF)) std::reverse(ObjectsToAllocate.begin(), ObjectsToAllocate.end()); } unsigned X86FrameLowering::getWinEHParentFrameOffset(const MachineFunction &MF) const { // RDX, the parent frame pointer, is homed into 16(%rsp) in the prologue. unsigned Offset = 16; // RBP is immediately pushed. Offset += SlotSize; // All callee-saved registers are then pushed. Offset += MF.getInfo()->getCalleeSavedFrameSize(); // Every funclet allocates enough stack space for the largest outgoing call. Offset += getWinEHFuncletFrameSize(MF); return Offset; } void X86FrameLowering::processFunctionBeforeFrameFinalized( MachineFunction &MF, RegScavenger *RS) const { // Mark the function as not having WinCFI. We will set it back to true in // emitPrologue if it gets called and emits CFI. MF.setHasWinCFI(false); // If this function isn't doing Win64-style C++ EH, we don't need to do // anything. const Function &F = MF.getFunction(); if (!STI.is64Bit() || !MF.hasEHFunclets() || classifyEHPersonality(F.getPersonalityFn()) != EHPersonality::MSVC_CXX) return; // Win64 C++ EH needs to allocate the UnwindHelp object at some fixed offset // relative to RSP after the prologue. Find the offset of the last fixed // object, so that we can allocate a slot immediately following it. If there // were no fixed objects, use offset -SlotSize, which is immediately after the // return address. Fixed objects have negative frame indices. MachineFrameInfo &MFI = MF.getFrameInfo(); WinEHFuncInfo &EHInfo = *MF.getWinEHFuncInfo(); int64_t MinFixedObjOffset = -SlotSize; for (int I = MFI.getObjectIndexBegin(); I < 0; ++I) MinFixedObjOffset = std::min(MinFixedObjOffset, MFI.getObjectOffset(I)); for (WinEHTryBlockMapEntry &TBME : EHInfo.TryBlockMap) { for (WinEHHandlerType &H : TBME.HandlerArray) { int FrameIndex = H.CatchObj.FrameIndex; if (FrameIndex != INT_MAX) { // Ensure alignment. unsigned Align = MFI.getObjectAlign(FrameIndex).value(); MinFixedObjOffset -= std::abs(MinFixedObjOffset) % Align; MinFixedObjOffset -= MFI.getObjectSize(FrameIndex); MFI.setObjectOffset(FrameIndex, MinFixedObjOffset); } } } // Ensure alignment. MinFixedObjOffset -= std::abs(MinFixedObjOffset) % 8; int64_t UnwindHelpOffset = MinFixedObjOffset - SlotSize; int UnwindHelpFI = MFI.CreateFixedObject(SlotSize, UnwindHelpOffset, /*IsImmutable=*/false); EHInfo.UnwindHelpFrameIdx = UnwindHelpFI; // Store -2 into UnwindHelp on function entry. We have to scan forwards past // other frame setup instructions. MachineBasicBlock &MBB = MF.front(); auto MBBI = MBB.begin(); while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup)) ++MBBI; DebugLoc DL = MBB.findDebugLoc(MBBI); addFrameReference(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64mi32)), UnwindHelpFI) .addImm(-2); } void X86FrameLowering::processFunctionBeforeFrameIndicesReplaced( MachineFunction &MF, RegScavenger *RS) const { if (STI.is32Bit() && MF.hasEHFunclets()) restoreWinEHStackPointersInParent(MF); } void X86FrameLowering::restoreWinEHStackPointersInParent( MachineFunction &MF) const { // 32-bit functions have to restore stack pointers when control is transferred // back to the parent function. These blocks are identified as eh pads that // are not funclet entries. bool IsSEH = isAsynchronousEHPersonality( classifyEHPersonality(MF.getFunction().getPersonalityFn())); for (MachineBasicBlock &MBB : MF) { bool NeedsRestore = MBB.isEHPad() && !MBB.isEHFuncletEntry(); if (NeedsRestore) restoreWin32EHStackPointers(MBB, MBB.begin(), DebugLoc(), /*RestoreSP=*/IsSEH); } } diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp index 302a15701d81..a96f73df855d 100644 --- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -1,6012 +1,6019 @@ //===- X86ISelDAGToDAG.cpp - A DAG pattern matching inst selector for X86 -===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file defines a DAG pattern matching instruction selector for X86, // converting from a legalized dag to a X86 dag. // //===----------------------------------------------------------------------===// #include "X86.h" #include "X86MachineFunctionInfo.h" #include "X86RegisterInfo.h" #include "X86Subtarget.h" #include "X86TargetMachine.h" #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/SelectionDAGISel.h" #include "llvm/Config/llvm-config.h" #include "llvm/IR/ConstantRange.h" #include "llvm/IR/Function.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/IntrinsicsX86.h" #include "llvm/IR/Type.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/KnownBits.h" #include "llvm/Support/MathExtras.h" #include using namespace llvm; #define DEBUG_TYPE "x86-isel" STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor"); static cl::opt AndImmShrink("x86-and-imm-shrink", cl::init(true), cl::desc("Enable setting constant bits to reduce size of mask immediates"), cl::Hidden); static cl::opt EnablePromoteAnyextLoad( "x86-promote-anyext-load", cl::init(true), cl::desc("Enable promoting aligned anyext load to wider load"), cl::Hidden); extern cl::opt IndirectBranchTracking; //===----------------------------------------------------------------------===// // Pattern Matcher Implementation //===----------------------------------------------------------------------===// namespace { /// This corresponds to X86AddressMode, but uses SDValue's instead of register /// numbers for the leaves of the matched tree. struct X86ISelAddressMode { enum { RegBase, FrameIndexBase } BaseType; // This is really a union, discriminated by BaseType! SDValue Base_Reg; int Base_FrameIndex; unsigned Scale; SDValue IndexReg; int32_t Disp; SDValue Segment; const GlobalValue *GV; const Constant *CP; const BlockAddress *BlockAddr; const char *ES; MCSymbol *MCSym; int JT; Align Alignment; // CP alignment. unsigned char SymbolFlags; // X86II::MO_* bool NegateIndex = false; X86ISelAddressMode() : BaseType(RegBase), Base_FrameIndex(0), Scale(1), IndexReg(), Disp(0), Segment(), GV(nullptr), CP(nullptr), BlockAddr(nullptr), ES(nullptr), MCSym(nullptr), JT(-1), SymbolFlags(X86II::MO_NO_FLAG) {} bool hasSymbolicDisplacement() const { return GV != nullptr || CP != nullptr || ES != nullptr || MCSym != nullptr || JT != -1 || BlockAddr != nullptr; } bool hasBaseOrIndexReg() const { return BaseType == FrameIndexBase || IndexReg.getNode() != nullptr || Base_Reg.getNode() != nullptr; } /// Return true if this addressing mode is already RIP-relative. bool isRIPRelative() const { if (BaseType != RegBase) return false; if (RegisterSDNode *RegNode = dyn_cast_or_null(Base_Reg.getNode())) return RegNode->getReg() == X86::RIP; return false; } void setBaseReg(SDValue Reg) { BaseType = RegBase; Base_Reg = Reg; } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void dump(SelectionDAG *DAG = nullptr) { dbgs() << "X86ISelAddressMode " << this << '\n'; dbgs() << "Base_Reg "; if (Base_Reg.getNode()) Base_Reg.getNode()->dump(DAG); else dbgs() << "nul\n"; if (BaseType == FrameIndexBase) dbgs() << " Base.FrameIndex " << Base_FrameIndex << '\n'; dbgs() << " Scale " << Scale << '\n' << "IndexReg "; if (NegateIndex) dbgs() << "negate "; if (IndexReg.getNode()) IndexReg.getNode()->dump(DAG); else dbgs() << "nul\n"; dbgs() << " Disp " << Disp << '\n' << "GV "; if (GV) GV->dump(); else dbgs() << "nul"; dbgs() << " CP "; if (CP) CP->dump(); else dbgs() << "nul"; dbgs() << '\n' << "ES "; if (ES) dbgs() << ES; else dbgs() << "nul"; dbgs() << " MCSym "; if (MCSym) dbgs() << MCSym; else dbgs() << "nul"; dbgs() << " JT" << JT << " Align" << Alignment.value() << '\n'; } #endif }; } namespace { //===--------------------------------------------------------------------===// /// ISel - X86-specific code to select X86 machine instructions for /// SelectionDAG operations. /// class X86DAGToDAGISel final : public SelectionDAGISel { /// Keep a pointer to the X86Subtarget around so that we can /// make the right decision when generating code for different targets. const X86Subtarget *Subtarget; /// If true, selector should try to optimize for minimum code size. bool OptForMinSize; /// Disable direct TLS access through segment registers. bool IndirectTlsSegRefs; public: explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOpt::Level OptLevel) : SelectionDAGISel(tm, OptLevel), Subtarget(nullptr), OptForMinSize(false), IndirectTlsSegRefs(false) {} StringRef getPassName() const override { return "X86 DAG->DAG Instruction Selection"; } bool runOnMachineFunction(MachineFunction &MF) override { // Reset the subtarget each time through. Subtarget = &MF.getSubtarget(); IndirectTlsSegRefs = MF.getFunction().hasFnAttribute( "indirect-tls-seg-refs"); // OptFor[Min]Size are used in pattern predicates that isel is matching. OptForMinSize = MF.getFunction().hasMinSize(); assert((!OptForMinSize || MF.getFunction().hasOptSize()) && "OptForMinSize implies OptForSize"); SelectionDAGISel::runOnMachineFunction(MF); return true; } void emitFunctionEntryCode() override; bool IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const override; void PreprocessISelDAG() override; void PostprocessISelDAG() override; // Include the pieces autogenerated from the target description. #include "X86GenDAGISel.inc" private: void Select(SDNode *N) override; bool foldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM); bool matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM, bool AllowSegmentRegForX32 = false); bool matchWrapper(SDValue N, X86ISelAddressMode &AM); bool matchAddress(SDValue N, X86ISelAddressMode &AM); bool matchVectorAddress(SDValue N, X86ISelAddressMode &AM); bool matchAdd(SDValue &N, X86ISelAddressMode &AM, unsigned Depth); bool matchAddressRecursively(SDValue N, X86ISelAddressMode &AM, unsigned Depth); bool matchAddressBase(SDValue N, X86ISelAddressMode &AM); bool selectAddr(SDNode *Parent, SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment); bool selectVectorAddr(MemSDNode *Parent, SDValue BasePtr, SDValue IndexOp, SDValue ScaleOp, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment); bool selectMOV64Imm32(SDValue N, SDValue &Imm); bool selectLEAAddr(SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment); bool selectLEA64_32Addr(SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment); bool selectTLSADDRAddr(SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment); bool selectRelocImm(SDValue N, SDValue &Op); bool tryFoldLoad(SDNode *Root, SDNode *P, SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment); // Convenience method where P is also root. bool tryFoldLoad(SDNode *P, SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment) { return tryFoldLoad(P, P, N, Base, Scale, Index, Disp, Segment); } bool tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment); bool isProfitableToFormMaskedOp(SDNode *N) const; /// Implement addressing mode selection for inline asm expressions. bool SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID, std::vector &OutOps) override; void emitSpecialCodeForMain(); inline void getAddressOperands(X86ISelAddressMode &AM, const SDLoc &DL, MVT VT, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment) { if (AM.BaseType == X86ISelAddressMode::FrameIndexBase) Base = CurDAG->getTargetFrameIndex( AM.Base_FrameIndex, TLI->getPointerTy(CurDAG->getDataLayout())); else if (AM.Base_Reg.getNode()) Base = AM.Base_Reg; else Base = CurDAG->getRegister(0, VT); Scale = getI8Imm(AM.Scale, DL); // Negate the index if needed. if (AM.NegateIndex) { unsigned NegOpc = VT == MVT::i64 ? X86::NEG64r : X86::NEG32r; SDValue Neg = SDValue(CurDAG->getMachineNode(NegOpc, DL, VT, MVT::i32, AM.IndexReg), 0); AM.IndexReg = Neg; } if (AM.IndexReg.getNode()) Index = AM.IndexReg; else Index = CurDAG->getRegister(0, VT); // These are 32-bit even in 64-bit mode since RIP-relative offset // is 32-bit. if (AM.GV) Disp = CurDAG->getTargetGlobalAddress(AM.GV, SDLoc(), MVT::i32, AM.Disp, AM.SymbolFlags); else if (AM.CP) Disp = CurDAG->getTargetConstantPool(AM.CP, MVT::i32, AM.Alignment, AM.Disp, AM.SymbolFlags); else if (AM.ES) { assert(!AM.Disp && "Non-zero displacement is ignored with ES."); Disp = CurDAG->getTargetExternalSymbol(AM.ES, MVT::i32, AM.SymbolFlags); } else if (AM.MCSym) { assert(!AM.Disp && "Non-zero displacement is ignored with MCSym."); assert(AM.SymbolFlags == 0 && "oo"); Disp = CurDAG->getMCSymbol(AM.MCSym, MVT::i32); } else if (AM.JT != -1) { assert(!AM.Disp && "Non-zero displacement is ignored with JT."); Disp = CurDAG->getTargetJumpTable(AM.JT, MVT::i32, AM.SymbolFlags); } else if (AM.BlockAddr) Disp = CurDAG->getTargetBlockAddress(AM.BlockAddr, MVT::i32, AM.Disp, AM.SymbolFlags); else Disp = CurDAG->getTargetConstant(AM.Disp, DL, MVT::i32); if (AM.Segment.getNode()) Segment = AM.Segment; else Segment = CurDAG->getRegister(0, MVT::i16); } // Utility function to determine whether we should avoid selecting // immediate forms of instructions for better code size or not. // At a high level, we'd like to avoid such instructions when // we have similar constants used within the same basic block // that can be kept in a register. // bool shouldAvoidImmediateInstFormsForSize(SDNode *N) const { uint32_t UseCount = 0; // Do not want to hoist if we're not optimizing for size. // TODO: We'd like to remove this restriction. // See the comment in X86InstrInfo.td for more info. if (!CurDAG->shouldOptForSize()) return false; // Walk all the users of the immediate. for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); (UI != UE) && (UseCount < 2); ++UI) { SDNode *User = *UI; // This user is already selected. Count it as a legitimate use and // move on. if (User->isMachineOpcode()) { UseCount++; continue; } // We want to count stores of immediates as real uses. if (User->getOpcode() == ISD::STORE && User->getOperand(1).getNode() == N) { UseCount++; continue; } // We don't currently match users that have > 2 operands (except // for stores, which are handled above) // Those instruction won't match in ISEL, for now, and would // be counted incorrectly. // This may change in the future as we add additional instruction // types. if (User->getNumOperands() != 2) continue; // If this is a sign-extended 8-bit integer immediate used in an ALU // instruction, there is probably an opcode encoding to save space. auto *C = dyn_cast(N); if (C && isInt<8>(C->getSExtValue())) continue; // Immediates that are used for offsets as part of stack // manipulation should be left alone. These are typically // used to indicate SP offsets for argument passing and // will get pulled into stores/pushes (implicitly). if (User->getOpcode() == X86ISD::ADD || User->getOpcode() == ISD::ADD || User->getOpcode() == X86ISD::SUB || User->getOpcode() == ISD::SUB) { // Find the other operand of the add/sub. SDValue OtherOp = User->getOperand(0); if (OtherOp.getNode() == N) OtherOp = User->getOperand(1); // Don't count if the other operand is SP. RegisterSDNode *RegNode; if (OtherOp->getOpcode() == ISD::CopyFromReg && (RegNode = dyn_cast_or_null( OtherOp->getOperand(1).getNode()))) if ((RegNode->getReg() == X86::ESP) || (RegNode->getReg() == X86::RSP)) continue; } // ... otherwise, count this and move on. UseCount++; } // If we have more than 1 use, then recommend for hoisting. return (UseCount > 1); } /// Return a target constant with the specified value of type i8. inline SDValue getI8Imm(unsigned Imm, const SDLoc &DL) { return CurDAG->getTargetConstant(Imm, DL, MVT::i8); } /// Return a target constant with the specified value, of type i32. inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) { return CurDAG->getTargetConstant(Imm, DL, MVT::i32); } /// Return a target constant with the specified value, of type i64. inline SDValue getI64Imm(uint64_t Imm, const SDLoc &DL) { return CurDAG->getTargetConstant(Imm, DL, MVT::i64); } SDValue getExtractVEXTRACTImmediate(SDNode *N, unsigned VecWidth, const SDLoc &DL) { assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width"); uint64_t Index = N->getConstantOperandVal(1); MVT VecVT = N->getOperand(0).getSimpleValueType(); return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL); } SDValue getInsertVINSERTImmediate(SDNode *N, unsigned VecWidth, const SDLoc &DL) { assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width"); uint64_t Index = N->getConstantOperandVal(2); MVT VecVT = N->getSimpleValueType(0); return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL); } // Helper to detect unneeded and instructions on shift amounts. Called // from PatFrags in tablegen. bool isUnneededShiftMask(SDNode *N, unsigned Width) const { assert(N->getOpcode() == ISD::AND && "Unexpected opcode"); const APInt &Val = cast(N->getOperand(1))->getAPIntValue(); if (Val.countTrailingOnes() >= Width) return true; APInt Mask = Val | CurDAG->computeKnownBits(N->getOperand(0)).Zero; return Mask.countTrailingOnes() >= Width; } /// Return an SDNode that returns the value of the global base register. /// Output instructions required to initialize the global base register, /// if necessary. SDNode *getGlobalBaseReg(); /// Return a reference to the TargetMachine, casted to the target-specific /// type. const X86TargetMachine &getTargetMachine() const { return static_cast(TM); } /// Return a reference to the TargetInstrInfo, casted to the target-specific /// type. const X86InstrInfo *getInstrInfo() const { return Subtarget->getInstrInfo(); } /// Address-mode matching performs shift-of-and to and-of-shift /// reassociation in order to expose more scaled addressing /// opportunities. bool ComplexPatternFuncMutatesDAG() const override { return true; } bool isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const; // Indicates we should prefer to use a non-temporal load for this load. bool useNonTemporalLoad(LoadSDNode *N) const { if (!N->isNonTemporal()) return false; unsigned StoreSize = N->getMemoryVT().getStoreSize(); if (N->getAlignment() < StoreSize) return false; switch (StoreSize) { default: llvm_unreachable("Unsupported store size"); case 4: case 8: return false; case 16: return Subtarget->hasSSE41(); case 32: return Subtarget->hasAVX2(); case 64: return Subtarget->hasAVX512(); } } bool foldLoadStoreIntoMemOperand(SDNode *Node); MachineSDNode *matchBEXTRFromAndImm(SDNode *Node); bool matchBitExtract(SDNode *Node); bool shrinkAndImmediate(SDNode *N); bool isMaskZeroExtended(SDNode *N) const; bool tryShiftAmountMod(SDNode *N); bool tryShrinkShlLogicImm(SDNode *N); bool tryVPTERNLOG(SDNode *N); bool matchVPTERNLOG(SDNode *Root, SDNode *ParentA, SDNode *ParentBC, SDValue A, SDValue B, SDValue C, uint8_t Imm); bool tryVPTESTM(SDNode *Root, SDValue Setcc, SDValue Mask); bool tryMatchBitSelect(SDNode *N); MachineSDNode *emitPCMPISTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad, const SDLoc &dl, MVT VT, SDNode *Node); MachineSDNode *emitPCMPESTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad, const SDLoc &dl, MVT VT, SDNode *Node, SDValue &InFlag); bool tryOptimizeRem8Extend(SDNode *N); bool onlyUsesZeroFlag(SDValue Flags) const; bool hasNoSignFlagUses(SDValue Flags) const; bool hasNoCarryFlagUses(SDValue Flags) const; }; } // Returns true if this masked compare can be implemented legally with this // type. static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget) { unsigned Opcode = N->getOpcode(); if (Opcode == X86ISD::CMPM || Opcode == X86ISD::CMPMM || Opcode == X86ISD::STRICT_CMPM || Opcode == ISD::SETCC || Opcode == X86ISD::CMPMM_SAE || Opcode == X86ISD::VFPCLASS) { // We can get 256-bit 8 element types here without VLX being enabled. When // this happens we will use 512-bit operations and the mask will not be // zero extended. EVT OpVT = N->getOperand(0).getValueType(); // The first operand of X86ISD::STRICT_CMPM is chain, so we need to get the // second operand. if (Opcode == X86ISD::STRICT_CMPM) OpVT = N->getOperand(1).getValueType(); if (OpVT.is256BitVector() || OpVT.is128BitVector()) return Subtarget->hasVLX(); return true; } // Scalar opcodes use 128 bit registers, but aren't subject to the VLX check. if (Opcode == X86ISD::VFPCLASSS || Opcode == X86ISD::FSETCCM || Opcode == X86ISD::FSETCCM_SAE) return true; return false; } // Returns true if we can assume the writer of the mask has zero extended it // for us. bool X86DAGToDAGISel::isMaskZeroExtended(SDNode *N) const { // If this is an AND, check if we have a compare on either side. As long as // one side guarantees the mask is zero extended, the AND will preserve those // zeros. if (N->getOpcode() == ISD::AND) return isLegalMaskCompare(N->getOperand(0).getNode(), Subtarget) || isLegalMaskCompare(N->getOperand(1).getNode(), Subtarget); return isLegalMaskCompare(N, Subtarget); } bool X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const { if (OptLevel == CodeGenOpt::None) return false; if (!N.hasOneUse()) return false; if (N.getOpcode() != ISD::LOAD) return true; // Don't fold non-temporal loads if we have an instruction for them. if (useNonTemporalLoad(cast(N))) return false; // If N is a load, do additional profitability checks. if (U == Root) { switch (U->getOpcode()) { default: break; case X86ISD::ADD: case X86ISD::ADC: case X86ISD::SUB: case X86ISD::SBB: case X86ISD::AND: case X86ISD::XOR: case X86ISD::OR: case ISD::ADD: case ISD::ADDCARRY: case ISD::AND: case ISD::OR: case ISD::XOR: { SDValue Op1 = U->getOperand(1); // If the other operand is a 8-bit immediate we should fold the immediate // instead. This reduces code size. // e.g. // movl 4(%esp), %eax // addl $4, %eax // vs. // movl $4, %eax // addl 4(%esp), %eax // The former is 2 bytes shorter. In case where the increment is 1, then // the saving can be 4 bytes (by using incl %eax). if (ConstantSDNode *Imm = dyn_cast(Op1)) { if (Imm->getAPIntValue().isSignedIntN(8)) return false; // If this is a 64-bit AND with an immediate that fits in 32-bits, // prefer using the smaller and over folding the load. This is needed to // make sure immediates created by shrinkAndImmediate are always folded. // Ideally we would narrow the load during DAG combine and get the // best of both worlds. if (U->getOpcode() == ISD::AND && Imm->getAPIntValue().getBitWidth() == 64 && Imm->getAPIntValue().isSignedIntN(32)) return false; // If this really a zext_inreg that can be represented with a movzx // instruction, prefer that. // TODO: We could shrink the load and fold if it is non-volatile. if (U->getOpcode() == ISD::AND && (Imm->getAPIntValue() == UINT8_MAX || Imm->getAPIntValue() == UINT16_MAX || Imm->getAPIntValue() == UINT32_MAX)) return false; // ADD/SUB with can negate the immediate and use the opposite operation // to fit 128 into a sign extended 8 bit immediate. if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB) && (-Imm->getAPIntValue()).isSignedIntN(8)) return false; if ((U->getOpcode() == X86ISD::ADD || U->getOpcode() == X86ISD::SUB) && (-Imm->getAPIntValue()).isSignedIntN(8) && hasNoCarryFlagUses(SDValue(U, 1))) return false; } // If the other operand is a TLS address, we should fold it instead. // This produces // movl %gs:0, %eax // leal i@NTPOFF(%eax), %eax // instead of // movl $i@NTPOFF, %eax // addl %gs:0, %eax // if the block also has an access to a second TLS address this will save // a load. // FIXME: This is probably also true for non-TLS addresses. if (Op1.getOpcode() == X86ISD::Wrapper) { SDValue Val = Op1.getOperand(0); if (Val.getOpcode() == ISD::TargetGlobalTLSAddress) return false; } // Don't fold load if this matches the BTS/BTR/BTC patterns. // BTS: (or X, (shl 1, n)) // BTR: (and X, (rotl -2, n)) // BTC: (xor X, (shl 1, n)) if (U->getOpcode() == ISD::OR || U->getOpcode() == ISD::XOR) { if (U->getOperand(0).getOpcode() == ISD::SHL && isOneConstant(U->getOperand(0).getOperand(0))) return false; if (U->getOperand(1).getOpcode() == ISD::SHL && isOneConstant(U->getOperand(1).getOperand(0))) return false; } if (U->getOpcode() == ISD::AND) { SDValue U0 = U->getOperand(0); SDValue U1 = U->getOperand(1); if (U0.getOpcode() == ISD::ROTL) { auto *C = dyn_cast(U0.getOperand(0)); if (C && C->getSExtValue() == -2) return false; } if (U1.getOpcode() == ISD::ROTL) { auto *C = dyn_cast(U1.getOperand(0)); if (C && C->getSExtValue() == -2) return false; } } break; } case ISD::SHL: case ISD::SRA: case ISD::SRL: // Don't fold a load into a shift by immediate. The BMI2 instructions // support folding a load, but not an immediate. The legacy instructions // support folding an immediate, but can't fold a load. Folding an // immediate is preferable to folding a load. if (isa(U->getOperand(1))) return false; break; } } // Prevent folding a load if this can implemented with an insert_subreg or // a move that implicitly zeroes. if (Root->getOpcode() == ISD::INSERT_SUBVECTOR && isNullConstant(Root->getOperand(2)) && (Root->getOperand(0).isUndef() || ISD::isBuildVectorAllZeros(Root->getOperand(0).getNode()))) return false; return true; } // Indicates it is profitable to form an AVX512 masked operation. Returning // false will favor a masked register-register masked move or vblendm and the // operation will be selected separately. bool X86DAGToDAGISel::isProfitableToFormMaskedOp(SDNode *N) const { assert( (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::SELECTS) && "Unexpected opcode!"); // If the operation has additional users, the operation will be duplicated. // Check the use count to prevent that. // FIXME: Are there cheap opcodes we might want to duplicate? return N->getOperand(1).hasOneUse(); } /// Replace the original chain operand of the call with /// load's chain operand and move load below the call's chain operand. static void moveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load, SDValue Call, SDValue OrigChain) { SmallVector Ops; SDValue Chain = OrigChain.getOperand(0); if (Chain.getNode() == Load.getNode()) Ops.push_back(Load.getOperand(0)); else { assert(Chain.getOpcode() == ISD::TokenFactor && "Unexpected chain operand"); for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) if (Chain.getOperand(i).getNode() == Load.getNode()) Ops.push_back(Load.getOperand(0)); else Ops.push_back(Chain.getOperand(i)); SDValue NewChain = CurDAG->getNode(ISD::TokenFactor, SDLoc(Load), MVT::Other, Ops); Ops.clear(); Ops.push_back(NewChain); } Ops.append(OrigChain->op_begin() + 1, OrigChain->op_end()); CurDAG->UpdateNodeOperands(OrigChain.getNode(), Ops); CurDAG->UpdateNodeOperands(Load.getNode(), Call.getOperand(0), Load.getOperand(1), Load.getOperand(2)); Ops.clear(); Ops.push_back(SDValue(Load.getNode(), 1)); Ops.append(Call->op_begin() + 1, Call->op_end()); CurDAG->UpdateNodeOperands(Call.getNode(), Ops); } /// Return true if call address is a load and it can be /// moved below CALLSEQ_START and the chains leading up to the call. /// Return the CALLSEQ_START by reference as a second output. /// In the case of a tail call, there isn't a callseq node between the call /// chain and the load. static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) { // The transformation is somewhat dangerous if the call's chain was glued to // the call. After MoveBelowOrigChain the load is moved between the call and // the chain, this can create a cycle if the load is not folded. So it is // *really* important that we are sure the load will be folded. if (Callee.getNode() == Chain.getNode() || !Callee.hasOneUse()) return false; LoadSDNode *LD = dyn_cast(Callee.getNode()); if (!LD || !LD->isSimple() || LD->getAddressingMode() != ISD::UNINDEXED || LD->getExtensionType() != ISD::NON_EXTLOAD) return false; // Now let's find the callseq_start. while (HasCallSeq && Chain.getOpcode() != ISD::CALLSEQ_START) { if (!Chain.hasOneUse()) return false; Chain = Chain.getOperand(0); } if (!Chain.getNumOperands()) return false; // Since we are not checking for AA here, conservatively abort if the chain // writes to memory. It's not safe to move the callee (a load) across a store. if (isa(Chain.getNode()) && cast(Chain.getNode())->writeMem()) return false; if (Chain.getOperand(0).getNode() == Callee.getNode()) return true; if (Chain.getOperand(0).getOpcode() == ISD::TokenFactor && Callee.getValue(1).isOperandOf(Chain.getOperand(0).getNode()) && Callee.getValue(1).hasOneUse()) return true; return false; } static bool isEndbrImm64(uint64_t Imm) { // There may be some other prefix bytes between 0xF3 and 0x0F1EFA. // i.g: 0xF3660F1EFA, 0xF3670F1EFA if ((Imm & 0x00FFFFFF) != 0x0F1EFA) return false; uint8_t OptionalPrefixBytes [] = {0x26, 0x2e, 0x36, 0x3e, 0x64, 0x65, 0x66, 0x67, 0xf0, 0xf2}; int i = 24; // 24bit 0x0F1EFA has matched while (i < 64) { uint8_t Byte = (Imm >> i) & 0xFF; if (Byte == 0xF3) return true; if (!llvm::is_contained(OptionalPrefixBytes, Byte)) return false; i += 8; } return false; } void X86DAGToDAGISel::PreprocessISelDAG() { bool MadeChange = false; for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(), E = CurDAG->allnodes_end(); I != E; ) { SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues. // This is for CET enhancement. // // ENDBR32 and ENDBR64 have specific opcodes: // ENDBR32: F3 0F 1E FB // ENDBR64: F3 0F 1E FA // And we want that attackers won’t find unintended ENDBR32/64 // opcode matches in the binary // Here’s an example: // If the compiler had to generate asm for the following code: // a = 0xF30F1EFA // it could, for example, generate: // mov 0xF30F1EFA, dword ptr[a] // In such a case, the binary would include a gadget that starts // with a fake ENDBR64 opcode. Therefore, we split such generation // into multiple operations, let it not shows in the binary if (N->getOpcode() == ISD::Constant) { MVT VT = N->getSimpleValueType(0); int64_t Imm = cast(N)->getSExtValue(); int32_t EndbrImm = Subtarget->is64Bit() ? 0xF30F1EFA : 0xF30F1EFB; if (Imm == EndbrImm || isEndbrImm64(Imm)) { // Check that the cf-protection-branch is enabled. Metadata *CFProtectionBranch = MF->getMMI().getModule()->getModuleFlag("cf-protection-branch"); if (CFProtectionBranch || IndirectBranchTracking) { SDLoc dl(N); SDValue Complement = CurDAG->getConstant(~Imm, dl, VT, false, true); Complement = CurDAG->getNOT(dl, Complement, VT); --I; CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Complement); ++I; MadeChange = true; continue; } } } // If this is a target specific AND node with no flag usages, turn it back // into ISD::AND to enable test instruction matching. if (N->getOpcode() == X86ISD::AND && !N->hasAnyUseOfValue(1)) { SDValue Res = CurDAG->getNode(ISD::AND, SDLoc(N), N->getValueType(0), N->getOperand(0), N->getOperand(1)); --I; CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res); ++I; MadeChange = true; continue; } /// Convert vector increment or decrement to sub/add with an all-ones /// constant: /// add X, <1, 1...> --> sub X, <-1, -1...> /// sub X, <1, 1...> --> add X, <-1, -1...> /// The all-ones vector constant can be materialized using a pcmpeq /// instruction that is commonly recognized as an idiom (has no register /// dependency), so that's better/smaller than loading a splat 1 constant. if ((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) && N->getSimpleValueType(0).isVector()) { APInt SplatVal; if (X86::isConstantSplat(N->getOperand(1), SplatVal) && SplatVal.isOneValue()) { SDLoc DL(N); MVT VT = N->getSimpleValueType(0); unsigned NumElts = VT.getSizeInBits() / 32; SDValue AllOnes = CurDAG->getAllOnesConstant(DL, MVT::getVectorVT(MVT::i32, NumElts)); AllOnes = CurDAG->getBitcast(VT, AllOnes); unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD; SDValue Res = CurDAG->getNode(NewOpcode, DL, VT, N->getOperand(0), AllOnes); --I; CurDAG->ReplaceAllUsesWith(N, Res.getNode()); ++I; MadeChange = true; continue; } } switch (N->getOpcode()) { case X86ISD::VBROADCAST: { MVT VT = N->getSimpleValueType(0); // Emulate v32i16/v64i8 broadcast without BWI. if (!Subtarget->hasBWI() && (VT == MVT::v32i16 || VT == MVT::v64i8)) { MVT NarrowVT = VT == MVT::v32i16 ? MVT::v16i16 : MVT::v32i8; SDLoc dl(N); SDValue NarrowBCast = CurDAG->getNode(X86ISD::VBROADCAST, dl, NarrowVT, N->getOperand(0)); SDValue Res = CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, CurDAG->getUNDEF(VT), NarrowBCast, CurDAG->getIntPtrConstant(0, dl)); unsigned Index = VT == MVT::v32i16 ? 16 : 32; Res = CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, NarrowBCast, CurDAG->getIntPtrConstant(Index, dl)); --I; CurDAG->ReplaceAllUsesWith(N, Res.getNode()); ++I; MadeChange = true; continue; } break; } case X86ISD::VBROADCAST_LOAD: { MVT VT = N->getSimpleValueType(0); // Emulate v32i16/v64i8 broadcast without BWI. if (!Subtarget->hasBWI() && (VT == MVT::v32i16 || VT == MVT::v64i8)) { MVT NarrowVT = VT == MVT::v32i16 ? MVT::v16i16 : MVT::v32i8; auto *MemNode = cast(N); SDLoc dl(N); SDVTList VTs = CurDAG->getVTList(NarrowVT, MVT::Other); SDValue Ops[] = {MemNode->getChain(), MemNode->getBasePtr()}; SDValue NarrowBCast = CurDAG->getMemIntrinsicNode( X86ISD::VBROADCAST_LOAD, dl, VTs, Ops, MemNode->getMemoryVT(), MemNode->getMemOperand()); SDValue Res = CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, CurDAG->getUNDEF(VT), NarrowBCast, CurDAG->getIntPtrConstant(0, dl)); unsigned Index = VT == MVT::v32i16 ? 16 : 32; Res = CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, NarrowBCast, CurDAG->getIntPtrConstant(Index, dl)); --I; SDValue To[] = {Res, NarrowBCast.getValue(1)}; CurDAG->ReplaceAllUsesWith(N, To); ++I; MadeChange = true; continue; } break; } case ISD::VSELECT: { // Replace VSELECT with non-mask conditions with with BLENDV. if (N->getOperand(0).getValueType().getVectorElementType() == MVT::i1) break; assert(Subtarget->hasSSE41() && "Expected SSE4.1 support!"); SDValue Blendv = CurDAG->getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0), N->getOperand(0), N->getOperand(1), N->getOperand(2)); --I; CurDAG->ReplaceAllUsesWith(N, Blendv.getNode()); ++I; MadeChange = true; continue; } case ISD::FP_ROUND: case ISD::STRICT_FP_ROUND: case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: case ISD::STRICT_FP_TO_SINT: case ISD::STRICT_FP_TO_UINT: { // Replace vector fp_to_s/uint with their X86 specific equivalent so we // don't need 2 sets of patterns. if (!N->getSimpleValueType(0).isVector()) break; unsigned NewOpc; switch (N->getOpcode()) { default: llvm_unreachable("Unexpected opcode!"); case ISD::FP_ROUND: NewOpc = X86ISD::VFPROUND; break; case ISD::STRICT_FP_ROUND: NewOpc = X86ISD::STRICT_VFPROUND; break; case ISD::STRICT_FP_TO_SINT: NewOpc = X86ISD::STRICT_CVTTP2SI; break; case ISD::FP_TO_SINT: NewOpc = X86ISD::CVTTP2SI; break; case ISD::STRICT_FP_TO_UINT: NewOpc = X86ISD::STRICT_CVTTP2UI; break; case ISD::FP_TO_UINT: NewOpc = X86ISD::CVTTP2UI; break; } SDValue Res; if (N->isStrictFPOpcode()) Res = CurDAG->getNode(NewOpc, SDLoc(N), {N->getValueType(0), MVT::Other}, {N->getOperand(0), N->getOperand(1)}); else Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0), N->getOperand(0)); --I; CurDAG->ReplaceAllUsesWith(N, Res.getNode()); ++I; MadeChange = true; continue; } case ISD::SHL: case ISD::SRA: case ISD::SRL: { // Replace vector shifts with their X86 specific equivalent so we don't // need 2 sets of patterns. if (!N->getValueType(0).isVector()) break; unsigned NewOpc; switch (N->getOpcode()) { default: llvm_unreachable("Unexpected opcode!"); case ISD::SHL: NewOpc = X86ISD::VSHLV; break; case ISD::SRA: NewOpc = X86ISD::VSRAV; break; case ISD::SRL: NewOpc = X86ISD::VSRLV; break; } SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0), N->getOperand(0), N->getOperand(1)); --I; CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res); ++I; MadeChange = true; continue; } case ISD::ANY_EXTEND: case ISD::ANY_EXTEND_VECTOR_INREG: { // Replace vector any extend with the zero extend equivalents so we don't // need 2 sets of patterns. Ignore vXi1 extensions. if (!N->getValueType(0).isVector()) break; unsigned NewOpc; if (N->getOperand(0).getScalarValueSizeInBits() == 1) { assert(N->getOpcode() == ISD::ANY_EXTEND && "Unexpected opcode for mask vector!"); NewOpc = ISD::SIGN_EXTEND; } else { NewOpc = N->getOpcode() == ISD::ANY_EXTEND ? ISD::ZERO_EXTEND : ISD::ZERO_EXTEND_VECTOR_INREG; } SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0), N->getOperand(0)); --I; CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res); ++I; MadeChange = true; continue; } case ISD::FCEIL: case ISD::STRICT_FCEIL: case ISD::FFLOOR: case ISD::STRICT_FFLOOR: case ISD::FTRUNC: case ISD::STRICT_FTRUNC: case ISD::FROUNDEVEN: case ISD::STRICT_FROUNDEVEN: case ISD::FNEARBYINT: case ISD::STRICT_FNEARBYINT: case ISD::FRINT: case ISD::STRICT_FRINT: { // Replace fp rounding with their X86 specific equivalent so we don't // need 2 sets of patterns. unsigned Imm; switch (N->getOpcode()) { default: llvm_unreachable("Unexpected opcode!"); case ISD::STRICT_FCEIL: case ISD::FCEIL: Imm = 0xA; break; case ISD::STRICT_FFLOOR: case ISD::FFLOOR: Imm = 0x9; break; case ISD::STRICT_FTRUNC: case ISD::FTRUNC: Imm = 0xB; break; case ISD::STRICT_FROUNDEVEN: case ISD::FROUNDEVEN: Imm = 0x8; break; case ISD::STRICT_FNEARBYINT: case ISD::FNEARBYINT: Imm = 0xC; break; case ISD::STRICT_FRINT: case ISD::FRINT: Imm = 0x4; break; } SDLoc dl(N); bool IsStrict = N->isStrictFPOpcode(); SDValue Res; if (IsStrict) Res = CurDAG->getNode(X86ISD::STRICT_VRNDSCALE, dl, {N->getValueType(0), MVT::Other}, {N->getOperand(0), N->getOperand(1), CurDAG->getTargetConstant(Imm, dl, MVT::i32)}); else Res = CurDAG->getNode(X86ISD::VRNDSCALE, dl, N->getValueType(0), N->getOperand(0), CurDAG->getTargetConstant(Imm, dl, MVT::i32)); --I; CurDAG->ReplaceAllUsesWith(N, Res.getNode()); ++I; MadeChange = true; continue; } case X86ISD::FANDN: case X86ISD::FAND: case X86ISD::FOR: case X86ISD::FXOR: { // Widen scalar fp logic ops to vector to reduce isel patterns. // FIXME: Can we do this during lowering/combine. MVT VT = N->getSimpleValueType(0); if (VT.isVector() || VT == MVT::f128) break; MVT VecVT = VT == MVT::f64 ? MVT::v2f64 : MVT::v4f32; SDLoc dl(N); SDValue Op0 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N->getOperand(0)); SDValue Op1 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N->getOperand(1)); SDValue Res; if (Subtarget->hasSSE2()) { EVT IntVT = EVT(VecVT).changeVectorElementTypeToInteger(); Op0 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op0); Op1 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op1); unsigned Opc; switch (N->getOpcode()) { default: llvm_unreachable("Unexpected opcode!"); case X86ISD::FANDN: Opc = X86ISD::ANDNP; break; case X86ISD::FAND: Opc = ISD::AND; break; case X86ISD::FOR: Opc = ISD::OR; break; case X86ISD::FXOR: Opc = ISD::XOR; break; } Res = CurDAG->getNode(Opc, dl, IntVT, Op0, Op1); Res = CurDAG->getNode(ISD::BITCAST, dl, VecVT, Res); } else { Res = CurDAG->getNode(N->getOpcode(), dl, VecVT, Op0, Op1); } Res = CurDAG->getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res, CurDAG->getIntPtrConstant(0, dl)); --I; CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res); ++I; MadeChange = true; continue; } } if (OptLevel != CodeGenOpt::None && // Only do this when the target can fold the load into the call or // jmp. !Subtarget->useIndirectThunkCalls() && ((N->getOpcode() == X86ISD::CALL && !Subtarget->slowTwoMemOps()) || (N->getOpcode() == X86ISD::TC_RETURN && (Subtarget->is64Bit() || !getTargetMachine().isPositionIndependent())))) { /// Also try moving call address load from outside callseq_start to just /// before the call to allow it to be folded. /// /// [Load chain] /// ^ /// | /// [Load] /// ^ ^ /// | | /// / \-- /// / | ///[CALLSEQ_START] | /// ^ | /// | | /// [LOAD/C2Reg] | /// | | /// \ / /// \ / /// [CALL] bool HasCallSeq = N->getOpcode() == X86ISD::CALL; SDValue Chain = N->getOperand(0); SDValue Load = N->getOperand(1); if (!isCalleeLoad(Load, Chain, HasCallSeq)) continue; moveBelowOrigChain(CurDAG, Load, SDValue(N, 0), Chain); ++NumLoadMoved; MadeChange = true; continue; } // Lower fpround and fpextend nodes that target the FP stack to be store and // load to the stack. This is a gross hack. We would like to simply mark // these as being illegal, but when we do that, legalize produces these when // it expands calls, then expands these in the same legalize pass. We would // like dag combine to be able to hack on these between the call expansion // and the node legalization. As such this pass basically does "really // late" legalization of these inline with the X86 isel pass. // FIXME: This should only happen when not compiled with -O0. switch (N->getOpcode()) { default: continue; case ISD::FP_ROUND: case ISD::FP_EXTEND: { MVT SrcVT = N->getOperand(0).getSimpleValueType(); MVT DstVT = N->getSimpleValueType(0); // If any of the sources are vectors, no fp stack involved. if (SrcVT.isVector() || DstVT.isVector()) continue; // If the source and destination are SSE registers, then this is a legal // conversion that should not be lowered. const X86TargetLowering *X86Lowering = static_cast(TLI); bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT); bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT); if (SrcIsSSE && DstIsSSE) continue; if (!SrcIsSSE && !DstIsSSE) { // If this is an FPStack extension, it is a noop. if (N->getOpcode() == ISD::FP_EXTEND) continue; // If this is a value-preserving FPStack truncation, it is a noop. if (N->getConstantOperandVal(1)) continue; } // Here we could have an FP stack truncation or an FPStack <-> SSE convert. // FPStack has extload and truncstore. SSE can fold direct loads into other // operations. Based on this, decide what we want to do. MVT MemVT = (N->getOpcode() == ISD::FP_ROUND) ? DstVT : SrcVT; SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT); int SPFI = cast(MemTmp)->getIndex(); MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(CurDAG->getMachineFunction(), SPFI); SDLoc dl(N); // FIXME: optimize the case where the src/dest is a load or store? SDValue Store = CurDAG->getTruncStore( CurDAG->getEntryNode(), dl, N->getOperand(0), MemTmp, MPI, MemVT); SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store, MemTmp, MPI, MemVT); // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the // extload we created. This will cause general havok on the dag because // anything below the conversion could be folded into other existing nodes. // To avoid invalidating 'I', back it up to the convert node. --I; CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Result); break; } //The sequence of events for lowering STRICT_FP versions of these nodes requires //dealing with the chain differently, as there is already a preexisting chain. case ISD::STRICT_FP_ROUND: case ISD::STRICT_FP_EXTEND: { MVT SrcVT = N->getOperand(1).getSimpleValueType(); MVT DstVT = N->getSimpleValueType(0); // If any of the sources are vectors, no fp stack involved. if (SrcVT.isVector() || DstVT.isVector()) continue; // If the source and destination are SSE registers, then this is a legal // conversion that should not be lowered. const X86TargetLowering *X86Lowering = static_cast(TLI); bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT); bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT); if (SrcIsSSE && DstIsSSE) continue; if (!SrcIsSSE && !DstIsSSE) { // If this is an FPStack extension, it is a noop. if (N->getOpcode() == ISD::STRICT_FP_EXTEND) continue; // If this is a value-preserving FPStack truncation, it is a noop. if (N->getConstantOperandVal(2)) continue; } // Here we could have an FP stack truncation or an FPStack <-> SSE convert. // FPStack has extload and truncstore. SSE can fold direct loads into other // operations. Based on this, decide what we want to do. MVT MemVT = (N->getOpcode() == ISD::STRICT_FP_ROUND) ? DstVT : SrcVT; SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT); int SPFI = cast(MemTmp)->getIndex(); MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(CurDAG->getMachineFunction(), SPFI); SDLoc dl(N); // FIXME: optimize the case where the src/dest is a load or store? //Since the operation is StrictFP, use the preexisting chain. SDValue Store, Result; if (!SrcIsSSE) { SDVTList VTs = CurDAG->getVTList(MVT::Other); SDValue Ops[] = {N->getOperand(0), N->getOperand(1), MemTmp}; Store = CurDAG->getMemIntrinsicNode(X86ISD::FST, dl, VTs, Ops, MemVT, MPI, /*Align*/ None, MachineMemOperand::MOStore); if (N->getFlags().hasNoFPExcept()) { SDNodeFlags Flags = Store->getFlags(); Flags.setNoFPExcept(true); Store->setFlags(Flags); } } else { assert(SrcVT == MemVT && "Unexpected VT!"); Store = CurDAG->getStore(N->getOperand(0), dl, N->getOperand(1), MemTmp, MPI); } if (!DstIsSSE) { SDVTList VTs = CurDAG->getVTList(DstVT, MVT::Other); SDValue Ops[] = {Store, MemTmp}; Result = CurDAG->getMemIntrinsicNode( X86ISD::FLD, dl, VTs, Ops, MemVT, MPI, /*Align*/ None, MachineMemOperand::MOLoad); if (N->getFlags().hasNoFPExcept()) { SDNodeFlags Flags = Result->getFlags(); Flags.setNoFPExcept(true); Result->setFlags(Flags); } } else { assert(DstVT == MemVT && "Unexpected VT!"); Result = CurDAG->getLoad(DstVT, dl, Store, MemTmp, MPI); } // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the // extload we created. This will cause general havok on the dag because // anything below the conversion could be folded into other existing nodes. // To avoid invalidating 'I', back it up to the convert node. --I; CurDAG->ReplaceAllUsesWith(N, Result.getNode()); break; } } // Now that we did that, the node is dead. Increment the iterator to the // next node to process, then delete N. ++I; MadeChange = true; } // Remove any dead nodes that may have been left behind. if (MadeChange) CurDAG->RemoveDeadNodes(); } // Look for a redundant movzx/movsx that can occur after an 8-bit divrem. bool X86DAGToDAGISel::tryOptimizeRem8Extend(SDNode *N) { unsigned Opc = N->getMachineOpcode(); if (Opc != X86::MOVZX32rr8 && Opc != X86::MOVSX32rr8 && Opc != X86::MOVSX64rr8) return false; SDValue N0 = N->getOperand(0); // We need to be extracting the lower bit of an extend. if (!N0.isMachineOpcode() || N0.getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG || N0.getConstantOperandVal(1) != X86::sub_8bit) return false; // We're looking for either a movsx or movzx to match the original opcode. unsigned ExpectedOpc = Opc == X86::MOVZX32rr8 ? X86::MOVZX32rr8_NOREX : X86::MOVSX32rr8_NOREX; SDValue N00 = N0.getOperand(0); if (!N00.isMachineOpcode() || N00.getMachineOpcode() != ExpectedOpc) return false; if (Opc == X86::MOVSX64rr8) { // If we had a sign extend from 8 to 64 bits. We still need to go from 32 // to 64. MachineSDNode *Extend = CurDAG->getMachineNode(X86::MOVSX64rr32, SDLoc(N), MVT::i64, N00); ReplaceUses(N, Extend); } else { // Ok we can drop this extend and just use the original extend. ReplaceUses(N, N00.getNode()); } return true; } void X86DAGToDAGISel::PostprocessISelDAG() { // Skip peepholes at -O0. if (TM.getOptLevel() == CodeGenOpt::None) return; SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end(); bool MadeChange = false; while (Position != CurDAG->allnodes_begin()) { SDNode *N = &*--Position; // Skip dead nodes and any non-machine opcodes. if (N->use_empty() || !N->isMachineOpcode()) continue; if (tryOptimizeRem8Extend(N)) { MadeChange = true; continue; } // Look for a TESTrr+ANDrr pattern where both operands of the test are // the same. Rewrite to remove the AND. unsigned Opc = N->getMachineOpcode(); if ((Opc == X86::TEST8rr || Opc == X86::TEST16rr || Opc == X86::TEST32rr || Opc == X86::TEST64rr) && N->getOperand(0) == N->getOperand(1) && N->isOnlyUserOf(N->getOperand(0).getNode()) && N->getOperand(0).isMachineOpcode()) { SDValue And = N->getOperand(0); unsigned N0Opc = And.getMachineOpcode(); if (N0Opc == X86::AND8rr || N0Opc == X86::AND16rr || N0Opc == X86::AND32rr || N0Opc == X86::AND64rr) { MachineSDNode *Test = CurDAG->getMachineNode(Opc, SDLoc(N), MVT::i32, And.getOperand(0), And.getOperand(1)); ReplaceUses(N, Test); MadeChange = true; continue; } if (N0Opc == X86::AND8rm || N0Opc == X86::AND16rm || N0Opc == X86::AND32rm || N0Opc == X86::AND64rm) { unsigned NewOpc; switch (N0Opc) { case X86::AND8rm: NewOpc = X86::TEST8mr; break; case X86::AND16rm: NewOpc = X86::TEST16mr; break; case X86::AND32rm: NewOpc = X86::TEST32mr; break; case X86::AND64rm: NewOpc = X86::TEST64mr; break; } // Need to swap the memory and register operand. SDValue Ops[] = { And.getOperand(1), And.getOperand(2), And.getOperand(3), And.getOperand(4), And.getOperand(5), And.getOperand(0), And.getOperand(6) /* Chain */ }; MachineSDNode *Test = CurDAG->getMachineNode(NewOpc, SDLoc(N), MVT::i32, MVT::Other, Ops); CurDAG->setNodeMemRefs( Test, cast(And.getNode())->memoperands()); ReplaceUses(N, Test); MadeChange = true; continue; } } // Look for a KAND+KORTEST and turn it into KTEST if only the zero flag is // used. We're doing this late so we can prefer to fold the AND into masked // comparisons. Doing that can be better for the live range of the mask // register. if ((Opc == X86::KORTESTBrr || Opc == X86::KORTESTWrr || Opc == X86::KORTESTDrr || Opc == X86::KORTESTQrr) && N->getOperand(0) == N->getOperand(1) && N->isOnlyUserOf(N->getOperand(0).getNode()) && N->getOperand(0).isMachineOpcode() && onlyUsesZeroFlag(SDValue(N, 0))) { SDValue And = N->getOperand(0); unsigned N0Opc = And.getMachineOpcode(); // KANDW is legal with AVX512F, but KTESTW requires AVX512DQ. The other // KAND instructions and KTEST use the same ISA feature. if (N0Opc == X86::KANDBrr || (N0Opc == X86::KANDWrr && Subtarget->hasDQI()) || N0Opc == X86::KANDDrr || N0Opc == X86::KANDQrr) { unsigned NewOpc; switch (Opc) { default: llvm_unreachable("Unexpected opcode!"); case X86::KORTESTBrr: NewOpc = X86::KTESTBrr; break; case X86::KORTESTWrr: NewOpc = X86::KTESTWrr; break; case X86::KORTESTDrr: NewOpc = X86::KTESTDrr; break; case X86::KORTESTQrr: NewOpc = X86::KTESTQrr; break; } MachineSDNode *KTest = CurDAG->getMachineNode(NewOpc, SDLoc(N), MVT::i32, And.getOperand(0), And.getOperand(1)); ReplaceUses(N, KTest); MadeChange = true; continue; } } // Attempt to remove vectors moves that were inserted to zero upper bits. if (Opc != TargetOpcode::SUBREG_TO_REG) continue; unsigned SubRegIdx = N->getConstantOperandVal(2); if (SubRegIdx != X86::sub_xmm && SubRegIdx != X86::sub_ymm) continue; SDValue Move = N->getOperand(1); if (!Move.isMachineOpcode()) continue; // Make sure its one of the move opcodes we recognize. switch (Move.getMachineOpcode()) { default: continue; case X86::VMOVAPDrr: case X86::VMOVUPDrr: case X86::VMOVAPSrr: case X86::VMOVUPSrr: case X86::VMOVDQArr: case X86::VMOVDQUrr: case X86::VMOVAPDYrr: case X86::VMOVUPDYrr: case X86::VMOVAPSYrr: case X86::VMOVUPSYrr: case X86::VMOVDQAYrr: case X86::VMOVDQUYrr: case X86::VMOVAPDZ128rr: case X86::VMOVUPDZ128rr: case X86::VMOVAPSZ128rr: case X86::VMOVUPSZ128rr: case X86::VMOVDQA32Z128rr: case X86::VMOVDQU32Z128rr: case X86::VMOVDQA64Z128rr: case X86::VMOVDQU64Z128rr: case X86::VMOVAPDZ256rr: case X86::VMOVUPDZ256rr: case X86::VMOVAPSZ256rr: case X86::VMOVUPSZ256rr: case X86::VMOVDQA32Z256rr: case X86::VMOVDQU32Z256rr: case X86::VMOVDQA64Z256rr: case X86::VMOVDQU64Z256rr: break; } SDValue In = Move.getOperand(0); if (!In.isMachineOpcode() || In.getMachineOpcode() <= TargetOpcode::GENERIC_OP_END) continue; // Make sure the instruction has a VEX, XOP, or EVEX prefix. This covers // the SHA instructions which use a legacy encoding. uint64_t TSFlags = getInstrInfo()->get(In.getMachineOpcode()).TSFlags; if ((TSFlags & X86II::EncodingMask) != X86II::VEX && (TSFlags & X86II::EncodingMask) != X86II::EVEX && (TSFlags & X86II::EncodingMask) != X86II::XOP) continue; // Producing instruction is another vector instruction. We can drop the // move. CurDAG->UpdateNodeOperands(N, N->getOperand(0), In, N->getOperand(2)); MadeChange = true; } if (MadeChange) CurDAG->RemoveDeadNodes(); } /// Emit any code that needs to be executed only in the main function. void X86DAGToDAGISel::emitSpecialCodeForMain() { if (Subtarget->isTargetCygMing()) { TargetLowering::ArgListTy Args; auto &DL = CurDAG->getDataLayout(); TargetLowering::CallLoweringInfo CLI(*CurDAG); CLI.setChain(CurDAG->getRoot()) .setCallee(CallingConv::C, Type::getVoidTy(*CurDAG->getContext()), CurDAG->getExternalSymbol("__main", TLI->getPointerTy(DL)), std::move(Args)); const TargetLowering &TLI = CurDAG->getTargetLoweringInfo(); std::pair Result = TLI.LowerCallTo(CLI); CurDAG->setRoot(Result.second); } } void X86DAGToDAGISel::emitFunctionEntryCode() { // If this is main, emit special code for main. const Function &F = MF->getFunction(); if (F.hasExternalLinkage() && F.getName() == "main") emitSpecialCodeForMain(); } static bool isDispSafeForFrameIndex(int64_t Val) { // On 64-bit platforms, we can run into an issue where a frame index // includes a displacement that, when added to the explicit displacement, // will overflow the displacement field. Assuming that the frame index // displacement fits into a 31-bit integer (which is only slightly more // aggressive than the current fundamental assumption that it fits into // a 32-bit integer), a 31-bit disp should always be safe. return isInt<31>(Val); } bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM) { // We may have already matched a displacement and the caller just added the // symbolic displacement. So we still need to do the checks even if Offset // is zero. int64_t Val = AM.Disp + Offset; // Cannot combine ExternalSymbol displacements with integer offsets. if (Val != 0 && (AM.ES || AM.MCSym)) return true; CodeModel::Model M = TM.getCodeModel(); if (Subtarget->is64Bit()) { if (Val != 0 && !X86::isOffsetSuitableForCodeModel(Val, M, AM.hasSymbolicDisplacement())) return true; // In addition to the checks required for a register base, check that // we do not try to use an unsafe Disp with a frame index. if (AM.BaseType == X86ISelAddressMode::FrameIndexBase && !isDispSafeForFrameIndex(Val)) return true; } AM.Disp = Val; return false; } bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM, bool AllowSegmentRegForX32) { SDValue Address = N->getOperand(1); // load gs:0 -> GS segment register. // load fs:0 -> FS segment register. // // This optimization is generally valid because the GNU TLS model defines that // gs:0 (or fs:0 on X86-64) contains its own address. However, for X86-64 mode // with 32-bit registers, as we get in ILP32 mode, those registers are first // zero-extended to 64 bits and then added it to the base address, which gives // unwanted results when the register holds a negative value. // For more information see http://people.redhat.com/drepper/tls.pdf if (ConstantSDNode *C = dyn_cast(Address)) { if (C->getSExtValue() == 0 && AM.Segment.getNode() == nullptr && !IndirectTlsSegRefs && (Subtarget->isTargetGlibc() || Subtarget->isTargetAndroid() || Subtarget->isTargetFuchsia())) { if (Subtarget->isTarget64BitILP32() && !AllowSegmentRegForX32) return true; switch (N->getPointerInfo().getAddrSpace()) { case X86AS::GS: AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16); return false; case X86AS::FS: AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16); return false; // Address space X86AS::SS is not handled here, because it is not used to // address TLS areas. } } } return true; } /// Try to match X86ISD::Wrapper and X86ISD::WrapperRIP nodes into an addressing /// mode. These wrap things that will resolve down into a symbol reference. /// If no match is possible, this returns true, otherwise it returns false. bool X86DAGToDAGISel::matchWrapper(SDValue N, X86ISelAddressMode &AM) { // If the addressing mode already has a symbol as the displacement, we can // never match another symbol. if (AM.hasSymbolicDisplacement()) return true; bool IsRIPRelTLS = false; bool IsRIPRel = N.getOpcode() == X86ISD::WrapperRIP; if (IsRIPRel) { SDValue Val = N.getOperand(0); if (Val.getOpcode() == ISD::TargetGlobalTLSAddress) IsRIPRelTLS = true; } // We can't use an addressing mode in the 64-bit large code model. // Global TLS addressing is an exception. In the medium code model, // we use can use a mode when RIP wrappers are present. // That signifies access to globals that are known to be "near", // such as the GOT itself. CodeModel::Model M = TM.getCodeModel(); if (Subtarget->is64Bit() && ((M == CodeModel::Large && !IsRIPRelTLS) || (M == CodeModel::Medium && !IsRIPRel))) return true; // Base and index reg must be 0 in order to use %rip as base. if (IsRIPRel && AM.hasBaseOrIndexReg()) return true; // Make a local copy in case we can't do this fold. X86ISelAddressMode Backup = AM; int64_t Offset = 0; SDValue N0 = N.getOperand(0); if (GlobalAddressSDNode *G = dyn_cast(N0)) { AM.GV = G->getGlobal(); AM.SymbolFlags = G->getTargetFlags(); Offset = G->getOffset(); } else if (ConstantPoolSDNode *CP = dyn_cast(N0)) { AM.CP = CP->getConstVal(); AM.Alignment = CP->getAlign(); AM.SymbolFlags = CP->getTargetFlags(); Offset = CP->getOffset(); } else if (ExternalSymbolSDNode *S = dyn_cast(N0)) { AM.ES = S->getSymbol(); AM.SymbolFlags = S->getTargetFlags(); } else if (auto *S = dyn_cast(N0)) { AM.MCSym = S->getMCSymbol(); } else if (JumpTableSDNode *J = dyn_cast(N0)) { AM.JT = J->getIndex(); AM.SymbolFlags = J->getTargetFlags(); } else if (BlockAddressSDNode *BA = dyn_cast(N0)) { AM.BlockAddr = BA->getBlockAddress(); AM.SymbolFlags = BA->getTargetFlags(); Offset = BA->getOffset(); } else llvm_unreachable("Unhandled symbol reference node."); if (foldOffsetIntoAddress(Offset, AM)) { AM = Backup; return true; } if (IsRIPRel) AM.setBaseReg(CurDAG->getRegister(X86::RIP, MVT::i64)); // Commit the changes now that we know this fold is safe. return false; } /// Add the specified node to the specified addressing mode, returning true if /// it cannot be done. This just pattern matches for the addressing mode. bool X86DAGToDAGISel::matchAddress(SDValue N, X86ISelAddressMode &AM) { if (matchAddressRecursively(N, AM, 0)) return true; // Post-processing: Make a second attempt to fold a load, if we now know // that there will not be any other register. This is only performed for // 64-bit ILP32 mode since 32-bit mode and 64-bit LP64 mode will have folded // any foldable load the first time. if (Subtarget->isTarget64BitILP32() && AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode() != nullptr && AM.IndexReg.getNode() == nullptr) { SDValue Save_Base_Reg = AM.Base_Reg; if (auto *LoadN = dyn_cast(Save_Base_Reg)) { AM.Base_Reg = SDValue(); if (matchLoadInAddress(LoadN, AM, /*AllowSegmentRegForX32=*/true)) AM.Base_Reg = Save_Base_Reg; } } // Post-processing: Convert lea(,%reg,2) to lea(%reg,%reg), which has // a smaller encoding and avoids a scaled-index. if (AM.Scale == 2 && AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode() == nullptr) { AM.Base_Reg = AM.IndexReg; AM.Scale = 1; } // Post-processing: Convert foo to foo(%rip), even in non-PIC mode, // because it has a smaller encoding. // TODO: Which other code models can use this? switch (TM.getCodeModel()) { default: break; case CodeModel::Small: case CodeModel::Kernel: if (Subtarget->is64Bit() && AM.Scale == 1 && AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode() == nullptr && AM.IndexReg.getNode() == nullptr && AM.SymbolFlags == X86II::MO_NO_FLAG && AM.hasSymbolicDisplacement()) AM.Base_Reg = CurDAG->getRegister(X86::RIP, MVT::i64); break; } return false; } bool X86DAGToDAGISel::matchAdd(SDValue &N, X86ISelAddressMode &AM, unsigned Depth) { // Add an artificial use to this node so that we can keep track of // it if it gets CSE'd with a different node. HandleSDNode Handle(N); X86ISelAddressMode Backup = AM; if (!matchAddressRecursively(N.getOperand(0), AM, Depth+1) && !matchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1)) return false; AM = Backup; // Try again after commutating the operands. if (!matchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth + 1) && !matchAddressRecursively(Handle.getValue().getOperand(0), AM, Depth + 1)) return false; AM = Backup; // If we couldn't fold both operands into the address at the same time, // see if we can just put each operand into a register and fold at least // the add. if (AM.BaseType == X86ISelAddressMode::RegBase && !AM.Base_Reg.getNode() && !AM.IndexReg.getNode()) { N = Handle.getValue(); AM.Base_Reg = N.getOperand(0); AM.IndexReg = N.getOperand(1); AM.Scale = 1; return false; } N = Handle.getValue(); return true; } // Insert a node into the DAG at least before the Pos node's position. This // will reposition the node as needed, and will assign it a node ID that is <= // the Pos node's ID. Note that this does *not* preserve the uniqueness of node // IDs! The selection DAG must no longer depend on their uniqueness when this // is used. static void insertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) { if (N->getNodeId() == -1 || (SelectionDAGISel::getUninvalidatedNodeId(N.getNode()) > SelectionDAGISel::getUninvalidatedNodeId(Pos.getNode()))) { DAG.RepositionNode(Pos->getIterator(), N.getNode()); // Mark Node as invalid for pruning as after this it may be a successor to a // selected node but otherwise be in the same position of Pos. // Conservatively mark it with the same -abs(Id) to assure node id // invariant is preserved. N->setNodeId(Pos->getNodeId()); SelectionDAGISel::InvalidateNodeId(N.getNode()); } } // Transform "(X >> (8-C1)) & (0xff << C1)" to "((X >> 8) & 0xff) << C1" if // safe. This allows us to convert the shift and and into an h-register // extract and a scaled index. Returns false if the simplification is // performed. static bool foldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N, uint64_t Mask, SDValue Shift, SDValue X, X86ISelAddressMode &AM) { if (Shift.getOpcode() != ISD::SRL || !isa(Shift.getOperand(1)) || !Shift.hasOneUse()) return true; int ScaleLog = 8 - Shift.getConstantOperandVal(1); if (ScaleLog <= 0 || ScaleLog >= 4 || Mask != (0xffu << ScaleLog)) return true; MVT VT = N.getSimpleValueType(); SDLoc DL(N); SDValue Eight = DAG.getConstant(8, DL, MVT::i8); SDValue NewMask = DAG.getConstant(0xff, DL, VT); SDValue Srl = DAG.getNode(ISD::SRL, DL, VT, X, Eight); SDValue And = DAG.getNode(ISD::AND, DL, VT, Srl, NewMask); SDValue ShlCount = DAG.getConstant(ScaleLog, DL, MVT::i8); SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And, ShlCount); // Insert the new nodes into the topological ordering. We must do this in // a valid topological ordering as nothing is going to go back and re-sort // these nodes. We continually insert before 'N' in sequence as this is // essentially a pre-flattened and pre-sorted sequence of nodes. There is no // hierarchy left to express. insertDAGNode(DAG, N, Eight); insertDAGNode(DAG, N, Srl); insertDAGNode(DAG, N, NewMask); insertDAGNode(DAG, N, And); insertDAGNode(DAG, N, ShlCount); insertDAGNode(DAG, N, Shl); DAG.ReplaceAllUsesWith(N, Shl); DAG.RemoveDeadNode(N.getNode()); AM.IndexReg = And; AM.Scale = (1 << ScaleLog); return false; } // Transforms "(X << C1) & C2" to "(X & (C2>>C1)) << C1" if safe and if this // allows us to fold the shift into this addressing mode. Returns false if the // transform succeeded. static bool foldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N, X86ISelAddressMode &AM) { SDValue Shift = N.getOperand(0); // Use a signed mask so that shifting right will insert sign bits. These // bits will be removed when we shift the result left so it doesn't matter // what we use. This might allow a smaller immediate encoding. int64_t Mask = cast(N->getOperand(1))->getSExtValue(); // If we have an any_extend feeding the AND, look through it to see if there // is a shift behind it. But only if the AND doesn't use the extended bits. // FIXME: Generalize this to other ANY_EXTEND than i32 to i64? bool FoundAnyExtend = false; if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() && Shift.getOperand(0).getSimpleValueType() == MVT::i32 && isUInt<32>(Mask)) { FoundAnyExtend = true; Shift = Shift.getOperand(0); } if (Shift.getOpcode() != ISD::SHL || !isa(Shift.getOperand(1))) return true; SDValue X = Shift.getOperand(0); // Not likely to be profitable if either the AND or SHIFT node has more // than one use (unless all uses are for address computation). Besides, // isel mechanism requires their node ids to be reused. if (!N.hasOneUse() || !Shift.hasOneUse()) return true; // Verify that the shift amount is something we can fold. unsigned ShiftAmt = Shift.getConstantOperandVal(1); if (ShiftAmt != 1 && ShiftAmt != 2 && ShiftAmt != 3) return true; MVT VT = N.getSimpleValueType(); SDLoc DL(N); if (FoundAnyExtend) { SDValue NewX = DAG.getNode(ISD::ANY_EXTEND, DL, VT, X); insertDAGNode(DAG, N, NewX); X = NewX; } SDValue NewMask = DAG.getConstant(Mask >> ShiftAmt, DL, VT); SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, X, NewMask); SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, NewAnd, Shift.getOperand(1)); // Insert the new nodes into the topological ordering. We must do this in // a valid topological ordering as nothing is going to go back and re-sort // these nodes. We continually insert before 'N' in sequence as this is // essentially a pre-flattened and pre-sorted sequence of nodes. There is no // hierarchy left to express. insertDAGNode(DAG, N, NewMask); insertDAGNode(DAG, N, NewAnd); insertDAGNode(DAG, N, NewShift); DAG.ReplaceAllUsesWith(N, NewShift); DAG.RemoveDeadNode(N.getNode()); AM.Scale = 1 << ShiftAmt; AM.IndexReg = NewAnd; return false; } // Implement some heroics to detect shifts of masked values where the mask can // be replaced by extending the shift and undoing that in the addressing mode // scale. Patterns such as (shl (srl x, c1), c2) are canonicalized into (and // (srl x, SHIFT), MASK) by DAGCombines that don't know the shl can be done in // the addressing mode. This results in code such as: // // int f(short *y, int *lookup_table) { // ... // return *y + lookup_table[*y >> 11]; // } // // Turning into: // movzwl (%rdi), %eax // movl %eax, %ecx // shrl $11, %ecx // addl (%rsi,%rcx,4), %eax // // Instead of: // movzwl (%rdi), %eax // movl %eax, %ecx // shrl $9, %ecx // andl $124, %rcx // addl (%rsi,%rcx), %eax // // Note that this function assumes the mask is provided as a mask *after* the // value is shifted. The input chain may or may not match that, but computing // such a mask is trivial. static bool foldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N, uint64_t Mask, SDValue Shift, SDValue X, X86ISelAddressMode &AM) { if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse() || !isa(Shift.getOperand(1))) return true; unsigned ShiftAmt = Shift.getConstantOperandVal(1); unsigned MaskLZ = countLeadingZeros(Mask); unsigned MaskTZ = countTrailingZeros(Mask); // The amount of shift we're trying to fit into the addressing mode is taken // from the trailing zeros of the mask. unsigned AMShiftAmt = MaskTZ; // There is nothing we can do here unless the mask is removing some bits. // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits. if (AMShiftAmt == 0 || AMShiftAmt > 3) return true; // We also need to ensure that mask is a continuous run of bits. if (countTrailingOnes(Mask >> MaskTZ) + MaskTZ + MaskLZ != 64) return true; // Scale the leading zero count down based on the actual size of the value. // Also scale it down based on the size of the shift. unsigned ScaleDown = (64 - X.getSimpleValueType().getSizeInBits()) + ShiftAmt; if (MaskLZ < ScaleDown) return true; MaskLZ -= ScaleDown; // The final check is to ensure that any masked out high bits of X are // already known to be zero. Otherwise, the mask has a semantic impact // other than masking out a couple of low bits. Unfortunately, because of // the mask, zero extensions will be removed from operands in some cases. // This code works extra hard to look through extensions because we can // replace them with zero extensions cheaply if necessary. bool ReplacingAnyExtend = false; if (X.getOpcode() == ISD::ANY_EXTEND) { unsigned ExtendBits = X.getSimpleValueType().getSizeInBits() - X.getOperand(0).getSimpleValueType().getSizeInBits(); // Assume that we'll replace the any-extend with a zero-extend, and // narrow the search to the extended value. X = X.getOperand(0); MaskLZ = ExtendBits > MaskLZ ? 0 : MaskLZ - ExtendBits; ReplacingAnyExtend = true; } APInt MaskedHighBits = APInt::getHighBitsSet(X.getSimpleValueType().getSizeInBits(), MaskLZ); KnownBits Known = DAG.computeKnownBits(X); if (MaskedHighBits != Known.Zero) return true; // We've identified a pattern that can be transformed into a single shift // and an addressing mode. Make it so. MVT VT = N.getSimpleValueType(); if (ReplacingAnyExtend) { assert(X.getValueType() != VT); // We looked through an ANY_EXTEND node, insert a ZERO_EXTEND. SDValue NewX = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(X), VT, X); insertDAGNode(DAG, N, NewX); X = NewX; } SDLoc DL(N); SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8); SDValue NewSRL = DAG.getNode(ISD::SRL, DL, VT, X, NewSRLAmt); SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8); SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewSRL, NewSHLAmt); // Insert the new nodes into the topological ordering. We must do this in // a valid topological ordering as nothing is going to go back and re-sort // these nodes. We continually insert before 'N' in sequence as this is // essentially a pre-flattened and pre-sorted sequence of nodes. There is no // hierarchy left to express. insertDAGNode(DAG, N, NewSRLAmt); insertDAGNode(DAG, N, NewSRL); insertDAGNode(DAG, N, NewSHLAmt); insertDAGNode(DAG, N, NewSHL); DAG.ReplaceAllUsesWith(N, NewSHL); DAG.RemoveDeadNode(N.getNode()); AM.Scale = 1 << AMShiftAmt; AM.IndexReg = NewSRL; return false; } // Transform "(X >> SHIFT) & (MASK << C1)" to // "((X >> (SHIFT + C1)) & (MASK)) << C1". Everything before the SHL will be // matched to a BEXTR later. Returns false if the simplification is performed. static bool foldMaskedShiftToBEXTR(SelectionDAG &DAG, SDValue N, uint64_t Mask, SDValue Shift, SDValue X, X86ISelAddressMode &AM, const X86Subtarget &Subtarget) { if (Shift.getOpcode() != ISD::SRL || !isa(Shift.getOperand(1)) || !Shift.hasOneUse() || !N.hasOneUse()) return true; // Only do this if BEXTR will be matched by matchBEXTRFromAndImm. if (!Subtarget.hasTBM() && !(Subtarget.hasBMI() && Subtarget.hasFastBEXTR())) return true; // We need to ensure that mask is a continuous run of bits. if (!isShiftedMask_64(Mask)) return true; unsigned ShiftAmt = Shift.getConstantOperandVal(1); // The amount of shift we're trying to fit into the addressing mode is taken // from the trailing zeros of the mask. unsigned AMShiftAmt = countTrailingZeros(Mask); // There is nothing we can do here unless the mask is removing some bits. // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits. if (AMShiftAmt == 0 || AMShiftAmt > 3) return true; MVT VT = N.getSimpleValueType(); SDLoc DL(N); SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8); SDValue NewSRL = DAG.getNode(ISD::SRL, DL, VT, X, NewSRLAmt); SDValue NewMask = DAG.getConstant(Mask >> AMShiftAmt, DL, VT); SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, NewSRL, NewMask); SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8); SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewAnd, NewSHLAmt); // Insert the new nodes into the topological ordering. We must do this in // a valid topological ordering as nothing is going to go back and re-sort // these nodes. We continually insert before 'N' in sequence as this is // essentially a pre-flattened and pre-sorted sequence of nodes. There is no // hierarchy left to express. insertDAGNode(DAG, N, NewSRLAmt); insertDAGNode(DAG, N, NewSRL); insertDAGNode(DAG, N, NewMask); insertDAGNode(DAG, N, NewAnd); insertDAGNode(DAG, N, NewSHLAmt); insertDAGNode(DAG, N, NewSHL); DAG.ReplaceAllUsesWith(N, NewSHL); DAG.RemoveDeadNode(N.getNode()); AM.Scale = 1 << AMShiftAmt; AM.IndexReg = NewAnd; return false; } bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM, unsigned Depth) { SDLoc dl(N); LLVM_DEBUG({ dbgs() << "MatchAddress: "; AM.dump(CurDAG); }); // Limit recursion. if (Depth > 5) return matchAddressBase(N, AM); // If this is already a %rip relative address, we can only merge immediates // into it. Instead of handling this in every case, we handle it here. // RIP relative addressing: %rip + 32-bit displacement! if (AM.isRIPRelative()) { // FIXME: JumpTable and ExternalSymbol address currently don't like // displacements. It isn't very important, but this should be fixed for // consistency. if (!(AM.ES || AM.MCSym) && AM.JT != -1) return true; if (ConstantSDNode *Cst = dyn_cast(N)) if (!foldOffsetIntoAddress(Cst->getSExtValue(), AM)) return false; return true; } switch (N.getOpcode()) { default: break; case ISD::LOCAL_RECOVER: { if (!AM.hasSymbolicDisplacement() && AM.Disp == 0) if (const auto *ESNode = dyn_cast(N.getOperand(0))) { // Use the symbol and don't prefix it. AM.MCSym = ESNode->getMCSymbol(); return false; } break; } case ISD::Constant: { uint64_t Val = cast(N)->getSExtValue(); if (!foldOffsetIntoAddress(Val, AM)) return false; break; } case X86ISD::Wrapper: case X86ISD::WrapperRIP: if (!matchWrapper(N, AM)) return false; break; case ISD::LOAD: if (!matchLoadInAddress(cast(N), AM)) return false; break; case ISD::FrameIndex: if (AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode() == nullptr && (!Subtarget->is64Bit() || isDispSafeForFrameIndex(AM.Disp))) { AM.BaseType = X86ISelAddressMode::FrameIndexBase; AM.Base_FrameIndex = cast(N)->getIndex(); return false; } break; case ISD::SHL: if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break; if (ConstantSDNode *CN = dyn_cast(N.getOperand(1))) { unsigned Val = CN->getZExtValue(); // Note that we handle x<<1 as (,x,2) rather than (x,x) here so // that the base operand remains free for further matching. If // the base doesn't end up getting used, a post-processing step // in MatchAddress turns (,x,2) into (x,x), which is cheaper. if (Val == 1 || Val == 2 || Val == 3) { AM.Scale = 1 << Val; SDValue ShVal = N.getOperand(0); // Okay, we know that we have a scale by now. However, if the scaled // value is an add of something and a constant, we can fold the // constant into the disp field here. if (CurDAG->isBaseWithConstantOffset(ShVal)) { AM.IndexReg = ShVal.getOperand(0); ConstantSDNode *AddVal = cast(ShVal.getOperand(1)); uint64_t Disp = (uint64_t)AddVal->getSExtValue() << Val; if (!foldOffsetIntoAddress(Disp, AM)) return false; } AM.IndexReg = ShVal; return false; } } break; case ISD::SRL: { // Scale must not be used already. if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break; // We only handle up to 64-bit values here as those are what matter for // addressing mode optimizations. assert(N.getSimpleValueType().getSizeInBits() <= 64 && "Unexpected value size!"); SDValue And = N.getOperand(0); if (And.getOpcode() != ISD::AND) break; SDValue X = And.getOperand(0); // The mask used for the transform is expected to be post-shift, but we // found the shift first so just apply the shift to the mask before passing // it down. if (!isa(N.getOperand(1)) || !isa(And.getOperand(1))) break; uint64_t Mask = And.getConstantOperandVal(1) >> N.getConstantOperandVal(1); // Try to fold the mask and shift into the scale, and return false if we // succeed. if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, N, X, AM)) return false; break; } case ISD::SMUL_LOHI: case ISD::UMUL_LOHI: // A mul_lohi where we need the low part can be folded as a plain multiply. if (N.getResNo() != 0) break; LLVM_FALLTHROUGH; case ISD::MUL: case X86ISD::MUL_IMM: // X*[3,5,9] -> X+X*[2,4,8] if (AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode() == nullptr && AM.IndexReg.getNode() == nullptr) { if (ConstantSDNode *CN = dyn_cast(N.getOperand(1))) if (CN->getZExtValue() == 3 || CN->getZExtValue() == 5 || CN->getZExtValue() == 9) { AM.Scale = unsigned(CN->getZExtValue())-1; SDValue MulVal = N.getOperand(0); SDValue Reg; // Okay, we know that we have a scale by now. However, if the scaled // value is an add of something and a constant, we can fold the // constant into the disp field here. if (MulVal.getNode()->getOpcode() == ISD::ADD && MulVal.hasOneUse() && isa(MulVal.getOperand(1))) { Reg = MulVal.getOperand(0); ConstantSDNode *AddVal = cast(MulVal.getOperand(1)); uint64_t Disp = AddVal->getSExtValue() * CN->getZExtValue(); if (foldOffsetIntoAddress(Disp, AM)) Reg = N.getOperand(0); } else { Reg = N.getOperand(0); } AM.IndexReg = AM.Base_Reg = Reg; return false; } } break; case ISD::SUB: { // Given A-B, if A can be completely folded into the address and // the index field with the index field unused, use -B as the index. // This is a win if a has multiple parts that can be folded into // the address. Also, this saves a mov if the base register has // other uses, since it avoids a two-address sub instruction, however // it costs an additional mov if the index register has other uses. // Add an artificial use to this node so that we can keep track of // it if it gets CSE'd with a different node. HandleSDNode Handle(N); // Test if the LHS of the sub can be folded. X86ISelAddressMode Backup = AM; if (matchAddressRecursively(N.getOperand(0), AM, Depth+1)) { N = Handle.getValue(); AM = Backup; break; } N = Handle.getValue(); // Test if the index field is free for use. if (AM.IndexReg.getNode() || AM.isRIPRelative()) { AM = Backup; break; } int Cost = 0; SDValue RHS = N.getOperand(1); // If the RHS involves a register with multiple uses, this // transformation incurs an extra mov, due to the neg instruction // clobbering its operand. if (!RHS.getNode()->hasOneUse() || RHS.getNode()->getOpcode() == ISD::CopyFromReg || RHS.getNode()->getOpcode() == ISD::TRUNCATE || RHS.getNode()->getOpcode() == ISD::ANY_EXTEND || (RHS.getNode()->getOpcode() == ISD::ZERO_EXTEND && RHS.getOperand(0).getValueType() == MVT::i32)) ++Cost; // If the base is a register with multiple uses, this // transformation may save a mov. if ((AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode() && !AM.Base_Reg.getNode()->hasOneUse()) || AM.BaseType == X86ISelAddressMode::FrameIndexBase) --Cost; // If the folded LHS was interesting, this transformation saves // address arithmetic. if ((AM.hasSymbolicDisplacement() && !Backup.hasSymbolicDisplacement()) + ((AM.Disp != 0) && (Backup.Disp == 0)) + (AM.Segment.getNode() && !Backup.Segment.getNode()) >= 2) --Cost; // If it doesn't look like it may be an overall win, don't do it. if (Cost >= 0) { AM = Backup; break; } // Ok, the transformation is legal and appears profitable. Go for it. // Negation will be emitted later to avoid creating dangling nodes if this // was an unprofitable LEA. AM.IndexReg = RHS; AM.NegateIndex = true; AM.Scale = 1; return false; } case ISD::ADD: if (!matchAdd(N, AM, Depth)) return false; break; case ISD::OR: // We want to look through a transform in InstCombine and DAGCombiner that // turns 'add' into 'or', so we can treat this 'or' exactly like an 'add'. // Example: (or (and x, 1), (shl y, 3)) --> (add (and x, 1), (shl y, 3)) // An 'lea' can then be used to match the shift (multiply) and add: // and $1, %esi // lea (%rsi, %rdi, 8), %rax if (CurDAG->haveNoCommonBitsSet(N.getOperand(0), N.getOperand(1)) && !matchAdd(N, AM, Depth)) return false; break; case ISD::AND: { // Perform some heroic transforms on an and of a constant-count shift // with a constant to enable use of the scaled offset field. // Scale must not be used already. if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break; // We only handle up to 64-bit values here as those are what matter for // addressing mode optimizations. assert(N.getSimpleValueType().getSizeInBits() <= 64 && "Unexpected value size!"); if (!isa(N.getOperand(1))) break; if (N.getOperand(0).getOpcode() == ISD::SRL) { SDValue Shift = N.getOperand(0); SDValue X = Shift.getOperand(0); uint64_t Mask = N.getConstantOperandVal(1); // Try to fold the mask and shift into an extract and scale. if (!foldMaskAndShiftToExtract(*CurDAG, N, Mask, Shift, X, AM)) return false; // Try to fold the mask and shift directly into the scale. if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, Shift, X, AM)) return false; // Try to fold the mask and shift into BEXTR and scale. if (!foldMaskedShiftToBEXTR(*CurDAG, N, Mask, Shift, X, AM, *Subtarget)) return false; } // Try to swap the mask and shift to place shifts which can be done as // a scale on the outside of the mask. if (!foldMaskedShiftToScaledMask(*CurDAG, N, AM)) return false; break; } case ISD::ZERO_EXTEND: { // Try to widen a zexted shift left to the same size as its use, so we can // match the shift as a scale factor. if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break; if (N.getOperand(0).getOpcode() != ISD::SHL || !N.getOperand(0).hasOneUse()) break; // Give up if the shift is not a valid scale factor [1,2,3]. SDValue Shl = N.getOperand(0); auto *ShAmtC = dyn_cast(Shl.getOperand(1)); if (!ShAmtC || ShAmtC->getZExtValue() > 3) break; // The narrow shift must only shift out zero bits (it must be 'nuw'). // That makes it safe to widen to the destination type. APInt HighZeros = APInt::getHighBitsSet(Shl.getValueSizeInBits(), ShAmtC->getZExtValue()); if (!CurDAG->MaskedValueIsZero(Shl.getOperand(0), HighZeros)) break; // zext (shl nuw i8 %x, C) to i32 --> shl (zext i8 %x to i32), (zext C) MVT VT = N.getSimpleValueType(); SDLoc DL(N); SDValue Zext = CurDAG->getNode(ISD::ZERO_EXTEND, DL, VT, Shl.getOperand(0)); SDValue NewShl = CurDAG->getNode(ISD::SHL, DL, VT, Zext, Shl.getOperand(1)); // Convert the shift to scale factor. AM.Scale = 1 << ShAmtC->getZExtValue(); AM.IndexReg = Zext; insertDAGNode(*CurDAG, N, Zext); insertDAGNode(*CurDAG, N, NewShl); CurDAG->ReplaceAllUsesWith(N, NewShl); CurDAG->RemoveDeadNode(N.getNode()); return false; } } return matchAddressBase(N, AM); } /// Helper for MatchAddress. Add the specified node to the /// specified addressing mode without any further recursion. bool X86DAGToDAGISel::matchAddressBase(SDValue N, X86ISelAddressMode &AM) { // Is the base register already occupied? if (AM.BaseType != X86ISelAddressMode::RegBase || AM.Base_Reg.getNode()) { // If so, check to see if the scale index register is set. if (!AM.IndexReg.getNode()) { AM.IndexReg = N; AM.Scale = 1; return false; } // Otherwise, we cannot select it. return true; } // Default, generate it as a register. AM.BaseType = X86ISelAddressMode::RegBase; AM.Base_Reg = N; return false; } /// Helper for selectVectorAddr. Handles things that can be folded into a /// gather scatter address. The index register and scale should have already /// been handled. bool X86DAGToDAGISel::matchVectorAddress(SDValue N, X86ISelAddressMode &AM) { // TODO: Support other operations. switch (N.getOpcode()) { case ISD::Constant: { uint64_t Val = cast(N)->getSExtValue(); if (!foldOffsetIntoAddress(Val, AM)) return false; break; } case X86ISD::Wrapper: if (!matchWrapper(N, AM)) return false; break; } return matchAddressBase(N, AM); } bool X86DAGToDAGISel::selectVectorAddr(MemSDNode *Parent, SDValue BasePtr, SDValue IndexOp, SDValue ScaleOp, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment) { X86ISelAddressMode AM; AM.IndexReg = IndexOp; AM.Scale = cast(ScaleOp)->getZExtValue(); unsigned AddrSpace = Parent->getPointerInfo().getAddrSpace(); if (AddrSpace == X86AS::GS) AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16); if (AddrSpace == X86AS::FS) AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16); if (AddrSpace == X86AS::SS) AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16); SDLoc DL(BasePtr); MVT VT = BasePtr.getSimpleValueType(); // Try to match into the base and displacement fields. if (matchVectorAddress(BasePtr, AM)) return false; getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment); return true; } /// Returns true if it is able to pattern match an addressing mode. /// It returns the operands which make up the maximal addressing mode it can /// match by reference. /// /// Parent is the parent node of the addr operand that is being matched. It /// is always a load, store, atomic node, or null. It is only null when /// checking memory operands for inline asm nodes. bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment) { X86ISelAddressMode AM; if (Parent && // This list of opcodes are all the nodes that have an "addr:$ptr" operand // that are not a MemSDNode, and thus don't have proper addrspace info. Parent->getOpcode() != ISD::INTRINSIC_W_CHAIN && // unaligned loads, fixme Parent->getOpcode() != ISD::INTRINSIC_VOID && // nontemporal stores Parent->getOpcode() != X86ISD::TLSCALL && // Fixme Parent->getOpcode() != X86ISD::ENQCMD && // Fixme Parent->getOpcode() != X86ISD::ENQCMDS && // Fixme Parent->getOpcode() != X86ISD::EH_SJLJ_SETJMP && // setjmp Parent->getOpcode() != X86ISD::EH_SJLJ_LONGJMP) { // longjmp unsigned AddrSpace = cast(Parent)->getPointerInfo().getAddrSpace(); if (AddrSpace == X86AS::GS) AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16); if (AddrSpace == X86AS::FS) AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16); if (AddrSpace == X86AS::SS) AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16); } // Save the DL and VT before calling matchAddress, it can invalidate N. SDLoc DL(N); MVT VT = N.getSimpleValueType(); if (matchAddress(N, AM)) return false; getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment); return true; } bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N, SDValue &Imm) { // In static codegen with small code model, we can get the address of a label // into a register with 'movl' if (N->getOpcode() != X86ISD::Wrapper) return false; N = N.getOperand(0); // At least GNU as does not accept 'movl' for TPOFF relocations. // FIXME: We could use 'movl' when we know we are targeting MC. if (N->getOpcode() == ISD::TargetGlobalTLSAddress) return false; Imm = N; if (N->getOpcode() != ISD::TargetGlobalAddress) return TM.getCodeModel() == CodeModel::Small; Optional CR = cast(N)->getGlobal()->getAbsoluteSymbolRange(); if (!CR) return TM.getCodeModel() == CodeModel::Small; return CR->getUnsignedMax().ult(1ull << 32); } bool X86DAGToDAGISel::selectLEA64_32Addr(SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment) { // Save the debug loc before calling selectLEAAddr, in case it invalidates N. SDLoc DL(N); if (!selectLEAAddr(N, Base, Scale, Index, Disp, Segment)) return false; RegisterSDNode *RN = dyn_cast(Base); if (RN && RN->getReg() == 0) Base = CurDAG->getRegister(0, MVT::i64); else if (Base.getValueType() == MVT::i32 && !isa(Base)) { // Base could already be %rip, particularly in the x32 ABI. SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL, MVT::i64), 0); Base = CurDAG->getTargetInsertSubreg(X86::sub_32bit, DL, MVT::i64, ImplDef, Base); } RN = dyn_cast(Index); if (RN && RN->getReg() == 0) Index = CurDAG->getRegister(0, MVT::i64); else { assert(Index.getValueType() == MVT::i32 && "Expect to be extending 32-bit registers for use in LEA"); SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL, MVT::i64), 0); Index = CurDAG->getTargetInsertSubreg(X86::sub_32bit, DL, MVT::i64, ImplDef, Index); } return true; } /// Calls SelectAddr and determines if the maximal addressing /// mode it matches can be cost effectively emitted as an LEA instruction. bool X86DAGToDAGISel::selectLEAAddr(SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment) { X86ISelAddressMode AM; // Save the DL and VT before calling matchAddress, it can invalidate N. SDLoc DL(N); MVT VT = N.getSimpleValueType(); // Set AM.Segment to prevent MatchAddress from using one. LEA doesn't support // segments. SDValue Copy = AM.Segment; SDValue T = CurDAG->getRegister(0, MVT::i32); AM.Segment = T; if (matchAddress(N, AM)) return false; assert (T == AM.Segment); AM.Segment = Copy; unsigned Complexity = 0; if (AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode()) Complexity = 1; else if (AM.BaseType == X86ISelAddressMode::FrameIndexBase) Complexity = 4; if (AM.IndexReg.getNode()) Complexity++; // Don't match just leal(,%reg,2). It's cheaper to do addl %reg, %reg, or with // a simple shift. if (AM.Scale > 1) Complexity++; // FIXME: We are artificially lowering the criteria to turn ADD %reg, $GA // to a LEA. This is determined with some experimentation but is by no means // optimal (especially for code size consideration). LEA is nice because of // its three-address nature. Tweak the cost function again when we can run // convertToThreeAddress() at register allocation time. if (AM.hasSymbolicDisplacement()) { // For X86-64, always use LEA to materialize RIP-relative addresses. if (Subtarget->is64Bit()) Complexity = 4; else Complexity += 2; } // Heuristic: try harder to form an LEA from ADD if the operands set flags. // Unlike ADD, LEA does not affect flags, so we will be less likely to require // duplicating flag-producing instructions later in the pipeline. if (N.getOpcode() == ISD::ADD) { auto isMathWithFlags = [](SDValue V) { switch (V.getOpcode()) { case X86ISD::ADD: case X86ISD::SUB: case X86ISD::ADC: case X86ISD::SBB: /* TODO: These opcodes can be added safely, but we may want to justify their inclusion for different reasons (better for reg-alloc). case X86ISD::SMUL: case X86ISD::UMUL: case X86ISD::OR: case X86ISD::XOR: case X86ISD::AND: */ // Value 1 is the flag output of the node - verify it's not dead. return !SDValue(V.getNode(), 1).use_empty(); default: return false; } }; // TODO: This could be an 'or' rather than 'and' to make the transform more // likely to happen. We might want to factor in whether there's a // load folding opportunity for the math op that disappears with LEA. if (isMathWithFlags(N.getOperand(0)) && isMathWithFlags(N.getOperand(1))) Complexity++; } if (AM.Disp) Complexity++; // If it isn't worth using an LEA, reject it. if (Complexity <= 2) return false; getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment); return true; } /// This is only run on TargetGlobalTLSAddress nodes. bool X86DAGToDAGISel::selectTLSADDRAddr(SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment) { assert(N.getOpcode() == ISD::TargetGlobalTLSAddress); const GlobalAddressSDNode *GA = cast(N); X86ISelAddressMode AM; AM.GV = GA->getGlobal(); AM.Disp += GA->getOffset(); AM.SymbolFlags = GA->getTargetFlags(); if (Subtarget->is32Bit()) { AM.Scale = 1; AM.IndexReg = CurDAG->getRegister(X86::EBX, MVT::i32); } MVT VT = N.getSimpleValueType(); getAddressOperands(AM, SDLoc(N), VT, Base, Scale, Index, Disp, Segment); return true; } bool X86DAGToDAGISel::selectRelocImm(SDValue N, SDValue &Op) { // Keep track of the original value type and whether this value was // truncated. If we see a truncation from pointer type to VT that truncates // bits that are known to be zero, we can use a narrow reference. EVT VT = N.getValueType(); bool WasTruncated = false; if (N.getOpcode() == ISD::TRUNCATE) { WasTruncated = true; N = N.getOperand(0); } if (N.getOpcode() != X86ISD::Wrapper) return false; // We can only use non-GlobalValues as immediates if they were not truncated, // as we do not have any range information. If we have a GlobalValue and the // address was not truncated, we can select it as an operand directly. unsigned Opc = N.getOperand(0)->getOpcode(); if (Opc != ISD::TargetGlobalAddress || !WasTruncated) { Op = N.getOperand(0); // We can only select the operand directly if we didn't have to look past a // truncate. return !WasTruncated; } // Check that the global's range fits into VT. auto *GA = cast(N.getOperand(0)); Optional CR = GA->getGlobal()->getAbsoluteSymbolRange(); if (!CR || CR->getUnsignedMax().uge(1ull << VT.getSizeInBits())) return false; // Okay, we can use a narrow reference. Op = CurDAG->getTargetGlobalAddress(GA->getGlobal(), SDLoc(N), VT, GA->getOffset(), GA->getTargetFlags()); return true; } bool X86DAGToDAGISel::tryFoldLoad(SDNode *Root, SDNode *P, SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment) { assert(Root && P && "Unknown root/parent nodes"); if (!ISD::isNON_EXTLoad(N.getNode()) || !IsProfitableToFold(N, P, Root) || !IsLegalToFold(N, P, Root, OptLevel)) return false; return selectAddr(N.getNode(), N.getOperand(1), Base, Scale, Index, Disp, Segment); } bool X86DAGToDAGISel::tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment) { assert(Root && P && "Unknown root/parent nodes"); if (N->getOpcode() != X86ISD::VBROADCAST_LOAD || !IsProfitableToFold(N, P, Root) || !IsLegalToFold(N, P, Root, OptLevel)) return false; return selectAddr(N.getNode(), N.getOperand(1), Base, Scale, Index, Disp, Segment); } /// Return an SDNode that returns the value of the global base register. /// Output instructions required to initialize the global base register, /// if necessary. SDNode *X86DAGToDAGISel::getGlobalBaseReg() { unsigned GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF); auto &DL = MF->getDataLayout(); return CurDAG->getRegister(GlobalBaseReg, TLI->getPointerTy(DL)).getNode(); } bool X86DAGToDAGISel::isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const { if (N->getOpcode() == ISD::TRUNCATE) N = N->getOperand(0).getNode(); if (N->getOpcode() != X86ISD::Wrapper) return false; auto *GA = dyn_cast(N->getOperand(0)); if (!GA) return false; Optional CR = GA->getGlobal()->getAbsoluteSymbolRange(); if (!CR) return Width == 32 && TM.getCodeModel() == CodeModel::Small; return CR->getSignedMin().sge(-1ull << Width) && CR->getSignedMax().slt(1ull << Width); } static X86::CondCode getCondFromNode(SDNode *N) { assert(N->isMachineOpcode() && "Unexpected node"); X86::CondCode CC = X86::COND_INVALID; unsigned Opc = N->getMachineOpcode(); if (Opc == X86::JCC_1) CC = static_cast(N->getConstantOperandVal(1)); else if (Opc == X86::SETCCr) CC = static_cast(N->getConstantOperandVal(0)); else if (Opc == X86::SETCCm) CC = static_cast(N->getConstantOperandVal(5)); else if (Opc == X86::CMOV16rr || Opc == X86::CMOV32rr || Opc == X86::CMOV64rr) CC = static_cast(N->getConstantOperandVal(2)); else if (Opc == X86::CMOV16rm || Opc == X86::CMOV32rm || Opc == X86::CMOV64rm) CC = static_cast(N->getConstantOperandVal(6)); return CC; } /// Test whether the given X86ISD::CMP node has any users that use a flag /// other than ZF. bool X86DAGToDAGISel::onlyUsesZeroFlag(SDValue Flags) const { // Examine each user of the node. for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end(); UI != UE; ++UI) { // Only check things that use the flags. if (UI.getUse().getResNo() != Flags.getResNo()) continue; // Only examine CopyToReg uses that copy to EFLAGS. if (UI->getOpcode() != ISD::CopyToReg || cast(UI->getOperand(1))->getReg() != X86::EFLAGS) return false; // Examine each user of the CopyToReg use. for (SDNode::use_iterator FlagUI = UI->use_begin(), FlagUE = UI->use_end(); FlagUI != FlagUE; ++FlagUI) { // Only examine the Flag result. if (FlagUI.getUse().getResNo() != 1) continue; // Anything unusual: assume conservatively. if (!FlagUI->isMachineOpcode()) return false; // Examine the condition code of the user. X86::CondCode CC = getCondFromNode(*FlagUI); switch (CC) { // Comparisons which only use the zero flag. case X86::COND_E: case X86::COND_NE: continue; // Anything else: assume conservatively. default: return false; } } } return true; } /// Test whether the given X86ISD::CMP node has any uses which require the SF /// flag to be accurate. bool X86DAGToDAGISel::hasNoSignFlagUses(SDValue Flags) const { // Examine each user of the node. for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end(); UI != UE; ++UI) { // Only check things that use the flags. if (UI.getUse().getResNo() != Flags.getResNo()) continue; // Only examine CopyToReg uses that copy to EFLAGS. if (UI->getOpcode() != ISD::CopyToReg || cast(UI->getOperand(1))->getReg() != X86::EFLAGS) return false; // Examine each user of the CopyToReg use. for (SDNode::use_iterator FlagUI = UI->use_begin(), FlagUE = UI->use_end(); FlagUI != FlagUE; ++FlagUI) { // Only examine the Flag result. if (FlagUI.getUse().getResNo() != 1) continue; // Anything unusual: assume conservatively. if (!FlagUI->isMachineOpcode()) return false; // Examine the condition code of the user. X86::CondCode CC = getCondFromNode(*FlagUI); switch (CC) { // Comparisons which don't examine the SF flag. case X86::COND_A: case X86::COND_AE: case X86::COND_B: case X86::COND_BE: case X86::COND_E: case X86::COND_NE: case X86::COND_O: case X86::COND_NO: case X86::COND_P: case X86::COND_NP: continue; // Anything else: assume conservatively. default: return false; } } } return true; } static bool mayUseCarryFlag(X86::CondCode CC) { switch (CC) { // Comparisons which don't examine the CF flag. case X86::COND_O: case X86::COND_NO: case X86::COND_E: case X86::COND_NE: case X86::COND_S: case X86::COND_NS: case X86::COND_P: case X86::COND_NP: case X86::COND_L: case X86::COND_GE: case X86::COND_G: case X86::COND_LE: return false; // Anything else: assume conservatively. default: return true; } } /// Test whether the given node which sets flags has any uses which require the /// CF flag to be accurate. bool X86DAGToDAGISel::hasNoCarryFlagUses(SDValue Flags) const { // Examine each user of the node. for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end(); UI != UE; ++UI) { // Only check things that use the flags. if (UI.getUse().getResNo() != Flags.getResNo()) continue; unsigned UIOpc = UI->getOpcode(); if (UIOpc == ISD::CopyToReg) { // Only examine CopyToReg uses that copy to EFLAGS. if (cast(UI->getOperand(1))->getReg() != X86::EFLAGS) return false; // Examine each user of the CopyToReg use. for (SDNode::use_iterator FlagUI = UI->use_begin(), FlagUE = UI->use_end(); FlagUI != FlagUE; ++FlagUI) { // Only examine the Flag result. if (FlagUI.getUse().getResNo() != 1) continue; // Anything unusual: assume conservatively. if (!FlagUI->isMachineOpcode()) return false; // Examine the condition code of the user. X86::CondCode CC = getCondFromNode(*FlagUI); if (mayUseCarryFlag(CC)) return false; } // This CopyToReg is ok. Move on to the next user. continue; } // This might be an unselected node. So look for the pre-isel opcodes that // use flags. unsigned CCOpNo; switch (UIOpc) { default: // Something unusual. Be conservative. return false; case X86ISD::SETCC: CCOpNo = 0; break; case X86ISD::SETCC_CARRY: CCOpNo = 0; break; case X86ISD::CMOV: CCOpNo = 2; break; case X86ISD::BRCOND: CCOpNo = 2; break; } X86::CondCode CC = (X86::CondCode)UI->getConstantOperandVal(CCOpNo); if (mayUseCarryFlag(CC)) return false; } return true; } /// Check whether or not the chain ending in StoreNode is suitable for doing /// the {load; op; store} to modify transformation. static bool isFusableLoadOpStorePattern(StoreSDNode *StoreNode, SDValue StoredVal, SelectionDAG *CurDAG, unsigned LoadOpNo, LoadSDNode *&LoadNode, SDValue &InputChain) { // Is the stored value result 0 of the operation? if (StoredVal.getResNo() != 0) return false; // Are there other uses of the operation other than the store? if (!StoredVal.getNode()->hasNUsesOfValue(1, 0)) return false; // Is the store non-extending and non-indexed? if (!ISD::isNormalStore(StoreNode) || StoreNode->isNonTemporal()) return false; SDValue Load = StoredVal->getOperand(LoadOpNo); // Is the stored value a non-extending and non-indexed load? if (!ISD::isNormalLoad(Load.getNode())) return false; // Return LoadNode by reference. LoadNode = cast(Load); // Is store the only read of the loaded value? if (!Load.hasOneUse()) return false; // Is the address of the store the same as the load? if (LoadNode->getBasePtr() != StoreNode->getBasePtr() || LoadNode->getOffset() != StoreNode->getOffset()) return false; bool FoundLoad = false; SmallVector ChainOps; SmallVector LoopWorklist; SmallPtrSet Visited; const unsigned int Max = 1024; // Visualization of Load-Op-Store fusion: // ------------------------- // Legend: // *-lines = Chain operand dependencies. // |-lines = Normal operand dependencies. // Dependencies flow down and right. n-suffix references multiple nodes. // // C Xn C // * * * // * * * // Xn A-LD Yn TF Yn // * * \ | * | // * * \ | * | // * * \ | => A--LD_OP_ST // * * \| \ // TF OP \ // * | \ Zn // * | \ // A-ST Zn // // This merge induced dependences from: #1: Xn -> LD, OP, Zn // #2: Yn -> LD // #3: ST -> Zn // Ensure the transform is safe by checking for the dual // dependencies to make sure we do not induce a loop. // As LD is a predecessor to both OP and ST we can do this by checking: // a). if LD is a predecessor to a member of Xn or Yn. // b). if a Zn is a predecessor to ST. // However, (b) can only occur through being a chain predecessor to // ST, which is the same as Zn being a member or predecessor of Xn, // which is a subset of LD being a predecessor of Xn. So it's // subsumed by check (a). SDValue Chain = StoreNode->getChain(); // Gather X elements in ChainOps. if (Chain == Load.getValue(1)) { FoundLoad = true; ChainOps.push_back(Load.getOperand(0)); } else if (Chain.getOpcode() == ISD::TokenFactor) { for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) { SDValue Op = Chain.getOperand(i); if (Op == Load.getValue(1)) { FoundLoad = true; // Drop Load, but keep its chain. No cycle check necessary. ChainOps.push_back(Load.getOperand(0)); continue; } LoopWorklist.push_back(Op.getNode()); ChainOps.push_back(Op); } } if (!FoundLoad) return false; // Worklist is currently Xn. Add Yn to worklist. for (SDValue Op : StoredVal->ops()) if (Op.getNode() != LoadNode) LoopWorklist.push_back(Op.getNode()); // Check (a) if Load is a predecessor to Xn + Yn if (SDNode::hasPredecessorHelper(Load.getNode(), Visited, LoopWorklist, Max, true)) return false; InputChain = CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ChainOps); return true; } // Change a chain of {load; op; store} of the same value into a simple op // through memory of that value, if the uses of the modified value and its // address are suitable. // // The tablegen pattern memory operand pattern is currently not able to match // the case where the EFLAGS on the original operation are used. // // To move this to tablegen, we'll need to improve tablegen to allow flags to // be transferred from a node in the pattern to the result node, probably with // a new keyword. For example, we have this // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst", // [(store (add (loadi64 addr:$dst), -1), addr:$dst), // (implicit EFLAGS)]>; // but maybe need something like this // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst", // [(store (add (loadi64 addr:$dst), -1), addr:$dst), // (transferrable EFLAGS)]>; // // Until then, we manually fold these and instruction select the operation // here. bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) { StoreSDNode *StoreNode = cast(Node); SDValue StoredVal = StoreNode->getOperand(1); unsigned Opc = StoredVal->getOpcode(); // Before we try to select anything, make sure this is memory operand size // and opcode we can handle. Note that this must match the code below that // actually lowers the opcodes. EVT MemVT = StoreNode->getMemoryVT(); if (MemVT != MVT::i64 && MemVT != MVT::i32 && MemVT != MVT::i16 && MemVT != MVT::i8) return false; bool IsCommutable = false; bool IsNegate = false; switch (Opc) { default: return false; case X86ISD::SUB: IsNegate = isNullConstant(StoredVal.getOperand(0)); break; case X86ISD::SBB: break; case X86ISD::ADD: case X86ISD::ADC: case X86ISD::AND: case X86ISD::OR: case X86ISD::XOR: IsCommutable = true; break; } unsigned LoadOpNo = IsNegate ? 1 : 0; LoadSDNode *LoadNode = nullptr; SDValue InputChain; if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo, LoadNode, InputChain)) { if (!IsCommutable) return false; // This operation is commutable, try the other operand. LoadOpNo = 1; if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo, LoadNode, InputChain)) return false; } SDValue Base, Scale, Index, Disp, Segment; if (!selectAddr(LoadNode, LoadNode->getBasePtr(), Base, Scale, Index, Disp, Segment)) return false; auto SelectOpcode = [&](unsigned Opc64, unsigned Opc32, unsigned Opc16, unsigned Opc8) { switch (MemVT.getSimpleVT().SimpleTy) { case MVT::i64: return Opc64; case MVT::i32: return Opc32; case MVT::i16: return Opc16; case MVT::i8: return Opc8; default: llvm_unreachable("Invalid size!"); } }; MachineSDNode *Result; switch (Opc) { case X86ISD::SUB: // Handle negate. if (IsNegate) { unsigned NewOpc = SelectOpcode(X86::NEG64m, X86::NEG32m, X86::NEG16m, X86::NEG8m); const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain}; Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other, Ops); break; } LLVM_FALLTHROUGH; case X86ISD::ADD: // Try to match inc/dec. if (!Subtarget->slowIncDec() || CurDAG->shouldOptForSize()) { bool IsOne = isOneConstant(StoredVal.getOperand(1)); bool IsNegOne = isAllOnesConstant(StoredVal.getOperand(1)); // ADD/SUB with 1/-1 and carry flag isn't used can use inc/dec. if ((IsOne || IsNegOne) && hasNoCarryFlagUses(StoredVal.getValue(1))) { unsigned NewOpc = ((Opc == X86ISD::ADD) == IsOne) ? SelectOpcode(X86::INC64m, X86::INC32m, X86::INC16m, X86::INC8m) : SelectOpcode(X86::DEC64m, X86::DEC32m, X86::DEC16m, X86::DEC8m); const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain}; Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other, Ops); break; } } LLVM_FALLTHROUGH; case X86ISD::ADC: case X86ISD::SBB: case X86ISD::AND: case X86ISD::OR: case X86ISD::XOR: { auto SelectRegOpcode = [SelectOpcode](unsigned Opc) { switch (Opc) { case X86ISD::ADD: return SelectOpcode(X86::ADD64mr, X86::ADD32mr, X86::ADD16mr, X86::ADD8mr); case X86ISD::ADC: return SelectOpcode(X86::ADC64mr, X86::ADC32mr, X86::ADC16mr, X86::ADC8mr); case X86ISD::SUB: return SelectOpcode(X86::SUB64mr, X86::SUB32mr, X86::SUB16mr, X86::SUB8mr); case X86ISD::SBB: return SelectOpcode(X86::SBB64mr, X86::SBB32mr, X86::SBB16mr, X86::SBB8mr); case X86ISD::AND: return SelectOpcode(X86::AND64mr, X86::AND32mr, X86::AND16mr, X86::AND8mr); case X86ISD::OR: return SelectOpcode(X86::OR64mr, X86::OR32mr, X86::OR16mr, X86::OR8mr); case X86ISD::XOR: return SelectOpcode(X86::XOR64mr, X86::XOR32mr, X86::XOR16mr, X86::XOR8mr); default: llvm_unreachable("Invalid opcode!"); } }; auto SelectImm8Opcode = [SelectOpcode](unsigned Opc) { switch (Opc) { case X86ISD::ADD: return SelectOpcode(X86::ADD64mi8, X86::ADD32mi8, X86::ADD16mi8, 0); case X86ISD::ADC: return SelectOpcode(X86::ADC64mi8, X86::ADC32mi8, X86::ADC16mi8, 0); case X86ISD::SUB: return SelectOpcode(X86::SUB64mi8, X86::SUB32mi8, X86::SUB16mi8, 0); case X86ISD::SBB: return SelectOpcode(X86::SBB64mi8, X86::SBB32mi8, X86::SBB16mi8, 0); case X86ISD::AND: return SelectOpcode(X86::AND64mi8, X86::AND32mi8, X86::AND16mi8, 0); case X86ISD::OR: return SelectOpcode(X86::OR64mi8, X86::OR32mi8, X86::OR16mi8, 0); case X86ISD::XOR: return SelectOpcode(X86::XOR64mi8, X86::XOR32mi8, X86::XOR16mi8, 0); default: llvm_unreachable("Invalid opcode!"); } }; auto SelectImmOpcode = [SelectOpcode](unsigned Opc) { switch (Opc) { case X86ISD::ADD: return SelectOpcode(X86::ADD64mi32, X86::ADD32mi, X86::ADD16mi, X86::ADD8mi); case X86ISD::ADC: return SelectOpcode(X86::ADC64mi32, X86::ADC32mi, X86::ADC16mi, X86::ADC8mi); case X86ISD::SUB: return SelectOpcode(X86::SUB64mi32, X86::SUB32mi, X86::SUB16mi, X86::SUB8mi); case X86ISD::SBB: return SelectOpcode(X86::SBB64mi32, X86::SBB32mi, X86::SBB16mi, X86::SBB8mi); case X86ISD::AND: return SelectOpcode(X86::AND64mi32, X86::AND32mi, X86::AND16mi, X86::AND8mi); case X86ISD::OR: return SelectOpcode(X86::OR64mi32, X86::OR32mi, X86::OR16mi, X86::OR8mi); case X86ISD::XOR: return SelectOpcode(X86::XOR64mi32, X86::XOR32mi, X86::XOR16mi, X86::XOR8mi); default: llvm_unreachable("Invalid opcode!"); } }; unsigned NewOpc = SelectRegOpcode(Opc); SDValue Operand = StoredVal->getOperand(1-LoadOpNo); // See if the operand is a constant that we can fold into an immediate // operand. if (auto *OperandC = dyn_cast(Operand)) { int64_t OperandV = OperandC->getSExtValue(); // Check if we can shrink the operand enough to fit in an immediate (or // fit into a smaller immediate) by negating it and switching the // operation. if ((Opc == X86ISD::ADD || Opc == X86ISD::SUB) && ((MemVT != MVT::i8 && !isInt<8>(OperandV) && isInt<8>(-OperandV)) || (MemVT == MVT::i64 && !isInt<32>(OperandV) && isInt<32>(-OperandV))) && hasNoCarryFlagUses(StoredVal.getValue(1))) { OperandV = -OperandV; Opc = Opc == X86ISD::ADD ? X86ISD::SUB : X86ISD::ADD; } // First try to fit this into an Imm8 operand. If it doesn't fit, then try // the larger immediate operand. if (MemVT != MVT::i8 && isInt<8>(OperandV)) { Operand = CurDAG->getTargetConstant(OperandV, SDLoc(Node), MemVT); NewOpc = SelectImm8Opcode(Opc); } else if (MemVT != MVT::i64 || isInt<32>(OperandV)) { Operand = CurDAG->getTargetConstant(OperandV, SDLoc(Node), MemVT); NewOpc = SelectImmOpcode(Opc); } } if (Opc == X86ISD::ADC || Opc == X86ISD::SBB) { SDValue CopyTo = CurDAG->getCopyToReg(InputChain, SDLoc(Node), X86::EFLAGS, StoredVal.getOperand(2), SDValue()); const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, Operand, CopyTo, CopyTo.getValue(1)}; Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other, Ops); } else { const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, Operand, InputChain}; Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other, Ops); } break; } default: llvm_unreachable("Invalid opcode!"); } MachineMemOperand *MemOps[] = {StoreNode->getMemOperand(), LoadNode->getMemOperand()}; CurDAG->setNodeMemRefs(Result, MemOps); // Update Load Chain uses as well. ReplaceUses(SDValue(LoadNode, 1), SDValue(Result, 1)); ReplaceUses(SDValue(StoreNode, 0), SDValue(Result, 1)); ReplaceUses(SDValue(StoredVal.getNode(), 1), SDValue(Result, 0)); CurDAG->RemoveDeadNode(Node); return true; } // See if this is an X & Mask that we can match to BEXTR/BZHI. // Where Mask is one of the following patterns: // a) x & (1 << nbits) - 1 // b) x & ~(-1 << nbits) // c) x & (-1 >> (32 - y)) // d) x << (32 - y) >> (32 - y) bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) { assert( (Node->getOpcode() == ISD::AND || Node->getOpcode() == ISD::SRL) && "Should be either an and-mask, or right-shift after clearing high bits."); // BEXTR is BMI instruction, BZHI is BMI2 instruction. We need at least one. if (!Subtarget->hasBMI() && !Subtarget->hasBMI2()) return false; MVT NVT = Node->getSimpleValueType(0); // Only supported for 32 and 64 bits. if (NVT != MVT::i32 && NVT != MVT::i64) return false; SDValue NBits; // If we have BMI2's BZHI, we are ok with muti-use patterns. // Else, if we only have BMI1's BEXTR, we require one-use. const bool CanHaveExtraUses = Subtarget->hasBMI2(); auto checkUses = [CanHaveExtraUses](SDValue Op, unsigned NUses) { return CanHaveExtraUses || Op.getNode()->hasNUsesOfValue(NUses, Op.getResNo()); }; auto checkOneUse = [checkUses](SDValue Op) { return checkUses(Op, 1); }; auto checkTwoUse = [checkUses](SDValue Op) { return checkUses(Op, 2); }; auto peekThroughOneUseTruncation = [checkOneUse](SDValue V) { if (V->getOpcode() == ISD::TRUNCATE && checkOneUse(V)) { assert(V.getSimpleValueType() == MVT::i32 && V.getOperand(0).getSimpleValueType() == MVT::i64 && "Expected i64 -> i32 truncation"); V = V.getOperand(0); } return V; }; // a) x & ((1 << nbits) + (-1)) auto matchPatternA = [checkOneUse, peekThroughOneUseTruncation, &NBits](SDValue Mask) -> bool { // Match `add`. Must only have one use! if (Mask->getOpcode() != ISD::ADD || !checkOneUse(Mask)) return false; // We should be adding all-ones constant (i.e. subtracting one.) if (!isAllOnesConstant(Mask->getOperand(1))) return false; // Match `1 << nbits`. Might be truncated. Must only have one use! SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(0)); if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0)) return false; if (!isOneConstant(M0->getOperand(0))) return false; NBits = M0->getOperand(1); return true; }; auto isAllOnes = [this, peekThroughOneUseTruncation, NVT](SDValue V) { V = peekThroughOneUseTruncation(V); return CurDAG->MaskedValueIsAllOnes( V, APInt::getLowBitsSet(V.getSimpleValueType().getSizeInBits(), NVT.getSizeInBits())); }; // b) x & ~(-1 << nbits) auto matchPatternB = [checkOneUse, isAllOnes, peekThroughOneUseTruncation, &NBits](SDValue Mask) -> bool { // Match `~()`. Must only have one use! if (Mask.getOpcode() != ISD::XOR || !checkOneUse(Mask)) return false; // The -1 only has to be all-ones for the final Node's NVT. if (!isAllOnes(Mask->getOperand(1))) return false; // Match `-1 << nbits`. Might be truncated. Must only have one use! SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(0)); if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0)) return false; // The -1 only has to be all-ones for the final Node's NVT. if (!isAllOnes(M0->getOperand(0))) return false; NBits = M0->getOperand(1); return true; }; // Match potentially-truncated (bitwidth - y) auto matchShiftAmt = [checkOneUse, &NBits](SDValue ShiftAmt, unsigned Bitwidth) { // Skip over a truncate of the shift amount. if (ShiftAmt.getOpcode() == ISD::TRUNCATE) { ShiftAmt = ShiftAmt.getOperand(0); // The trunc should have been the only user of the real shift amount. if (!checkOneUse(ShiftAmt)) return false; } // Match the shift amount as: (bitwidth - y). It should go away, too. if (ShiftAmt.getOpcode() != ISD::SUB) return false; auto *V0 = dyn_cast(ShiftAmt.getOperand(0)); if (!V0 || V0->getZExtValue() != Bitwidth) return false; NBits = ShiftAmt.getOperand(1); return true; }; // c) x & (-1 >> (32 - y)) auto matchPatternC = [checkOneUse, peekThroughOneUseTruncation, matchShiftAmt](SDValue Mask) -> bool { // The mask itself may be truncated. Mask = peekThroughOneUseTruncation(Mask); unsigned Bitwidth = Mask.getSimpleValueType().getSizeInBits(); // Match `l>>`. Must only have one use! if (Mask.getOpcode() != ISD::SRL || !checkOneUse(Mask)) return false; // We should be shifting truly all-ones constant. if (!isAllOnesConstant(Mask.getOperand(0))) return false; SDValue M1 = Mask.getOperand(1); // The shift amount should not be used externally. if (!checkOneUse(M1)) return false; return matchShiftAmt(M1, Bitwidth); }; SDValue X; // d) x << (32 - y) >> (32 - y) auto matchPatternD = [checkOneUse, checkTwoUse, matchShiftAmt, &X](SDNode *Node) -> bool { if (Node->getOpcode() != ISD::SRL) return false; SDValue N0 = Node->getOperand(0); if (N0->getOpcode() != ISD::SHL || !checkOneUse(N0)) return false; unsigned Bitwidth = N0.getSimpleValueType().getSizeInBits(); SDValue N1 = Node->getOperand(1); SDValue N01 = N0->getOperand(1); // Both of the shifts must be by the exact same value. // There should not be any uses of the shift amount outside of the pattern. if (N1 != N01 || !checkTwoUse(N1)) return false; if (!matchShiftAmt(N1, Bitwidth)) return false; X = N0->getOperand(0); return true; }; auto matchLowBitMask = [matchPatternA, matchPatternB, matchPatternC](SDValue Mask) -> bool { return matchPatternA(Mask) || matchPatternB(Mask) || matchPatternC(Mask); }; if (Node->getOpcode() == ISD::AND) { X = Node->getOperand(0); SDValue Mask = Node->getOperand(1); if (matchLowBitMask(Mask)) { // Great. } else { std::swap(X, Mask); if (!matchLowBitMask(Mask)) return false; } } else if (!matchPatternD(Node)) return false; SDLoc DL(Node); // Truncate the shift amount. NBits = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NBits); insertDAGNode(*CurDAG, SDValue(Node, 0), NBits); // Insert 8-bit NBits into lowest 8 bits of 32-bit register. // All the other bits are undefined, we do not care about them. SDValue ImplDef = SDValue( CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::i32), 0); insertDAGNode(*CurDAG, SDValue(Node, 0), ImplDef); SDValue SRIdxVal = CurDAG->getTargetConstant(X86::sub_8bit, DL, MVT::i32); insertDAGNode(*CurDAG, SDValue(Node, 0), SRIdxVal); NBits = SDValue( CurDAG->getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::i32, ImplDef, NBits, SRIdxVal), 0); insertDAGNode(*CurDAG, SDValue(Node, 0), NBits); if (Subtarget->hasBMI2()) { // Great, just emit the the BZHI.. if (NVT != MVT::i32) { // But have to place the bit count into the wide-enough register first. NBits = CurDAG->getNode(ISD::ANY_EXTEND, DL, NVT, NBits); insertDAGNode(*CurDAG, SDValue(Node, 0), NBits); } SDValue Extract = CurDAG->getNode(X86ISD::BZHI, DL, NVT, X, NBits); ReplaceNode(Node, Extract.getNode()); SelectCode(Extract.getNode()); return true; } // Else, if we do *NOT* have BMI2, let's find out if the if the 'X' is // *logically* shifted (potentially with one-use trunc inbetween), // and the truncation was the only use of the shift, // and if so look past one-use truncation. { SDValue RealX = peekThroughOneUseTruncation(X); // FIXME: only if the shift is one-use? if (RealX != X && RealX.getOpcode() == ISD::SRL) X = RealX; } MVT XVT = X.getSimpleValueType(); // Else, emitting BEXTR requires one more step. // The 'control' of BEXTR has the pattern of: // [15...8 bit][ 7...0 bit] location // [ bit count][ shift] name // I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11 // Shift NBits left by 8 bits, thus producing 'control'. // This makes the low 8 bits to be zero. SDValue C8 = CurDAG->getConstant(8, DL, MVT::i8); insertDAGNode(*CurDAG, SDValue(Node, 0), C8); SDValue Control = CurDAG->getNode(ISD::SHL, DL, MVT::i32, NBits, C8); insertDAGNode(*CurDAG, SDValue(Node, 0), Control); // If the 'X' is *logically* shifted, we can fold that shift into 'control'. // FIXME: only if the shift is one-use? if (X.getOpcode() == ISD::SRL) { SDValue ShiftAmt = X.getOperand(1); X = X.getOperand(0); assert(ShiftAmt.getValueType() == MVT::i8 && "Expected shift amount to be i8"); // Now, *zero*-extend the shift amount. The bits 8...15 *must* be zero! // We could zext to i16 in some form, but we intentionally don't do that. SDValue OrigShiftAmt = ShiftAmt; ShiftAmt = CurDAG->getNode(ISD::ZERO_EXTEND, DL, MVT::i32, ShiftAmt); insertDAGNode(*CurDAG, OrigShiftAmt, ShiftAmt); // And now 'or' these low 8 bits of shift amount into the 'control'. Control = CurDAG->getNode(ISD::OR, DL, MVT::i32, Control, ShiftAmt); insertDAGNode(*CurDAG, SDValue(Node, 0), Control); } // But have to place the 'control' into the wide-enough register first. if (XVT != MVT::i32) { Control = CurDAG->getNode(ISD::ANY_EXTEND, DL, XVT, Control); insertDAGNode(*CurDAG, SDValue(Node, 0), Control); } // And finally, form the BEXTR itself. SDValue Extract = CurDAG->getNode(X86ISD::BEXTR, DL, XVT, X, Control); // The 'X' was originally truncated. Do that now. if (XVT != NVT) { insertDAGNode(*CurDAG, SDValue(Node, 0), Extract); Extract = CurDAG->getNode(ISD::TRUNCATE, DL, NVT, Extract); } ReplaceNode(Node, Extract.getNode()); SelectCode(Extract.getNode()); return true; } // See if this is an (X >> C1) & C2 that we can match to BEXTR/BEXTRI. MachineSDNode *X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode *Node) { MVT NVT = Node->getSimpleValueType(0); SDLoc dl(Node); SDValue N0 = Node->getOperand(0); SDValue N1 = Node->getOperand(1); // If we have TBM we can use an immediate for the control. If we have BMI // we should only do this if the BEXTR instruction is implemented well. // Otherwise moving the control into a register makes this more costly. // TODO: Maybe load folding, greater than 32-bit masks, or a guarantee of LICM // hoisting the move immediate would make it worthwhile with a less optimal // BEXTR? bool PreferBEXTR = Subtarget->hasTBM() || (Subtarget->hasBMI() && Subtarget->hasFastBEXTR()); if (!PreferBEXTR && !Subtarget->hasBMI2()) return nullptr; // Must have a shift right. if (N0->getOpcode() != ISD::SRL && N0->getOpcode() != ISD::SRA) return nullptr; // Shift can't have additional users. if (!N0->hasOneUse()) return nullptr; // Only supported for 32 and 64 bits. if (NVT != MVT::i32 && NVT != MVT::i64) return nullptr; // Shift amount and RHS of and must be constant. ConstantSDNode *MaskCst = dyn_cast(N1); ConstantSDNode *ShiftCst = dyn_cast(N0->getOperand(1)); if (!MaskCst || !ShiftCst) return nullptr; // And RHS must be a mask. uint64_t Mask = MaskCst->getZExtValue(); if (!isMask_64(Mask)) return nullptr; uint64_t Shift = ShiftCst->getZExtValue(); uint64_t MaskSize = countPopulation(Mask); // Don't interfere with something that can be handled by extracting AH. // TODO: If we are able to fold a load, BEXTR might still be better than AH. if (Shift == 8 && MaskSize == 8) return nullptr; // Make sure we are only using bits that were in the original value, not // shifted in. if (Shift + MaskSize > NVT.getSizeInBits()) return nullptr; // BZHI, if available, is always fast, unlike BEXTR. But even if we decide // that we can't use BEXTR, it is only worthwhile using BZHI if the mask // does not fit into 32 bits. Load folding is not a sufficient reason. if (!PreferBEXTR && MaskSize <= 32) return nullptr; SDValue Control; unsigned ROpc, MOpc; if (!PreferBEXTR) { assert(Subtarget->hasBMI2() && "We must have BMI2's BZHI then."); // If we can't make use of BEXTR then we can't fuse shift+mask stages. // Let's perform the mask first, and apply shift later. Note that we need to // widen the mask to account for the fact that we'll apply shift afterwards! Control = CurDAG->getTargetConstant(Shift + MaskSize, dl, NVT); ROpc = NVT == MVT::i64 ? X86::BZHI64rr : X86::BZHI32rr; MOpc = NVT == MVT::i64 ? X86::BZHI64rm : X86::BZHI32rm; unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri; Control = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, Control), 0); } else { // The 'control' of BEXTR has the pattern of: // [15...8 bit][ 7...0 bit] location // [ bit count][ shift] name // I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11 Control = CurDAG->getTargetConstant(Shift | (MaskSize << 8), dl, NVT); if (Subtarget->hasTBM()) { ROpc = NVT == MVT::i64 ? X86::BEXTRI64ri : X86::BEXTRI32ri; MOpc = NVT == MVT::i64 ? X86::BEXTRI64mi : X86::BEXTRI32mi; } else { assert(Subtarget->hasBMI() && "We must have BMI1's BEXTR then."); // BMI requires the immediate to placed in a register. ROpc = NVT == MVT::i64 ? X86::BEXTR64rr : X86::BEXTR32rr; MOpc = NVT == MVT::i64 ? X86::BEXTR64rm : X86::BEXTR32rm; unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri; Control = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, Control), 0); } } MachineSDNode *NewNode; SDValue Input = N0->getOperand(0); SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; if (tryFoldLoad(Node, N0.getNode(), Input, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) { SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Control, Input.getOperand(0)}; SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other); NewNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops); // Update the chain. ReplaceUses(Input.getValue(1), SDValue(NewNode, 2)); // Record the mem-refs CurDAG->setNodeMemRefs(NewNode, {cast(Input)->getMemOperand()}); } else { NewNode = CurDAG->getMachineNode(ROpc, dl, NVT, MVT::i32, Input, Control); } if (!PreferBEXTR) { // We still need to apply the shift. SDValue ShAmt = CurDAG->getTargetConstant(Shift, dl, NVT); unsigned NewOpc = NVT == MVT::i64 ? X86::SHR64ri : X86::SHR32ri; NewNode = CurDAG->getMachineNode(NewOpc, dl, NVT, SDValue(NewNode, 0), ShAmt); } return NewNode; } // Emit a PCMISTR(I/M) instruction. MachineSDNode *X86DAGToDAGISel::emitPCMPISTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad, const SDLoc &dl, MVT VT, SDNode *Node) { SDValue N0 = Node->getOperand(0); SDValue N1 = Node->getOperand(1); SDValue Imm = Node->getOperand(2); const ConstantInt *Val = cast(Imm)->getConstantIntValue(); Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType()); // Try to fold a load. No need to check alignment. SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; if (MayFoldLoad && tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) { SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm, N1.getOperand(0) }; SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other); MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops); // Update the chain. ReplaceUses(N1.getValue(1), SDValue(CNode, 2)); // Record the mem-refs CurDAG->setNodeMemRefs(CNode, {cast(N1)->getMemOperand()}); return CNode; } SDValue Ops[] = { N0, N1, Imm }; SDVTList VTs = CurDAG->getVTList(VT, MVT::i32); MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops); return CNode; } // Emit a PCMESTR(I/M) instruction. Also return the Glue result in case we need // to emit a second instruction after this one. This is needed since we have two // copyToReg nodes glued before this and we need to continue that glue through. MachineSDNode *X86DAGToDAGISel::emitPCMPESTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad, const SDLoc &dl, MVT VT, SDNode *Node, SDValue &InFlag) { SDValue N0 = Node->getOperand(0); SDValue N2 = Node->getOperand(2); SDValue Imm = Node->getOperand(4); const ConstantInt *Val = cast(Imm)->getConstantIntValue(); Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType()); // Try to fold a load. No need to check alignment. SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; if (MayFoldLoad && tryFoldLoad(Node, N2, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) { SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm, N2.getOperand(0), InFlag }; SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other, MVT::Glue); MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops); InFlag = SDValue(CNode, 3); // Update the chain. ReplaceUses(N2.getValue(1), SDValue(CNode, 2)); // Record the mem-refs CurDAG->setNodeMemRefs(CNode, {cast(N2)->getMemOperand()}); return CNode; } SDValue Ops[] = { N0, N2, Imm, InFlag }; SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Glue); MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops); InFlag = SDValue(CNode, 2); return CNode; } bool X86DAGToDAGISel::tryShiftAmountMod(SDNode *N) { EVT VT = N->getValueType(0); // Only handle scalar shifts. if (VT.isVector()) return false; // Narrower shifts only mask to 5 bits in hardware. unsigned Size = VT == MVT::i64 ? 64 : 32; SDValue OrigShiftAmt = N->getOperand(1); SDValue ShiftAmt = OrigShiftAmt; SDLoc DL(N); // Skip over a truncate of the shift amount. if (ShiftAmt->getOpcode() == ISD::TRUNCATE) ShiftAmt = ShiftAmt->getOperand(0); // This function is called after X86DAGToDAGISel::matchBitExtract(), // so we are not afraid that we might mess up BZHI/BEXTR pattern. SDValue NewShiftAmt; if (ShiftAmt->getOpcode() == ISD::ADD || ShiftAmt->getOpcode() == ISD::SUB) { SDValue Add0 = ShiftAmt->getOperand(0); SDValue Add1 = ShiftAmt->getOperand(1); // If we are shifting by X+/-N where N == 0 mod Size, then just shift by X // to avoid the ADD/SUB. if (isa(Add1) && cast(Add1)->getZExtValue() % Size == 0) { NewShiftAmt = Add0; // If we are shifting by N-X where N == 0 mod Size, then just shift by -X to // generate a NEG instead of a SUB of a constant. } else if (ShiftAmt->getOpcode() == ISD::SUB && isa(Add0) && cast(Add0)->getZExtValue() != 0 && cast(Add0)->getZExtValue() % Size == 0) { // Insert a negate op. // TODO: This isn't guaranteed to replace the sub if there is a logic cone // that uses it that's not a shift. EVT SubVT = ShiftAmt.getValueType(); SDValue Zero = CurDAG->getConstant(0, DL, SubVT); SDValue Neg = CurDAG->getNode(ISD::SUB, DL, SubVT, Zero, Add1); NewShiftAmt = Neg; // Insert these operands into a valid topological order so they can // get selected independently. insertDAGNode(*CurDAG, OrigShiftAmt, Zero); insertDAGNode(*CurDAG, OrigShiftAmt, Neg); } else return false; } else return false; if (NewShiftAmt.getValueType() != MVT::i8) { // Need to truncate the shift amount. NewShiftAmt = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NewShiftAmt); // Add to a correct topological ordering. insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt); } // Insert a new mask to keep the shift amount legal. This should be removed // by isel patterns. NewShiftAmt = CurDAG->getNode(ISD::AND, DL, MVT::i8, NewShiftAmt, CurDAG->getConstant(Size - 1, DL, MVT::i8)); // Place in a correct topological ordering. insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt); SDNode *UpdatedNode = CurDAG->UpdateNodeOperands(N, N->getOperand(0), NewShiftAmt); if (UpdatedNode != N) { // If we found an existing node, we should replace ourselves with that node // and wait for it to be selected after its other users. ReplaceNode(N, UpdatedNode); return true; } // If the original shift amount is now dead, delete it so that we don't run // it through isel. if (OrigShiftAmt.getNode()->use_empty()) CurDAG->RemoveDeadNode(OrigShiftAmt.getNode()); // Now that we've optimized the shift amount, defer to normal isel to get // load folding and legacy vs BMI2 selection without repeating it here. SelectCode(N); return true; } bool X86DAGToDAGISel::tryShrinkShlLogicImm(SDNode *N) { MVT NVT = N->getSimpleValueType(0); unsigned Opcode = N->getOpcode(); SDLoc dl(N); // For operations of the form (x << C1) op C2, check if we can use a smaller // encoding for C2 by transforming it into (x op (C2>>C1)) << C1. SDValue Shift = N->getOperand(0); SDValue N1 = N->getOperand(1); ConstantSDNode *Cst = dyn_cast(N1); if (!Cst) return false; int64_t Val = Cst->getSExtValue(); // If we have an any_extend feeding the AND, look through it to see if there // is a shift behind it. But only if the AND doesn't use the extended bits. // FIXME: Generalize this to other ANY_EXTEND than i32 to i64? bool FoundAnyExtend = false; if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() && Shift.getOperand(0).getSimpleValueType() == MVT::i32 && isUInt<32>(Val)) { FoundAnyExtend = true; Shift = Shift.getOperand(0); } if (Shift.getOpcode() != ISD::SHL || !Shift.hasOneUse()) return false; // i8 is unshrinkable, i16 should be promoted to i32. if (NVT != MVT::i32 && NVT != MVT::i64) return false; ConstantSDNode *ShlCst = dyn_cast(Shift.getOperand(1)); if (!ShlCst) return false; uint64_t ShAmt = ShlCst->getZExtValue(); // Make sure that we don't change the operation by removing bits. // This only matters for OR and XOR, AND is unaffected. uint64_t RemovedBitsMask = (1ULL << ShAmt) - 1; if (Opcode != ISD::AND && (Val & RemovedBitsMask) != 0) return false; // Check the minimum bitwidth for the new constant. // TODO: Using 16 and 8 bit operations is also possible for or32 & xor32. auto CanShrinkImmediate = [&](int64_t &ShiftedVal) { if (Opcode == ISD::AND) { // AND32ri is the same as AND64ri32 with zext imm. // Try this before sign extended immediates below. ShiftedVal = (uint64_t)Val >> ShAmt; if (NVT == MVT::i64 && !isUInt<32>(Val) && isUInt<32>(ShiftedVal)) return true; // Also swap order when the AND can become MOVZX. if (ShiftedVal == UINT8_MAX || ShiftedVal == UINT16_MAX) return true; } ShiftedVal = Val >> ShAmt; if ((!isInt<8>(Val) && isInt<8>(ShiftedVal)) || (!isInt<32>(Val) && isInt<32>(ShiftedVal))) return true; if (Opcode != ISD::AND) { // MOV32ri+OR64r/XOR64r is cheaper than MOV64ri64+OR64rr/XOR64rr ShiftedVal = (uint64_t)Val >> ShAmt; if (NVT == MVT::i64 && !isUInt<32>(Val) && isUInt<32>(ShiftedVal)) return true; } return false; }; int64_t ShiftedVal; if (!CanShrinkImmediate(ShiftedVal)) return false; // Ok, we can reorder to get a smaller immediate. // But, its possible the original immediate allowed an AND to become MOVZX. // Doing this late due to avoid the MakedValueIsZero call as late as // possible. if (Opcode == ISD::AND) { // Find the smallest zext this could possibly be. unsigned ZExtWidth = Cst->getAPIntValue().getActiveBits(); ZExtWidth = PowerOf2Ceil(std::max(ZExtWidth, 8U)); // Figure out which bits need to be zero to achieve that mask. APInt NeededMask = APInt::getLowBitsSet(NVT.getSizeInBits(), ZExtWidth); NeededMask &= ~Cst->getAPIntValue(); if (CurDAG->MaskedValueIsZero(N->getOperand(0), NeededMask)) return false; } SDValue X = Shift.getOperand(0); if (FoundAnyExtend) { SDValue NewX = CurDAG->getNode(ISD::ANY_EXTEND, dl, NVT, X); insertDAGNode(*CurDAG, SDValue(N, 0), NewX); X = NewX; } SDValue NewCst = CurDAG->getConstant(ShiftedVal, dl, NVT); insertDAGNode(*CurDAG, SDValue(N, 0), NewCst); SDValue NewBinOp = CurDAG->getNode(Opcode, dl, NVT, X, NewCst); insertDAGNode(*CurDAG, SDValue(N, 0), NewBinOp); SDValue NewSHL = CurDAG->getNode(ISD::SHL, dl, NVT, NewBinOp, Shift.getOperand(1)); ReplaceNode(N, NewSHL.getNode()); SelectCode(NewSHL.getNode()); return true; } bool X86DAGToDAGISel::matchVPTERNLOG(SDNode *Root, SDNode *ParentA, SDNode *ParentBC, SDValue A, SDValue B, SDValue C, uint8_t Imm) { assert(A.isOperandOf(ParentA)); assert(B.isOperandOf(ParentBC)); assert(C.isOperandOf(ParentBC)); auto tryFoldLoadOrBCast = [this](SDNode *Root, SDNode *P, SDValue &L, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment) { if (tryFoldLoad(Root, P, L, Base, Scale, Index, Disp, Segment)) return true; // Not a load, check for broadcast which may be behind a bitcast. if (L.getOpcode() == ISD::BITCAST && L.hasOneUse()) { P = L.getNode(); L = L.getOperand(0); } if (L.getOpcode() != X86ISD::VBROADCAST_LOAD) return false; // Only 32 and 64 bit broadcasts are supported. auto *MemIntr = cast(L); unsigned Size = MemIntr->getMemoryVT().getSizeInBits(); if (Size != 32 && Size != 64) return false; return tryFoldBroadcast(Root, P, L, Base, Scale, Index, Disp, Segment); }; bool FoldedLoad = false; SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; if (tryFoldLoadOrBCast(Root, ParentBC, C, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) { FoldedLoad = true; } else if (tryFoldLoadOrBCast(Root, ParentA, A, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) { FoldedLoad = true; std::swap(A, C); // Swap bits 1/4 and 3/6. uint8_t OldImm = Imm; Imm = OldImm & 0xa5; if (OldImm & 0x02) Imm |= 0x10; if (OldImm & 0x10) Imm |= 0x02; if (OldImm & 0x08) Imm |= 0x40; if (OldImm & 0x40) Imm |= 0x08; } else if (tryFoldLoadOrBCast(Root, ParentBC, B, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) { FoldedLoad = true; std::swap(B, C); // Swap bits 1/2 and 5/6. uint8_t OldImm = Imm; Imm = OldImm & 0x99; if (OldImm & 0x02) Imm |= 0x04; if (OldImm & 0x04) Imm |= 0x02; if (OldImm & 0x20) Imm |= 0x40; if (OldImm & 0x40) Imm |= 0x20; } SDLoc DL(Root); SDValue TImm = CurDAG->getTargetConstant(Imm, DL, MVT::i8); MVT NVT = Root->getSimpleValueType(0); MachineSDNode *MNode; if (FoldedLoad) { SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other); unsigned Opc; if (C.getOpcode() == X86ISD::VBROADCAST_LOAD) { auto *MemIntr = cast(C); unsigned EltSize = MemIntr->getMemoryVT().getSizeInBits(); assert((EltSize == 32 || EltSize == 64) && "Unexpected broadcast size!"); bool UseD = EltSize == 32; if (NVT.is128BitVector()) Opc = UseD ? X86::VPTERNLOGDZ128rmbi : X86::VPTERNLOGQZ128rmbi; else if (NVT.is256BitVector()) Opc = UseD ? X86::VPTERNLOGDZ256rmbi : X86::VPTERNLOGQZ256rmbi; else if (NVT.is512BitVector()) Opc = UseD ? X86::VPTERNLOGDZrmbi : X86::VPTERNLOGQZrmbi; else llvm_unreachable("Unexpected vector size!"); } else { bool UseD = NVT.getVectorElementType() == MVT::i32; if (NVT.is128BitVector()) Opc = UseD ? X86::VPTERNLOGDZ128rmi : X86::VPTERNLOGQZ128rmi; else if (NVT.is256BitVector()) Opc = UseD ? X86::VPTERNLOGDZ256rmi : X86::VPTERNLOGQZ256rmi; else if (NVT.is512BitVector()) Opc = UseD ? X86::VPTERNLOGDZrmi : X86::VPTERNLOGQZrmi; else llvm_unreachable("Unexpected vector size!"); } SDValue Ops[] = {A, B, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, TImm, C.getOperand(0)}; MNode = CurDAG->getMachineNode(Opc, DL, VTs, Ops); // Update the chain. ReplaceUses(C.getValue(1), SDValue(MNode, 1)); // Record the mem-refs CurDAG->setNodeMemRefs(MNode, {cast(C)->getMemOperand()}); } else { bool UseD = NVT.getVectorElementType() == MVT::i32; unsigned Opc; if (NVT.is128BitVector()) Opc = UseD ? X86::VPTERNLOGDZ128rri : X86::VPTERNLOGQZ128rri; else if (NVT.is256BitVector()) Opc = UseD ? X86::VPTERNLOGDZ256rri : X86::VPTERNLOGQZ256rri; else if (NVT.is512BitVector()) Opc = UseD ? X86::VPTERNLOGDZrri : X86::VPTERNLOGQZrri; else llvm_unreachable("Unexpected vector size!"); MNode = CurDAG->getMachineNode(Opc, DL, NVT, {A, B, C, TImm}); } ReplaceUses(SDValue(Root, 0), SDValue(MNode, 0)); CurDAG->RemoveDeadNode(Root); return true; } // Try to match two logic ops to a VPTERNLOG. // FIXME: Handle inverted inputs? // FIXME: Handle more complex patterns that use an operand more than once? bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) { MVT NVT = N->getSimpleValueType(0); // Make sure we support VPTERNLOG. if (!NVT.isVector() || !Subtarget->hasAVX512() || NVT.getVectorElementType() == MVT::i1) return false; // We need VLX for 128/256-bit. if (!(Subtarget->hasVLX() || NVT.is512BitVector())) return false; SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); auto getFoldableLogicOp = [](SDValue Op) { // Peek through single use bitcast. if (Op.getOpcode() == ISD::BITCAST && Op.hasOneUse()) Op = Op.getOperand(0); if (!Op.hasOneUse()) return SDValue(); unsigned Opc = Op.getOpcode(); if (Opc == ISD::AND || Opc == ISD::OR || Opc == ISD::XOR || Opc == X86ISD::ANDNP) return Op; return SDValue(); }; SDValue A, FoldableOp; if ((FoldableOp = getFoldableLogicOp(N1))) { A = N0; } else if ((FoldableOp = getFoldableLogicOp(N0))) { A = N1; } else return false; SDValue B = FoldableOp.getOperand(0); SDValue C = FoldableOp.getOperand(1); // We can build the appropriate control immediate by performing the logic // operation we're matching using these constants for A, B, and C. const uint8_t TernlogMagicA = 0xf0; const uint8_t TernlogMagicB = 0xcc; const uint8_t TernlogMagicC = 0xaa; uint8_t Imm; switch (FoldableOp.getOpcode()) { default: llvm_unreachable("Unexpected opcode!"); case ISD::AND: Imm = TernlogMagicB & TernlogMagicC; break; case ISD::OR: Imm = TernlogMagicB | TernlogMagicC; break; case ISD::XOR: Imm = TernlogMagicB ^ TernlogMagicC; break; case X86ISD::ANDNP: Imm = ~(TernlogMagicB) & TernlogMagicC; break; } switch (N->getOpcode()) { default: llvm_unreachable("Unexpected opcode!"); case X86ISD::ANDNP: if (A == N0) Imm &= ~TernlogMagicA; else Imm = ~(Imm) & TernlogMagicA; break; case ISD::AND: Imm &= TernlogMagicA; break; case ISD::OR: Imm |= TernlogMagicA; break; case ISD::XOR: Imm ^= TernlogMagicA; break; } return matchVPTERNLOG(N, N, FoldableOp.getNode(), A, B, C, Imm); } /// If the high bits of an 'and' operand are known zero, try setting the /// high bits of an 'and' constant operand to produce a smaller encoding by /// creating a small, sign-extended negative immediate rather than a large /// positive one. This reverses a transform in SimplifyDemandedBits that /// shrinks mask constants by clearing bits. There is also a possibility that /// the 'and' mask can be made -1, so the 'and' itself is unnecessary. In that /// case, just replace the 'and'. Return 'true' if the node is replaced. bool X86DAGToDAGISel::shrinkAndImmediate(SDNode *And) { // i8 is unshrinkable, i16 should be promoted to i32, and vector ops don't // have immediate operands. MVT VT = And->getSimpleValueType(0); if (VT != MVT::i32 && VT != MVT::i64) return false; auto *And1C = dyn_cast(And->getOperand(1)); if (!And1C) return false; // Bail out if the mask constant is already negative. It's can't shrink more. // If the upper 32 bits of a 64 bit mask are all zeros, we have special isel // patterns to use a 32-bit and instead of a 64-bit and by relying on the // implicit zeroing of 32 bit ops. So we should check if the lower 32 bits // are negative too. APInt MaskVal = And1C->getAPIntValue(); unsigned MaskLZ = MaskVal.countLeadingZeros(); if (!MaskLZ || (VT == MVT::i64 && MaskLZ == 32)) return false; // Don't extend into the upper 32 bits of a 64 bit mask. if (VT == MVT::i64 && MaskLZ >= 32) { MaskLZ -= 32; MaskVal = MaskVal.trunc(32); } SDValue And0 = And->getOperand(0); APInt HighZeros = APInt::getHighBitsSet(MaskVal.getBitWidth(), MaskLZ); APInt NegMaskVal = MaskVal | HighZeros; // If a negative constant would not allow a smaller encoding, there's no need // to continue. Only change the constant when we know it's a win. unsigned MinWidth = NegMaskVal.getMinSignedBits(); if (MinWidth > 32 || (MinWidth > 8 && MaskVal.getMinSignedBits() <= 32)) return false; // Extend masks if we truncated above. if (VT == MVT::i64 && MaskVal.getBitWidth() < 64) { NegMaskVal = NegMaskVal.zext(64); HighZeros = HighZeros.zext(64); } // The variable operand must be all zeros in the top bits to allow using the // new, negative constant as the mask. if (!CurDAG->MaskedValueIsZero(And0, HighZeros)) return false; // Check if the mask is -1. In that case, this is an unnecessary instruction // that escaped earlier analysis. if (NegMaskVal.isAllOnesValue()) { ReplaceNode(And, And0.getNode()); return true; } // A negative mask allows a smaller encoding. Create a new 'and' node. SDValue NewMask = CurDAG->getConstant(NegMaskVal, SDLoc(And), VT); SDValue NewAnd = CurDAG->getNode(ISD::AND, SDLoc(And), VT, And0, NewMask); ReplaceNode(And, NewAnd.getNode()); SelectCode(NewAnd.getNode()); return true; } static unsigned getVPTESTMOpc(MVT TestVT, bool IsTestN, bool FoldedLoad, bool FoldedBCast, bool Masked) { #define VPTESTM_CASE(VT, SUFFIX) \ case MVT::VT: \ if (Masked) \ return IsTestN ? X86::VPTESTNM##SUFFIX##k: X86::VPTESTM##SUFFIX##k; \ return IsTestN ? X86::VPTESTNM##SUFFIX : X86::VPTESTM##SUFFIX; #define VPTESTM_BROADCAST_CASES(SUFFIX) \ default: llvm_unreachable("Unexpected VT!"); \ VPTESTM_CASE(v4i32, DZ128##SUFFIX) \ VPTESTM_CASE(v2i64, QZ128##SUFFIX) \ VPTESTM_CASE(v8i32, DZ256##SUFFIX) \ VPTESTM_CASE(v4i64, QZ256##SUFFIX) \ VPTESTM_CASE(v16i32, DZ##SUFFIX) \ VPTESTM_CASE(v8i64, QZ##SUFFIX) #define VPTESTM_FULL_CASES(SUFFIX) \ VPTESTM_BROADCAST_CASES(SUFFIX) \ VPTESTM_CASE(v16i8, BZ128##SUFFIX) \ VPTESTM_CASE(v8i16, WZ128##SUFFIX) \ VPTESTM_CASE(v32i8, BZ256##SUFFIX) \ VPTESTM_CASE(v16i16, WZ256##SUFFIX) \ VPTESTM_CASE(v64i8, BZ##SUFFIX) \ VPTESTM_CASE(v32i16, WZ##SUFFIX) if (FoldedBCast) { switch (TestVT.SimpleTy) { VPTESTM_BROADCAST_CASES(rmb) } } if (FoldedLoad) { switch (TestVT.SimpleTy) { VPTESTM_FULL_CASES(rm) } } switch (TestVT.SimpleTy) { VPTESTM_FULL_CASES(rr) } #undef VPTESTM_FULL_CASES #undef VPTESTM_BROADCAST_CASES #undef VPTESTM_CASE } // Try to create VPTESTM instruction. If InMask is not null, it will be used // to form a masked operation. bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc, SDValue InMask) { assert(Subtarget->hasAVX512() && "Expected AVX512!"); assert(Setcc.getSimpleValueType().getVectorElementType() == MVT::i1 && "Unexpected VT!"); // Look for equal and not equal compares. ISD::CondCode CC = cast(Setcc.getOperand(2))->get(); if (CC != ISD::SETEQ && CC != ISD::SETNE) return false; SDValue SetccOp0 = Setcc.getOperand(0); SDValue SetccOp1 = Setcc.getOperand(1); // Canonicalize the all zero vector to the RHS. if (ISD::isBuildVectorAllZeros(SetccOp0.getNode())) std::swap(SetccOp0, SetccOp1); // See if we're comparing against zero. if (!ISD::isBuildVectorAllZeros(SetccOp1.getNode())) return false; SDValue N0 = SetccOp0; MVT CmpVT = N0.getSimpleValueType(); MVT CmpSVT = CmpVT.getVectorElementType(); // Start with both operands the same. We'll try to refine this. SDValue Src0 = N0; SDValue Src1 = N0; { // Look through single use bitcasts. SDValue N0Temp = N0; if (N0Temp.getOpcode() == ISD::BITCAST && N0Temp.hasOneUse()) N0Temp = N0.getOperand(0); // Look for single use AND. if (N0Temp.getOpcode() == ISD::AND && N0Temp.hasOneUse()) { Src0 = N0Temp.getOperand(0); Src1 = N0Temp.getOperand(1); } } // Without VLX we need to widen the operation. bool Widen = !Subtarget->hasVLX() && !CmpVT.is512BitVector(); auto tryFoldLoadOrBCast = [&](SDNode *Root, SDNode *P, SDValue &L, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment) { // If we need to widen, we can't fold the load. if (!Widen) if (tryFoldLoad(Root, P, L, Base, Scale, Index, Disp, Segment)) return true; // If we didn't fold a load, try to match broadcast. No widening limitation // for this. But only 32 and 64 bit types are supported. if (CmpSVT != MVT::i32 && CmpSVT != MVT::i64) return false; // Look through single use bitcasts. if (L.getOpcode() == ISD::BITCAST && L.hasOneUse()) { P = L.getNode(); L = L.getOperand(0); } if (L.getOpcode() != X86ISD::VBROADCAST_LOAD) return false; auto *MemIntr = cast(L); if (MemIntr->getMemoryVT().getSizeInBits() != CmpSVT.getSizeInBits()) return false; return tryFoldBroadcast(Root, P, L, Base, Scale, Index, Disp, Segment); }; // We can only fold loads if the sources are unique. bool CanFoldLoads = Src0 != Src1; bool FoldedLoad = false; SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; if (CanFoldLoads) { FoldedLoad = tryFoldLoadOrBCast(Root, N0.getNode(), Src1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4); if (!FoldedLoad) { // And is commutative. FoldedLoad = tryFoldLoadOrBCast(Root, N0.getNode(), Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4); if (FoldedLoad) std::swap(Src0, Src1); } } bool FoldedBCast = FoldedLoad && Src1.getOpcode() == X86ISD::VBROADCAST_LOAD; bool IsMasked = InMask.getNode() != nullptr; SDLoc dl(Root); MVT ResVT = Setcc.getSimpleValueType(); MVT MaskVT = ResVT; if (Widen) { // Widen the inputs using insert_subreg or copy_to_regclass. unsigned Scale = CmpVT.is128BitVector() ? 4 : 2; unsigned SubReg = CmpVT.is128BitVector() ? X86::sub_xmm : X86::sub_ymm; unsigned NumElts = CmpVT.getVectorNumElements() * Scale; CmpVT = MVT::getVectorVT(CmpSVT, NumElts); MaskVT = MVT::getVectorVT(MVT::i1, NumElts); SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, dl, CmpVT), 0); Src0 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src0); if (!FoldedBCast) Src1 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src1); if (IsMasked) { // Widen the mask. unsigned RegClass = TLI->getRegClassFor(MaskVT)->getID(); SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32); InMask = SDValue(CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, dl, MaskVT, InMask, RC), 0); } } bool IsTestN = CC == ISD::SETEQ; unsigned Opc = getVPTESTMOpc(CmpVT, IsTestN, FoldedLoad, FoldedBCast, IsMasked); MachineSDNode *CNode; if (FoldedLoad) { SDVTList VTs = CurDAG->getVTList(MaskVT, MVT::Other); if (IsMasked) { SDValue Ops[] = { InMask, Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Src1.getOperand(0) }; CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops); } else { SDValue Ops[] = { Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Src1.getOperand(0) }; CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops); } // Update the chain. ReplaceUses(Src1.getValue(1), SDValue(CNode, 1)); // Record the mem-refs CurDAG->setNodeMemRefs(CNode, {cast(Src1)->getMemOperand()}); } else { if (IsMasked) CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, InMask, Src0, Src1); else CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, Src0, Src1); } // If we widened, we need to shrink the mask VT. if (Widen) { unsigned RegClass = TLI->getRegClassFor(ResVT)->getID(); SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32); CNode = CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, dl, ResVT, SDValue(CNode, 0), RC); } ReplaceUses(SDValue(Root, 0), SDValue(CNode, 0)); CurDAG->RemoveDeadNode(Root); return true; } // Try to match the bitselect pattern (or (and A, B), (andn A, C)). Turn it // into vpternlog. bool X86DAGToDAGISel::tryMatchBitSelect(SDNode *N) { assert(N->getOpcode() == ISD::OR && "Unexpected opcode!"); MVT NVT = N->getSimpleValueType(0); // Make sure we support VPTERNLOG. if (!NVT.isVector() || !Subtarget->hasAVX512()) return false; // We need VLX for 128/256-bit. if (!(Subtarget->hasVLX() || NVT.is512BitVector())) return false; SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); // Canonicalize AND to LHS. if (N1.getOpcode() == ISD::AND) std::swap(N0, N1); if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP || !N0.hasOneUse() || !N1.hasOneUse()) return false; // ANDN is not commutable, use it to pick down A and C. SDValue A = N1.getOperand(0); SDValue C = N1.getOperand(1); // AND is commutable, if one operand matches A, the other operand is B. // Otherwise this isn't a match. SDValue B; if (N0.getOperand(0) == A) B = N0.getOperand(1); else if (N0.getOperand(1) == A) B = N0.getOperand(0); else return false; SDLoc dl(N); SDValue Imm = CurDAG->getTargetConstant(0xCA, dl, MVT::i8); SDValue Ternlog = CurDAG->getNode(X86ISD::VPTERNLOG, dl, NVT, A, B, C, Imm); ReplaceNode(N, Ternlog.getNode()); return matchVPTERNLOG(Ternlog.getNode(), Ternlog.getNode(), Ternlog.getNode(), A, B, C, 0xCA); } void X86DAGToDAGISel::Select(SDNode *Node) { MVT NVT = Node->getSimpleValueType(0); unsigned Opcode = Node->getOpcode(); SDLoc dl(Node); if (Node->isMachineOpcode()) { LLVM_DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << '\n'); Node->setNodeId(-1); return; // Already selected. } switch (Opcode) { default: break; case ISD::INTRINSIC_W_CHAIN: { unsigned IntNo = Node->getConstantOperandVal(1); switch (IntNo) { default: break; case Intrinsic::x86_encodekey128: case Intrinsic::x86_encodekey256: { if (!Subtarget->hasKL()) break; unsigned Opcode; switch (IntNo) { default: llvm_unreachable("Impossible intrinsic"); case Intrinsic::x86_encodekey128: Opcode = X86::ENCODEKEY128; break; case Intrinsic::x86_encodekey256: Opcode = X86::ENCODEKEY256; break; } SDValue Chain = Node->getOperand(0); Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM0, Node->getOperand(3), SDValue()); if (Opcode == X86::ENCODEKEY256) Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM1, Node->getOperand(4), Chain.getValue(1)); MachineSDNode *Res = CurDAG->getMachineNode( Opcode, dl, Node->getVTList(), {Node->getOperand(2), Chain, Chain.getValue(1)}); ReplaceNode(Node, Res); return; } case Intrinsic::x86_tileloadd64_internal: { if (!Subtarget->hasAMXTILE()) break; unsigned Opc = X86::PTILELOADDV; // _tile_loadd_internal(row, col, buf, STRIDE) SDValue Base = Node->getOperand(4); SDValue Scale = getI8Imm(1, dl); SDValue Index = Node->getOperand(5); SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32); SDValue Segment = CurDAG->getRegister(0, MVT::i16); + SDValue CFG = CurDAG->getRegister(0, MVT::Untyped); SDValue Chain = Node->getOperand(0); MachineSDNode *CNode; SDValue Ops[] = {Node->getOperand(2), Node->getOperand(3), Base, Scale, Index, Disp, Segment, + CFG, Chain}; CNode = CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops); ReplaceNode(Node, CNode); return; } case Intrinsic::x86_tdpbssd_internal: { if (!Subtarget->hasAMXTILE()) break; SDValue Chain = Node->getOperand(0); unsigned Opc = X86::PTDPBSSDV; + SDValue CFG = CurDAG->getRegister(0, MVT::Untyped); SDValue Ops[] = {Node->getOperand(2), Node->getOperand(3), Node->getOperand(4), Node->getOperand(5), Node->getOperand(6), Node->getOperand(7), + CFG, Chain}; MachineSDNode *CNode = CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops); ReplaceNode(Node, CNode); return; } case Intrinsic::x86_tilezero_internal: { if (!Subtarget->hasAMXTILE()) break; unsigned Opc = X86::PTILEZEROV; SDValue Chain = Node->getOperand(0); - SDValue Ops[] = {Node->getOperand(2), Node->getOperand(3), Chain}; + SDValue CFG = CurDAG->getRegister(0, MVT::Untyped); + SDValue Ops[] = {Node->getOperand(2), Node->getOperand(3), CFG, Chain}; MachineSDNode *CNode = CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops); ReplaceNode(Node, CNode); return; } } break; } case ISD::INTRINSIC_VOID: { unsigned IntNo = Node->getConstantOperandVal(1); switch (IntNo) { default: break; case Intrinsic::x86_sse3_monitor: case Intrinsic::x86_monitorx: case Intrinsic::x86_clzero: { bool Use64BitPtr = Node->getOperand(2).getValueType() == MVT::i64; unsigned Opc = 0; switch (IntNo) { default: llvm_unreachable("Unexpected intrinsic!"); case Intrinsic::x86_sse3_monitor: if (!Subtarget->hasSSE3()) break; Opc = Use64BitPtr ? X86::MONITOR64rrr : X86::MONITOR32rrr; break; case Intrinsic::x86_monitorx: if (!Subtarget->hasMWAITX()) break; Opc = Use64BitPtr ? X86::MONITORX64rrr : X86::MONITORX32rrr; break; case Intrinsic::x86_clzero: if (!Subtarget->hasCLZERO()) break; Opc = Use64BitPtr ? X86::CLZERO64r : X86::CLZERO32r; break; } if (Opc) { unsigned PtrReg = Use64BitPtr ? X86::RAX : X86::EAX; SDValue Chain = CurDAG->getCopyToReg(Node->getOperand(0), dl, PtrReg, Node->getOperand(2), SDValue()); SDValue InFlag = Chain.getValue(1); if (IntNo == Intrinsic::x86_sse3_monitor || IntNo == Intrinsic::x86_monitorx) { // Copy the other two operands to ECX and EDX. Chain = CurDAG->getCopyToReg(Chain, dl, X86::ECX, Node->getOperand(3), InFlag); InFlag = Chain.getValue(1); Chain = CurDAG->getCopyToReg(Chain, dl, X86::EDX, Node->getOperand(4), InFlag); InFlag = Chain.getValue(1); } MachineSDNode *CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, { Chain, InFlag}); ReplaceNode(Node, CNode); return; } break; } case Intrinsic::x86_tilestored64_internal: { unsigned Opc = X86::PTILESTOREDV; // _tile_stored_internal(row, col, buf, STRIDE, c) SDValue Base = Node->getOperand(4); SDValue Scale = getI8Imm(1, dl); SDValue Index = Node->getOperand(5); SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32); SDValue Segment = CurDAG->getRegister(0, MVT::i16); + SDValue CFG = CurDAG->getRegister(0, MVT::Untyped); SDValue Chain = Node->getOperand(0); MachineSDNode *CNode; SDValue Ops[] = {Node->getOperand(2), Node->getOperand(3), Base, Scale, Index, Disp, Segment, Node->getOperand(6), + CFG, Chain}; CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops); ReplaceNode(Node, CNode); return; } case Intrinsic::x86_tileloadd64: case Intrinsic::x86_tileloaddt164: case Intrinsic::x86_tilestored64: { if (!Subtarget->hasAMXTILE()) break; unsigned Opc; switch (IntNo) { default: llvm_unreachable("Unexpected intrinsic!"); case Intrinsic::x86_tileloadd64: Opc = X86::PTILELOADD; break; case Intrinsic::x86_tileloaddt164: Opc = X86::PTILELOADDT1; break; case Intrinsic::x86_tilestored64: Opc = X86::PTILESTORED; break; } // FIXME: Match displacement and scale. unsigned TIndex = Node->getConstantOperandVal(2); SDValue TReg = getI8Imm(TIndex, dl); SDValue Base = Node->getOperand(3); SDValue Scale = getI8Imm(1, dl); SDValue Index = Node->getOperand(4); SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32); SDValue Segment = CurDAG->getRegister(0, MVT::i16); SDValue Chain = Node->getOperand(0); MachineSDNode *CNode; if (Opc == X86::PTILESTORED) { SDValue Ops[] = { Base, Scale, Index, Disp, Segment, TReg, Chain }; CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops); } else { SDValue Ops[] = { TReg, Base, Scale, Index, Disp, Segment, Chain }; CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops); } ReplaceNode(Node, CNode); return; } } break; } case ISD::BRIND: { if (Subtarget->isTargetNaCl()) // NaCl has its own pass where jmp %r32 are converted to jmp %r64. We // leave the instruction alone. break; if (Subtarget->isTarget64BitILP32()) { // Converts a 32-bit register to a 64-bit, zero-extended version of // it. This is needed because x86-64 can do many things, but jmp %r32 // ain't one of them. SDValue Target = Node->getOperand(1); assert(Target.getValueType() == MVT::i32 && "Unexpected VT!"); SDValue ZextTarget = CurDAG->getZExtOrTrunc(Target, dl, MVT::i64); SDValue Brind = CurDAG->getNode(ISD::BRIND, dl, MVT::Other, Node->getOperand(0), ZextTarget); ReplaceNode(Node, Brind.getNode()); SelectCode(ZextTarget.getNode()); SelectCode(Brind.getNode()); return; } break; } case X86ISD::GlobalBaseReg: ReplaceNode(Node, getGlobalBaseReg()); return; case ISD::BITCAST: // Just drop all 128/256/512-bit bitcasts. if (NVT.is512BitVector() || NVT.is256BitVector() || NVT.is128BitVector() || NVT == MVT::f128) { ReplaceUses(SDValue(Node, 0), Node->getOperand(0)); CurDAG->RemoveDeadNode(Node); return; } break; case ISD::SRL: if (matchBitExtract(Node)) return; LLVM_FALLTHROUGH; case ISD::SRA: case ISD::SHL: if (tryShiftAmountMod(Node)) return; break; case X86ISD::VPTERNLOG: { uint8_t Imm = cast(Node->getOperand(3))->getZExtValue(); if (matchVPTERNLOG(Node, Node, Node, Node->getOperand(0), Node->getOperand(1), Node->getOperand(2), Imm)) return; break; } case X86ISD::ANDNP: if (tryVPTERNLOG(Node)) return; break; case ISD::AND: if (NVT.isVector() && NVT.getVectorElementType() == MVT::i1) { // Try to form a masked VPTESTM. Operands can be in either order. SDValue N0 = Node->getOperand(0); SDValue N1 = Node->getOperand(1); if (N0.getOpcode() == ISD::SETCC && N0.hasOneUse() && tryVPTESTM(Node, N0, N1)) return; if (N1.getOpcode() == ISD::SETCC && N1.hasOneUse() && tryVPTESTM(Node, N1, N0)) return; } if (MachineSDNode *NewNode = matchBEXTRFromAndImm(Node)) { ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0)); CurDAG->RemoveDeadNode(Node); return; } if (matchBitExtract(Node)) return; if (AndImmShrink && shrinkAndImmediate(Node)) return; LLVM_FALLTHROUGH; case ISD::OR: case ISD::XOR: if (tryShrinkShlLogicImm(Node)) return; if (Opcode == ISD::OR && tryMatchBitSelect(Node)) return; if (tryVPTERNLOG(Node)) return; LLVM_FALLTHROUGH; case ISD::ADD: case ISD::SUB: { // Try to avoid folding immediates with multiple uses for optsize. // This code tries to select to register form directly to avoid going // through the isel table which might fold the immediate. We can't change // the patterns on the add/sub/and/or/xor with immediate paterns in the // tablegen files to check immediate use count without making the patterns // unavailable to the fast-isel table. if (!CurDAG->shouldOptForSize()) break; // Only handle i8/i16/i32/i64. if (NVT != MVT::i8 && NVT != MVT::i16 && NVT != MVT::i32 && NVT != MVT::i64) break; SDValue N0 = Node->getOperand(0); SDValue N1 = Node->getOperand(1); ConstantSDNode *Cst = dyn_cast(N1); if (!Cst) break; int64_t Val = Cst->getSExtValue(); // Make sure its an immediate that is considered foldable. // FIXME: Handle unsigned 32 bit immediates for 64-bit AND. if (!isInt<8>(Val) && !isInt<32>(Val)) break; // If this can match to INC/DEC, let it go. if (Opcode == ISD::ADD && (Val == 1 || Val == -1)) break; // Check if we should avoid folding this immediate. if (!shouldAvoidImmediateInstFormsForSize(N1.getNode())) break; // We should not fold the immediate. So we need a register form instead. unsigned ROpc, MOpc; switch (NVT.SimpleTy) { default: llvm_unreachable("Unexpected VT!"); case MVT::i8: switch (Opcode) { default: llvm_unreachable("Unexpected opcode!"); case ISD::ADD: ROpc = X86::ADD8rr; MOpc = X86::ADD8rm; break; case ISD::SUB: ROpc = X86::SUB8rr; MOpc = X86::SUB8rm; break; case ISD::AND: ROpc = X86::AND8rr; MOpc = X86::AND8rm; break; case ISD::OR: ROpc = X86::OR8rr; MOpc = X86::OR8rm; break; case ISD::XOR: ROpc = X86::XOR8rr; MOpc = X86::XOR8rm; break; } break; case MVT::i16: switch (Opcode) { default: llvm_unreachable("Unexpected opcode!"); case ISD::ADD: ROpc = X86::ADD16rr; MOpc = X86::ADD16rm; break; case ISD::SUB: ROpc = X86::SUB16rr; MOpc = X86::SUB16rm; break; case ISD::AND: ROpc = X86::AND16rr; MOpc = X86::AND16rm; break; case ISD::OR: ROpc = X86::OR16rr; MOpc = X86::OR16rm; break; case ISD::XOR: ROpc = X86::XOR16rr; MOpc = X86::XOR16rm; break; } break; case MVT::i32: switch (Opcode) { default: llvm_unreachable("Unexpected opcode!"); case ISD::ADD: ROpc = X86::ADD32rr; MOpc = X86::ADD32rm; break; case ISD::SUB: ROpc = X86::SUB32rr; MOpc = X86::SUB32rm; break; case ISD::AND: ROpc = X86::AND32rr; MOpc = X86::AND32rm; break; case ISD::OR: ROpc = X86::OR32rr; MOpc = X86::OR32rm; break; case ISD::XOR: ROpc = X86::XOR32rr; MOpc = X86::XOR32rm; break; } break; case MVT::i64: switch (Opcode) { default: llvm_unreachable("Unexpected opcode!"); case ISD::ADD: ROpc = X86::ADD64rr; MOpc = X86::ADD64rm; break; case ISD::SUB: ROpc = X86::SUB64rr; MOpc = X86::SUB64rm; break; case ISD::AND: ROpc = X86::AND64rr; MOpc = X86::AND64rm; break; case ISD::OR: ROpc = X86::OR64rr; MOpc = X86::OR64rm; break; case ISD::XOR: ROpc = X86::XOR64rr; MOpc = X86::XOR64rm; break; } break; } // Ok this is a AND/OR/XOR/ADD/SUB with constant. // If this is a not a subtract, we can still try to fold a load. if (Opcode != ISD::SUB) { SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) { SDValue Ops[] = { N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) }; SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other); MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops); // Update the chain. ReplaceUses(N0.getValue(1), SDValue(CNode, 2)); // Record the mem-refs CurDAG->setNodeMemRefs(CNode, {cast(N0)->getMemOperand()}); ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0)); CurDAG->RemoveDeadNode(Node); return; } } CurDAG->SelectNodeTo(Node, ROpc, NVT, MVT::i32, N0, N1); return; } case X86ISD::SMUL: // i16/i32/i64 are handled with isel patterns. if (NVT != MVT::i8) break; LLVM_FALLTHROUGH; case X86ISD::UMUL: { SDValue N0 = Node->getOperand(0); SDValue N1 = Node->getOperand(1); unsigned LoReg, ROpc, MOpc; switch (NVT.SimpleTy) { default: llvm_unreachable("Unsupported VT!"); case MVT::i8: LoReg = X86::AL; ROpc = Opcode == X86ISD::SMUL ? X86::IMUL8r : X86::MUL8r; MOpc = Opcode == X86ISD::SMUL ? X86::IMUL8m : X86::MUL8m; break; case MVT::i16: LoReg = X86::AX; ROpc = X86::MUL16r; MOpc = X86::MUL16m; break; case MVT::i32: LoReg = X86::EAX; ROpc = X86::MUL32r; MOpc = X86::MUL32m; break; case MVT::i64: LoReg = X86::RAX; ROpc = X86::MUL64r; MOpc = X86::MUL64m; break; } SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; bool FoldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4); // Multiply is commutative. if (!FoldedLoad) { FoldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4); if (FoldedLoad) std::swap(N0, N1); } SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg, N0, SDValue()).getValue(1); MachineSDNode *CNode; if (FoldedLoad) { // i16/i32/i64 use an instruction that produces a low and high result even // though only the low result is used. SDVTList VTs; if (NVT == MVT::i8) VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other); else VTs = CurDAG->getVTList(NVT, NVT, MVT::i32, MVT::Other); SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0), InFlag }; CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops); // Update the chain. ReplaceUses(N1.getValue(1), SDValue(CNode, NVT == MVT::i8 ? 2 : 3)); // Record the mem-refs CurDAG->setNodeMemRefs(CNode, {cast(N1)->getMemOperand()}); } else { // i16/i32/i64 use an instruction that produces a low and high result even // though only the low result is used. SDVTList VTs; if (NVT == MVT::i8) VTs = CurDAG->getVTList(NVT, MVT::i32); else VTs = CurDAG->getVTList(NVT, NVT, MVT::i32); CNode = CurDAG->getMachineNode(ROpc, dl, VTs, {N1, InFlag}); } ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0)); ReplaceUses(SDValue(Node, 1), SDValue(CNode, NVT == MVT::i8 ? 1 : 2)); CurDAG->RemoveDeadNode(Node); return; } case ISD::SMUL_LOHI: case ISD::UMUL_LOHI: { SDValue N0 = Node->getOperand(0); SDValue N1 = Node->getOperand(1); unsigned Opc, MOpc; unsigned LoReg, HiReg; bool IsSigned = Opcode == ISD::SMUL_LOHI; bool UseMULX = !IsSigned && Subtarget->hasBMI2(); bool UseMULXHi = UseMULX && SDValue(Node, 0).use_empty(); switch (NVT.SimpleTy) { default: llvm_unreachable("Unsupported VT!"); case MVT::i32: Opc = UseMULXHi ? X86::MULX32Hrr : UseMULX ? X86::MULX32rr : IsSigned ? X86::IMUL32r : X86::MUL32r; MOpc = UseMULXHi ? X86::MULX32Hrm : UseMULX ? X86::MULX32rm : IsSigned ? X86::IMUL32m : X86::MUL32m; LoReg = UseMULX ? X86::EDX : X86::EAX; HiReg = X86::EDX; break; case MVT::i64: Opc = UseMULXHi ? X86::MULX64Hrr : UseMULX ? X86::MULX64rr : IsSigned ? X86::IMUL64r : X86::MUL64r; MOpc = UseMULXHi ? X86::MULX64Hrm : UseMULX ? X86::MULX64rm : IsSigned ? X86::IMUL64m : X86::MUL64m; LoReg = UseMULX ? X86::RDX : X86::RAX; HiReg = X86::RDX; break; } SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4); // Multiply is commmutative. if (!foldedLoad) { foldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4); if (foldedLoad) std::swap(N0, N1); } SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg, N0, SDValue()).getValue(1); SDValue ResHi, ResLo; if (foldedLoad) { SDValue Chain; MachineSDNode *CNode = nullptr; SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0), InFlag }; if (UseMULXHi) { SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other); CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops); ResHi = SDValue(CNode, 0); Chain = SDValue(CNode, 1); } else if (UseMULX) { SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::Other); CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops); ResHi = SDValue(CNode, 0); ResLo = SDValue(CNode, 1); Chain = SDValue(CNode, 2); } else { SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue); CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops); Chain = SDValue(CNode, 0); InFlag = SDValue(CNode, 1); } // Update the chain. ReplaceUses(N1.getValue(1), Chain); // Record the mem-refs CurDAG->setNodeMemRefs(CNode, {cast(N1)->getMemOperand()}); } else { SDValue Ops[] = { N1, InFlag }; if (UseMULXHi) { SDVTList VTs = CurDAG->getVTList(NVT); SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops); ResHi = SDValue(CNode, 0); } else if (UseMULX) { SDVTList VTs = CurDAG->getVTList(NVT, NVT); SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops); ResHi = SDValue(CNode, 0); ResLo = SDValue(CNode, 1); } else { SDVTList VTs = CurDAG->getVTList(MVT::Glue); SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops); InFlag = SDValue(CNode, 0); } } // Copy the low half of the result, if it is needed. if (!SDValue(Node, 0).use_empty()) { if (!ResLo) { assert(LoReg && "Register for low half is not defined!"); ResLo = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, LoReg, NVT, InFlag); InFlag = ResLo.getValue(2); } ReplaceUses(SDValue(Node, 0), ResLo); LLVM_DEBUG(dbgs() << "=> "; ResLo.getNode()->dump(CurDAG); dbgs() << '\n'); } // Copy the high half of the result, if it is needed. if (!SDValue(Node, 1).use_empty()) { if (!ResHi) { assert(HiReg && "Register for high half is not defined!"); ResHi = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, HiReg, NVT, InFlag); InFlag = ResHi.getValue(2); } ReplaceUses(SDValue(Node, 1), ResHi); LLVM_DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG); dbgs() << '\n'); } CurDAG->RemoveDeadNode(Node); return; } case ISD::SDIVREM: case ISD::UDIVREM: { SDValue N0 = Node->getOperand(0); SDValue N1 = Node->getOperand(1); unsigned ROpc, MOpc; bool isSigned = Opcode == ISD::SDIVREM; if (!isSigned) { switch (NVT.SimpleTy) { default: llvm_unreachable("Unsupported VT!"); case MVT::i8: ROpc = X86::DIV8r; MOpc = X86::DIV8m; break; case MVT::i16: ROpc = X86::DIV16r; MOpc = X86::DIV16m; break; case MVT::i32: ROpc = X86::DIV32r; MOpc = X86::DIV32m; break; case MVT::i64: ROpc = X86::DIV64r; MOpc = X86::DIV64m; break; } } else { switch (NVT.SimpleTy) { default: llvm_unreachable("Unsupported VT!"); case MVT::i8: ROpc = X86::IDIV8r; MOpc = X86::IDIV8m; break; case MVT::i16: ROpc = X86::IDIV16r; MOpc = X86::IDIV16m; break; case MVT::i32: ROpc = X86::IDIV32r; MOpc = X86::IDIV32m; break; case MVT::i64: ROpc = X86::IDIV64r; MOpc = X86::IDIV64m; break; } } unsigned LoReg, HiReg, ClrReg; unsigned SExtOpcode; switch (NVT.SimpleTy) { default: llvm_unreachable("Unsupported VT!"); case MVT::i8: LoReg = X86::AL; ClrReg = HiReg = X86::AH; SExtOpcode = 0; // Not used. break; case MVT::i16: LoReg = X86::AX; HiReg = X86::DX; ClrReg = X86::DX; SExtOpcode = X86::CWD; break; case MVT::i32: LoReg = X86::EAX; ClrReg = HiReg = X86::EDX; SExtOpcode = X86::CDQ; break; case MVT::i64: LoReg = X86::RAX; ClrReg = HiReg = X86::RDX; SExtOpcode = X86::CQO; break; } SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4); bool signBitIsZero = CurDAG->SignBitIsZero(N0); SDValue InFlag; if (NVT == MVT::i8) { // Special case for div8, just use a move with zero extension to AX to // clear the upper 8 bits (AH). SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Chain; MachineSDNode *Move; if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) { SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) }; unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rm8 : X86::MOVZX16rm8; Move = CurDAG->getMachineNode(Opc, dl, MVT::i16, MVT::Other, Ops); Chain = SDValue(Move, 1); ReplaceUses(N0.getValue(1), Chain); // Record the mem-refs CurDAG->setNodeMemRefs(Move, {cast(N0)->getMemOperand()}); } else { unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rr8 : X86::MOVZX16rr8; Move = CurDAG->getMachineNode(Opc, dl, MVT::i16, N0); Chain = CurDAG->getEntryNode(); } Chain = CurDAG->getCopyToReg(Chain, dl, X86::AX, SDValue(Move, 0), SDValue()); InFlag = Chain.getValue(1); } else { InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg, N0, SDValue()).getValue(1); if (isSigned && !signBitIsZero) { // Sign extend the low part into the high part. InFlag = SDValue(CurDAG->getMachineNode(SExtOpcode, dl, MVT::Glue, InFlag),0); } else { // Zero out the high part, effectively zero extending the input. SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i32); SDValue ClrNode = SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, VTs, None), 0); switch (NVT.SimpleTy) { case MVT::i16: ClrNode = SDValue(CurDAG->getMachineNode( TargetOpcode::EXTRACT_SUBREG, dl, MVT::i16, ClrNode, CurDAG->getTargetConstant(X86::sub_16bit, dl, MVT::i32)), 0); break; case MVT::i32: break; case MVT::i64: ClrNode = SDValue(CurDAG->getMachineNode( TargetOpcode::SUBREG_TO_REG, dl, MVT::i64, CurDAG->getTargetConstant(0, dl, MVT::i64), ClrNode, CurDAG->getTargetConstant(X86::sub_32bit, dl, MVT::i32)), 0); break; default: llvm_unreachable("Unexpected division source"); } InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, ClrReg, ClrNode, InFlag).getValue(1); } } if (foldedLoad) { SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0), InFlag }; MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, MVT::Other, MVT::Glue, Ops); InFlag = SDValue(CNode, 1); // Update the chain. ReplaceUses(N1.getValue(1), SDValue(CNode, 0)); // Record the mem-refs CurDAG->setNodeMemRefs(CNode, {cast(N1)->getMemOperand()}); } else { InFlag = SDValue(CurDAG->getMachineNode(ROpc, dl, MVT::Glue, N1, InFlag), 0); } // Prevent use of AH in a REX instruction by explicitly copying it to // an ABCD_L register. // // The current assumption of the register allocator is that isel // won't generate explicit references to the GR8_ABCD_H registers. If // the allocator and/or the backend get enhanced to be more robust in // that regard, this can be, and should be, removed. if (HiReg == X86::AH && !SDValue(Node, 1).use_empty()) { SDValue AHCopy = CurDAG->getRegister(X86::AH, MVT::i8); unsigned AHExtOpcode = isSigned ? X86::MOVSX32rr8_NOREX : X86::MOVZX32rr8_NOREX; SDNode *RNode = CurDAG->getMachineNode(AHExtOpcode, dl, MVT::i32, MVT::Glue, AHCopy, InFlag); SDValue Result(RNode, 0); InFlag = SDValue(RNode, 1); Result = CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result); ReplaceUses(SDValue(Node, 1), Result); LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n'); } // Copy the division (low) result, if it is needed. if (!SDValue(Node, 0).use_empty()) { SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, LoReg, NVT, InFlag); InFlag = Result.getValue(2); ReplaceUses(SDValue(Node, 0), Result); LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n'); } // Copy the remainder (high) result, if it is needed. if (!SDValue(Node, 1).use_empty()) { SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, HiReg, NVT, InFlag); InFlag = Result.getValue(2); ReplaceUses(SDValue(Node, 1), Result); LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n'); } CurDAG->RemoveDeadNode(Node); return; } case X86ISD::FCMP: case X86ISD::STRICT_FCMP: case X86ISD::STRICT_FCMPS: { bool IsStrictCmp = Node->getOpcode() == X86ISD::STRICT_FCMP || Node->getOpcode() == X86ISD::STRICT_FCMPS; SDValue N0 = Node->getOperand(IsStrictCmp ? 1 : 0); SDValue N1 = Node->getOperand(IsStrictCmp ? 2 : 1); // Save the original VT of the compare. MVT CmpVT = N0.getSimpleValueType(); // Floating point needs special handling if we don't have FCOMI. if (Subtarget->hasCMov()) break; bool IsSignaling = Node->getOpcode() == X86ISD::STRICT_FCMPS; unsigned Opc; switch (CmpVT.SimpleTy) { default: llvm_unreachable("Unexpected type!"); case MVT::f32: Opc = IsSignaling ? X86::COM_Fpr32 : X86::UCOM_Fpr32; break; case MVT::f64: Opc = IsSignaling ? X86::COM_Fpr64 : X86::UCOM_Fpr64; break; case MVT::f80: Opc = IsSignaling ? X86::COM_Fpr80 : X86::UCOM_Fpr80; break; } SDValue Cmp; SDValue Chain = IsStrictCmp ? Node->getOperand(0) : CurDAG->getEntryNode(); if (IsStrictCmp) { SDVTList VTs = CurDAG->getVTList(MVT::i16, MVT::Other); Cmp = SDValue(CurDAG->getMachineNode(Opc, dl, VTs, {N0, N1, Chain}), 0); Chain = Cmp.getValue(1); } else { Cmp = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::i16, N0, N1), 0); } // Move FPSW to AX. SDValue FPSW = CurDAG->getCopyToReg(Chain, dl, X86::FPSW, Cmp, SDValue()); Chain = FPSW; SDValue FNSTSW = SDValue(CurDAG->getMachineNode(X86::FNSTSW16r, dl, MVT::i16, FPSW, FPSW.getValue(1)), 0); // Extract upper 8-bits of AX. SDValue Extract = CurDAG->getTargetExtractSubreg(X86::sub_8bit_hi, dl, MVT::i8, FNSTSW); // Move AH into flags. // Some 64-bit targets lack SAHF support, but they do support FCOMI. assert(Subtarget->hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?"); SDValue AH = CurDAG->getCopyToReg(Chain, dl, X86::AH, Extract, SDValue()); Chain = AH; SDValue SAHF = SDValue( CurDAG->getMachineNode(X86::SAHF, dl, MVT::i32, AH.getValue(1)), 0); if (IsStrictCmp) ReplaceUses(SDValue(Node, 1), Chain); ReplaceUses(SDValue(Node, 0), SAHF); CurDAG->RemoveDeadNode(Node); return; } case X86ISD::CMP: { SDValue N0 = Node->getOperand(0); SDValue N1 = Node->getOperand(1); // Optimizations for TEST compares. if (!isNullConstant(N1)) break; // Save the original VT of the compare. MVT CmpVT = N0.getSimpleValueType(); // If we are comparing (and (shr X, C, Mask) with 0, emit a BEXTR followed // by a test instruction. The test should be removed later by // analyzeCompare if we are using only the zero flag. // TODO: Should we check the users and use the BEXTR flags directly? if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) { if (MachineSDNode *NewNode = matchBEXTRFromAndImm(N0.getNode())) { unsigned TestOpc = CmpVT == MVT::i64 ? X86::TEST64rr : X86::TEST32rr; SDValue BEXTR = SDValue(NewNode, 0); NewNode = CurDAG->getMachineNode(TestOpc, dl, MVT::i32, BEXTR, BEXTR); ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0)); CurDAG->RemoveDeadNode(Node); return; } } // We can peek through truncates, but we need to be careful below. if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse()) N0 = N0.getOperand(0); // Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to // use a smaller encoding. // Look past the truncate if CMP is the only use of it. if (N0.getOpcode() == ISD::AND && N0.getNode()->hasOneUse() && N0.getValueType() != MVT::i8) { ConstantSDNode *C = dyn_cast(N0.getOperand(1)); if (!C) break; uint64_t Mask = C->getZExtValue(); // Check if we can replace AND+IMM64 with a shift. This is possible for // masks/ like 0xFF000000 or 0x00FFFFFF and if we care only about the zero // flag. if (CmpVT == MVT::i64 && !isInt<32>(Mask) && onlyUsesZeroFlag(SDValue(Node, 0))) { if (isMask_64(~Mask)) { unsigned TrailingZeros = countTrailingZeros(Mask); SDValue Imm = CurDAG->getTargetConstant(TrailingZeros, dl, MVT::i64); SDValue Shift = SDValue(CurDAG->getMachineNode(X86::SHR64ri, dl, MVT::i64, MVT::i32, N0.getOperand(0), Imm), 0); MachineSDNode *Test = CurDAG->getMachineNode(X86::TEST64rr, dl, MVT::i32, Shift, Shift); ReplaceNode(Node, Test); return; } if (isMask_64(Mask)) { unsigned LeadingZeros = countLeadingZeros(Mask); SDValue Imm = CurDAG->getTargetConstant(LeadingZeros, dl, MVT::i64); SDValue Shift = SDValue(CurDAG->getMachineNode(X86::SHL64ri, dl, MVT::i64, MVT::i32, N0.getOperand(0), Imm), 0); MachineSDNode *Test = CurDAG->getMachineNode(X86::TEST64rr, dl, MVT::i32, Shift, Shift); ReplaceNode(Node, Test); return; } } MVT VT; int SubRegOp; unsigned ROpc, MOpc; // For each of these checks we need to be careful if the sign flag is // being used. It is only safe to use the sign flag in two conditions, // either the sign bit in the shrunken mask is zero or the final test // size is equal to the original compare size. if (isUInt<8>(Mask) && (!(Mask & 0x80) || CmpVT == MVT::i8 || hasNoSignFlagUses(SDValue(Node, 0)))) { // For example, convert "testl %eax, $8" to "testb %al, $8" VT = MVT::i8; SubRegOp = X86::sub_8bit; ROpc = X86::TEST8ri; MOpc = X86::TEST8mi; } else if (OptForMinSize && isUInt<16>(Mask) && (!(Mask & 0x8000) || CmpVT == MVT::i16 || hasNoSignFlagUses(SDValue(Node, 0)))) { // For example, "testl %eax, $32776" to "testw %ax, $32776". // NOTE: We only want to form TESTW instructions if optimizing for // min size. Otherwise we only save one byte and possibly get a length // changing prefix penalty in the decoders. VT = MVT::i16; SubRegOp = X86::sub_16bit; ROpc = X86::TEST16ri; MOpc = X86::TEST16mi; } else if (isUInt<32>(Mask) && N0.getValueType() != MVT::i16 && ((!(Mask & 0x80000000) && // Without minsize 16-bit Cmps can get here so we need to // be sure we calculate the correct sign flag if needed. (CmpVT != MVT::i16 || !(Mask & 0x8000))) || CmpVT == MVT::i32 || hasNoSignFlagUses(SDValue(Node, 0)))) { // For example, "testq %rax, $268468232" to "testl %eax, $268468232". // NOTE: We only want to run that transform if N0 is 32 or 64 bits. // Otherwize, we find ourselves in a position where we have to do // promotion. If previous passes did not promote the and, we assume // they had a good reason not to and do not promote here. VT = MVT::i32; SubRegOp = X86::sub_32bit; ROpc = X86::TEST32ri; MOpc = X86::TEST32mi; } else { // No eligible transformation was found. break; } SDValue Imm = CurDAG->getTargetConstant(Mask, dl, VT); SDValue Reg = N0.getOperand(0); // Emit a testl or testw. MachineSDNode *NewNode; SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; if (tryFoldLoad(Node, N0.getNode(), Reg, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) { if (auto *LoadN = dyn_cast(N0.getOperand(0).getNode())) { if (!LoadN->isSimple()) { unsigned NumVolBits = LoadN->getValueType(0).getSizeInBits(); if (MOpc == X86::TEST8mi && NumVolBits != 8) break; else if (MOpc == X86::TEST16mi && NumVolBits != 16) break; else if (MOpc == X86::TEST32mi && NumVolBits != 32) break; } } SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm, Reg.getOperand(0) }; NewNode = CurDAG->getMachineNode(MOpc, dl, MVT::i32, MVT::Other, Ops); // Update the chain. ReplaceUses(Reg.getValue(1), SDValue(NewNode, 1)); // Record the mem-refs CurDAG->setNodeMemRefs(NewNode, {cast(Reg)->getMemOperand()}); } else { // Extract the subregister if necessary. if (N0.getValueType() != VT) Reg = CurDAG->getTargetExtractSubreg(SubRegOp, dl, VT, Reg); NewNode = CurDAG->getMachineNode(ROpc, dl, MVT::i32, Reg, Imm); } // Replace CMP with TEST. ReplaceNode(Node, NewNode); return; } break; } case X86ISD::PCMPISTR: { if (!Subtarget->hasSSE42()) break; bool NeedIndex = !SDValue(Node, 0).use_empty(); bool NeedMask = !SDValue(Node, 1).use_empty(); // We can't fold a load if we are going to make two instructions. bool MayFoldLoad = !NeedIndex || !NeedMask; MachineSDNode *CNode; if (NeedMask) { unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPISTRMrr : X86::PCMPISTRMrr; unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPISTRMrm : X86::PCMPISTRMrm; CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node); ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0)); } if (NeedIndex || !NeedMask) { unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPISTRIrr : X86::PCMPISTRIrr; unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPISTRIrm : X86::PCMPISTRIrm; CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node); ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0)); } // Connect the flag usage to the last instruction created. ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1)); CurDAG->RemoveDeadNode(Node); return; } case X86ISD::PCMPESTR: { if (!Subtarget->hasSSE42()) break; // Copy the two implicit register inputs. SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EAX, Node->getOperand(1), SDValue()).getValue(1); InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EDX, Node->getOperand(3), InFlag).getValue(1); bool NeedIndex = !SDValue(Node, 0).use_empty(); bool NeedMask = !SDValue(Node, 1).use_empty(); // We can't fold a load if we are going to make two instructions. bool MayFoldLoad = !NeedIndex || !NeedMask; MachineSDNode *CNode; if (NeedMask) { unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPESTRMrr : X86::PCMPESTRMrr; unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPESTRMrm : X86::PCMPESTRMrm; CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node, InFlag); ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0)); } if (NeedIndex || !NeedMask) { unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPESTRIrr : X86::PCMPESTRIrr; unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPESTRIrm : X86::PCMPESTRIrm; CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node, InFlag); ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0)); } // Connect the flag usage to the last instruction created. ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1)); CurDAG->RemoveDeadNode(Node); return; } case ISD::SETCC: { if (NVT.isVector() && tryVPTESTM(Node, SDValue(Node, 0), SDValue())) return; break; } case ISD::STORE: if (foldLoadStoreIntoMemOperand(Node)) return; break; case X86ISD::SETCC_CARRY: { // We have to do this manually because tblgen will put the eflags copy in // the wrong place if we use an extract_subreg in the pattern. MVT VT = Node->getSimpleValueType(0); // Copy flags to the EFLAGS register and glue it to next node. SDValue EFLAGS = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS, Node->getOperand(1), SDValue()); // Create a 64-bit instruction if the result is 64-bits otherwise use the // 32-bit version. unsigned Opc = VT == MVT::i64 ? X86::SETB_C64r : X86::SETB_C32r; MVT SetVT = VT == MVT::i64 ? MVT::i64 : MVT::i32; SDValue Result = SDValue( CurDAG->getMachineNode(Opc, dl, SetVT, EFLAGS, EFLAGS.getValue(1)), 0); // For less than 32-bits we need to extract from the 32-bit node. if (VT == MVT::i8 || VT == MVT::i16) { int SubIndex = VT == MVT::i16 ? X86::sub_16bit : X86::sub_8bit; Result = CurDAG->getTargetExtractSubreg(SubIndex, dl, VT, Result); } ReplaceUses(SDValue(Node, 0), Result); CurDAG->RemoveDeadNode(Node); return; } case X86ISD::SBB: { if (isNullConstant(Node->getOperand(0)) && isNullConstant(Node->getOperand(1))) { MVT VT = Node->getSimpleValueType(0); // Create zero. SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i32); SDValue Zero = SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, VTs, None), 0); if (VT == MVT::i64) { Zero = SDValue( CurDAG->getMachineNode( TargetOpcode::SUBREG_TO_REG, dl, MVT::i64, CurDAG->getTargetConstant(0, dl, MVT::i64), Zero, CurDAG->getTargetConstant(X86::sub_32bit, dl, MVT::i32)), 0); } // Copy flags to the EFLAGS register and glue it to next node. SDValue EFLAGS = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS, Node->getOperand(2), SDValue()); // Create a 64-bit instruction if the result is 64-bits otherwise use the // 32-bit version. unsigned Opc = VT == MVT::i64 ? X86::SBB64rr : X86::SBB32rr; MVT SBBVT = VT == MVT::i64 ? MVT::i64 : MVT::i32; VTs = CurDAG->getVTList(SBBVT, MVT::i32); SDValue Result = SDValue(CurDAG->getMachineNode(Opc, dl, VTs, {Zero, Zero, EFLAGS, EFLAGS.getValue(1)}), 0); // Replace the flag use. ReplaceUses(SDValue(Node, 1), Result.getValue(1)); // Replace the result use. if (!SDValue(Node, 0).use_empty()) { // For less than 32-bits we need to extract from the 32-bit node. if (VT == MVT::i8 || VT == MVT::i16) { int SubIndex = VT == MVT::i16 ? X86::sub_16bit : X86::sub_8bit; Result = CurDAG->getTargetExtractSubreg(SubIndex, dl, VT, Result); } ReplaceUses(SDValue(Node, 0), Result); } CurDAG->RemoveDeadNode(Node); return; } break; } case X86ISD::MGATHER: { auto *Mgt = cast(Node); SDValue IndexOp = Mgt->getIndex(); SDValue Mask = Mgt->getMask(); MVT IndexVT = IndexOp.getSimpleValueType(); MVT ValueVT = Node->getSimpleValueType(0); MVT MaskVT = Mask.getSimpleValueType(); // This is just to prevent crashes if the nodes are malformed somehow. We're // otherwise only doing loose type checking in here based on type what // a type constraint would say just like table based isel. if (!ValueVT.isVector() || !MaskVT.isVector()) break; unsigned NumElts = ValueVT.getVectorNumElements(); MVT ValueSVT = ValueVT.getVectorElementType(); bool IsFP = ValueSVT.isFloatingPoint(); unsigned EltSize = ValueSVT.getSizeInBits(); unsigned Opc = 0; bool AVX512Gather = MaskVT.getVectorElementType() == MVT::i1; if (AVX512Gather) { if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32) Opc = IsFP ? X86::VGATHERDPSZ128rm : X86::VPGATHERDDZ128rm; else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32) Opc = IsFP ? X86::VGATHERDPSZ256rm : X86::VPGATHERDDZ256rm; else if (IndexVT == MVT::v16i32 && NumElts == 16 && EltSize == 32) Opc = IsFP ? X86::VGATHERDPSZrm : X86::VPGATHERDDZrm; else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64) Opc = IsFP ? X86::VGATHERDPDZ128rm : X86::VPGATHERDQZ128rm; else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64) Opc = IsFP ? X86::VGATHERDPDZ256rm : X86::VPGATHERDQZ256rm; else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 64) Opc = IsFP ? X86::VGATHERDPDZrm : X86::VPGATHERDQZrm; else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32) Opc = IsFP ? X86::VGATHERQPSZ128rm : X86::VPGATHERQDZ128rm; else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32) Opc = IsFP ? X86::VGATHERQPSZ256rm : X86::VPGATHERQDZ256rm; else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 32) Opc = IsFP ? X86::VGATHERQPSZrm : X86::VPGATHERQDZrm; else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64) Opc = IsFP ? X86::VGATHERQPDZ128rm : X86::VPGATHERQQZ128rm; else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64) Opc = IsFP ? X86::VGATHERQPDZ256rm : X86::VPGATHERQQZ256rm; else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 64) Opc = IsFP ? X86::VGATHERQPDZrm : X86::VPGATHERQQZrm; } else { assert(EVT(MaskVT) == EVT(ValueVT).changeVectorElementTypeToInteger() && "Unexpected mask VT!"); if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32) Opc = IsFP ? X86::VGATHERDPSrm : X86::VPGATHERDDrm; else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32) Opc = IsFP ? X86::VGATHERDPSYrm : X86::VPGATHERDDYrm; else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64) Opc = IsFP ? X86::VGATHERDPDrm : X86::VPGATHERDQrm; else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64) Opc = IsFP ? X86::VGATHERDPDYrm : X86::VPGATHERDQYrm; else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32) Opc = IsFP ? X86::VGATHERQPSrm : X86::VPGATHERQDrm; else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32) Opc = IsFP ? X86::VGATHERQPSYrm : X86::VPGATHERQDYrm; else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64) Opc = IsFP ? X86::VGATHERQPDrm : X86::VPGATHERQQrm; else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64) Opc = IsFP ? X86::VGATHERQPDYrm : X86::VPGATHERQQYrm; } if (!Opc) break; SDValue Base, Scale, Index, Disp, Segment; if (!selectVectorAddr(Mgt, Mgt->getBasePtr(), IndexOp, Mgt->getScale(), Base, Scale, Index, Disp, Segment)) break; SDValue PassThru = Mgt->getPassThru(); SDValue Chain = Mgt->getChain(); // Gather instructions have a mask output not in the ISD node. SDVTList VTs = CurDAG->getVTList(ValueVT, MaskVT, MVT::Other); MachineSDNode *NewNode; if (AVX512Gather) { SDValue Ops[] = {PassThru, Mask, Base, Scale, Index, Disp, Segment, Chain}; NewNode = CurDAG->getMachineNode(Opc, SDLoc(dl), VTs, Ops); } else { SDValue Ops[] = {PassThru, Base, Scale, Index, Disp, Segment, Mask, Chain}; NewNode = CurDAG->getMachineNode(Opc, SDLoc(dl), VTs, Ops); } CurDAG->setNodeMemRefs(NewNode, {Mgt->getMemOperand()}); ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0)); ReplaceUses(SDValue(Node, 1), SDValue(NewNode, 2)); CurDAG->RemoveDeadNode(Node); return; } case X86ISD::MSCATTER: { auto *Sc = cast(Node); SDValue Value = Sc->getValue(); SDValue IndexOp = Sc->getIndex(); MVT IndexVT = IndexOp.getSimpleValueType(); MVT ValueVT = Value.getSimpleValueType(); // This is just to prevent crashes if the nodes are malformed somehow. We're // otherwise only doing loose type checking in here based on type what // a type constraint would say just like table based isel. if (!ValueVT.isVector()) break; unsigned NumElts = ValueVT.getVectorNumElements(); MVT ValueSVT = ValueVT.getVectorElementType(); bool IsFP = ValueSVT.isFloatingPoint(); unsigned EltSize = ValueSVT.getSizeInBits(); unsigned Opc; if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32) Opc = IsFP ? X86::VSCATTERDPSZ128mr : X86::VPSCATTERDDZ128mr; else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32) Opc = IsFP ? X86::VSCATTERDPSZ256mr : X86::VPSCATTERDDZ256mr; else if (IndexVT == MVT::v16i32 && NumElts == 16 && EltSize == 32) Opc = IsFP ? X86::VSCATTERDPSZmr : X86::VPSCATTERDDZmr; else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64) Opc = IsFP ? X86::VSCATTERDPDZ128mr : X86::VPSCATTERDQZ128mr; else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64) Opc = IsFP ? X86::VSCATTERDPDZ256mr : X86::VPSCATTERDQZ256mr; else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 64) Opc = IsFP ? X86::VSCATTERDPDZmr : X86::VPSCATTERDQZmr; else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32) Opc = IsFP ? X86::VSCATTERQPSZ128mr : X86::VPSCATTERQDZ128mr; else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32) Opc = IsFP ? X86::VSCATTERQPSZ256mr : X86::VPSCATTERQDZ256mr; else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 32) Opc = IsFP ? X86::VSCATTERQPSZmr : X86::VPSCATTERQDZmr; else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64) Opc = IsFP ? X86::VSCATTERQPDZ128mr : X86::VPSCATTERQQZ128mr; else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64) Opc = IsFP ? X86::VSCATTERQPDZ256mr : X86::VPSCATTERQQZ256mr; else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 64) Opc = IsFP ? X86::VSCATTERQPDZmr : X86::VPSCATTERQQZmr; else break; SDValue Base, Scale, Index, Disp, Segment; if (!selectVectorAddr(Sc, Sc->getBasePtr(), IndexOp, Sc->getScale(), Base, Scale, Index, Disp, Segment)) break; SDValue Mask = Sc->getMask(); SDValue Chain = Sc->getChain(); // Scatter instructions have a mask output not in the ISD node. SDVTList VTs = CurDAG->getVTList(Mask.getValueType(), MVT::Other); SDValue Ops[] = {Base, Scale, Index, Disp, Segment, Mask, Value, Chain}; MachineSDNode *NewNode = CurDAG->getMachineNode(Opc, SDLoc(dl), VTs, Ops); CurDAG->setNodeMemRefs(NewNode, {Sc->getMemOperand()}); ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 1)); CurDAG->RemoveDeadNode(Node); return; } case ISD::PREALLOCATED_SETUP: { auto *MFI = CurDAG->getMachineFunction().getInfo(); auto CallId = MFI->getPreallocatedIdForCallSite( cast(Node->getOperand(1))->getValue()); SDValue Chain = Node->getOperand(0); SDValue CallIdValue = CurDAG->getTargetConstant(CallId, dl, MVT::i32); MachineSDNode *New = CurDAG->getMachineNode( TargetOpcode::PREALLOCATED_SETUP, dl, MVT::Other, CallIdValue, Chain); ReplaceUses(SDValue(Node, 0), SDValue(New, 0)); // Chain CurDAG->RemoveDeadNode(Node); return; } case ISD::PREALLOCATED_ARG: { auto *MFI = CurDAG->getMachineFunction().getInfo(); auto CallId = MFI->getPreallocatedIdForCallSite( cast(Node->getOperand(1))->getValue()); SDValue Chain = Node->getOperand(0); SDValue CallIdValue = CurDAG->getTargetConstant(CallId, dl, MVT::i32); SDValue ArgIndex = Node->getOperand(2); SDValue Ops[3]; Ops[0] = CallIdValue; Ops[1] = ArgIndex; Ops[2] = Chain; MachineSDNode *New = CurDAG->getMachineNode( TargetOpcode::PREALLOCATED_ARG, dl, CurDAG->getVTList(TLI->getPointerTy(CurDAG->getDataLayout()), MVT::Other), Ops); ReplaceUses(SDValue(Node, 0), SDValue(New, 0)); // Arg pointer ReplaceUses(SDValue(Node, 1), SDValue(New, 1)); // Chain CurDAG->RemoveDeadNode(Node); return; } case X86ISD::AESENCWIDE128KL: case X86ISD::AESDECWIDE128KL: case X86ISD::AESENCWIDE256KL: case X86ISD::AESDECWIDE256KL: { if (!Subtarget->hasWIDEKL()) break; unsigned Opcode; switch (Node->getOpcode()) { default: llvm_unreachable("Unexpected opcode!"); case X86ISD::AESENCWIDE128KL: Opcode = X86::AESENCWIDE128KL; break; case X86ISD::AESDECWIDE128KL: Opcode = X86::AESDECWIDE128KL; break; case X86ISD::AESENCWIDE256KL: Opcode = X86::AESENCWIDE256KL; break; case X86ISD::AESDECWIDE256KL: Opcode = X86::AESDECWIDE256KL; break; } SDValue Chain = Node->getOperand(0); SDValue Addr = Node->getOperand(1); SDValue Base, Scale, Index, Disp, Segment; if (!selectAddr(Node, Addr, Base, Scale, Index, Disp, Segment)) break; Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM0, Node->getOperand(2), SDValue()); Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM1, Node->getOperand(3), Chain.getValue(1)); Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM2, Node->getOperand(4), Chain.getValue(1)); Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM3, Node->getOperand(5), Chain.getValue(1)); Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM4, Node->getOperand(6), Chain.getValue(1)); Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM5, Node->getOperand(7), Chain.getValue(1)); Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM6, Node->getOperand(8), Chain.getValue(1)); Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM7, Node->getOperand(9), Chain.getValue(1)); MachineSDNode *Res = CurDAG->getMachineNode( Opcode, dl, Node->getVTList(), {Base, Scale, Index, Disp, Segment, Chain, Chain.getValue(1)}); CurDAG->setNodeMemRefs(Res, cast(Node)->getMemOperand()); ReplaceNode(Node, Res); return; } } SelectCode(Node); } bool X86DAGToDAGISel:: SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID, std::vector &OutOps) { SDValue Op0, Op1, Op2, Op3, Op4; switch (ConstraintID) { default: llvm_unreachable("Unexpected asm memory constraint"); case InlineAsm::Constraint_o: // offsetable ?? case InlineAsm::Constraint_v: // not offsetable ?? case InlineAsm::Constraint_m: // memory case InlineAsm::Constraint_X: if (!selectAddr(nullptr, Op, Op0, Op1, Op2, Op3, Op4)) return true; break; } OutOps.push_back(Op0); OutOps.push_back(Op1); OutOps.push_back(Op2); OutOps.push_back(Op3); OutOps.push_back(Op4); return false; } /// This pass converts a legalized DAG into a X86-specific DAG, /// ready for instruction scheduling. FunctionPass *llvm::createX86ISelDag(X86TargetMachine &TM, CodeGenOpt::Level OptLevel) { return new X86DAGToDAGISel(TM, OptLevel); } diff --git a/llvm/lib/Target/X86/X86InstrAMX.td b/llvm/lib/Target/X86/X86InstrAMX.td index 209ebd4b3de3..e4f3290cab9f 100644 --- a/llvm/lib/Target/X86/X86InstrAMX.td +++ b/llvm/lib/Target/X86/X86InstrAMX.td @@ -1,140 +1,149 @@ //===---- X86InstrAMX.td - AMX Instruction Set Extension --*- tablegen -*--===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file describes the instructions that make up the Intel AMX instruction // set. // //===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===// // AMX instructions let Predicates = [HasAMXTILE, In64BitMode] in { let SchedRW = [WriteSystem] in { let hasSideEffects = 1, Defs = [TMM0,TMM1,TMM2,TMM3,TMM4,TMM5,TMM6,TMM7] in def LDTILECFG : I <0x49, MRM0m, (outs), (ins opaquemem:$src), "ldtilecfg\t$src", [(int_x86_ldtilecfg addr:$src)]>, VEX, T8PS; let hasSideEffects = 1 in def STTILECFG : I <0x49, MRM0m, (outs), (ins opaquemem:$src), "sttilecfg\t$src", [(int_x86_sttilecfg addr:$src)]>, VEX, T8PD; let mayLoad = 1 in def TILELOADD : I<0x4b, MRMSrcMemFSIB, (outs TILE:$dst), (ins sibmem:$src), "tileloadd\t{$src, $dst|$dst, $src}", []>, VEX, T8XD; let mayLoad = 1 in def TILELOADDT1 : I<0x4b, MRMSrcMemFSIB, (outs TILE:$dst), (ins sibmem:$src), "tileloaddt1\t{$src, $dst|$dst, $src}", []>, VEX, T8PD; let Defs = [TMM0,TMM1,TMM2,TMM3,TMM4,TMM5,TMM6,TMM7] in def TILERELEASE : I<0x49, MRM_C0, (outs), (ins), "tilerelease", [(int_x86_tilerelease)]>, VEX, T8PS; let mayStore = 1 in def TILESTORED : I<0x4b, MRMDestMemFSIB, (outs), (ins sibmem:$dst, TILE:$src), "tilestored\t{$src, $dst|$dst, $src}", []>, VEX, T8XS; def TILEZERO : I<0x49, MRMr0, (outs TILE:$dst), (ins), "tilezero\t$dst", []>, VEX, T8XD; // Pseduo instruction for RA. + let hasSideEffects = 1, mayLoad = 1, + Defs = [TMM0,TMM1,TMM2,TMM3,TMM4,TMM5,TMM6,TMM7] in + def PLDTILECFG : PseudoI <(outs TILECFG:$cfg), (ins opaquemem:$src), []>; + + let hasSideEffects = 1, mayStore = 1 in + def PSTTILECFG : PseudoI<(outs), (ins opaquemem:$dst, TILECFG:$cfg), []>; + def PTILELOADDV : PseudoI<(outs TILE: $dst), (ins GR16:$src1, GR16:$src2, - opaquemem:$src3), []>; + opaquemem:$src3, + TILECFG:$cfg), []>; def PTILESTOREDV : PseudoI<(outs), (ins GR16:$src1, GR16:$src2, opaquemem:$src3, - TILE:$src4), []>; + TILE:$src4, TILECFG:$cfg), []>; def PTILEZEROV : PseudoI<(outs TILE: $dst), (ins GR16:$src1, - GR16:$src2), []>; + GR16:$src2, + TILECFG:$cfg), []>; let usesCustomInserter = 1 in { // Pseudo instructions, using immediates instead of tile registers. // To be translated to the actual instructions in X86ISelLowering.cpp def PTILELOADD : PseudoI<(outs), (ins u8imm:$src1, sibmem:$src2), []>; def PTILELOADDT1 : PseudoI<(outs), (ins u8imm:$src1, sibmem:$src2), []>; def PTILESTORED : PseudoI<(outs), (ins i8mem:$dst, u8imm:$src), []>; def PTILEZERO : PseudoI<(outs), (ins u8imm:$src), [(int_x86_tilezero timm:$src)]>; } } // SchedRW } // HasAMXTILE let Predicates = [HasAMXINT8, In64BitMode] in { let SchedRW = [WriteSystem] in { let Constraints = "$src1 = $dst" in { def TDPBSSD : I<0x5e, MRMSrcReg4VOp3, (outs TILE:$dst), (ins TILE:$src1, TILE:$src2, TILE:$src3), "tdpbssd\t{$src3, $src2, $dst|$dst, $src2, $src3}", []>, VEX_4V, T8XD; def TDPBSUD : I<0x5e, MRMSrcReg4VOp3, (outs TILE:$dst), (ins TILE:$src1, TILE:$src2, TILE:$src3), "tdpbsud\t{$src3, $src2, $dst|$dst, $src2, $src3}", []>, VEX_4V, T8XS; def TDPBUSD : I<0x5e, MRMSrcReg4VOp3, (outs TILE:$dst), (ins TILE:$src1, TILE:$src2, TILE:$src3), "tdpbusd\t{$src3, $src2, $dst|$dst, $src2, $src3}", []>, VEX_4V, T8PD; def TDPBUUD : I<0x5e, MRMSrcReg4VOp3, (outs TILE:$dst), (ins TILE:$src1, TILE:$src2, TILE:$src3), "tdpbuud\t{$src3, $src2, $dst|$dst, $src2, $src3}", []>, VEX_4V, T8PS; } // Pseduo instruction for RA. let Constraints = "$src4 = $dst" in def PTDPBSSDV : PseudoI<(outs TILE: $dst), (ins GR16:$src1, GR16:$src2, GR16:$src3, TILE:$src4, - TILE:$src5, TILE:$src6), []>; + TILE:$src5, TILE:$src6, TILECFG:$cfg), []>; let usesCustomInserter = 1 in { // Pseudo instructions, using immediates instead of tile registers. // To be translated to the actual instructions in X86ISelLowering.cpp def PTDPBSSD : PseudoI<(outs), (ins u8imm:$src1, u8imm:$src2, u8imm:$src3), [(int_x86_tdpbssd timm:$src1, timm:$src2, timm:$src3)]>; def PTDPBSUD : PseudoI<(outs), (ins u8imm:$src1, u8imm:$src2, u8imm:$src3), [(int_x86_tdpbsud timm:$src1, timm:$src2, timm:$src3)]>; def PTDPBUSD : PseudoI<(outs), (ins u8imm:$src1, u8imm:$src2, u8imm:$src3), [(int_x86_tdpbusd timm:$src1, timm:$src2, timm:$src3)]>; def PTDPBUUD : PseudoI<(outs), (ins u8imm:$src1, u8imm:$src2, u8imm:$src3), [(int_x86_tdpbuud timm:$src1, timm:$src2, timm:$src3)]>; } } } // HasAMXTILE let Predicates = [HasAMXBF16, In64BitMode] in { let SchedRW = [WriteSystem] in { let Constraints = "$src1 = $dst" in def TDPBF16PS : I<0x5c, MRMSrcReg4VOp3, (outs TILE:$dst), (ins TILE:$src1, TILE:$src2, TILE:$src3), "tdpbf16ps\t{$src3, $src2, $dst|$dst, $src2, $src3}", []>, VEX_4V, T8XS; let usesCustomInserter = 1 in { // Pseudo instructions, using immediates instead of tile registers. // To be translated to the actual instructions in X86ISelLowering.cpp def PTDPBF16PS : PseudoI<(outs), (ins u8imm:$src1, u8imm:$src2, u8imm:$src3), [(int_x86_tdpbf16ps timm:$src1, timm:$src2, timm:$src3)]>; } } } // HasAMXTILE, HasAMXBF16 diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index fe434bd80f35..d9bab14f0c08 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -1,9057 +1,9065 @@ //===-- X86InstrInfo.cpp - X86 Instruction Information --------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file contains the X86 implementation of the TargetInstrInfo class. // //===----------------------------------------------------------------------===// #include "X86InstrInfo.h" #include "X86.h" #include "X86InstrBuilder.h" #include "X86InstrFoldTables.h" #include "X86MachineFunctionInfo.h" #include "X86Subtarget.h" #include "X86TargetMachine.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/Sequence.h" #include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/LiveVariables.h" #include "llvm/CodeGen/MachineConstantPool.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/StackMaps.h" #include "llvm/IR/DebugInfoMetadata.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetOptions.h" using namespace llvm; #define DEBUG_TYPE "x86-instr-info" #define GET_INSTRINFO_CTOR_DTOR #include "X86GenInstrInfo.inc" static cl::opt NoFusing("disable-spill-fusing", cl::desc("Disable fusing of spill code into instructions"), cl::Hidden); static cl::opt PrintFailedFusing("print-failed-fuse-candidates", cl::desc("Print instructions that the allocator wants to" " fuse, but the X86 backend currently can't"), cl::Hidden); static cl::opt ReMatPICStubLoad("remat-pic-stub-load", cl::desc("Re-materialize load from stub in PIC mode"), cl::init(false), cl::Hidden); static cl::opt PartialRegUpdateClearance("partial-reg-update-clearance", cl::desc("Clearance between two register writes " "for inserting XOR to avoid partial " "register update"), cl::init(64), cl::Hidden); static cl::opt UndefRegClearance("undef-reg-clearance", cl::desc("How many idle instructions we would like before " "certain undef register reads"), cl::init(128), cl::Hidden); // Pin the vtable to this file. void X86InstrInfo::anchor() {} X86InstrInfo::X86InstrInfo(X86Subtarget &STI) : X86GenInstrInfo((STI.isTarget64BitLP64() ? X86::ADJCALLSTACKDOWN64 : X86::ADJCALLSTACKDOWN32), (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKUP64 : X86::ADJCALLSTACKUP32), X86::CATCHRET, (STI.is64Bit() ? X86::RETQ : X86::RETL)), Subtarget(STI), RI(STI.getTargetTriple()) { } bool X86InstrInfo::isCoalescableExtInstr(const MachineInstr &MI, Register &SrcReg, Register &DstReg, unsigned &SubIdx) const { switch (MI.getOpcode()) { default: break; case X86::MOVSX16rr8: case X86::MOVZX16rr8: case X86::MOVSX32rr8: case X86::MOVZX32rr8: case X86::MOVSX64rr8: if (!Subtarget.is64Bit()) // It's not always legal to reference the low 8-bit of the larger // register in 32-bit mode. return false; LLVM_FALLTHROUGH; case X86::MOVSX32rr16: case X86::MOVZX32rr16: case X86::MOVSX64rr16: case X86::MOVSX64rr32: { if (MI.getOperand(0).getSubReg() || MI.getOperand(1).getSubReg()) // Be conservative. return false; SrcReg = MI.getOperand(1).getReg(); DstReg = MI.getOperand(0).getReg(); switch (MI.getOpcode()) { default: llvm_unreachable("Unreachable!"); case X86::MOVSX16rr8: case X86::MOVZX16rr8: case X86::MOVSX32rr8: case X86::MOVZX32rr8: case X86::MOVSX64rr8: SubIdx = X86::sub_8bit; break; case X86::MOVSX32rr16: case X86::MOVZX32rr16: case X86::MOVSX64rr16: SubIdx = X86::sub_16bit; break; case X86::MOVSX64rr32: SubIdx = X86::sub_32bit; break; } return true; } } return false; } bool X86InstrInfo::isDataInvariant(MachineInstr &MI) { switch (MI.getOpcode()) { default: // By default, assume that the instruction is not data invariant. return false; // Some target-independent operations that trivially lower to data-invariant // instructions. case TargetOpcode::COPY: case TargetOpcode::INSERT_SUBREG: case TargetOpcode::SUBREG_TO_REG: return true; // On x86 it is believed that imul is constant time w.r.t. the loaded data. // However, they set flags and are perhaps the most surprisingly constant // time operations so we call them out here separately. case X86::IMUL16rr: case X86::IMUL16rri8: case X86::IMUL16rri: case X86::IMUL32rr: case X86::IMUL32rri8: case X86::IMUL32rri: case X86::IMUL64rr: case X86::IMUL64rri32: case X86::IMUL64rri8: // Bit scanning and counting instructions that are somewhat surprisingly // constant time as they scan across bits and do other fairly complex // operations like popcnt, but are believed to be constant time on x86. // However, these set flags. case X86::BSF16rr: case X86::BSF32rr: case X86::BSF64rr: case X86::BSR16rr: case X86::BSR32rr: case X86::BSR64rr: case X86::LZCNT16rr: case X86::LZCNT32rr: case X86::LZCNT64rr: case X86::POPCNT16rr: case X86::POPCNT32rr: case X86::POPCNT64rr: case X86::TZCNT16rr: case X86::TZCNT32rr: case X86::TZCNT64rr: // Bit manipulation instructions are effectively combinations of basic // arithmetic ops, and should still execute in constant time. These also // set flags. case X86::BLCFILL32rr: case X86::BLCFILL64rr: case X86::BLCI32rr: case X86::BLCI64rr: case X86::BLCIC32rr: case X86::BLCIC64rr: case X86::BLCMSK32rr: case X86::BLCMSK64rr: case X86::BLCS32rr: case X86::BLCS64rr: case X86::BLSFILL32rr: case X86::BLSFILL64rr: case X86::BLSI32rr: case X86::BLSI64rr: case X86::BLSIC32rr: case X86::BLSIC64rr: case X86::BLSMSK32rr: case X86::BLSMSK64rr: case X86::BLSR32rr: case X86::BLSR64rr: case X86::TZMSK32rr: case X86::TZMSK64rr: // Bit extracting and clearing instructions should execute in constant time, // and set flags. case X86::BEXTR32rr: case X86::BEXTR64rr: case X86::BEXTRI32ri: case X86::BEXTRI64ri: case X86::BZHI32rr: case X86::BZHI64rr: // Shift and rotate. case X86::ROL8r1: case X86::ROL16r1: case X86::ROL32r1: case X86::ROL64r1: case X86::ROL8rCL: case X86::ROL16rCL: case X86::ROL32rCL: case X86::ROL64rCL: case X86::ROL8ri: case X86::ROL16ri: case X86::ROL32ri: case X86::ROL64ri: case X86::ROR8r1: case X86::ROR16r1: case X86::ROR32r1: case X86::ROR64r1: case X86::ROR8rCL: case X86::ROR16rCL: case X86::ROR32rCL: case X86::ROR64rCL: case X86::ROR8ri: case X86::ROR16ri: case X86::ROR32ri: case X86::ROR64ri: case X86::SAR8r1: case X86::SAR16r1: case X86::SAR32r1: case X86::SAR64r1: case X86::SAR8rCL: case X86::SAR16rCL: case X86::SAR32rCL: case X86::SAR64rCL: case X86::SAR8ri: case X86::SAR16ri: case X86::SAR32ri: case X86::SAR64ri: case X86::SHL8r1: case X86::SHL16r1: case X86::SHL32r1: case X86::SHL64r1: case X86::SHL8rCL: case X86::SHL16rCL: case X86::SHL32rCL: case X86::SHL64rCL: case X86::SHL8ri: case X86::SHL16ri: case X86::SHL32ri: case X86::SHL64ri: case X86::SHR8r1: case X86::SHR16r1: case X86::SHR32r1: case X86::SHR64r1: case X86::SHR8rCL: case X86::SHR16rCL: case X86::SHR32rCL: case X86::SHR64rCL: case X86::SHR8ri: case X86::SHR16ri: case X86::SHR32ri: case X86::SHR64ri: case X86::SHLD16rrCL: case X86::SHLD32rrCL: case X86::SHLD64rrCL: case X86::SHLD16rri8: case X86::SHLD32rri8: case X86::SHLD64rri8: case X86::SHRD16rrCL: case X86::SHRD32rrCL: case X86::SHRD64rrCL: case X86::SHRD16rri8: case X86::SHRD32rri8: case X86::SHRD64rri8: // Basic arithmetic is constant time on the input but does set flags. case X86::ADC8rr: case X86::ADC8ri: case X86::ADC16rr: case X86::ADC16ri: case X86::ADC16ri8: case X86::ADC32rr: case X86::ADC32ri: case X86::ADC32ri8: case X86::ADC64rr: case X86::ADC64ri8: case X86::ADC64ri32: case X86::ADD8rr: case X86::ADD8ri: case X86::ADD16rr: case X86::ADD16ri: case X86::ADD16ri8: case X86::ADD32rr: case X86::ADD32ri: case X86::ADD32ri8: case X86::ADD64rr: case X86::ADD64ri8: case X86::ADD64ri32: case X86::AND8rr: case X86::AND8ri: case X86::AND16rr: case X86::AND16ri: case X86::AND16ri8: case X86::AND32rr: case X86::AND32ri: case X86::AND32ri8: case X86::AND64rr: case X86::AND64ri8: case X86::AND64ri32: case X86::OR8rr: case X86::OR8ri: case X86::OR16rr: case X86::OR16ri: case X86::OR16ri8: case X86::OR32rr: case X86::OR32ri: case X86::OR32ri8: case X86::OR64rr: case X86::OR64ri8: case X86::OR64ri32: case X86::SBB8rr: case X86::SBB8ri: case X86::SBB16rr: case X86::SBB16ri: case X86::SBB16ri8: case X86::SBB32rr: case X86::SBB32ri: case X86::SBB32ri8: case X86::SBB64rr: case X86::SBB64ri8: case X86::SBB64ri32: case X86::SUB8rr: case X86::SUB8ri: case X86::SUB16rr: case X86::SUB16ri: case X86::SUB16ri8: case X86::SUB32rr: case X86::SUB32ri: case X86::SUB32ri8: case X86::SUB64rr: case X86::SUB64ri8: case X86::SUB64ri32: case X86::XOR8rr: case X86::XOR8ri: case X86::XOR16rr: case X86::XOR16ri: case X86::XOR16ri8: case X86::XOR32rr: case X86::XOR32ri: case X86::XOR32ri8: case X86::XOR64rr: case X86::XOR64ri8: case X86::XOR64ri32: // Arithmetic with just 32-bit and 64-bit variants and no immediates. case X86::ADCX32rr: case X86::ADCX64rr: case X86::ADOX32rr: case X86::ADOX64rr: case X86::ANDN32rr: case X86::ANDN64rr: // Unary arithmetic operations. case X86::DEC8r: case X86::DEC16r: case X86::DEC32r: case X86::DEC64r: case X86::INC8r: case X86::INC16r: case X86::INC32r: case X86::INC64r: case X86::NEG8r: case X86::NEG16r: case X86::NEG32r: case X86::NEG64r: // Unlike other arithmetic, NOT doesn't set EFLAGS. case X86::NOT8r: case X86::NOT16r: case X86::NOT32r: case X86::NOT64r: // Various move instructions used to zero or sign extend things. Note that we // intentionally don't support the _NOREX variants as we can't handle that // register constraint anyways. case X86::MOVSX16rr8: case X86::MOVSX32rr8: case X86::MOVSX32rr16: case X86::MOVSX64rr8: case X86::MOVSX64rr16: case X86::MOVSX64rr32: case X86::MOVZX16rr8: case X86::MOVZX32rr8: case X86::MOVZX32rr16: case X86::MOVZX64rr8: case X86::MOVZX64rr16: case X86::MOV32rr: // Arithmetic instructions that are both constant time and don't set flags. case X86::RORX32ri: case X86::RORX64ri: case X86::SARX32rr: case X86::SARX64rr: case X86::SHLX32rr: case X86::SHLX64rr: case X86::SHRX32rr: case X86::SHRX64rr: // LEA doesn't actually access memory, and its arithmetic is constant time. case X86::LEA16r: case X86::LEA32r: case X86::LEA64_32r: case X86::LEA64r: return true; } } bool X86InstrInfo::isDataInvariantLoad(MachineInstr &MI) { switch (MI.getOpcode()) { default: // By default, assume that the load will immediately leak. return false; // On x86 it is believed that imul is constant time w.r.t. the loaded data. // However, they set flags and are perhaps the most surprisingly constant // time operations so we call them out here separately. case X86::IMUL16rm: case X86::IMUL16rmi8: case X86::IMUL16rmi: case X86::IMUL32rm: case X86::IMUL32rmi8: case X86::IMUL32rmi: case X86::IMUL64rm: case X86::IMUL64rmi32: case X86::IMUL64rmi8: // Bit scanning and counting instructions that are somewhat surprisingly // constant time as they scan across bits and do other fairly complex // operations like popcnt, but are believed to be constant time on x86. // However, these set flags. case X86::BSF16rm: case X86::BSF32rm: case X86::BSF64rm: case X86::BSR16rm: case X86::BSR32rm: case X86::BSR64rm: case X86::LZCNT16rm: case X86::LZCNT32rm: case X86::LZCNT64rm: case X86::POPCNT16rm: case X86::POPCNT32rm: case X86::POPCNT64rm: case X86::TZCNT16rm: case X86::TZCNT32rm: case X86::TZCNT64rm: // Bit manipulation instructions are effectively combinations of basic // arithmetic ops, and should still execute in constant time. These also // set flags. case X86::BLCFILL32rm: case X86::BLCFILL64rm: case X86::BLCI32rm: case X86::BLCI64rm: case X86::BLCIC32rm: case X86::BLCIC64rm: case X86::BLCMSK32rm: case X86::BLCMSK64rm: case X86::BLCS32rm: case X86::BLCS64rm: case X86::BLSFILL32rm: case X86::BLSFILL64rm: case X86::BLSI32rm: case X86::BLSI64rm: case X86::BLSIC32rm: case X86::BLSIC64rm: case X86::BLSMSK32rm: case X86::BLSMSK64rm: case X86::BLSR32rm: case X86::BLSR64rm: case X86::TZMSK32rm: case X86::TZMSK64rm: // Bit extracting and clearing instructions should execute in constant time, // and set flags. case X86::BEXTR32rm: case X86::BEXTR64rm: case X86::BEXTRI32mi: case X86::BEXTRI64mi: case X86::BZHI32rm: case X86::BZHI64rm: // Basic arithmetic is constant time on the input but does set flags. case X86::ADC8rm: case X86::ADC16rm: case X86::ADC32rm: case X86::ADC64rm: case X86::ADCX32rm: case X86::ADCX64rm: case X86::ADD8rm: case X86::ADD16rm: case X86::ADD32rm: case X86::ADD64rm: case X86::ADOX32rm: case X86::ADOX64rm: case X86::AND8rm: case X86::AND16rm: case X86::AND32rm: case X86::AND64rm: case X86::ANDN32rm: case X86::ANDN64rm: case X86::OR8rm: case X86::OR16rm: case X86::OR32rm: case X86::OR64rm: case X86::SBB8rm: case X86::SBB16rm: case X86::SBB32rm: case X86::SBB64rm: case X86::SUB8rm: case X86::SUB16rm: case X86::SUB32rm: case X86::SUB64rm: case X86::XOR8rm: case X86::XOR16rm: case X86::XOR32rm: case X86::XOR64rm: // Integer multiply w/o affecting flags is still believed to be constant // time on x86. Called out separately as this is among the most surprising // instructions to exhibit that behavior. case X86::MULX32rm: case X86::MULX64rm: // Arithmetic instructions that are both constant time and don't set flags. case X86::RORX32mi: case X86::RORX64mi: case X86::SARX32rm: case X86::SARX64rm: case X86::SHLX32rm: case X86::SHLX64rm: case X86::SHRX32rm: case X86::SHRX64rm: // Conversions are believed to be constant time and don't set flags. case X86::CVTTSD2SI64rm: case X86::VCVTTSD2SI64rm: case X86::VCVTTSD2SI64Zrm: case X86::CVTTSD2SIrm: case X86::VCVTTSD2SIrm: case X86::VCVTTSD2SIZrm: case X86::CVTTSS2SI64rm: case X86::VCVTTSS2SI64rm: case X86::VCVTTSS2SI64Zrm: case X86::CVTTSS2SIrm: case X86::VCVTTSS2SIrm: case X86::VCVTTSS2SIZrm: case X86::CVTSI2SDrm: case X86::VCVTSI2SDrm: case X86::VCVTSI2SDZrm: case X86::CVTSI2SSrm: case X86::VCVTSI2SSrm: case X86::VCVTSI2SSZrm: case X86::CVTSI642SDrm: case X86::VCVTSI642SDrm: case X86::VCVTSI642SDZrm: case X86::CVTSI642SSrm: case X86::VCVTSI642SSrm: case X86::VCVTSI642SSZrm: case X86::CVTSS2SDrm: case X86::VCVTSS2SDrm: case X86::VCVTSS2SDZrm: case X86::CVTSD2SSrm: case X86::VCVTSD2SSrm: case X86::VCVTSD2SSZrm: // AVX512 added unsigned integer conversions. case X86::VCVTTSD2USI64Zrm: case X86::VCVTTSD2USIZrm: case X86::VCVTTSS2USI64Zrm: case X86::VCVTTSS2USIZrm: case X86::VCVTUSI2SDZrm: case X86::VCVTUSI642SDZrm: case X86::VCVTUSI2SSZrm: case X86::VCVTUSI642SSZrm: // Loads to register don't set flags. case X86::MOV8rm: case X86::MOV8rm_NOREX: case X86::MOV16rm: case X86::MOV32rm: case X86::MOV64rm: case X86::MOVSX16rm8: case X86::MOVSX32rm16: case X86::MOVSX32rm8: case X86::MOVSX32rm8_NOREX: case X86::MOVSX64rm16: case X86::MOVSX64rm32: case X86::MOVSX64rm8: case X86::MOVZX16rm8: case X86::MOVZX32rm16: case X86::MOVZX32rm8: case X86::MOVZX32rm8_NOREX: case X86::MOVZX64rm16: case X86::MOVZX64rm8: return true; } } int X86InstrInfo::getSPAdjust(const MachineInstr &MI) const { const MachineFunction *MF = MI.getParent()->getParent(); const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering(); if (isFrameInstr(MI)) { int SPAdj = alignTo(getFrameSize(MI), TFI->getStackAlign()); SPAdj -= getFrameAdjustment(MI); if (!isFrameSetup(MI)) SPAdj = -SPAdj; return SPAdj; } // To know whether a call adjusts the stack, we need information // that is bound to the following ADJCALLSTACKUP pseudo. // Look for the next ADJCALLSTACKUP that follows the call. if (MI.isCall()) { const MachineBasicBlock *MBB = MI.getParent(); auto I = ++MachineBasicBlock::const_iterator(MI); for (auto E = MBB->end(); I != E; ++I) { if (I->getOpcode() == getCallFrameDestroyOpcode() || I->isCall()) break; } // If we could not find a frame destroy opcode, then it has already // been simplified, so we don't care. if (I->getOpcode() != getCallFrameDestroyOpcode()) return 0; return -(I->getOperand(1).getImm()); } // Currently handle only PUSHes we can reasonably expect to see // in call sequences switch (MI.getOpcode()) { default: return 0; case X86::PUSH32i8: case X86::PUSH32r: case X86::PUSH32rmm: case X86::PUSH32rmr: case X86::PUSHi32: return 4; case X86::PUSH64i8: case X86::PUSH64r: case X86::PUSH64rmm: case X86::PUSH64rmr: case X86::PUSH64i32: return 8; } } /// Return true and the FrameIndex if the specified /// operand and follow operands form a reference to the stack frame. bool X86InstrInfo::isFrameOperand(const MachineInstr &MI, unsigned int Op, int &FrameIndex) const { if (MI.getOperand(Op + X86::AddrBaseReg).isFI() && MI.getOperand(Op + X86::AddrScaleAmt).isImm() && MI.getOperand(Op + X86::AddrIndexReg).isReg() && MI.getOperand(Op + X86::AddrDisp).isImm() && MI.getOperand(Op + X86::AddrScaleAmt).getImm() == 1 && MI.getOperand(Op + X86::AddrIndexReg).getReg() == 0 && MI.getOperand(Op + X86::AddrDisp).getImm() == 0) { FrameIndex = MI.getOperand(Op + X86::AddrBaseReg).getIndex(); return true; } return false; } static bool isFrameLoadOpcode(int Opcode, unsigned &MemBytes) { switch (Opcode) { default: return false; case X86::MOV8rm: case X86::KMOVBkm: MemBytes = 1; return true; case X86::MOV16rm: case X86::KMOVWkm: MemBytes = 2; return true; case X86::MOV32rm: case X86::MOVSSrm: case X86::MOVSSrm_alt: case X86::VMOVSSrm: case X86::VMOVSSrm_alt: case X86::VMOVSSZrm: case X86::VMOVSSZrm_alt: case X86::KMOVDkm: MemBytes = 4; return true; case X86::MOV64rm: case X86::LD_Fp64m: case X86::MOVSDrm: case X86::MOVSDrm_alt: case X86::VMOVSDrm: case X86::VMOVSDrm_alt: case X86::VMOVSDZrm: case X86::VMOVSDZrm_alt: case X86::MMX_MOVD64rm: case X86::MMX_MOVQ64rm: case X86::KMOVQkm: MemBytes = 8; return true; case X86::MOVAPSrm: case X86::MOVUPSrm: case X86::MOVAPDrm: case X86::MOVUPDrm: case X86::MOVDQArm: case X86::MOVDQUrm: case X86::VMOVAPSrm: case X86::VMOVUPSrm: case X86::VMOVAPDrm: case X86::VMOVUPDrm: case X86::VMOVDQArm: case X86::VMOVDQUrm: case X86::VMOVAPSZ128rm: case X86::VMOVUPSZ128rm: case X86::VMOVAPSZ128rm_NOVLX: case X86::VMOVUPSZ128rm_NOVLX: case X86::VMOVAPDZ128rm: case X86::VMOVUPDZ128rm: case X86::VMOVDQU8Z128rm: case X86::VMOVDQU16Z128rm: case X86::VMOVDQA32Z128rm: case X86::VMOVDQU32Z128rm: case X86::VMOVDQA64Z128rm: case X86::VMOVDQU64Z128rm: MemBytes = 16; return true; case X86::VMOVAPSYrm: case X86::VMOVUPSYrm: case X86::VMOVAPDYrm: case X86::VMOVUPDYrm: case X86::VMOVDQAYrm: case X86::VMOVDQUYrm: case X86::VMOVAPSZ256rm: case X86::VMOVUPSZ256rm: case X86::VMOVAPSZ256rm_NOVLX: case X86::VMOVUPSZ256rm_NOVLX: case X86::VMOVAPDZ256rm: case X86::VMOVUPDZ256rm: case X86::VMOVDQU8Z256rm: case X86::VMOVDQU16Z256rm: case X86::VMOVDQA32Z256rm: case X86::VMOVDQU32Z256rm: case X86::VMOVDQA64Z256rm: case X86::VMOVDQU64Z256rm: MemBytes = 32; return true; case X86::VMOVAPSZrm: case X86::VMOVUPSZrm: case X86::VMOVAPDZrm: case X86::VMOVUPDZrm: case X86::VMOVDQU8Zrm: case X86::VMOVDQU16Zrm: case X86::VMOVDQA32Zrm: case X86::VMOVDQU32Zrm: case X86::VMOVDQA64Zrm: case X86::VMOVDQU64Zrm: MemBytes = 64; return true; } } static bool isFrameStoreOpcode(int Opcode, unsigned &MemBytes) { switch (Opcode) { default: return false; case X86::MOV8mr: case X86::KMOVBmk: MemBytes = 1; return true; case X86::MOV16mr: case X86::KMOVWmk: MemBytes = 2; return true; case X86::MOV32mr: case X86::MOVSSmr: case X86::VMOVSSmr: case X86::VMOVSSZmr: case X86::KMOVDmk: MemBytes = 4; return true; case X86::MOV64mr: case X86::ST_FpP64m: case X86::MOVSDmr: case X86::VMOVSDmr: case X86::VMOVSDZmr: case X86::MMX_MOVD64mr: case X86::MMX_MOVQ64mr: case X86::MMX_MOVNTQmr: case X86::KMOVQmk: MemBytes = 8; return true; case X86::MOVAPSmr: case X86::MOVUPSmr: case X86::MOVAPDmr: case X86::MOVUPDmr: case X86::MOVDQAmr: case X86::MOVDQUmr: case X86::VMOVAPSmr: case X86::VMOVUPSmr: case X86::VMOVAPDmr: case X86::VMOVUPDmr: case X86::VMOVDQAmr: case X86::VMOVDQUmr: case X86::VMOVUPSZ128mr: case X86::VMOVAPSZ128mr: case X86::VMOVUPSZ128mr_NOVLX: case X86::VMOVAPSZ128mr_NOVLX: case X86::VMOVUPDZ128mr: case X86::VMOVAPDZ128mr: case X86::VMOVDQA32Z128mr: case X86::VMOVDQU32Z128mr: case X86::VMOVDQA64Z128mr: case X86::VMOVDQU64Z128mr: case X86::VMOVDQU8Z128mr: case X86::VMOVDQU16Z128mr: MemBytes = 16; return true; case X86::VMOVUPSYmr: case X86::VMOVAPSYmr: case X86::VMOVUPDYmr: case X86::VMOVAPDYmr: case X86::VMOVDQUYmr: case X86::VMOVDQAYmr: case X86::VMOVUPSZ256mr: case X86::VMOVAPSZ256mr: case X86::VMOVUPSZ256mr_NOVLX: case X86::VMOVAPSZ256mr_NOVLX: case X86::VMOVUPDZ256mr: case X86::VMOVAPDZ256mr: case X86::VMOVDQU8Z256mr: case X86::VMOVDQU16Z256mr: case X86::VMOVDQA32Z256mr: case X86::VMOVDQU32Z256mr: case X86::VMOVDQA64Z256mr: case X86::VMOVDQU64Z256mr: MemBytes = 32; return true; case X86::VMOVUPSZmr: case X86::VMOVAPSZmr: case X86::VMOVUPDZmr: case X86::VMOVAPDZmr: case X86::VMOVDQU8Zmr: case X86::VMOVDQU16Zmr: case X86::VMOVDQA32Zmr: case X86::VMOVDQU32Zmr: case X86::VMOVDQA64Zmr: case X86::VMOVDQU64Zmr: MemBytes = 64; return true; } return false; } unsigned X86InstrInfo::isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const { unsigned Dummy; return X86InstrInfo::isLoadFromStackSlot(MI, FrameIndex, Dummy); } unsigned X86InstrInfo::isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex, unsigned &MemBytes) const { if (isFrameLoadOpcode(MI.getOpcode(), MemBytes)) if (MI.getOperand(0).getSubReg() == 0 && isFrameOperand(MI, 1, FrameIndex)) return MI.getOperand(0).getReg(); return 0; } unsigned X86InstrInfo::isLoadFromStackSlotPostFE(const MachineInstr &MI, int &FrameIndex) const { unsigned Dummy; if (isFrameLoadOpcode(MI.getOpcode(), Dummy)) { unsigned Reg; if ((Reg = isLoadFromStackSlot(MI, FrameIndex))) return Reg; // Check for post-frame index elimination operations SmallVector Accesses; if (hasLoadFromStackSlot(MI, Accesses)) { FrameIndex = cast(Accesses.front()->getPseudoValue()) ->getFrameIndex(); return 1; } } return 0; } unsigned X86InstrInfo::isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const { unsigned Dummy; return X86InstrInfo::isStoreToStackSlot(MI, FrameIndex, Dummy); } unsigned X86InstrInfo::isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex, unsigned &MemBytes) const { if (isFrameStoreOpcode(MI.getOpcode(), MemBytes)) if (MI.getOperand(X86::AddrNumOperands).getSubReg() == 0 && isFrameOperand(MI, 0, FrameIndex)) return MI.getOperand(X86::AddrNumOperands).getReg(); return 0; } unsigned X86InstrInfo::isStoreToStackSlotPostFE(const MachineInstr &MI, int &FrameIndex) const { unsigned Dummy; if (isFrameStoreOpcode(MI.getOpcode(), Dummy)) { unsigned Reg; if ((Reg = isStoreToStackSlot(MI, FrameIndex))) return Reg; // Check for post-frame index elimination operations SmallVector Accesses; if (hasStoreToStackSlot(MI, Accesses)) { FrameIndex = cast(Accesses.front()->getPseudoValue()) ->getFrameIndex(); return 1; } } return 0; } /// Return true if register is PIC base; i.e.g defined by X86::MOVPC32r. static bool regIsPICBase(Register BaseReg, const MachineRegisterInfo &MRI) { // Don't waste compile time scanning use-def chains of physregs. if (!BaseReg.isVirtual()) return false; bool isPICBase = false; for (MachineRegisterInfo::def_instr_iterator I = MRI.def_instr_begin(BaseReg), E = MRI.def_instr_end(); I != E; ++I) { MachineInstr *DefMI = &*I; if (DefMI->getOpcode() != X86::MOVPC32r) return false; assert(!isPICBase && "More than one PIC base?"); isPICBase = true; } return isPICBase; } bool X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI, AAResults *AA) const { switch (MI.getOpcode()) { default: // This function should only be called for opcodes with the ReMaterializable // flag set. llvm_unreachable("Unknown rematerializable operation!"); break; case X86::LOAD_STACK_GUARD: case X86::AVX1_SETALLONES: case X86::AVX2_SETALLONES: case X86::AVX512_128_SET0: case X86::AVX512_256_SET0: case X86::AVX512_512_SET0: case X86::AVX512_512_SETALLONES: case X86::AVX512_FsFLD0SD: case X86::AVX512_FsFLD0SS: case X86::AVX512_FsFLD0F128: case X86::AVX_SET0: case X86::FsFLD0SD: case X86::FsFLD0SS: case X86::FsFLD0F128: case X86::KSET0D: case X86::KSET0Q: case X86::KSET0W: case X86::KSET1D: case X86::KSET1Q: case X86::KSET1W: case X86::MMX_SET0: case X86::MOV32ImmSExti8: case X86::MOV32r0: case X86::MOV32r1: case X86::MOV32r_1: case X86::MOV32ri64: case X86::MOV64ImmSExti8: case X86::V_SET0: case X86::V_SETALLONES: case X86::MOV16ri: case X86::MOV32ri: case X86::MOV64ri: case X86::MOV64ri32: case X86::MOV8ri: return true; case X86::MOV8rm: case X86::MOV8rm_NOREX: case X86::MOV16rm: case X86::MOV32rm: case X86::MOV64rm: case X86::MOVSSrm: case X86::MOVSSrm_alt: case X86::MOVSDrm: case X86::MOVSDrm_alt: case X86::MOVAPSrm: case X86::MOVUPSrm: case X86::MOVAPDrm: case X86::MOVUPDrm: case X86::MOVDQArm: case X86::MOVDQUrm: case X86::VMOVSSrm: case X86::VMOVSSrm_alt: case X86::VMOVSDrm: case X86::VMOVSDrm_alt: case X86::VMOVAPSrm: case X86::VMOVUPSrm: case X86::VMOVAPDrm: case X86::VMOVUPDrm: case X86::VMOVDQArm: case X86::VMOVDQUrm: case X86::VMOVAPSYrm: case X86::VMOVUPSYrm: case X86::VMOVAPDYrm: case X86::VMOVUPDYrm: case X86::VMOVDQAYrm: case X86::VMOVDQUYrm: case X86::MMX_MOVD64rm: case X86::MMX_MOVQ64rm: // AVX-512 case X86::VMOVSSZrm: case X86::VMOVSSZrm_alt: case X86::VMOVSDZrm: case X86::VMOVSDZrm_alt: case X86::VMOVAPDZ128rm: case X86::VMOVAPDZ256rm: case X86::VMOVAPDZrm: case X86::VMOVAPSZ128rm: case X86::VMOVAPSZ256rm: case X86::VMOVAPSZ128rm_NOVLX: case X86::VMOVAPSZ256rm_NOVLX: case X86::VMOVAPSZrm: case X86::VMOVDQA32Z128rm: case X86::VMOVDQA32Z256rm: case X86::VMOVDQA32Zrm: case X86::VMOVDQA64Z128rm: case X86::VMOVDQA64Z256rm: case X86::VMOVDQA64Zrm: case X86::VMOVDQU16Z128rm: case X86::VMOVDQU16Z256rm: case X86::VMOVDQU16Zrm: case X86::VMOVDQU32Z128rm: case X86::VMOVDQU32Z256rm: case X86::VMOVDQU32Zrm: case X86::VMOVDQU64Z128rm: case X86::VMOVDQU64Z256rm: case X86::VMOVDQU64Zrm: case X86::VMOVDQU8Z128rm: case X86::VMOVDQU8Z256rm: case X86::VMOVDQU8Zrm: case X86::VMOVUPDZ128rm: case X86::VMOVUPDZ256rm: case X86::VMOVUPDZrm: case X86::VMOVUPSZ128rm: case X86::VMOVUPSZ256rm: case X86::VMOVUPSZ128rm_NOVLX: case X86::VMOVUPSZ256rm_NOVLX: case X86::VMOVUPSZrm: { // Loads from constant pools are trivially rematerializable. if (MI.getOperand(1 + X86::AddrBaseReg).isReg() && MI.getOperand(1 + X86::AddrScaleAmt).isImm() && MI.getOperand(1 + X86::AddrIndexReg).isReg() && MI.getOperand(1 + X86::AddrIndexReg).getReg() == 0 && MI.isDereferenceableInvariantLoad(AA)) { Register BaseReg = MI.getOperand(1 + X86::AddrBaseReg).getReg(); if (BaseReg == 0 || BaseReg == X86::RIP) return true; // Allow re-materialization of PIC load. if (!ReMatPICStubLoad && MI.getOperand(1 + X86::AddrDisp).isGlobal()) return false; const MachineFunction &MF = *MI.getParent()->getParent(); const MachineRegisterInfo &MRI = MF.getRegInfo(); return regIsPICBase(BaseReg, MRI); } return false; } case X86::LEA32r: case X86::LEA64r: { if (MI.getOperand(1 + X86::AddrScaleAmt).isImm() && MI.getOperand(1 + X86::AddrIndexReg).isReg() && MI.getOperand(1 + X86::AddrIndexReg).getReg() == 0 && !MI.getOperand(1 + X86::AddrDisp).isReg()) { // lea fi#, lea GV, etc. are all rematerializable. if (!MI.getOperand(1 + X86::AddrBaseReg).isReg()) return true; Register BaseReg = MI.getOperand(1 + X86::AddrBaseReg).getReg(); if (BaseReg == 0) return true; // Allow re-materialization of lea PICBase + x. const MachineFunction &MF = *MI.getParent()->getParent(); const MachineRegisterInfo &MRI = MF.getRegInfo(); return regIsPICBase(BaseReg, MRI); } return false; } } } void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register DestReg, unsigned SubIdx, const MachineInstr &Orig, const TargetRegisterInfo &TRI) const { bool ClobbersEFLAGS = Orig.modifiesRegister(X86::EFLAGS, &TRI); if (ClobbersEFLAGS && MBB.computeRegisterLiveness(&TRI, X86::EFLAGS, I) != MachineBasicBlock::LQR_Dead) { // The instruction clobbers EFLAGS. Re-materialize as MOV32ri to avoid side // effects. int Value; switch (Orig.getOpcode()) { case X86::MOV32r0: Value = 0; break; case X86::MOV32r1: Value = 1; break; case X86::MOV32r_1: Value = -1; break; default: llvm_unreachable("Unexpected instruction!"); } const DebugLoc &DL = Orig.getDebugLoc(); BuildMI(MBB, I, DL, get(X86::MOV32ri)) .add(Orig.getOperand(0)) .addImm(Value); } else { MachineInstr *MI = MBB.getParent()->CloneMachineInstr(&Orig); MBB.insert(I, MI); } MachineInstr &NewMI = *std::prev(I); NewMI.substituteRegister(Orig.getOperand(0).getReg(), DestReg, SubIdx, TRI); } /// True if MI has a condition code def, e.g. EFLAGS, that is not marked dead. bool X86InstrInfo::hasLiveCondCodeDef(MachineInstr &MI) const { for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { MachineOperand &MO = MI.getOperand(i); if (MO.isReg() && MO.isDef() && MO.getReg() == X86::EFLAGS && !MO.isDead()) { return true; } } return false; } /// Check whether the shift count for a machine operand is non-zero. inline static unsigned getTruncatedShiftCount(const MachineInstr &MI, unsigned ShiftAmtOperandIdx) { // The shift count is six bits with the REX.W prefix and five bits without. unsigned ShiftCountMask = (MI.getDesc().TSFlags & X86II::REX_W) ? 63 : 31; unsigned Imm = MI.getOperand(ShiftAmtOperandIdx).getImm(); return Imm & ShiftCountMask; } /// Check whether the given shift count is appropriate /// can be represented by a LEA instruction. inline static bool isTruncatedShiftCountForLEA(unsigned ShAmt) { // Left shift instructions can be transformed into load-effective-address // instructions if we can encode them appropriately. // A LEA instruction utilizes a SIB byte to encode its scale factor. // The SIB.scale field is two bits wide which means that we can encode any // shift amount less than 4. return ShAmt < 4 && ShAmt > 0; } bool X86InstrInfo::classifyLEAReg(MachineInstr &MI, const MachineOperand &Src, unsigned Opc, bool AllowSP, Register &NewSrc, bool &isKill, MachineOperand &ImplicitOp, LiveVariables *LV) const { MachineFunction &MF = *MI.getParent()->getParent(); const TargetRegisterClass *RC; if (AllowSP) { RC = Opc != X86::LEA32r ? &X86::GR64RegClass : &X86::GR32RegClass; } else { RC = Opc != X86::LEA32r ? &X86::GR64_NOSPRegClass : &X86::GR32_NOSPRegClass; } Register SrcReg = Src.getReg(); // For both LEA64 and LEA32 the register already has essentially the right // type (32-bit or 64-bit) we may just need to forbid SP. if (Opc != X86::LEA64_32r) { NewSrc = SrcReg; isKill = Src.isKill(); assert(!Src.isUndef() && "Undef op doesn't need optimization"); if (NewSrc.isVirtual() && !MF.getRegInfo().constrainRegClass(NewSrc, RC)) return false; return true; } // This is for an LEA64_32r and incoming registers are 32-bit. One way or // another we need to add 64-bit registers to the final MI. if (SrcReg.isPhysical()) { ImplicitOp = Src; ImplicitOp.setImplicit(); NewSrc = getX86SubSuperRegister(Src.getReg(), 64); isKill = Src.isKill(); assert(!Src.isUndef() && "Undef op doesn't need optimization"); } else { // Virtual register of the wrong class, we have to create a temporary 64-bit // vreg to feed into the LEA. NewSrc = MF.getRegInfo().createVirtualRegister(RC); MachineInstr *Copy = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(TargetOpcode::COPY)) .addReg(NewSrc, RegState::Define | RegState::Undef, X86::sub_32bit) .add(Src); // Which is obviously going to be dead after we're done with it. isKill = true; if (LV) LV->replaceKillInstruction(SrcReg, MI, *Copy); } // We've set all the parameters without issue. return true; } MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA( unsigned MIOpc, MachineFunction::iterator &MFI, MachineInstr &MI, LiveVariables *LV, bool Is8BitOp) const { // We handle 8-bit adds and various 16-bit opcodes in the switch below. MachineRegisterInfo &RegInfo = MFI->getParent()->getRegInfo(); assert((Is8BitOp || RegInfo.getTargetRegisterInfo()->getRegSizeInBits( *RegInfo.getRegClass(MI.getOperand(0).getReg())) == 16) && "Unexpected type for LEA transform"); // TODO: For a 32-bit target, we need to adjust the LEA variables with // something like this: // Opcode = X86::LEA32r; // InRegLEA = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass); // OutRegLEA = // Is8BitOp ? RegInfo.createVirtualRegister(&X86::GR32ABCD_RegClass) // : RegInfo.createVirtualRegister(&X86::GR32RegClass); if (!Subtarget.is64Bit()) return nullptr; unsigned Opcode = X86::LEA64_32r; Register InRegLEA = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass); Register OutRegLEA = RegInfo.createVirtualRegister(&X86::GR32RegClass); // Build and insert into an implicit UNDEF value. This is OK because // we will be shifting and then extracting the lower 8/16-bits. // This has the potential to cause partial register stall. e.g. // movw (%rbp,%rcx,2), %dx // leal -65(%rdx), %esi // But testing has shown this *does* help performance in 64-bit mode (at // least on modern x86 machines). MachineBasicBlock::iterator MBBI = MI.getIterator(); Register Dest = MI.getOperand(0).getReg(); Register Src = MI.getOperand(1).getReg(); bool IsDead = MI.getOperand(0).isDead(); bool IsKill = MI.getOperand(1).isKill(); unsigned SubReg = Is8BitOp ? X86::sub_8bit : X86::sub_16bit; assert(!MI.getOperand(1).isUndef() && "Undef op doesn't need optimization"); BuildMI(*MFI, MBBI, MI.getDebugLoc(), get(X86::IMPLICIT_DEF), InRegLEA); MachineInstr *InsMI = BuildMI(*MFI, MBBI, MI.getDebugLoc(), get(TargetOpcode::COPY)) .addReg(InRegLEA, RegState::Define, SubReg) .addReg(Src, getKillRegState(IsKill)); MachineInstrBuilder MIB = BuildMI(*MFI, MBBI, MI.getDebugLoc(), get(Opcode), OutRegLEA); switch (MIOpc) { default: llvm_unreachable("Unreachable!"); case X86::SHL8ri: case X86::SHL16ri: { unsigned ShAmt = MI.getOperand(2).getImm(); MIB.addReg(0).addImm(1ULL << ShAmt) .addReg(InRegLEA, RegState::Kill).addImm(0).addReg(0); break; } case X86::INC8r: case X86::INC16r: addRegOffset(MIB, InRegLEA, true, 1); break; case X86::DEC8r: case X86::DEC16r: addRegOffset(MIB, InRegLEA, true, -1); break; case X86::ADD8ri: case X86::ADD8ri_DB: case X86::ADD16ri: case X86::ADD16ri8: case X86::ADD16ri_DB: case X86::ADD16ri8_DB: addRegOffset(MIB, InRegLEA, true, MI.getOperand(2).getImm()); break; case X86::ADD8rr: case X86::ADD8rr_DB: case X86::ADD16rr: case X86::ADD16rr_DB: { Register Src2 = MI.getOperand(2).getReg(); bool IsKill2 = MI.getOperand(2).isKill(); assert(!MI.getOperand(2).isUndef() && "Undef op doesn't need optimization"); unsigned InRegLEA2 = 0; MachineInstr *InsMI2 = nullptr; if (Src == Src2) { // ADD8rr/ADD16rr killed %reg1028, %reg1028 // just a single insert_subreg. addRegReg(MIB, InRegLEA, true, InRegLEA, false); } else { if (Subtarget.is64Bit()) InRegLEA2 = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass); else InRegLEA2 = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass); // Build and insert into an implicit UNDEF value. This is OK because // we will be shifting and then extracting the lower 8/16-bits. BuildMI(*MFI, &*MIB, MI.getDebugLoc(), get(X86::IMPLICIT_DEF), InRegLEA2); InsMI2 = BuildMI(*MFI, &*MIB, MI.getDebugLoc(), get(TargetOpcode::COPY)) .addReg(InRegLEA2, RegState::Define, SubReg) .addReg(Src2, getKillRegState(IsKill2)); addRegReg(MIB, InRegLEA, true, InRegLEA2, true); } if (LV && IsKill2 && InsMI2) LV->replaceKillInstruction(Src2, MI, *InsMI2); break; } } MachineInstr *NewMI = MIB; MachineInstr *ExtMI = BuildMI(*MFI, MBBI, MI.getDebugLoc(), get(TargetOpcode::COPY)) .addReg(Dest, RegState::Define | getDeadRegState(IsDead)) .addReg(OutRegLEA, RegState::Kill, SubReg); if (LV) { // Update live variables. LV->getVarInfo(InRegLEA).Kills.push_back(NewMI); LV->getVarInfo(OutRegLEA).Kills.push_back(ExtMI); if (IsKill) LV->replaceKillInstruction(Src, MI, *InsMI); if (IsDead) LV->replaceKillInstruction(Dest, MI, *ExtMI); } return ExtMI; } /// This method must be implemented by targets that /// set the M_CONVERTIBLE_TO_3_ADDR flag. When this flag is set, the target /// may be able to convert a two-address instruction into a true /// three-address instruction on demand. This allows the X86 target (for /// example) to convert ADD and SHL instructions into LEA instructions if they /// would require register copies due to two-addressness. /// /// This method returns a null pointer if the transformation cannot be /// performed, otherwise it returns the new instruction. /// MachineInstr * X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, MachineInstr &MI, LiveVariables *LV) const { // The following opcodes also sets the condition code register(s). Only // convert them to equivalent lea if the condition code register def's // are dead! if (hasLiveCondCodeDef(MI)) return nullptr; MachineFunction &MF = *MI.getParent()->getParent(); // All instructions input are two-addr instructions. Get the known operands. const MachineOperand &Dest = MI.getOperand(0); const MachineOperand &Src = MI.getOperand(1); // Ideally, operations with undef should be folded before we get here, but we // can't guarantee it. Bail out because optimizing undefs is a waste of time. // Without this, we have to forward undef state to new register operands to // avoid machine verifier errors. if (Src.isUndef()) return nullptr; if (MI.getNumOperands() > 2) if (MI.getOperand(2).isReg() && MI.getOperand(2).isUndef()) return nullptr; MachineInstr *NewMI = nullptr; bool Is64Bit = Subtarget.is64Bit(); bool Is8BitOp = false; unsigned MIOpc = MI.getOpcode(); switch (MIOpc) { default: llvm_unreachable("Unreachable!"); case X86::SHL64ri: { assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!"); unsigned ShAmt = getTruncatedShiftCount(MI, 2); if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr; // LEA can't handle RSP. if (Src.getReg().isVirtual() && !MF.getRegInfo().constrainRegClass( Src.getReg(), &X86::GR64_NOSPRegClass)) return nullptr; NewMI = BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r)) .add(Dest) .addReg(0) .addImm(1ULL << ShAmt) .add(Src) .addImm(0) .addReg(0); break; } case X86::SHL32ri: { assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!"); unsigned ShAmt = getTruncatedShiftCount(MI, 2); if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr; unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r; // LEA can't handle ESP. bool isKill; Register SrcReg; MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false, SrcReg, isKill, ImplicitOp, LV)) return nullptr; MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc)) .add(Dest) .addReg(0) .addImm(1ULL << ShAmt) .addReg(SrcReg, getKillRegState(isKill)) .addImm(0) .addReg(0); if (ImplicitOp.getReg() != 0) MIB.add(ImplicitOp); NewMI = MIB; break; } case X86::SHL8ri: Is8BitOp = true; LLVM_FALLTHROUGH; case X86::SHL16ri: { assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!"); unsigned ShAmt = getTruncatedShiftCount(MI, 2); if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr; return convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV, Is8BitOp); } case X86::INC64r: case X86::INC32r: { assert(MI.getNumOperands() >= 2 && "Unknown inc instruction!"); unsigned Opc = MIOpc == X86::INC64r ? X86::LEA64r : (Is64Bit ? X86::LEA64_32r : X86::LEA32r); bool isKill; Register SrcReg; MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false, SrcReg, isKill, ImplicitOp, LV)) return nullptr; MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc)) .add(Dest) .addReg(SrcReg, getKillRegState(isKill)); if (ImplicitOp.getReg() != 0) MIB.add(ImplicitOp); NewMI = addOffset(MIB, 1); break; } case X86::DEC64r: case X86::DEC32r: { assert(MI.getNumOperands() >= 2 && "Unknown dec instruction!"); unsigned Opc = MIOpc == X86::DEC64r ? X86::LEA64r : (Is64Bit ? X86::LEA64_32r : X86::LEA32r); bool isKill; Register SrcReg; MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false, SrcReg, isKill, ImplicitOp, LV)) return nullptr; MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc)) .add(Dest) .addReg(SrcReg, getKillRegState(isKill)); if (ImplicitOp.getReg() != 0) MIB.add(ImplicitOp); NewMI = addOffset(MIB, -1); break; } case X86::DEC8r: case X86::INC8r: Is8BitOp = true; LLVM_FALLTHROUGH; case X86::DEC16r: case X86::INC16r: return convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV, Is8BitOp); case X86::ADD64rr: case X86::ADD64rr_DB: case X86::ADD32rr: case X86::ADD32rr_DB: { assert(MI.getNumOperands() >= 3 && "Unknown add instruction!"); unsigned Opc; if (MIOpc == X86::ADD64rr || MIOpc == X86::ADD64rr_DB) Opc = X86::LEA64r; else Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r; bool isKill; Register SrcReg; MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true, SrcReg, isKill, ImplicitOp, LV)) return nullptr; const MachineOperand &Src2 = MI.getOperand(2); bool isKill2; Register SrcReg2; MachineOperand ImplicitOp2 = MachineOperand::CreateReg(0, false); if (!classifyLEAReg(MI, Src2, Opc, /*AllowSP=*/ false, SrcReg2, isKill2, ImplicitOp2, LV)) return nullptr; MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc)).add(Dest); if (ImplicitOp.getReg() != 0) MIB.add(ImplicitOp); if (ImplicitOp2.getReg() != 0) MIB.add(ImplicitOp2); NewMI = addRegReg(MIB, SrcReg, isKill, SrcReg2, isKill2); if (LV && Src2.isKill()) LV->replaceKillInstruction(SrcReg2, MI, *NewMI); break; } case X86::ADD8rr: case X86::ADD8rr_DB: Is8BitOp = true; LLVM_FALLTHROUGH; case X86::ADD16rr: case X86::ADD16rr_DB: return convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV, Is8BitOp); case X86::ADD64ri32: case X86::ADD64ri8: case X86::ADD64ri32_DB: case X86::ADD64ri8_DB: assert(MI.getNumOperands() >= 3 && "Unknown add instruction!"); NewMI = addOffset( BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r)).add(Dest).add(Src), MI.getOperand(2)); break; case X86::ADD32ri: case X86::ADD32ri8: case X86::ADD32ri_DB: case X86::ADD32ri8_DB: { assert(MI.getNumOperands() >= 3 && "Unknown add instruction!"); unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r; bool isKill; Register SrcReg; MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true, SrcReg, isKill, ImplicitOp, LV)) return nullptr; MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc)) .add(Dest) .addReg(SrcReg, getKillRegState(isKill)); if (ImplicitOp.getReg() != 0) MIB.add(ImplicitOp); NewMI = addOffset(MIB, MI.getOperand(2)); break; } case X86::ADD8ri: case X86::ADD8ri_DB: Is8BitOp = true; LLVM_FALLTHROUGH; case X86::ADD16ri: case X86::ADD16ri8: case X86::ADD16ri_DB: case X86::ADD16ri8_DB: return convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV, Is8BitOp); case X86::SUB8ri: case X86::SUB16ri8: case X86::SUB16ri: /// FIXME: Support these similar to ADD8ri/ADD16ri*. return nullptr; case X86::SUB32ri8: case X86::SUB32ri: { if (!MI.getOperand(2).isImm()) return nullptr; int64_t Imm = MI.getOperand(2).getImm(); if (!isInt<32>(-Imm)) return nullptr; assert(MI.getNumOperands() >= 3 && "Unknown add instruction!"); unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r; bool isKill; Register SrcReg; MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true, SrcReg, isKill, ImplicitOp, LV)) return nullptr; MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc)) .add(Dest) .addReg(SrcReg, getKillRegState(isKill)); if (ImplicitOp.getReg() != 0) MIB.add(ImplicitOp); NewMI = addOffset(MIB, -Imm); break; } case X86::SUB64ri8: case X86::SUB64ri32: { if (!MI.getOperand(2).isImm()) return nullptr; int64_t Imm = MI.getOperand(2).getImm(); if (!isInt<32>(-Imm)) return nullptr; assert(MI.getNumOperands() >= 3 && "Unknown sub instruction!"); MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r)).add(Dest).add(Src); NewMI = addOffset(MIB, -Imm); break; } case X86::VMOVDQU8Z128rmk: case X86::VMOVDQU8Z256rmk: case X86::VMOVDQU8Zrmk: case X86::VMOVDQU16Z128rmk: case X86::VMOVDQU16Z256rmk: case X86::VMOVDQU16Zrmk: case X86::VMOVDQU32Z128rmk: case X86::VMOVDQA32Z128rmk: case X86::VMOVDQU32Z256rmk: case X86::VMOVDQA32Z256rmk: case X86::VMOVDQU32Zrmk: case X86::VMOVDQA32Zrmk: case X86::VMOVDQU64Z128rmk: case X86::VMOVDQA64Z128rmk: case X86::VMOVDQU64Z256rmk: case X86::VMOVDQA64Z256rmk: case X86::VMOVDQU64Zrmk: case X86::VMOVDQA64Zrmk: case X86::VMOVUPDZ128rmk: case X86::VMOVAPDZ128rmk: case X86::VMOVUPDZ256rmk: case X86::VMOVAPDZ256rmk: case X86::VMOVUPDZrmk: case X86::VMOVAPDZrmk: case X86::VMOVUPSZ128rmk: case X86::VMOVAPSZ128rmk: case X86::VMOVUPSZ256rmk: case X86::VMOVAPSZ256rmk: case X86::VMOVUPSZrmk: case X86::VMOVAPSZrmk: case X86::VBROADCASTSDZ256rmk: case X86::VBROADCASTSDZrmk: case X86::VBROADCASTSSZ128rmk: case X86::VBROADCASTSSZ256rmk: case X86::VBROADCASTSSZrmk: case X86::VPBROADCASTDZ128rmk: case X86::VPBROADCASTDZ256rmk: case X86::VPBROADCASTDZrmk: case X86::VPBROADCASTQZ128rmk: case X86::VPBROADCASTQZ256rmk: case X86::VPBROADCASTQZrmk: { unsigned Opc; switch (MIOpc) { default: llvm_unreachable("Unreachable!"); case X86::VMOVDQU8Z128rmk: Opc = X86::VPBLENDMBZ128rmk; break; case X86::VMOVDQU8Z256rmk: Opc = X86::VPBLENDMBZ256rmk; break; case X86::VMOVDQU8Zrmk: Opc = X86::VPBLENDMBZrmk; break; case X86::VMOVDQU16Z128rmk: Opc = X86::VPBLENDMWZ128rmk; break; case X86::VMOVDQU16Z256rmk: Opc = X86::VPBLENDMWZ256rmk; break; case X86::VMOVDQU16Zrmk: Opc = X86::VPBLENDMWZrmk; break; case X86::VMOVDQU32Z128rmk: Opc = X86::VPBLENDMDZ128rmk; break; case X86::VMOVDQU32Z256rmk: Opc = X86::VPBLENDMDZ256rmk; break; case X86::VMOVDQU32Zrmk: Opc = X86::VPBLENDMDZrmk; break; case X86::VMOVDQU64Z128rmk: Opc = X86::VPBLENDMQZ128rmk; break; case X86::VMOVDQU64Z256rmk: Opc = X86::VPBLENDMQZ256rmk; break; case X86::VMOVDQU64Zrmk: Opc = X86::VPBLENDMQZrmk; break; case X86::VMOVUPDZ128rmk: Opc = X86::VBLENDMPDZ128rmk; break; case X86::VMOVUPDZ256rmk: Opc = X86::VBLENDMPDZ256rmk; break; case X86::VMOVUPDZrmk: Opc = X86::VBLENDMPDZrmk; break; case X86::VMOVUPSZ128rmk: Opc = X86::VBLENDMPSZ128rmk; break; case X86::VMOVUPSZ256rmk: Opc = X86::VBLENDMPSZ256rmk; break; case X86::VMOVUPSZrmk: Opc = X86::VBLENDMPSZrmk; break; case X86::VMOVDQA32Z128rmk: Opc = X86::VPBLENDMDZ128rmk; break; case X86::VMOVDQA32Z256rmk: Opc = X86::VPBLENDMDZ256rmk; break; case X86::VMOVDQA32Zrmk: Opc = X86::VPBLENDMDZrmk; break; case X86::VMOVDQA64Z128rmk: Opc = X86::VPBLENDMQZ128rmk; break; case X86::VMOVDQA64Z256rmk: Opc = X86::VPBLENDMQZ256rmk; break; case X86::VMOVDQA64Zrmk: Opc = X86::VPBLENDMQZrmk; break; case X86::VMOVAPDZ128rmk: Opc = X86::VBLENDMPDZ128rmk; break; case X86::VMOVAPDZ256rmk: Opc = X86::VBLENDMPDZ256rmk; break; case X86::VMOVAPDZrmk: Opc = X86::VBLENDMPDZrmk; break; case X86::VMOVAPSZ128rmk: Opc = X86::VBLENDMPSZ128rmk; break; case X86::VMOVAPSZ256rmk: Opc = X86::VBLENDMPSZ256rmk; break; case X86::VMOVAPSZrmk: Opc = X86::VBLENDMPSZrmk; break; case X86::VBROADCASTSDZ256rmk: Opc = X86::VBLENDMPDZ256rmbk; break; case X86::VBROADCASTSDZrmk: Opc = X86::VBLENDMPDZrmbk; break; case X86::VBROADCASTSSZ128rmk: Opc = X86::VBLENDMPSZ128rmbk; break; case X86::VBROADCASTSSZ256rmk: Opc = X86::VBLENDMPSZ256rmbk; break; case X86::VBROADCASTSSZrmk: Opc = X86::VBLENDMPSZrmbk; break; case X86::VPBROADCASTDZ128rmk: Opc = X86::VPBLENDMDZ128rmbk; break; case X86::VPBROADCASTDZ256rmk: Opc = X86::VPBLENDMDZ256rmbk; break; case X86::VPBROADCASTDZrmk: Opc = X86::VPBLENDMDZrmbk; break; case X86::VPBROADCASTQZ128rmk: Opc = X86::VPBLENDMQZ128rmbk; break; case X86::VPBROADCASTQZ256rmk: Opc = X86::VPBLENDMQZ256rmbk; break; case X86::VPBROADCASTQZrmk: Opc = X86::VPBLENDMQZrmbk; break; } NewMI = BuildMI(MF, MI.getDebugLoc(), get(Opc)) .add(Dest) .add(MI.getOperand(2)) .add(Src) .add(MI.getOperand(3)) .add(MI.getOperand(4)) .add(MI.getOperand(5)) .add(MI.getOperand(6)) .add(MI.getOperand(7)); break; } case X86::VMOVDQU8Z128rrk: case X86::VMOVDQU8Z256rrk: case X86::VMOVDQU8Zrrk: case X86::VMOVDQU16Z128rrk: case X86::VMOVDQU16Z256rrk: case X86::VMOVDQU16Zrrk: case X86::VMOVDQU32Z128rrk: case X86::VMOVDQA32Z128rrk: case X86::VMOVDQU32Z256rrk: case X86::VMOVDQA32Z256rrk: case X86::VMOVDQU32Zrrk: case X86::VMOVDQA32Zrrk: case X86::VMOVDQU64Z128rrk: case X86::VMOVDQA64Z128rrk: case X86::VMOVDQU64Z256rrk: case X86::VMOVDQA64Z256rrk: case X86::VMOVDQU64Zrrk: case X86::VMOVDQA64Zrrk: case X86::VMOVUPDZ128rrk: case X86::VMOVAPDZ128rrk: case X86::VMOVUPDZ256rrk: case X86::VMOVAPDZ256rrk: case X86::VMOVUPDZrrk: case X86::VMOVAPDZrrk: case X86::VMOVUPSZ128rrk: case X86::VMOVAPSZ128rrk: case X86::VMOVUPSZ256rrk: case X86::VMOVAPSZ256rrk: case X86::VMOVUPSZrrk: case X86::VMOVAPSZrrk: { unsigned Opc; switch (MIOpc) { default: llvm_unreachable("Unreachable!"); case X86::VMOVDQU8Z128rrk: Opc = X86::VPBLENDMBZ128rrk; break; case X86::VMOVDQU8Z256rrk: Opc = X86::VPBLENDMBZ256rrk; break; case X86::VMOVDQU8Zrrk: Opc = X86::VPBLENDMBZrrk; break; case X86::VMOVDQU16Z128rrk: Opc = X86::VPBLENDMWZ128rrk; break; case X86::VMOVDQU16Z256rrk: Opc = X86::VPBLENDMWZ256rrk; break; case X86::VMOVDQU16Zrrk: Opc = X86::VPBLENDMWZrrk; break; case X86::VMOVDQU32Z128rrk: Opc = X86::VPBLENDMDZ128rrk; break; case X86::VMOVDQU32Z256rrk: Opc = X86::VPBLENDMDZ256rrk; break; case X86::VMOVDQU32Zrrk: Opc = X86::VPBLENDMDZrrk; break; case X86::VMOVDQU64Z128rrk: Opc = X86::VPBLENDMQZ128rrk; break; case X86::VMOVDQU64Z256rrk: Opc = X86::VPBLENDMQZ256rrk; break; case X86::VMOVDQU64Zrrk: Opc = X86::VPBLENDMQZrrk; break; case X86::VMOVUPDZ128rrk: Opc = X86::VBLENDMPDZ128rrk; break; case X86::VMOVUPDZ256rrk: Opc = X86::VBLENDMPDZ256rrk; break; case X86::VMOVUPDZrrk: Opc = X86::VBLENDMPDZrrk; break; case X86::VMOVUPSZ128rrk: Opc = X86::VBLENDMPSZ128rrk; break; case X86::VMOVUPSZ256rrk: Opc = X86::VBLENDMPSZ256rrk; break; case X86::VMOVUPSZrrk: Opc = X86::VBLENDMPSZrrk; break; case X86::VMOVDQA32Z128rrk: Opc = X86::VPBLENDMDZ128rrk; break; case X86::VMOVDQA32Z256rrk: Opc = X86::VPBLENDMDZ256rrk; break; case X86::VMOVDQA32Zrrk: Opc = X86::VPBLENDMDZrrk; break; case X86::VMOVDQA64Z128rrk: Opc = X86::VPBLENDMQZ128rrk; break; case X86::VMOVDQA64Z256rrk: Opc = X86::VPBLENDMQZ256rrk; break; case X86::VMOVDQA64Zrrk: Opc = X86::VPBLENDMQZrrk; break; case X86::VMOVAPDZ128rrk: Opc = X86::VBLENDMPDZ128rrk; break; case X86::VMOVAPDZ256rrk: Opc = X86::VBLENDMPDZ256rrk; break; case X86::VMOVAPDZrrk: Opc = X86::VBLENDMPDZrrk; break; case X86::VMOVAPSZ128rrk: Opc = X86::VBLENDMPSZ128rrk; break; case X86::VMOVAPSZ256rrk: Opc = X86::VBLENDMPSZ256rrk; break; case X86::VMOVAPSZrrk: Opc = X86::VBLENDMPSZrrk; break; } NewMI = BuildMI(MF, MI.getDebugLoc(), get(Opc)) .add(Dest) .add(MI.getOperand(2)) .add(Src) .add(MI.getOperand(3)); break; } } if (!NewMI) return nullptr; if (LV) { // Update live variables if (Src.isKill()) LV->replaceKillInstruction(Src.getReg(), MI, *NewMI); if (Dest.isDead()) LV->replaceKillInstruction(Dest.getReg(), MI, *NewMI); } MFI->insert(MI.getIterator(), NewMI); // Insert the new inst return NewMI; } /// This determines which of three possible cases of a three source commute /// the source indexes correspond to taking into account any mask operands. /// All prevents commuting a passthru operand. Returns -1 if the commute isn't /// possible. /// Case 0 - Possible to commute the first and second operands. /// Case 1 - Possible to commute the first and third operands. /// Case 2 - Possible to commute the second and third operands. static unsigned getThreeSrcCommuteCase(uint64_t TSFlags, unsigned SrcOpIdx1, unsigned SrcOpIdx2) { // Put the lowest index to SrcOpIdx1 to simplify the checks below. if (SrcOpIdx1 > SrcOpIdx2) std::swap(SrcOpIdx1, SrcOpIdx2); unsigned Op1 = 1, Op2 = 2, Op3 = 3; if (X86II::isKMasked(TSFlags)) { Op2++; Op3++; } if (SrcOpIdx1 == Op1 && SrcOpIdx2 == Op2) return 0; if (SrcOpIdx1 == Op1 && SrcOpIdx2 == Op3) return 1; if (SrcOpIdx1 == Op2 && SrcOpIdx2 == Op3) return 2; llvm_unreachable("Unknown three src commute case."); } unsigned X86InstrInfo::getFMA3OpcodeToCommuteOperands( const MachineInstr &MI, unsigned SrcOpIdx1, unsigned SrcOpIdx2, const X86InstrFMA3Group &FMA3Group) const { unsigned Opc = MI.getOpcode(); // TODO: Commuting the 1st operand of FMA*_Int requires some additional // analysis. The commute optimization is legal only if all users of FMA*_Int // use only the lowest element of the FMA*_Int instruction. Such analysis are // not implemented yet. So, just return 0 in that case. // When such analysis are available this place will be the right place for // calling it. assert(!(FMA3Group.isIntrinsic() && (SrcOpIdx1 == 1 || SrcOpIdx2 == 1)) && "Intrinsic instructions can't commute operand 1"); // Determine which case this commute is or if it can't be done. unsigned Case = getThreeSrcCommuteCase(MI.getDesc().TSFlags, SrcOpIdx1, SrcOpIdx2); assert(Case < 3 && "Unexpected case number!"); // Define the FMA forms mapping array that helps to map input FMA form // to output FMA form to preserve the operation semantics after // commuting the operands. const unsigned Form132Index = 0; const unsigned Form213Index = 1; const unsigned Form231Index = 2; static const unsigned FormMapping[][3] = { // 0: SrcOpIdx1 == 1 && SrcOpIdx2 == 2; // FMA132 A, C, b; ==> FMA231 C, A, b; // FMA213 B, A, c; ==> FMA213 A, B, c; // FMA231 C, A, b; ==> FMA132 A, C, b; { Form231Index, Form213Index, Form132Index }, // 1: SrcOpIdx1 == 1 && SrcOpIdx2 == 3; // FMA132 A, c, B; ==> FMA132 B, c, A; // FMA213 B, a, C; ==> FMA231 C, a, B; // FMA231 C, a, B; ==> FMA213 B, a, C; { Form132Index, Form231Index, Form213Index }, // 2: SrcOpIdx1 == 2 && SrcOpIdx2 == 3; // FMA132 a, C, B; ==> FMA213 a, B, C; // FMA213 b, A, C; ==> FMA132 b, C, A; // FMA231 c, A, B; ==> FMA231 c, B, A; { Form213Index, Form132Index, Form231Index } }; unsigned FMAForms[3]; FMAForms[0] = FMA3Group.get132Opcode(); FMAForms[1] = FMA3Group.get213Opcode(); FMAForms[2] = FMA3Group.get231Opcode(); unsigned FormIndex; for (FormIndex = 0; FormIndex < 3; FormIndex++) if (Opc == FMAForms[FormIndex]) break; // Everything is ready, just adjust the FMA opcode and return it. FormIndex = FormMapping[Case][FormIndex]; return FMAForms[FormIndex]; } static void commuteVPTERNLOG(MachineInstr &MI, unsigned SrcOpIdx1, unsigned SrcOpIdx2) { // Determine which case this commute is or if it can't be done. unsigned Case = getThreeSrcCommuteCase(MI.getDesc().TSFlags, SrcOpIdx1, SrcOpIdx2); assert(Case < 3 && "Unexpected case value!"); // For each case we need to swap two pairs of bits in the final immediate. static const uint8_t SwapMasks[3][4] = { { 0x04, 0x10, 0x08, 0x20 }, // Swap bits 2/4 and 3/5. { 0x02, 0x10, 0x08, 0x40 }, // Swap bits 1/4 and 3/6. { 0x02, 0x04, 0x20, 0x40 }, // Swap bits 1/2 and 5/6. }; uint8_t Imm = MI.getOperand(MI.getNumOperands()-1).getImm(); // Clear out the bits we are swapping. uint8_t NewImm = Imm & ~(SwapMasks[Case][0] | SwapMasks[Case][1] | SwapMasks[Case][2] | SwapMasks[Case][3]); // If the immediate had a bit of the pair set, then set the opposite bit. if (Imm & SwapMasks[Case][0]) NewImm |= SwapMasks[Case][1]; if (Imm & SwapMasks[Case][1]) NewImm |= SwapMasks[Case][0]; if (Imm & SwapMasks[Case][2]) NewImm |= SwapMasks[Case][3]; if (Imm & SwapMasks[Case][3]) NewImm |= SwapMasks[Case][2]; MI.getOperand(MI.getNumOperands()-1).setImm(NewImm); } // Returns true if this is a VPERMI2 or VPERMT2 instruction that can be // commuted. static bool isCommutableVPERMV3Instruction(unsigned Opcode) { #define VPERM_CASES(Suffix) \ case X86::VPERMI2##Suffix##128rr: case X86::VPERMT2##Suffix##128rr: \ case X86::VPERMI2##Suffix##256rr: case X86::VPERMT2##Suffix##256rr: \ case X86::VPERMI2##Suffix##rr: case X86::VPERMT2##Suffix##rr: \ case X86::VPERMI2##Suffix##128rm: case X86::VPERMT2##Suffix##128rm: \ case X86::VPERMI2##Suffix##256rm: case X86::VPERMT2##Suffix##256rm: \ case X86::VPERMI2##Suffix##rm: case X86::VPERMT2##Suffix##rm: \ case X86::VPERMI2##Suffix##128rrkz: case X86::VPERMT2##Suffix##128rrkz: \ case X86::VPERMI2##Suffix##256rrkz: case X86::VPERMT2##Suffix##256rrkz: \ case X86::VPERMI2##Suffix##rrkz: case X86::VPERMT2##Suffix##rrkz: \ case X86::VPERMI2##Suffix##128rmkz: case X86::VPERMT2##Suffix##128rmkz: \ case X86::VPERMI2##Suffix##256rmkz: case X86::VPERMT2##Suffix##256rmkz: \ case X86::VPERMI2##Suffix##rmkz: case X86::VPERMT2##Suffix##rmkz: #define VPERM_CASES_BROADCAST(Suffix) \ VPERM_CASES(Suffix) \ case X86::VPERMI2##Suffix##128rmb: case X86::VPERMT2##Suffix##128rmb: \ case X86::VPERMI2##Suffix##256rmb: case X86::VPERMT2##Suffix##256rmb: \ case X86::VPERMI2##Suffix##rmb: case X86::VPERMT2##Suffix##rmb: \ case X86::VPERMI2##Suffix##128rmbkz: case X86::VPERMT2##Suffix##128rmbkz: \ case X86::VPERMI2##Suffix##256rmbkz: case X86::VPERMT2##Suffix##256rmbkz: \ case X86::VPERMI2##Suffix##rmbkz: case X86::VPERMT2##Suffix##rmbkz: switch (Opcode) { default: return false; VPERM_CASES(B) VPERM_CASES_BROADCAST(D) VPERM_CASES_BROADCAST(PD) VPERM_CASES_BROADCAST(PS) VPERM_CASES_BROADCAST(Q) VPERM_CASES(W) return true; } #undef VPERM_CASES_BROADCAST #undef VPERM_CASES } // Returns commuted opcode for VPERMI2 and VPERMT2 instructions by switching // from the I opcode to the T opcode and vice versa. static unsigned getCommutedVPERMV3Opcode(unsigned Opcode) { #define VPERM_CASES(Orig, New) \ case X86::Orig##128rr: return X86::New##128rr; \ case X86::Orig##128rrkz: return X86::New##128rrkz; \ case X86::Orig##128rm: return X86::New##128rm; \ case X86::Orig##128rmkz: return X86::New##128rmkz; \ case X86::Orig##256rr: return X86::New##256rr; \ case X86::Orig##256rrkz: return X86::New##256rrkz; \ case X86::Orig##256rm: return X86::New##256rm; \ case X86::Orig##256rmkz: return X86::New##256rmkz; \ case X86::Orig##rr: return X86::New##rr; \ case X86::Orig##rrkz: return X86::New##rrkz; \ case X86::Orig##rm: return X86::New##rm; \ case X86::Orig##rmkz: return X86::New##rmkz; #define VPERM_CASES_BROADCAST(Orig, New) \ VPERM_CASES(Orig, New) \ case X86::Orig##128rmb: return X86::New##128rmb; \ case X86::Orig##128rmbkz: return X86::New##128rmbkz; \ case X86::Orig##256rmb: return X86::New##256rmb; \ case X86::Orig##256rmbkz: return X86::New##256rmbkz; \ case X86::Orig##rmb: return X86::New##rmb; \ case X86::Orig##rmbkz: return X86::New##rmbkz; switch (Opcode) { VPERM_CASES(VPERMI2B, VPERMT2B) VPERM_CASES_BROADCAST(VPERMI2D, VPERMT2D) VPERM_CASES_BROADCAST(VPERMI2PD, VPERMT2PD) VPERM_CASES_BROADCAST(VPERMI2PS, VPERMT2PS) VPERM_CASES_BROADCAST(VPERMI2Q, VPERMT2Q) VPERM_CASES(VPERMI2W, VPERMT2W) VPERM_CASES(VPERMT2B, VPERMI2B) VPERM_CASES_BROADCAST(VPERMT2D, VPERMI2D) VPERM_CASES_BROADCAST(VPERMT2PD, VPERMI2PD) VPERM_CASES_BROADCAST(VPERMT2PS, VPERMI2PS) VPERM_CASES_BROADCAST(VPERMT2Q, VPERMI2Q) VPERM_CASES(VPERMT2W, VPERMI2W) } llvm_unreachable("Unreachable!"); #undef VPERM_CASES_BROADCAST #undef VPERM_CASES } MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx1, unsigned OpIdx2) const { auto cloneIfNew = [NewMI](MachineInstr &MI) -> MachineInstr & { if (NewMI) return *MI.getParent()->getParent()->CloneMachineInstr(&MI); return MI; }; switch (MI.getOpcode()) { case X86::SHRD16rri8: // A = SHRD16rri8 B, C, I -> A = SHLD16rri8 C, B, (16-I) case X86::SHLD16rri8: // A = SHLD16rri8 B, C, I -> A = SHRD16rri8 C, B, (16-I) case X86::SHRD32rri8: // A = SHRD32rri8 B, C, I -> A = SHLD32rri8 C, B, (32-I) case X86::SHLD32rri8: // A = SHLD32rri8 B, C, I -> A = SHRD32rri8 C, B, (32-I) case X86::SHRD64rri8: // A = SHRD64rri8 B, C, I -> A = SHLD64rri8 C, B, (64-I) case X86::SHLD64rri8:{// A = SHLD64rri8 B, C, I -> A = SHRD64rri8 C, B, (64-I) unsigned Opc; unsigned Size; switch (MI.getOpcode()) { default: llvm_unreachable("Unreachable!"); case X86::SHRD16rri8: Size = 16; Opc = X86::SHLD16rri8; break; case X86::SHLD16rri8: Size = 16; Opc = X86::SHRD16rri8; break; case X86::SHRD32rri8: Size = 32; Opc = X86::SHLD32rri8; break; case X86::SHLD32rri8: Size = 32; Opc = X86::SHRD32rri8; break; case X86::SHRD64rri8: Size = 64; Opc = X86::SHLD64rri8; break; case X86::SHLD64rri8: Size = 64; Opc = X86::SHRD64rri8; break; } unsigned Amt = MI.getOperand(3).getImm(); auto &WorkingMI = cloneIfNew(MI); WorkingMI.setDesc(get(Opc)); WorkingMI.getOperand(3).setImm(Size - Amt); return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, OpIdx1, OpIdx2); } case X86::PFSUBrr: case X86::PFSUBRrr: { // PFSUB x, y: x = x - y // PFSUBR x, y: x = y - x unsigned Opc = (X86::PFSUBRrr == MI.getOpcode() ? X86::PFSUBrr : X86::PFSUBRrr); auto &WorkingMI = cloneIfNew(MI); WorkingMI.setDesc(get(Opc)); return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, OpIdx1, OpIdx2); } case X86::BLENDPDrri: case X86::BLENDPSrri: case X86::VBLENDPDrri: case X86::VBLENDPSrri: // If we're optimizing for size, try to use MOVSD/MOVSS. if (MI.getParent()->getParent()->getFunction().hasOptSize()) { unsigned Mask, Opc; switch (MI.getOpcode()) { default: llvm_unreachable("Unreachable!"); case X86::BLENDPDrri: Opc = X86::MOVSDrr; Mask = 0x03; break; case X86::BLENDPSrri: Opc = X86::MOVSSrr; Mask = 0x0F; break; case X86::VBLENDPDrri: Opc = X86::VMOVSDrr; Mask = 0x03; break; case X86::VBLENDPSrri: Opc = X86::VMOVSSrr; Mask = 0x0F; break; } if ((MI.getOperand(3).getImm() ^ Mask) == 1) { auto &WorkingMI = cloneIfNew(MI); WorkingMI.setDesc(get(Opc)); WorkingMI.RemoveOperand(3); return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, OpIdx1, OpIdx2); } } LLVM_FALLTHROUGH; case X86::PBLENDWrri: case X86::VBLENDPDYrri: case X86::VBLENDPSYrri: case X86::VPBLENDDrri: case X86::VPBLENDWrri: case X86::VPBLENDDYrri: case X86::VPBLENDWYrri:{ int8_t Mask; switch (MI.getOpcode()) { default: llvm_unreachable("Unreachable!"); case X86::BLENDPDrri: Mask = (int8_t)0x03; break; case X86::BLENDPSrri: Mask = (int8_t)0x0F; break; case X86::PBLENDWrri: Mask = (int8_t)0xFF; break; case X86::VBLENDPDrri: Mask = (int8_t)0x03; break; case X86::VBLENDPSrri: Mask = (int8_t)0x0F; break; case X86::VBLENDPDYrri: Mask = (int8_t)0x0F; break; case X86::VBLENDPSYrri: Mask = (int8_t)0xFF; break; case X86::VPBLENDDrri: Mask = (int8_t)0x0F; break; case X86::VPBLENDWrri: Mask = (int8_t)0xFF; break; case X86::VPBLENDDYrri: Mask = (int8_t)0xFF; break; case X86::VPBLENDWYrri: Mask = (int8_t)0xFF; break; } // Only the least significant bits of Imm are used. // Using int8_t to ensure it will be sign extended to the int64_t that // setImm takes in order to match isel behavior. int8_t Imm = MI.getOperand(3).getImm() & Mask; auto &WorkingMI = cloneIfNew(MI); WorkingMI.getOperand(3).setImm(Mask ^ Imm); return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, OpIdx1, OpIdx2); } case X86::INSERTPSrr: case X86::VINSERTPSrr: case X86::VINSERTPSZrr: { unsigned Imm = MI.getOperand(MI.getNumOperands() - 1).getImm(); unsigned ZMask = Imm & 15; unsigned DstIdx = (Imm >> 4) & 3; unsigned SrcIdx = (Imm >> 6) & 3; // We can commute insertps if we zero 2 of the elements, the insertion is // "inline" and we don't override the insertion with a zero. if (DstIdx == SrcIdx && (ZMask & (1 << DstIdx)) == 0 && countPopulation(ZMask) == 2) { unsigned AltIdx = findFirstSet((ZMask | (1 << DstIdx)) ^ 15); assert(AltIdx < 4 && "Illegal insertion index"); unsigned AltImm = (AltIdx << 6) | (AltIdx << 4) | ZMask; auto &WorkingMI = cloneIfNew(MI); WorkingMI.getOperand(MI.getNumOperands() - 1).setImm(AltImm); return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, OpIdx1, OpIdx2); } return nullptr; } case X86::MOVSDrr: case X86::MOVSSrr: case X86::VMOVSDrr: case X86::VMOVSSrr:{ // On SSE41 or later we can commute a MOVSS/MOVSD to a BLENDPS/BLENDPD. if (Subtarget.hasSSE41()) { unsigned Mask, Opc; switch (MI.getOpcode()) { default: llvm_unreachable("Unreachable!"); case X86::MOVSDrr: Opc = X86::BLENDPDrri; Mask = 0x02; break; case X86::MOVSSrr: Opc = X86::BLENDPSrri; Mask = 0x0E; break; case X86::VMOVSDrr: Opc = X86::VBLENDPDrri; Mask = 0x02; break; case X86::VMOVSSrr: Opc = X86::VBLENDPSrri; Mask = 0x0E; break; } auto &WorkingMI = cloneIfNew(MI); WorkingMI.setDesc(get(Opc)); WorkingMI.addOperand(MachineOperand::CreateImm(Mask)); return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, OpIdx1, OpIdx2); } // Convert to SHUFPD. assert(MI.getOpcode() == X86::MOVSDrr && "Can only commute MOVSDrr without SSE4.1"); auto &WorkingMI = cloneIfNew(MI); WorkingMI.setDesc(get(X86::SHUFPDrri)); WorkingMI.addOperand(MachineOperand::CreateImm(0x02)); return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, OpIdx1, OpIdx2); } case X86::SHUFPDrri: { // Commute to MOVSD. assert(MI.getOperand(3).getImm() == 0x02 && "Unexpected immediate!"); auto &WorkingMI = cloneIfNew(MI); WorkingMI.setDesc(get(X86::MOVSDrr)); WorkingMI.RemoveOperand(3); return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, OpIdx1, OpIdx2); } case X86::PCLMULQDQrr: case X86::VPCLMULQDQrr: case X86::VPCLMULQDQYrr: case X86::VPCLMULQDQZrr: case X86::VPCLMULQDQZ128rr: case X86::VPCLMULQDQZ256rr: { // SRC1 64bits = Imm[0] ? SRC1[127:64] : SRC1[63:0] // SRC2 64bits = Imm[4] ? SRC2[127:64] : SRC2[63:0] unsigned Imm = MI.getOperand(3).getImm(); unsigned Src1Hi = Imm & 0x01; unsigned Src2Hi = Imm & 0x10; auto &WorkingMI = cloneIfNew(MI); WorkingMI.getOperand(3).setImm((Src1Hi << 4) | (Src2Hi >> 4)); return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, OpIdx1, OpIdx2); } case X86::VPCMPBZ128rri: case X86::VPCMPUBZ128rri: case X86::VPCMPBZ256rri: case X86::VPCMPUBZ256rri: case X86::VPCMPBZrri: case X86::VPCMPUBZrri: case X86::VPCMPDZ128rri: case X86::VPCMPUDZ128rri: case X86::VPCMPDZ256rri: case X86::VPCMPUDZ256rri: case X86::VPCMPDZrri: case X86::VPCMPUDZrri: case X86::VPCMPQZ128rri: case X86::VPCMPUQZ128rri: case X86::VPCMPQZ256rri: case X86::VPCMPUQZ256rri: case X86::VPCMPQZrri: case X86::VPCMPUQZrri: case X86::VPCMPWZ128rri: case X86::VPCMPUWZ128rri: case X86::VPCMPWZ256rri: case X86::VPCMPUWZ256rri: case X86::VPCMPWZrri: case X86::VPCMPUWZrri: case X86::VPCMPBZ128rrik: case X86::VPCMPUBZ128rrik: case X86::VPCMPBZ256rrik: case X86::VPCMPUBZ256rrik: case X86::VPCMPBZrrik: case X86::VPCMPUBZrrik: case X86::VPCMPDZ128rrik: case X86::VPCMPUDZ128rrik: case X86::VPCMPDZ256rrik: case X86::VPCMPUDZ256rrik: case X86::VPCMPDZrrik: case X86::VPCMPUDZrrik: case X86::VPCMPQZ128rrik: case X86::VPCMPUQZ128rrik: case X86::VPCMPQZ256rrik: case X86::VPCMPUQZ256rrik: case X86::VPCMPQZrrik: case X86::VPCMPUQZrrik: case X86::VPCMPWZ128rrik: case X86::VPCMPUWZ128rrik: case X86::VPCMPWZ256rrik: case X86::VPCMPUWZ256rrik: case X86::VPCMPWZrrik: case X86::VPCMPUWZrrik: { // Flip comparison mode immediate (if necessary). unsigned Imm = MI.getOperand(MI.getNumOperands() - 1).getImm() & 0x7; Imm = X86::getSwappedVPCMPImm(Imm); auto &WorkingMI = cloneIfNew(MI); WorkingMI.getOperand(MI.getNumOperands() - 1).setImm(Imm); return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, OpIdx1, OpIdx2); } case X86::VPCOMBri: case X86::VPCOMUBri: case X86::VPCOMDri: case X86::VPCOMUDri: case X86::VPCOMQri: case X86::VPCOMUQri: case X86::VPCOMWri: case X86::VPCOMUWri: { // Flip comparison mode immediate (if necessary). unsigned Imm = MI.getOperand(3).getImm() & 0x7; Imm = X86::getSwappedVPCOMImm(Imm); auto &WorkingMI = cloneIfNew(MI); WorkingMI.getOperand(3).setImm(Imm); return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, OpIdx1, OpIdx2); } case X86::VCMPSDZrr: case X86::VCMPSSZrr: case X86::VCMPPDZrri: case X86::VCMPPSZrri: case X86::VCMPPDZ128rri: case X86::VCMPPSZ128rri: case X86::VCMPPDZ256rri: case X86::VCMPPSZ256rri: case X86::VCMPPDZrrik: case X86::VCMPPSZrrik: case X86::VCMPPDZ128rrik: case X86::VCMPPSZ128rrik: case X86::VCMPPDZ256rrik: case X86::VCMPPSZ256rrik: { unsigned Imm = MI.getOperand(MI.getNumExplicitOperands() - 1).getImm() & 0x1f; Imm = X86::getSwappedVCMPImm(Imm); auto &WorkingMI = cloneIfNew(MI); WorkingMI.getOperand(MI.getNumExplicitOperands() - 1).setImm(Imm); return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, OpIdx1, OpIdx2); } case X86::VPERM2F128rr: case X86::VPERM2I128rr: { // Flip permute source immediate. // Imm & 0x02: lo = if set, select Op1.lo/hi else Op0.lo/hi. // Imm & 0x20: hi = if set, select Op1.lo/hi else Op0.lo/hi. int8_t Imm = MI.getOperand(3).getImm() & 0xFF; auto &WorkingMI = cloneIfNew(MI); WorkingMI.getOperand(3).setImm(Imm ^ 0x22); return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, OpIdx1, OpIdx2); } case X86::MOVHLPSrr: case X86::UNPCKHPDrr: case X86::VMOVHLPSrr: case X86::VUNPCKHPDrr: case X86::VMOVHLPSZrr: case X86::VUNPCKHPDZ128rr: { assert(Subtarget.hasSSE2() && "Commuting MOVHLP/UNPCKHPD requires SSE2!"); unsigned Opc = MI.getOpcode(); switch (Opc) { default: llvm_unreachable("Unreachable!"); case X86::MOVHLPSrr: Opc = X86::UNPCKHPDrr; break; case X86::UNPCKHPDrr: Opc = X86::MOVHLPSrr; break; case X86::VMOVHLPSrr: Opc = X86::VUNPCKHPDrr; break; case X86::VUNPCKHPDrr: Opc = X86::VMOVHLPSrr; break; case X86::VMOVHLPSZrr: Opc = X86::VUNPCKHPDZ128rr; break; case X86::VUNPCKHPDZ128rr: Opc = X86::VMOVHLPSZrr; break; } auto &WorkingMI = cloneIfNew(MI); WorkingMI.setDesc(get(Opc)); return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, OpIdx1, OpIdx2); } case X86::CMOV16rr: case X86::CMOV32rr: case X86::CMOV64rr: { auto &WorkingMI = cloneIfNew(MI); unsigned OpNo = MI.getDesc().getNumOperands() - 1; X86::CondCode CC = static_cast(MI.getOperand(OpNo).getImm()); WorkingMI.getOperand(OpNo).setImm(X86::GetOppositeBranchCondition(CC)); return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, OpIdx1, OpIdx2); } case X86::VPTERNLOGDZrri: case X86::VPTERNLOGDZrmi: case X86::VPTERNLOGDZ128rri: case X86::VPTERNLOGDZ128rmi: case X86::VPTERNLOGDZ256rri: case X86::VPTERNLOGDZ256rmi: case X86::VPTERNLOGQZrri: case X86::VPTERNLOGQZrmi: case X86::VPTERNLOGQZ128rri: case X86::VPTERNLOGQZ128rmi: case X86::VPTERNLOGQZ256rri: case X86::VPTERNLOGQZ256rmi: case X86::VPTERNLOGDZrrik: case X86::VPTERNLOGDZ128rrik: case X86::VPTERNLOGDZ256rrik: case X86::VPTERNLOGQZrrik: case X86::VPTERNLOGQZ128rrik: case X86::VPTERNLOGQZ256rrik: case X86::VPTERNLOGDZrrikz: case X86::VPTERNLOGDZrmikz: case X86::VPTERNLOGDZ128rrikz: case X86::VPTERNLOGDZ128rmikz: case X86::VPTERNLOGDZ256rrikz: case X86::VPTERNLOGDZ256rmikz: case X86::VPTERNLOGQZrrikz: case X86::VPTERNLOGQZrmikz: case X86::VPTERNLOGQZ128rrikz: case X86::VPTERNLOGQZ128rmikz: case X86::VPTERNLOGQZ256rrikz: case X86::VPTERNLOGQZ256rmikz: case X86::VPTERNLOGDZ128rmbi: case X86::VPTERNLOGDZ256rmbi: case X86::VPTERNLOGDZrmbi: case X86::VPTERNLOGQZ128rmbi: case X86::VPTERNLOGQZ256rmbi: case X86::VPTERNLOGQZrmbi: case X86::VPTERNLOGDZ128rmbikz: case X86::VPTERNLOGDZ256rmbikz: case X86::VPTERNLOGDZrmbikz: case X86::VPTERNLOGQZ128rmbikz: case X86::VPTERNLOGQZ256rmbikz: case X86::VPTERNLOGQZrmbikz: { auto &WorkingMI = cloneIfNew(MI); commuteVPTERNLOG(WorkingMI, OpIdx1, OpIdx2); return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, OpIdx1, OpIdx2); } default: { if (isCommutableVPERMV3Instruction(MI.getOpcode())) { unsigned Opc = getCommutedVPERMV3Opcode(MI.getOpcode()); auto &WorkingMI = cloneIfNew(MI); WorkingMI.setDesc(get(Opc)); return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, OpIdx1, OpIdx2); } const X86InstrFMA3Group *FMA3Group = getFMA3Group(MI.getOpcode(), MI.getDesc().TSFlags); if (FMA3Group) { unsigned Opc = getFMA3OpcodeToCommuteOperands(MI, OpIdx1, OpIdx2, *FMA3Group); auto &WorkingMI = cloneIfNew(MI); WorkingMI.setDesc(get(Opc)); return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, OpIdx1, OpIdx2); } return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2); } } } bool X86InstrInfo::findThreeSrcCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx1, unsigned &SrcOpIdx2, bool IsIntrinsic) const { uint64_t TSFlags = MI.getDesc().TSFlags; unsigned FirstCommutableVecOp = 1; unsigned LastCommutableVecOp = 3; unsigned KMaskOp = -1U; if (X86II::isKMasked(TSFlags)) { // For k-zero-masked operations it is Ok to commute the first vector // operand. Unless this is an intrinsic instruction. // For regular k-masked operations a conservative choice is done as the // elements of the first vector operand, for which the corresponding bit // in the k-mask operand is set to 0, are copied to the result of the // instruction. // TODO/FIXME: The commute still may be legal if it is known that the // k-mask operand is set to either all ones or all zeroes. // It is also Ok to commute the 1st operand if all users of MI use only // the elements enabled by the k-mask operand. For example, // v4 = VFMADD213PSZrk v1, k, v2, v3; // v1[i] = k[i] ? v2[i]*v1[i]+v3[i] // : v1[i]; // VMOVAPSZmrk , k, v4; // this is the ONLY user of v4 -> // // Ok, to commute v1 in FMADD213PSZrk. // The k-mask operand has index = 2 for masked and zero-masked operations. KMaskOp = 2; // The operand with index = 1 is used as a source for those elements for // which the corresponding bit in the k-mask is set to 0. if (X86II::isKMergeMasked(TSFlags) || IsIntrinsic) FirstCommutableVecOp = 3; LastCommutableVecOp++; } else if (IsIntrinsic) { // Commuting the first operand of an intrinsic instruction isn't possible // unless we can prove that only the lowest element of the result is used. FirstCommutableVecOp = 2; } if (isMem(MI, LastCommutableVecOp)) LastCommutableVecOp--; // Only the first RegOpsNum operands are commutable. // Also, the value 'CommuteAnyOperandIndex' is valid here as it means // that the operand is not specified/fixed. if (SrcOpIdx1 != CommuteAnyOperandIndex && (SrcOpIdx1 < FirstCommutableVecOp || SrcOpIdx1 > LastCommutableVecOp || SrcOpIdx1 == KMaskOp)) return false; if (SrcOpIdx2 != CommuteAnyOperandIndex && (SrcOpIdx2 < FirstCommutableVecOp || SrcOpIdx2 > LastCommutableVecOp || SrcOpIdx2 == KMaskOp)) return false; // Look for two different register operands assumed to be commutable // regardless of the FMA opcode. The FMA opcode is adjusted later. if (SrcOpIdx1 == CommuteAnyOperandIndex || SrcOpIdx2 == CommuteAnyOperandIndex) { unsigned CommutableOpIdx2 = SrcOpIdx2; // At least one of operands to be commuted is not specified and // this method is free to choose appropriate commutable operands. if (SrcOpIdx1 == SrcOpIdx2) // Both of operands are not fixed. By default set one of commutable // operands to the last register operand of the instruction. CommutableOpIdx2 = LastCommutableVecOp; else if (SrcOpIdx2 == CommuteAnyOperandIndex) // Only one of operands is not fixed. CommutableOpIdx2 = SrcOpIdx1; // CommutableOpIdx2 is well defined now. Let's choose another commutable // operand and assign its index to CommutableOpIdx1. Register Op2Reg = MI.getOperand(CommutableOpIdx2).getReg(); unsigned CommutableOpIdx1; for (CommutableOpIdx1 = LastCommutableVecOp; CommutableOpIdx1 >= FirstCommutableVecOp; CommutableOpIdx1--) { // Just ignore and skip the k-mask operand. if (CommutableOpIdx1 == KMaskOp) continue; // The commuted operands must have different registers. // Otherwise, the commute transformation does not change anything and // is useless then. if (Op2Reg != MI.getOperand(CommutableOpIdx1).getReg()) break; } // No appropriate commutable operands were found. if (CommutableOpIdx1 < FirstCommutableVecOp) return false; // Assign the found pair of commutable indices to SrcOpIdx1 and SrcOpidx2 // to return those values. if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, CommutableOpIdx1, CommutableOpIdx2)) return false; } return true; } bool X86InstrInfo::findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx1, unsigned &SrcOpIdx2) const { const MCInstrDesc &Desc = MI.getDesc(); if (!Desc.isCommutable()) return false; switch (MI.getOpcode()) { case X86::CMPSDrr: case X86::CMPSSrr: case X86::CMPPDrri: case X86::CMPPSrri: case X86::VCMPSDrr: case X86::VCMPSSrr: case X86::VCMPPDrri: case X86::VCMPPSrri: case X86::VCMPPDYrri: case X86::VCMPPSYrri: case X86::VCMPSDZrr: case X86::VCMPSSZrr: case X86::VCMPPDZrri: case X86::VCMPPSZrri: case X86::VCMPPDZ128rri: case X86::VCMPPSZ128rri: case X86::VCMPPDZ256rri: case X86::VCMPPSZ256rri: case X86::VCMPPDZrrik: case X86::VCMPPSZrrik: case X86::VCMPPDZ128rrik: case X86::VCMPPSZ128rrik: case X86::VCMPPDZ256rrik: case X86::VCMPPSZ256rrik: { unsigned OpOffset = X86II::isKMasked(Desc.TSFlags) ? 1 : 0; // Float comparison can be safely commuted for // Ordered/Unordered/Equal/NotEqual tests unsigned Imm = MI.getOperand(3 + OpOffset).getImm() & 0x7; switch (Imm) { default: // EVEX versions can be commuted. if ((Desc.TSFlags & X86II::EncodingMask) == X86II::EVEX) break; return false; case 0x00: // EQUAL case 0x03: // UNORDERED case 0x04: // NOT EQUAL case 0x07: // ORDERED break; } // The indices of the commutable operands are 1 and 2 (or 2 and 3 // when masked). // Assign them to the returned operand indices here. return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 1 + OpOffset, 2 + OpOffset); } case X86::MOVSSrr: // X86::MOVSDrr is always commutable. MOVSS is only commutable if we can // form sse4.1 blend. We assume VMOVSSrr/VMOVSDrr is always commutable since // AVX implies sse4.1. if (Subtarget.hasSSE41()) return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2); return false; case X86::SHUFPDrri: // We can commute this to MOVSD. if (MI.getOperand(3).getImm() == 0x02) return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2); return false; case X86::MOVHLPSrr: case X86::UNPCKHPDrr: case X86::VMOVHLPSrr: case X86::VUNPCKHPDrr: case X86::VMOVHLPSZrr: case X86::VUNPCKHPDZ128rr: if (Subtarget.hasSSE2()) return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2); return false; case X86::VPTERNLOGDZrri: case X86::VPTERNLOGDZrmi: case X86::VPTERNLOGDZ128rri: case X86::VPTERNLOGDZ128rmi: case X86::VPTERNLOGDZ256rri: case X86::VPTERNLOGDZ256rmi: case X86::VPTERNLOGQZrri: case X86::VPTERNLOGQZrmi: case X86::VPTERNLOGQZ128rri: case X86::VPTERNLOGQZ128rmi: case X86::VPTERNLOGQZ256rri: case X86::VPTERNLOGQZ256rmi: case X86::VPTERNLOGDZrrik: case X86::VPTERNLOGDZ128rrik: case X86::VPTERNLOGDZ256rrik: case X86::VPTERNLOGQZrrik: case X86::VPTERNLOGQZ128rrik: case X86::VPTERNLOGQZ256rrik: case X86::VPTERNLOGDZrrikz: case X86::VPTERNLOGDZrmikz: case X86::VPTERNLOGDZ128rrikz: case X86::VPTERNLOGDZ128rmikz: case X86::VPTERNLOGDZ256rrikz: case X86::VPTERNLOGDZ256rmikz: case X86::VPTERNLOGQZrrikz: case X86::VPTERNLOGQZrmikz: case X86::VPTERNLOGQZ128rrikz: case X86::VPTERNLOGQZ128rmikz: case X86::VPTERNLOGQZ256rrikz: case X86::VPTERNLOGQZ256rmikz: case X86::VPTERNLOGDZ128rmbi: case X86::VPTERNLOGDZ256rmbi: case X86::VPTERNLOGDZrmbi: case X86::VPTERNLOGQZ128rmbi: case X86::VPTERNLOGQZ256rmbi: case X86::VPTERNLOGQZrmbi: case X86::VPTERNLOGDZ128rmbikz: case X86::VPTERNLOGDZ256rmbikz: case X86::VPTERNLOGDZrmbikz: case X86::VPTERNLOGQZ128rmbikz: case X86::VPTERNLOGQZ256rmbikz: case X86::VPTERNLOGQZrmbikz: return findThreeSrcCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2); case X86::VPDPWSSDYrr: case X86::VPDPWSSDrr: case X86::VPDPWSSDSYrr: case X86::VPDPWSSDSrr: case X86::VPDPWSSDZ128r: case X86::VPDPWSSDZ128rk: case X86::VPDPWSSDZ128rkz: case X86::VPDPWSSDZ256r: case X86::VPDPWSSDZ256rk: case X86::VPDPWSSDZ256rkz: case X86::VPDPWSSDZr: case X86::VPDPWSSDZrk: case X86::VPDPWSSDZrkz: case X86::VPDPWSSDSZ128r: case X86::VPDPWSSDSZ128rk: case X86::VPDPWSSDSZ128rkz: case X86::VPDPWSSDSZ256r: case X86::VPDPWSSDSZ256rk: case X86::VPDPWSSDSZ256rkz: case X86::VPDPWSSDSZr: case X86::VPDPWSSDSZrk: case X86::VPDPWSSDSZrkz: case X86::VPMADD52HUQZ128r: case X86::VPMADD52HUQZ128rk: case X86::VPMADD52HUQZ128rkz: case X86::VPMADD52HUQZ256r: case X86::VPMADD52HUQZ256rk: case X86::VPMADD52HUQZ256rkz: case X86::VPMADD52HUQZr: case X86::VPMADD52HUQZrk: case X86::VPMADD52HUQZrkz: case X86::VPMADD52LUQZ128r: case X86::VPMADD52LUQZ128rk: case X86::VPMADD52LUQZ128rkz: case X86::VPMADD52LUQZ256r: case X86::VPMADD52LUQZ256rk: case X86::VPMADD52LUQZ256rkz: case X86::VPMADD52LUQZr: case X86::VPMADD52LUQZrk: case X86::VPMADD52LUQZrkz: { unsigned CommutableOpIdx1 = 2; unsigned CommutableOpIdx2 = 3; if (X86II::isKMasked(Desc.TSFlags)) { // Skip the mask register. ++CommutableOpIdx1; ++CommutableOpIdx2; } if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, CommutableOpIdx1, CommutableOpIdx2)) return false; if (!MI.getOperand(SrcOpIdx1).isReg() || !MI.getOperand(SrcOpIdx2).isReg()) // No idea. return false; return true; } default: const X86InstrFMA3Group *FMA3Group = getFMA3Group(MI.getOpcode(), MI.getDesc().TSFlags); if (FMA3Group) return findThreeSrcCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2, FMA3Group->isIntrinsic()); // Handled masked instructions since we need to skip over the mask input // and the preserved input. if (X86II::isKMasked(Desc.TSFlags)) { // First assume that the first input is the mask operand and skip past it. unsigned CommutableOpIdx1 = Desc.getNumDefs() + 1; unsigned CommutableOpIdx2 = Desc.getNumDefs() + 2; // Check if the first input is tied. If there isn't one then we only // need to skip the mask operand which we did above. if ((MI.getDesc().getOperandConstraint(Desc.getNumDefs(), MCOI::TIED_TO) != -1)) { // If this is zero masking instruction with a tied operand, we need to // move the first index back to the first input since this must // be a 3 input instruction and we want the first two non-mask inputs. // Otherwise this is a 2 input instruction with a preserved input and // mask, so we need to move the indices to skip one more input. if (X86II::isKMergeMasked(Desc.TSFlags)) { ++CommutableOpIdx1; ++CommutableOpIdx2; } else { --CommutableOpIdx1; } } if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, CommutableOpIdx1, CommutableOpIdx2)) return false; if (!MI.getOperand(SrcOpIdx1).isReg() || !MI.getOperand(SrcOpIdx2).isReg()) // No idea. return false; return true; } return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2); } return false; } X86::CondCode X86::getCondFromBranch(const MachineInstr &MI) { switch (MI.getOpcode()) { default: return X86::COND_INVALID; case X86::JCC_1: return static_cast( MI.getOperand(MI.getDesc().getNumOperands() - 1).getImm()); } } /// Return condition code of a SETCC opcode. X86::CondCode X86::getCondFromSETCC(const MachineInstr &MI) { switch (MI.getOpcode()) { default: return X86::COND_INVALID; case X86::SETCCr: case X86::SETCCm: return static_cast( MI.getOperand(MI.getDesc().getNumOperands() - 1).getImm()); } } /// Return condition code of a CMov opcode. X86::CondCode X86::getCondFromCMov(const MachineInstr &MI) { switch (MI.getOpcode()) { default: return X86::COND_INVALID; case X86::CMOV16rr: case X86::CMOV32rr: case X86::CMOV64rr: case X86::CMOV16rm: case X86::CMOV32rm: case X86::CMOV64rm: return static_cast( MI.getOperand(MI.getDesc().getNumOperands() - 1).getImm()); } } /// Return the inverse of the specified condition, /// e.g. turning COND_E to COND_NE. X86::CondCode X86::GetOppositeBranchCondition(X86::CondCode CC) { switch (CC) { default: llvm_unreachable("Illegal condition code!"); case X86::COND_E: return X86::COND_NE; case X86::COND_NE: return X86::COND_E; case X86::COND_L: return X86::COND_GE; case X86::COND_LE: return X86::COND_G; case X86::COND_G: return X86::COND_LE; case X86::COND_GE: return X86::COND_L; case X86::COND_B: return X86::COND_AE; case X86::COND_BE: return X86::COND_A; case X86::COND_A: return X86::COND_BE; case X86::COND_AE: return X86::COND_B; case X86::COND_S: return X86::COND_NS; case X86::COND_NS: return X86::COND_S; case X86::COND_P: return X86::COND_NP; case X86::COND_NP: return X86::COND_P; case X86::COND_O: return X86::COND_NO; case X86::COND_NO: return X86::COND_O; case X86::COND_NE_OR_P: return X86::COND_E_AND_NP; case X86::COND_E_AND_NP: return X86::COND_NE_OR_P; } } /// Assuming the flags are set by MI(a,b), return the condition code if we /// modify the instructions such that flags are set by MI(b,a). static X86::CondCode getSwappedCondition(X86::CondCode CC) { switch (CC) { default: return X86::COND_INVALID; case X86::COND_E: return X86::COND_E; case X86::COND_NE: return X86::COND_NE; case X86::COND_L: return X86::COND_G; case X86::COND_LE: return X86::COND_GE; case X86::COND_G: return X86::COND_L; case X86::COND_GE: return X86::COND_LE; case X86::COND_B: return X86::COND_A; case X86::COND_BE: return X86::COND_AE; case X86::COND_A: return X86::COND_B; case X86::COND_AE: return X86::COND_BE; } } std::pair X86::getX86ConditionCode(CmpInst::Predicate Predicate) { X86::CondCode CC = X86::COND_INVALID; bool NeedSwap = false; switch (Predicate) { default: break; // Floating-point Predicates case CmpInst::FCMP_UEQ: CC = X86::COND_E; break; case CmpInst::FCMP_OLT: NeedSwap = true; LLVM_FALLTHROUGH; case CmpInst::FCMP_OGT: CC = X86::COND_A; break; case CmpInst::FCMP_OLE: NeedSwap = true; LLVM_FALLTHROUGH; case CmpInst::FCMP_OGE: CC = X86::COND_AE; break; case CmpInst::FCMP_UGT: NeedSwap = true; LLVM_FALLTHROUGH; case CmpInst::FCMP_ULT: CC = X86::COND_B; break; case CmpInst::FCMP_UGE: NeedSwap = true; LLVM_FALLTHROUGH; case CmpInst::FCMP_ULE: CC = X86::COND_BE; break; case CmpInst::FCMP_ONE: CC = X86::COND_NE; break; case CmpInst::FCMP_UNO: CC = X86::COND_P; break; case CmpInst::FCMP_ORD: CC = X86::COND_NP; break; case CmpInst::FCMP_OEQ: LLVM_FALLTHROUGH; case CmpInst::FCMP_UNE: CC = X86::COND_INVALID; break; // Integer Predicates case CmpInst::ICMP_EQ: CC = X86::COND_E; break; case CmpInst::ICMP_NE: CC = X86::COND_NE; break; case CmpInst::ICMP_UGT: CC = X86::COND_A; break; case CmpInst::ICMP_UGE: CC = X86::COND_AE; break; case CmpInst::ICMP_ULT: CC = X86::COND_B; break; case CmpInst::ICMP_ULE: CC = X86::COND_BE; break; case CmpInst::ICMP_SGT: CC = X86::COND_G; break; case CmpInst::ICMP_SGE: CC = X86::COND_GE; break; case CmpInst::ICMP_SLT: CC = X86::COND_L; break; case CmpInst::ICMP_SLE: CC = X86::COND_LE; break; } return std::make_pair(CC, NeedSwap); } /// Return a setcc opcode based on whether it has memory operand. unsigned X86::getSETOpc(bool HasMemoryOperand) { return HasMemoryOperand ? X86::SETCCr : X86::SETCCm; } /// Return a cmov opcode for the given register size in bytes, and operand type. unsigned X86::getCMovOpcode(unsigned RegBytes, bool HasMemoryOperand) { switch(RegBytes) { default: llvm_unreachable("Illegal register size!"); case 2: return HasMemoryOperand ? X86::CMOV16rm : X86::CMOV16rr; case 4: return HasMemoryOperand ? X86::CMOV32rm : X86::CMOV32rr; case 8: return HasMemoryOperand ? X86::CMOV64rm : X86::CMOV64rr; } } /// Get the VPCMP immediate for the given condition. unsigned X86::getVPCMPImmForCond(ISD::CondCode CC) { switch (CC) { default: llvm_unreachable("Unexpected SETCC condition"); case ISD::SETNE: return 4; case ISD::SETEQ: return 0; case ISD::SETULT: case ISD::SETLT: return 1; case ISD::SETUGT: case ISD::SETGT: return 6; case ISD::SETUGE: case ISD::SETGE: return 5; case ISD::SETULE: case ISD::SETLE: return 2; } } /// Get the VPCMP immediate if the operands are swapped. unsigned X86::getSwappedVPCMPImm(unsigned Imm) { switch (Imm) { default: llvm_unreachable("Unreachable!"); case 0x01: Imm = 0x06; break; // LT -> NLE case 0x02: Imm = 0x05; break; // LE -> NLT case 0x05: Imm = 0x02; break; // NLT -> LE case 0x06: Imm = 0x01; break; // NLE -> LT case 0x00: // EQ case 0x03: // FALSE case 0x04: // NE case 0x07: // TRUE break; } return Imm; } /// Get the VPCOM immediate if the operands are swapped. unsigned X86::getSwappedVPCOMImm(unsigned Imm) { switch (Imm) { default: llvm_unreachable("Unreachable!"); case 0x00: Imm = 0x02; break; // LT -> GT case 0x01: Imm = 0x03; break; // LE -> GE case 0x02: Imm = 0x00; break; // GT -> LT case 0x03: Imm = 0x01; break; // GE -> LE case 0x04: // EQ case 0x05: // NE case 0x06: // FALSE case 0x07: // TRUE break; } return Imm; } /// Get the VCMP immediate if the operands are swapped. unsigned X86::getSwappedVCMPImm(unsigned Imm) { // Only need the lower 2 bits to distinquish. switch (Imm & 0x3) { default: llvm_unreachable("Unreachable!"); case 0x00: case 0x03: // EQ/NE/TRUE/FALSE/ORD/UNORD don't change immediate when commuted. break; case 0x01: case 0x02: // Need to toggle bits 3:0. Bit 4 stays the same. Imm ^= 0xf; break; } return Imm; } bool X86InstrInfo::isUnconditionalTailCall(const MachineInstr &MI) const { switch (MI.getOpcode()) { case X86::TCRETURNdi: case X86::TCRETURNri: case X86::TCRETURNmi: case X86::TCRETURNdi64: case X86::TCRETURNri64: case X86::TCRETURNmi64: return true; default: return false; } } bool X86InstrInfo::canMakeTailCallConditional( SmallVectorImpl &BranchCond, const MachineInstr &TailCall) const { if (TailCall.getOpcode() != X86::TCRETURNdi && TailCall.getOpcode() != X86::TCRETURNdi64) { // Only direct calls can be done with a conditional branch. return false; } const MachineFunction *MF = TailCall.getParent()->getParent(); if (Subtarget.isTargetWin64() && MF->hasWinCFI()) { // Conditional tail calls confuse the Win64 unwinder. return false; } assert(BranchCond.size() == 1); if (BranchCond[0].getImm() > X86::LAST_VALID_COND) { // Can't make a conditional tail call with this condition. return false; } const X86MachineFunctionInfo *X86FI = MF->getInfo(); if (X86FI->getTCReturnAddrDelta() != 0 || TailCall.getOperand(1).getImm() != 0) { // A conditional tail call cannot do any stack adjustment. return false; } return true; } void X86InstrInfo::replaceBranchWithTailCall( MachineBasicBlock &MBB, SmallVectorImpl &BranchCond, const MachineInstr &TailCall) const { assert(canMakeTailCallConditional(BranchCond, TailCall)); MachineBasicBlock::iterator I = MBB.end(); while (I != MBB.begin()) { --I; if (I->isDebugInstr()) continue; if (!I->isBranch()) assert(0 && "Can't find the branch to replace!"); X86::CondCode CC = X86::getCondFromBranch(*I); assert(BranchCond.size() == 1); if (CC != BranchCond[0].getImm()) continue; break; } unsigned Opc = TailCall.getOpcode() == X86::TCRETURNdi ? X86::TCRETURNdicc : X86::TCRETURNdi64cc; auto MIB = BuildMI(MBB, I, MBB.findDebugLoc(I), get(Opc)); MIB->addOperand(TailCall.getOperand(0)); // Destination. MIB.addImm(0); // Stack offset (not used). MIB->addOperand(BranchCond[0]); // Condition. MIB.copyImplicitOps(TailCall); // Regmask and (imp-used) parameters. // Add implicit uses and defs of all live regs potentially clobbered by the // call. This way they still appear live across the call. LivePhysRegs LiveRegs(getRegisterInfo()); LiveRegs.addLiveOuts(MBB); SmallVector, 8> Clobbers; LiveRegs.stepForward(*MIB, Clobbers); for (const auto &C : Clobbers) { MIB.addReg(C.first, RegState::Implicit); MIB.addReg(C.first, RegState::Implicit | RegState::Define); } I->eraseFromParent(); } // Given a MBB and its TBB, find the FBB which was a fallthrough MBB (it may // not be a fallthrough MBB now due to layout changes). Return nullptr if the // fallthrough MBB cannot be identified. static MachineBasicBlock *getFallThroughMBB(MachineBasicBlock *MBB, MachineBasicBlock *TBB) { // Look for non-EHPad successors other than TBB. If we find exactly one, it // is the fallthrough MBB. If we find zero, then TBB is both the target MBB // and fallthrough MBB. If we find more than one, we cannot identify the // fallthrough MBB and should return nullptr. MachineBasicBlock *FallthroughBB = nullptr; for (auto SI = MBB->succ_begin(), SE = MBB->succ_end(); SI != SE; ++SI) { if ((*SI)->isEHPad() || (*SI == TBB && FallthroughBB)) continue; // Return a nullptr if we found more than one fallthrough successor. if (FallthroughBB && FallthroughBB != TBB) return nullptr; FallthroughBB = *SI; } return FallthroughBB; } bool X86InstrInfo::AnalyzeBranchImpl( MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl &Cond, SmallVectorImpl &CondBranches, bool AllowModify) const { // Start from the bottom of the block and work up, examining the // terminator instructions. MachineBasicBlock::iterator I = MBB.end(); MachineBasicBlock::iterator UnCondBrIter = MBB.end(); while (I != MBB.begin()) { --I; if (I->isDebugInstr()) continue; // Working from the bottom, when we see a non-terminator instruction, we're // done. if (!isUnpredicatedTerminator(*I)) break; // A terminator that isn't a branch can't easily be handled by this // analysis. if (!I->isBranch()) return true; // Handle unconditional branches. if (I->getOpcode() == X86::JMP_1) { UnCondBrIter = I; if (!AllowModify) { TBB = I->getOperand(0).getMBB(); continue; } // If the block has any instructions after a JMP, delete them. while (std::next(I) != MBB.end()) std::next(I)->eraseFromParent(); Cond.clear(); FBB = nullptr; // Delete the JMP if it's equivalent to a fall-through. if (MBB.isLayoutSuccessor(I->getOperand(0).getMBB())) { TBB = nullptr; I->eraseFromParent(); I = MBB.end(); UnCondBrIter = MBB.end(); continue; } // TBB is used to indicate the unconditional destination. TBB = I->getOperand(0).getMBB(); continue; } // Handle conditional branches. X86::CondCode BranchCode = X86::getCondFromBranch(*I); if (BranchCode == X86::COND_INVALID) return true; // Can't handle indirect branch. // In practice we should never have an undef eflags operand, if we do // abort here as we are not prepared to preserve the flag. if (I->findRegisterUseOperand(X86::EFLAGS)->isUndef()) return true; // Working from the bottom, handle the first conditional branch. if (Cond.empty()) { MachineBasicBlock *TargetBB = I->getOperand(0).getMBB(); if (AllowModify && UnCondBrIter != MBB.end() && MBB.isLayoutSuccessor(TargetBB)) { // If we can modify the code and it ends in something like: // // jCC L1 // jmp L2 // L1: // ... // L2: // // Then we can change this to: // // jnCC L2 // L1: // ... // L2: // // Which is a bit more efficient. // We conditionally jump to the fall-through block. BranchCode = GetOppositeBranchCondition(BranchCode); MachineBasicBlock::iterator OldInst = I; BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(X86::JCC_1)) .addMBB(UnCondBrIter->getOperand(0).getMBB()) .addImm(BranchCode); BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(X86::JMP_1)) .addMBB(TargetBB); OldInst->eraseFromParent(); UnCondBrIter->eraseFromParent(); // Restart the analysis. UnCondBrIter = MBB.end(); I = MBB.end(); continue; } FBB = TBB; TBB = I->getOperand(0).getMBB(); Cond.push_back(MachineOperand::CreateImm(BranchCode)); CondBranches.push_back(&*I); continue; } // Handle subsequent conditional branches. Only handle the case where all // conditional branches branch to the same destination and their condition // opcodes fit one of the special multi-branch idioms. assert(Cond.size() == 1); assert(TBB); // If the conditions are the same, we can leave them alone. X86::CondCode OldBranchCode = (X86::CondCode)Cond[0].getImm(); auto NewTBB = I->getOperand(0).getMBB(); if (OldBranchCode == BranchCode && TBB == NewTBB) continue; // If they differ, see if they fit one of the known patterns. Theoretically, // we could handle more patterns here, but we shouldn't expect to see them // if instruction selection has done a reasonable job. if (TBB == NewTBB && ((OldBranchCode == X86::COND_P && BranchCode == X86::COND_NE) || (OldBranchCode == X86::COND_NE && BranchCode == X86::COND_P))) { BranchCode = X86::COND_NE_OR_P; } else if ((OldBranchCode == X86::COND_NP && BranchCode == X86::COND_NE) || (OldBranchCode == X86::COND_E && BranchCode == X86::COND_P)) { if (NewTBB != (FBB ? FBB : getFallThroughMBB(&MBB, TBB))) return true; // X86::COND_E_AND_NP usually has two different branch destinations. // // JP B1 // JE B2 // JMP B1 // B1: // B2: // // Here this condition branches to B2 only if NP && E. It has another // equivalent form: // // JNE B1 // JNP B2 // JMP B1 // B1: // B2: // // Similarly it branches to B2 only if E && NP. That is why this condition // is named with COND_E_AND_NP. BranchCode = X86::COND_E_AND_NP; } else return true; // Update the MachineOperand. Cond[0].setImm(BranchCode); CondBranches.push_back(&*I); } return false; } bool X86InstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl &Cond, bool AllowModify) const { SmallVector CondBranches; return AnalyzeBranchImpl(MBB, TBB, FBB, Cond, CondBranches, AllowModify); } bool X86InstrInfo::analyzeBranchPredicate(MachineBasicBlock &MBB, MachineBranchPredicate &MBP, bool AllowModify) const { using namespace std::placeholders; SmallVector Cond; SmallVector CondBranches; if (AnalyzeBranchImpl(MBB, MBP.TrueDest, MBP.FalseDest, Cond, CondBranches, AllowModify)) return true; if (Cond.size() != 1) return true; assert(MBP.TrueDest && "expected!"); if (!MBP.FalseDest) MBP.FalseDest = MBB.getNextNode(); const TargetRegisterInfo *TRI = &getRegisterInfo(); MachineInstr *ConditionDef = nullptr; bool SingleUseCondition = true; for (auto I = std::next(MBB.rbegin()), E = MBB.rend(); I != E; ++I) { if (I->modifiesRegister(X86::EFLAGS, TRI)) { ConditionDef = &*I; break; } if (I->readsRegister(X86::EFLAGS, TRI)) SingleUseCondition = false; } if (!ConditionDef) return true; if (SingleUseCondition) { for (auto *Succ : MBB.successors()) if (Succ->isLiveIn(X86::EFLAGS)) SingleUseCondition = false; } MBP.ConditionDef = ConditionDef; MBP.SingleUseCondition = SingleUseCondition; // Currently we only recognize the simple pattern: // // test %reg, %reg // je %label // const unsigned TestOpcode = Subtarget.is64Bit() ? X86::TEST64rr : X86::TEST32rr; if (ConditionDef->getOpcode() == TestOpcode && ConditionDef->getNumOperands() == 3 && ConditionDef->getOperand(0).isIdenticalTo(ConditionDef->getOperand(1)) && (Cond[0].getImm() == X86::COND_NE || Cond[0].getImm() == X86::COND_E)) { MBP.LHS = ConditionDef->getOperand(0); MBP.RHS = MachineOperand::CreateImm(0); MBP.Predicate = Cond[0].getImm() == X86::COND_NE ? MachineBranchPredicate::PRED_NE : MachineBranchPredicate::PRED_EQ; return false; } return true; } unsigned X86InstrInfo::removeBranch(MachineBasicBlock &MBB, int *BytesRemoved) const { assert(!BytesRemoved && "code size not handled"); MachineBasicBlock::iterator I = MBB.end(); unsigned Count = 0; while (I != MBB.begin()) { --I; if (I->isDebugInstr()) continue; if (I->getOpcode() != X86::JMP_1 && X86::getCondFromBranch(*I) == X86::COND_INVALID) break; // Remove the branch. I->eraseFromParent(); I = MBB.end(); ++Count; } return Count; } unsigned X86InstrInfo::insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef Cond, const DebugLoc &DL, int *BytesAdded) const { // Shouldn't be a fall through. assert(TBB && "insertBranch must not be told to insert a fallthrough"); assert((Cond.size() == 1 || Cond.size() == 0) && "X86 branch conditions have one component!"); assert(!BytesAdded && "code size not handled"); if (Cond.empty()) { // Unconditional branch? assert(!FBB && "Unconditional branch with multiple successors!"); BuildMI(&MBB, DL, get(X86::JMP_1)).addMBB(TBB); return 1; } // If FBB is null, it is implied to be a fall-through block. bool FallThru = FBB == nullptr; // Conditional branch. unsigned Count = 0; X86::CondCode CC = (X86::CondCode)Cond[0].getImm(); switch (CC) { case X86::COND_NE_OR_P: // Synthesize NE_OR_P with two branches. BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(X86::COND_NE); ++Count; BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(X86::COND_P); ++Count; break; case X86::COND_E_AND_NP: // Use the next block of MBB as FBB if it is null. if (FBB == nullptr) { FBB = getFallThroughMBB(&MBB, TBB); assert(FBB && "MBB cannot be the last block in function when the false " "body is a fall-through."); } // Synthesize COND_E_AND_NP with two branches. BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(FBB).addImm(X86::COND_NE); ++Count; BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(X86::COND_NP); ++Count; break; default: { BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(CC); ++Count; } } if (!FallThru) { // Two-way Conditional branch. Insert the second branch. BuildMI(&MBB, DL, get(X86::JMP_1)).addMBB(FBB); ++Count; } return Count; } bool X86InstrInfo::canInsertSelect(const MachineBasicBlock &MBB, ArrayRef Cond, Register DstReg, Register TrueReg, Register FalseReg, int &CondCycles, int &TrueCycles, int &FalseCycles) const { // Not all subtargets have cmov instructions. if (!Subtarget.hasCMov()) return false; if (Cond.size() != 1) return false; // We cannot do the composite conditions, at least not in SSA form. if ((X86::CondCode)Cond[0].getImm() > X86::LAST_VALID_COND) return false; // Check register classes. const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); const TargetRegisterClass *RC = RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg)); if (!RC) return false; // We have cmov instructions for 16, 32, and 64 bit general purpose registers. if (X86::GR16RegClass.hasSubClassEq(RC) || X86::GR32RegClass.hasSubClassEq(RC) || X86::GR64RegClass.hasSubClassEq(RC)) { // This latency applies to Pentium M, Merom, Wolfdale, Nehalem, and Sandy // Bridge. Probably Ivy Bridge as well. CondCycles = 2; TrueCycles = 2; FalseCycles = 2; return true; } // Can't do vectors. return false; } void X86InstrInfo::insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef Cond, Register TrueReg, Register FalseReg) const { MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo(); const TargetRegisterClass &RC = *MRI.getRegClass(DstReg); assert(Cond.size() == 1 && "Invalid Cond array"); unsigned Opc = X86::getCMovOpcode(TRI.getRegSizeInBits(RC) / 8, false /*HasMemoryOperand*/); BuildMI(MBB, I, DL, get(Opc), DstReg) .addReg(FalseReg) .addReg(TrueReg) .addImm(Cond[0].getImm()); } /// Test if the given register is a physical h register. static bool isHReg(unsigned Reg) { return X86::GR8_ABCD_HRegClass.contains(Reg); } // Try and copy between VR128/VR64 and GR64 registers. static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg, const X86Subtarget &Subtarget) { bool HasAVX = Subtarget.hasAVX(); bool HasAVX512 = Subtarget.hasAVX512(); // SrcReg(MaskReg) -> DestReg(GR64) // SrcReg(MaskReg) -> DestReg(GR32) // All KMASK RegClasses hold the same k registers, can be tested against anyone. if (X86::VK16RegClass.contains(SrcReg)) { if (X86::GR64RegClass.contains(DestReg)) { assert(Subtarget.hasBWI()); return X86::KMOVQrk; } if (X86::GR32RegClass.contains(DestReg)) return Subtarget.hasBWI() ? X86::KMOVDrk : X86::KMOVWrk; } // SrcReg(GR64) -> DestReg(MaskReg) // SrcReg(GR32) -> DestReg(MaskReg) // All KMASK RegClasses hold the same k registers, can be tested against anyone. if (X86::VK16RegClass.contains(DestReg)) { if (X86::GR64RegClass.contains(SrcReg)) { assert(Subtarget.hasBWI()); return X86::KMOVQkr; } if (X86::GR32RegClass.contains(SrcReg)) return Subtarget.hasBWI() ? X86::KMOVDkr : X86::KMOVWkr; } // SrcReg(VR128) -> DestReg(GR64) // SrcReg(VR64) -> DestReg(GR64) // SrcReg(GR64) -> DestReg(VR128) // SrcReg(GR64) -> DestReg(VR64) if (X86::GR64RegClass.contains(DestReg)) { if (X86::VR128XRegClass.contains(SrcReg)) // Copy from a VR128 register to a GR64 register. return HasAVX512 ? X86::VMOVPQIto64Zrr : HasAVX ? X86::VMOVPQIto64rr : X86::MOVPQIto64rr; if (X86::VR64RegClass.contains(SrcReg)) // Copy from a VR64 register to a GR64 register. return X86::MMX_MOVD64from64rr; } else if (X86::GR64RegClass.contains(SrcReg)) { // Copy from a GR64 register to a VR128 register. if (X86::VR128XRegClass.contains(DestReg)) return HasAVX512 ? X86::VMOV64toPQIZrr : HasAVX ? X86::VMOV64toPQIrr : X86::MOV64toPQIrr; // Copy from a GR64 register to a VR64 register. if (X86::VR64RegClass.contains(DestReg)) return X86::MMX_MOVD64to64rr; } // SrcReg(VR128) -> DestReg(GR32) // SrcReg(GR32) -> DestReg(VR128) if (X86::GR32RegClass.contains(DestReg) && X86::VR128XRegClass.contains(SrcReg)) // Copy from a VR128 register to a GR32 register. return HasAVX512 ? X86::VMOVPDI2DIZrr : HasAVX ? X86::VMOVPDI2DIrr : X86::MOVPDI2DIrr; if (X86::VR128XRegClass.contains(DestReg) && X86::GR32RegClass.contains(SrcReg)) // Copy from a VR128 register to a VR128 register. return HasAVX512 ? X86::VMOVDI2PDIZrr : HasAVX ? X86::VMOVDI2PDIrr : X86::MOVDI2PDIrr; return 0; } void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc) const { // First deal with the normal symmetric copies. bool HasAVX = Subtarget.hasAVX(); bool HasVLX = Subtarget.hasVLX(); unsigned Opc = 0; if (X86::GR64RegClass.contains(DestReg, SrcReg)) Opc = X86::MOV64rr; else if (X86::GR32RegClass.contains(DestReg, SrcReg)) Opc = X86::MOV32rr; else if (X86::GR16RegClass.contains(DestReg, SrcReg)) Opc = X86::MOV16rr; else if (X86::GR8RegClass.contains(DestReg, SrcReg)) { // Copying to or from a physical H register on x86-64 requires a NOREX // move. Otherwise use a normal move. if ((isHReg(DestReg) || isHReg(SrcReg)) && Subtarget.is64Bit()) { Opc = X86::MOV8rr_NOREX; // Both operands must be encodable without an REX prefix. assert(X86::GR8_NOREXRegClass.contains(SrcReg, DestReg) && "8-bit H register can not be copied outside GR8_NOREX"); } else Opc = X86::MOV8rr; } else if (X86::VR64RegClass.contains(DestReg, SrcReg)) Opc = X86::MMX_MOVQ64rr; else if (X86::VR128XRegClass.contains(DestReg, SrcReg)) { if (HasVLX) Opc = X86::VMOVAPSZ128rr; else if (X86::VR128RegClass.contains(DestReg, SrcReg)) Opc = HasAVX ? X86::VMOVAPSrr : X86::MOVAPSrr; else { // If this an extended register and we don't have VLX we need to use a // 512-bit move. Opc = X86::VMOVAPSZrr; const TargetRegisterInfo *TRI = &getRegisterInfo(); DestReg = TRI->getMatchingSuperReg(DestReg, X86::sub_xmm, &X86::VR512RegClass); SrcReg = TRI->getMatchingSuperReg(SrcReg, X86::sub_xmm, &X86::VR512RegClass); } } else if (X86::VR256XRegClass.contains(DestReg, SrcReg)) { if (HasVLX) Opc = X86::VMOVAPSZ256rr; else if (X86::VR256RegClass.contains(DestReg, SrcReg)) Opc = X86::VMOVAPSYrr; else { // If this an extended register and we don't have VLX we need to use a // 512-bit move. Opc = X86::VMOVAPSZrr; const TargetRegisterInfo *TRI = &getRegisterInfo(); DestReg = TRI->getMatchingSuperReg(DestReg, X86::sub_ymm, &X86::VR512RegClass); SrcReg = TRI->getMatchingSuperReg(SrcReg, X86::sub_ymm, &X86::VR512RegClass); } } else if (X86::VR512RegClass.contains(DestReg, SrcReg)) Opc = X86::VMOVAPSZrr; // All KMASK RegClasses hold the same k registers, can be tested against anyone. else if (X86::VK16RegClass.contains(DestReg, SrcReg)) Opc = Subtarget.hasBWI() ? X86::KMOVQkk : X86::KMOVWkk; if (!Opc) Opc = CopyToFromAsymmetricReg(DestReg, SrcReg, Subtarget); if (Opc) { BuildMI(MBB, MI, DL, get(Opc), DestReg) .addReg(SrcReg, getKillRegState(KillSrc)); return; } if (SrcReg == X86::EFLAGS || DestReg == X86::EFLAGS) { // FIXME: We use a fatal error here because historically LLVM has tried // lower some of these physreg copies and we want to ensure we get // reasonable bug reports if someone encounters a case no other testing // found. This path should be removed after the LLVM 7 release. report_fatal_error("Unable to copy EFLAGS physical register!"); } LLVM_DEBUG(dbgs() << "Cannot copy " << RI.getName(SrcReg) << " to " << RI.getName(DestReg) << '\n'); report_fatal_error("Cannot emit physreg copy instruction"); } Optional X86InstrInfo::isCopyInstrImpl(const MachineInstr &MI) const { if (MI.isMoveReg()) return DestSourcePair{MI.getOperand(0), MI.getOperand(1)}; return None; } static unsigned getLoadStoreRegOpcode(Register Reg, const TargetRegisterClass *RC, bool IsStackAligned, const X86Subtarget &STI, bool load) { bool HasAVX = STI.hasAVX(); bool HasAVX512 = STI.hasAVX512(); bool HasVLX = STI.hasVLX(); switch (STI.getRegisterInfo()->getSpillSize(*RC)) { default: llvm_unreachable("Unknown spill size"); case 1: assert(X86::GR8RegClass.hasSubClassEq(RC) && "Unknown 1-byte regclass"); if (STI.is64Bit()) // Copying to or from a physical H register on x86-64 requires a NOREX // move. Otherwise use a normal move. if (isHReg(Reg) || X86::GR8_ABCD_HRegClass.hasSubClassEq(RC)) return load ? X86::MOV8rm_NOREX : X86::MOV8mr_NOREX; return load ? X86::MOV8rm : X86::MOV8mr; case 2: if (X86::VK16RegClass.hasSubClassEq(RC)) return load ? X86::KMOVWkm : X86::KMOVWmk; assert(X86::GR16RegClass.hasSubClassEq(RC) && "Unknown 2-byte regclass"); return load ? X86::MOV16rm : X86::MOV16mr; case 4: if (X86::GR32RegClass.hasSubClassEq(RC)) return load ? X86::MOV32rm : X86::MOV32mr; if (X86::FR32XRegClass.hasSubClassEq(RC)) return load ? (HasAVX512 ? X86::VMOVSSZrm_alt : HasAVX ? X86::VMOVSSrm_alt : X86::MOVSSrm_alt) : (HasAVX512 ? X86::VMOVSSZmr : HasAVX ? X86::VMOVSSmr : X86::MOVSSmr); if (X86::RFP32RegClass.hasSubClassEq(RC)) return load ? X86::LD_Fp32m : X86::ST_Fp32m; if (X86::VK32RegClass.hasSubClassEq(RC)) { assert(STI.hasBWI() && "KMOVD requires BWI"); return load ? X86::KMOVDkm : X86::KMOVDmk; } // All of these mask pair classes have the same spill size, the same kind // of kmov instructions can be used with all of them. if (X86::VK1PAIRRegClass.hasSubClassEq(RC) || X86::VK2PAIRRegClass.hasSubClassEq(RC) || X86::VK4PAIRRegClass.hasSubClassEq(RC) || X86::VK8PAIRRegClass.hasSubClassEq(RC) || X86::VK16PAIRRegClass.hasSubClassEq(RC)) return load ? X86::MASKPAIR16LOAD : X86::MASKPAIR16STORE; llvm_unreachable("Unknown 4-byte regclass"); case 8: if (X86::GR64RegClass.hasSubClassEq(RC)) return load ? X86::MOV64rm : X86::MOV64mr; if (X86::FR64XRegClass.hasSubClassEq(RC)) return load ? (HasAVX512 ? X86::VMOVSDZrm_alt : HasAVX ? X86::VMOVSDrm_alt : X86::MOVSDrm_alt) : (HasAVX512 ? X86::VMOVSDZmr : HasAVX ? X86::VMOVSDmr : X86::MOVSDmr); if (X86::VR64RegClass.hasSubClassEq(RC)) return load ? X86::MMX_MOVQ64rm : X86::MMX_MOVQ64mr; if (X86::RFP64RegClass.hasSubClassEq(RC)) return load ? X86::LD_Fp64m : X86::ST_Fp64m; if (X86::VK64RegClass.hasSubClassEq(RC)) { assert(STI.hasBWI() && "KMOVQ requires BWI"); return load ? X86::KMOVQkm : X86::KMOVQmk; } llvm_unreachable("Unknown 8-byte regclass"); case 10: assert(X86::RFP80RegClass.hasSubClassEq(RC) && "Unknown 10-byte regclass"); return load ? X86::LD_Fp80m : X86::ST_FpP80m; case 16: { if (X86::VR128XRegClass.hasSubClassEq(RC)) { // If stack is realigned we can use aligned stores. if (IsStackAligned) return load ? (HasVLX ? X86::VMOVAPSZ128rm : HasAVX512 ? X86::VMOVAPSZ128rm_NOVLX : HasAVX ? X86::VMOVAPSrm : X86::MOVAPSrm): (HasVLX ? X86::VMOVAPSZ128mr : HasAVX512 ? X86::VMOVAPSZ128mr_NOVLX : HasAVX ? X86::VMOVAPSmr : X86::MOVAPSmr); else return load ? (HasVLX ? X86::VMOVUPSZ128rm : HasAVX512 ? X86::VMOVUPSZ128rm_NOVLX : HasAVX ? X86::VMOVUPSrm : X86::MOVUPSrm): (HasVLX ? X86::VMOVUPSZ128mr : HasAVX512 ? X86::VMOVUPSZ128mr_NOVLX : HasAVX ? X86::VMOVUPSmr : X86::MOVUPSmr); } if (X86::BNDRRegClass.hasSubClassEq(RC)) { if (STI.is64Bit()) return load ? X86::BNDMOV64rm : X86::BNDMOV64mr; else return load ? X86::BNDMOV32rm : X86::BNDMOV32mr; } llvm_unreachable("Unknown 16-byte regclass"); } case 32: assert(X86::VR256XRegClass.hasSubClassEq(RC) && "Unknown 32-byte regclass"); // If stack is realigned we can use aligned stores. if (IsStackAligned) return load ? (HasVLX ? X86::VMOVAPSZ256rm : HasAVX512 ? X86::VMOVAPSZ256rm_NOVLX : X86::VMOVAPSYrm) : (HasVLX ? X86::VMOVAPSZ256mr : HasAVX512 ? X86::VMOVAPSZ256mr_NOVLX : X86::VMOVAPSYmr); else return load ? (HasVLX ? X86::VMOVUPSZ256rm : HasAVX512 ? X86::VMOVUPSZ256rm_NOVLX : X86::VMOVUPSYrm) : (HasVLX ? X86::VMOVUPSZ256mr : HasAVX512 ? X86::VMOVUPSZ256mr_NOVLX : X86::VMOVUPSYmr); case 64: assert(X86::VR512RegClass.hasSubClassEq(RC) && "Unknown 64-byte regclass"); assert(STI.hasAVX512() && "Using 512-bit register requires AVX512"); if (IsStackAligned) return load ? X86::VMOVAPSZrm : X86::VMOVAPSZmr; else return load ? X86::VMOVUPSZrm : X86::VMOVUPSZmr; } } Optional X86InstrInfo::getAddrModeFromMemoryOp(const MachineInstr &MemI, const TargetRegisterInfo *TRI) const { const MCInstrDesc &Desc = MemI.getDesc(); int MemRefBegin = X86II::getMemoryOperandNo(Desc.TSFlags); if (MemRefBegin < 0) return None; MemRefBegin += X86II::getOperandBias(Desc); auto &BaseOp = MemI.getOperand(MemRefBegin + X86::AddrBaseReg); if (!BaseOp.isReg()) // Can be an MO_FrameIndex return None; const MachineOperand &DispMO = MemI.getOperand(MemRefBegin + X86::AddrDisp); // Displacement can be symbolic if (!DispMO.isImm()) return None; ExtAddrMode AM; AM.BaseReg = BaseOp.getReg(); AM.ScaledReg = MemI.getOperand(MemRefBegin + X86::AddrIndexReg).getReg(); AM.Scale = MemI.getOperand(MemRefBegin + X86::AddrScaleAmt).getImm(); AM.Displacement = DispMO.getImm(); return AM; } bool X86InstrInfo::getConstValDefinedInReg(const MachineInstr &MI, const Register Reg, int64_t &ImmVal) const { if (MI.getOpcode() != X86::MOV32ri && MI.getOpcode() != X86::MOV64ri) return false; // Mov Src can be a global address. if (!MI.getOperand(1).isImm() || MI.getOperand(0).getReg() != Reg) return false; ImmVal = MI.getOperand(1).getImm(); return true; } bool X86InstrInfo::preservesZeroValueInReg( const MachineInstr *MI, const Register NullValueReg, const TargetRegisterInfo *TRI) const { if (!MI->modifiesRegister(NullValueReg, TRI)) return true; switch (MI->getOpcode()) { // Shift right/left of a null unto itself is still a null, i.e. rax = shl rax // X. case X86::SHR64ri: case X86::SHR32ri: case X86::SHL64ri: case X86::SHL32ri: assert(MI->getOperand(0).isDef() && MI->getOperand(1).isUse() && "expected for shift opcode!"); return MI->getOperand(0).getReg() == NullValueReg && MI->getOperand(1).getReg() == NullValueReg; // Zero extend of a sub-reg of NullValueReg into itself does not change the // null value. case X86::MOV32rr: return llvm::all_of(MI->operands(), [&](const MachineOperand &MO) { return TRI->isSubRegisterEq(NullValueReg, MO.getReg()); }); default: return false; } llvm_unreachable("Should be handled above!"); } bool X86InstrInfo::getMemOperandsWithOffsetWidth( const MachineInstr &MemOp, SmallVectorImpl &BaseOps, int64_t &Offset, bool &OffsetIsScalable, unsigned &Width, const TargetRegisterInfo *TRI) const { const MCInstrDesc &Desc = MemOp.getDesc(); int MemRefBegin = X86II::getMemoryOperandNo(Desc.TSFlags); if (MemRefBegin < 0) return false; MemRefBegin += X86II::getOperandBias(Desc); const MachineOperand *BaseOp = &MemOp.getOperand(MemRefBegin + X86::AddrBaseReg); if (!BaseOp->isReg()) // Can be an MO_FrameIndex return false; if (MemOp.getOperand(MemRefBegin + X86::AddrScaleAmt).getImm() != 1) return false; if (MemOp.getOperand(MemRefBegin + X86::AddrIndexReg).getReg() != X86::NoRegister) return false; const MachineOperand &DispMO = MemOp.getOperand(MemRefBegin + X86::AddrDisp); // Displacement can be symbolic if (!DispMO.isImm()) return false; Offset = DispMO.getImm(); if (!BaseOp->isReg()) return false; OffsetIsScalable = false; // FIXME: Relying on memoperands() may not be right thing to do here. Check // with X86 maintainers, and fix it accordingly. For now, it is ok, since // there is no use of `Width` for X86 back-end at the moment. Width = !MemOp.memoperands_empty() ? MemOp.memoperands().front()->getSize() : 0; BaseOps.push_back(BaseOp); return true; } static unsigned getStoreRegOpcode(Register SrcReg, const TargetRegisterClass *RC, bool IsStackAligned, const X86Subtarget &STI) { return getLoadStoreRegOpcode(SrcReg, RC, IsStackAligned, STI, false); } static unsigned getLoadRegOpcode(Register DestReg, const TargetRegisterClass *RC, bool IsStackAligned, const X86Subtarget &STI) { return getLoadStoreRegOpcode(DestReg, RC, IsStackAligned, STI, true); } void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, bool isKill, int FrameIdx, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const { const MachineFunction &MF = *MBB.getParent(); assert(MF.getFrameInfo().getObjectSize(FrameIdx) >= TRI->getSpillSize(*RC) && "Stack slot too small for store"); if (RC->getID() == X86::TILERegClassID) { unsigned Opc = X86::TILESTORED; // tilestored %tmm, (%sp, %idx) MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo(); Register VirtReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass); BuildMI(MBB, MI, DebugLoc(), get(X86::MOV64ri), VirtReg).addImm(64); MachineInstr *NewMI = addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx) .addReg(SrcReg, getKillRegState(isKill)); MachineOperand &MO = NewMI->getOperand(2); MO.setReg(VirtReg); MO.setIsKill(true); + } else if (RC->getID() == X86::TILECFGRegClassID) { + unsigned Opc = X86::PSTTILECFG; + addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx) + .addReg(SrcReg, getKillRegState(isKill)); } else { unsigned Alignment = std::max(TRI->getSpillSize(*RC), 16); bool isAligned = (Subtarget.getFrameLowering()->getStackAlign() >= Alignment) || RI.canRealignStack(MF); unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, Subtarget); addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx) .addReg(SrcReg, getKillRegState(isKill)); } } void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, int FrameIdx, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const { if (RC->getID() == X86::TILERegClassID) { unsigned Opc = X86::TILELOADD; // tileloadd (%sp, %idx), %tmm MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo(); Register VirtReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass); MachineInstr *NewMI = BuildMI(MBB, MI, DebugLoc(), get(X86::MOV64ri), VirtReg).addImm(64); NewMI = addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg), FrameIdx); MachineOperand &MO = NewMI->getOperand(3); MO.setReg(VirtReg); MO.setIsKill(true); + } else if (RC->getID() == X86::TILECFGRegClassID) { + unsigned Opc = X86::PLDTILECFG; + addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg), + FrameIdx); } else { const MachineFunction &MF = *MBB.getParent(); unsigned Alignment = std::max(TRI->getSpillSize(*RC), 16); bool isAligned = (Subtarget.getFrameLowering()->getStackAlign() >= Alignment) || RI.canRealignStack(MF); unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, Subtarget); addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg), FrameIdx); } } bool X86InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg, Register &SrcReg2, int &CmpMask, int &CmpValue) const { switch (MI.getOpcode()) { default: break; case X86::CMP64ri32: case X86::CMP64ri8: case X86::CMP32ri: case X86::CMP32ri8: case X86::CMP16ri: case X86::CMP16ri8: case X86::CMP8ri: SrcReg = MI.getOperand(0).getReg(); SrcReg2 = 0; if (MI.getOperand(1).isImm()) { CmpMask = ~0; CmpValue = MI.getOperand(1).getImm(); } else { CmpMask = CmpValue = 0; } return true; // A SUB can be used to perform comparison. case X86::SUB64rm: case X86::SUB32rm: case X86::SUB16rm: case X86::SUB8rm: SrcReg = MI.getOperand(1).getReg(); SrcReg2 = 0; CmpMask = 0; CmpValue = 0; return true; case X86::SUB64rr: case X86::SUB32rr: case X86::SUB16rr: case X86::SUB8rr: SrcReg = MI.getOperand(1).getReg(); SrcReg2 = MI.getOperand(2).getReg(); CmpMask = 0; CmpValue = 0; return true; case X86::SUB64ri32: case X86::SUB64ri8: case X86::SUB32ri: case X86::SUB32ri8: case X86::SUB16ri: case X86::SUB16ri8: case X86::SUB8ri: SrcReg = MI.getOperand(1).getReg(); SrcReg2 = 0; if (MI.getOperand(2).isImm()) { CmpMask = ~0; CmpValue = MI.getOperand(2).getImm(); } else { CmpMask = CmpValue = 0; } return true; case X86::CMP64rr: case X86::CMP32rr: case X86::CMP16rr: case X86::CMP8rr: SrcReg = MI.getOperand(0).getReg(); SrcReg2 = MI.getOperand(1).getReg(); CmpMask = 0; CmpValue = 0; return true; case X86::TEST8rr: case X86::TEST16rr: case X86::TEST32rr: case X86::TEST64rr: SrcReg = MI.getOperand(0).getReg(); if (MI.getOperand(1).getReg() != SrcReg) return false; // Compare against zero. SrcReg2 = 0; CmpMask = ~0; CmpValue = 0; return true; } return false; } /// Check whether the first instruction, whose only /// purpose is to update flags, can be made redundant. /// CMPrr can be made redundant by SUBrr if the operands are the same. /// This function can be extended later on. /// SrcReg, SrcRegs: register operands for FlagI. /// ImmValue: immediate for FlagI if it takes an immediate. inline static bool isRedundantFlagInstr(const MachineInstr &FlagI, Register SrcReg, Register SrcReg2, int ImmMask, int ImmValue, const MachineInstr &OI) { if (((FlagI.getOpcode() == X86::CMP64rr && OI.getOpcode() == X86::SUB64rr) || (FlagI.getOpcode() == X86::CMP32rr && OI.getOpcode() == X86::SUB32rr) || (FlagI.getOpcode() == X86::CMP16rr && OI.getOpcode() == X86::SUB16rr) || (FlagI.getOpcode() == X86::CMP8rr && OI.getOpcode() == X86::SUB8rr)) && ((OI.getOperand(1).getReg() == SrcReg && OI.getOperand(2).getReg() == SrcReg2) || (OI.getOperand(1).getReg() == SrcReg2 && OI.getOperand(2).getReg() == SrcReg))) return true; if (ImmMask != 0 && ((FlagI.getOpcode() == X86::CMP64ri32 && OI.getOpcode() == X86::SUB64ri32) || (FlagI.getOpcode() == X86::CMP64ri8 && OI.getOpcode() == X86::SUB64ri8) || (FlagI.getOpcode() == X86::CMP32ri && OI.getOpcode() == X86::SUB32ri) || (FlagI.getOpcode() == X86::CMP32ri8 && OI.getOpcode() == X86::SUB32ri8) || (FlagI.getOpcode() == X86::CMP16ri && OI.getOpcode() == X86::SUB16ri) || (FlagI.getOpcode() == X86::CMP16ri8 && OI.getOpcode() == X86::SUB16ri8) || (FlagI.getOpcode() == X86::CMP8ri && OI.getOpcode() == X86::SUB8ri)) && OI.getOperand(1).getReg() == SrcReg && OI.getOperand(2).getImm() == ImmValue) return true; return false; } /// Check whether the definition can be converted /// to remove a comparison against zero. inline static bool isDefConvertible(const MachineInstr &MI, bool &NoSignFlag) { NoSignFlag = false; switch (MI.getOpcode()) { default: return false; // The shift instructions only modify ZF if their shift count is non-zero. // N.B.: The processor truncates the shift count depending on the encoding. case X86::SAR8ri: case X86::SAR16ri: case X86::SAR32ri:case X86::SAR64ri: case X86::SHR8ri: case X86::SHR16ri: case X86::SHR32ri:case X86::SHR64ri: return getTruncatedShiftCount(MI, 2) != 0; // Some left shift instructions can be turned into LEA instructions but only // if their flags aren't used. Avoid transforming such instructions. case X86::SHL8ri: case X86::SHL16ri: case X86::SHL32ri:case X86::SHL64ri:{ unsigned ShAmt = getTruncatedShiftCount(MI, 2); if (isTruncatedShiftCountForLEA(ShAmt)) return false; return ShAmt != 0; } case X86::SHRD16rri8:case X86::SHRD32rri8:case X86::SHRD64rri8: case X86::SHLD16rri8:case X86::SHLD32rri8:case X86::SHLD64rri8: return getTruncatedShiftCount(MI, 3) != 0; case X86::SUB64ri32: case X86::SUB64ri8: case X86::SUB32ri: case X86::SUB32ri8: case X86::SUB16ri: case X86::SUB16ri8: case X86::SUB8ri: case X86::SUB64rr: case X86::SUB32rr: case X86::SUB16rr: case X86::SUB8rr: case X86::SUB64rm: case X86::SUB32rm: case X86::SUB16rm: case X86::SUB8rm: case X86::DEC64r: case X86::DEC32r: case X86::DEC16r: case X86::DEC8r: case X86::ADD64ri32: case X86::ADD64ri8: case X86::ADD32ri: case X86::ADD32ri8: case X86::ADD16ri: case X86::ADD16ri8: case X86::ADD8ri: case X86::ADD64rr: case X86::ADD32rr: case X86::ADD16rr: case X86::ADD8rr: case X86::ADD64rm: case X86::ADD32rm: case X86::ADD16rm: case X86::ADD8rm: case X86::INC64r: case X86::INC32r: case X86::INC16r: case X86::INC8r: case X86::AND64ri32: case X86::AND64ri8: case X86::AND32ri: case X86::AND32ri8: case X86::AND16ri: case X86::AND16ri8: case X86::AND8ri: case X86::AND64rr: case X86::AND32rr: case X86::AND16rr: case X86::AND8rr: case X86::AND64rm: case X86::AND32rm: case X86::AND16rm: case X86::AND8rm: case X86::XOR64ri32: case X86::XOR64ri8: case X86::XOR32ri: case X86::XOR32ri8: case X86::XOR16ri: case X86::XOR16ri8: case X86::XOR8ri: case X86::XOR64rr: case X86::XOR32rr: case X86::XOR16rr: case X86::XOR8rr: case X86::XOR64rm: case X86::XOR32rm: case X86::XOR16rm: case X86::XOR8rm: case X86::OR64ri32: case X86::OR64ri8: case X86::OR32ri: case X86::OR32ri8: case X86::OR16ri: case X86::OR16ri8: case X86::OR8ri: case X86::OR64rr: case X86::OR32rr: case X86::OR16rr: case X86::OR8rr: case X86::OR64rm: case X86::OR32rm: case X86::OR16rm: case X86::OR8rm: case X86::ADC64ri32: case X86::ADC64ri8: case X86::ADC32ri: case X86::ADC32ri8: case X86::ADC16ri: case X86::ADC16ri8: case X86::ADC8ri: case X86::ADC64rr: case X86::ADC32rr: case X86::ADC16rr: case X86::ADC8rr: case X86::ADC64rm: case X86::ADC32rm: case X86::ADC16rm: case X86::ADC8rm: case X86::SBB64ri32: case X86::SBB64ri8: case X86::SBB32ri: case X86::SBB32ri8: case X86::SBB16ri: case X86::SBB16ri8: case X86::SBB8ri: case X86::SBB64rr: case X86::SBB32rr: case X86::SBB16rr: case X86::SBB8rr: case X86::SBB64rm: case X86::SBB32rm: case X86::SBB16rm: case X86::SBB8rm: case X86::NEG8r: case X86::NEG16r: case X86::NEG32r: case X86::NEG64r: case X86::SAR8r1: case X86::SAR16r1: case X86::SAR32r1:case X86::SAR64r1: case X86::SHR8r1: case X86::SHR16r1: case X86::SHR32r1:case X86::SHR64r1: case X86::SHL8r1: case X86::SHL16r1: case X86::SHL32r1:case X86::SHL64r1: case X86::ANDN32rr: case X86::ANDN32rm: case X86::ANDN64rr: case X86::ANDN64rm: case X86::BLSI32rr: case X86::BLSI32rm: case X86::BLSI64rr: case X86::BLSI64rm: case X86::BLSMSK32rr:case X86::BLSMSK32rm: case X86::BLSMSK64rr:case X86::BLSMSK64rm: case X86::BLSR32rr: case X86::BLSR32rm: case X86::BLSR64rr: case X86::BLSR64rm: case X86::BZHI32rr: case X86::BZHI32rm: case X86::BZHI64rr: case X86::BZHI64rm: case X86::LZCNT16rr: case X86::LZCNT16rm: case X86::LZCNT32rr: case X86::LZCNT32rm: case X86::LZCNT64rr: case X86::LZCNT64rm: case X86::POPCNT16rr:case X86::POPCNT16rm: case X86::POPCNT32rr:case X86::POPCNT32rm: case X86::POPCNT64rr:case X86::POPCNT64rm: case X86::TZCNT16rr: case X86::TZCNT16rm: case X86::TZCNT32rr: case X86::TZCNT32rm: case X86::TZCNT64rr: case X86::TZCNT64rm: case X86::BLCFILL32rr: case X86::BLCFILL32rm: case X86::BLCFILL64rr: case X86::BLCFILL64rm: case X86::BLCI32rr: case X86::BLCI32rm: case X86::BLCI64rr: case X86::BLCI64rm: case X86::BLCIC32rr: case X86::BLCIC32rm: case X86::BLCIC64rr: case X86::BLCIC64rm: case X86::BLCMSK32rr: case X86::BLCMSK32rm: case X86::BLCMSK64rr: case X86::BLCMSK64rm: case X86::BLCS32rr: case X86::BLCS32rm: case X86::BLCS64rr: case X86::BLCS64rm: case X86::BLSFILL32rr: case X86::BLSFILL32rm: case X86::BLSFILL64rr: case X86::BLSFILL64rm: case X86::BLSIC32rr: case X86::BLSIC32rm: case X86::BLSIC64rr: case X86::BLSIC64rm: case X86::T1MSKC32rr: case X86::T1MSKC32rm: case X86::T1MSKC64rr: case X86::T1MSKC64rm: case X86::TZMSK32rr: case X86::TZMSK32rm: case X86::TZMSK64rr: case X86::TZMSK64rm: return true; case X86::BEXTR32rr: case X86::BEXTR64rr: case X86::BEXTR32rm: case X86::BEXTR64rm: case X86::BEXTRI32ri: case X86::BEXTRI32mi: case X86::BEXTRI64ri: case X86::BEXTRI64mi: // BEXTR doesn't update the sign flag so we can't use it. NoSignFlag = true; return true; } } /// Check whether the use can be converted to remove a comparison against zero. static X86::CondCode isUseDefConvertible(const MachineInstr &MI) { switch (MI.getOpcode()) { default: return X86::COND_INVALID; case X86::NEG8r: case X86::NEG16r: case X86::NEG32r: case X86::NEG64r: return X86::COND_AE; case X86::LZCNT16rr: case X86::LZCNT32rr: case X86::LZCNT64rr: return X86::COND_B; case X86::POPCNT16rr: case X86::POPCNT32rr: case X86::POPCNT64rr: return X86::COND_E; case X86::TZCNT16rr: case X86::TZCNT32rr: case X86::TZCNT64rr: return X86::COND_B; case X86::BSF16rr: case X86::BSF32rr: case X86::BSF64rr: case X86::BSR16rr: case X86::BSR32rr: case X86::BSR64rr: return X86::COND_E; case X86::BLSI32rr: case X86::BLSI64rr: return X86::COND_AE; case X86::BLSR32rr: case X86::BLSR64rr: case X86::BLSMSK32rr: case X86::BLSMSK64rr: return X86::COND_B; // TODO: TBM instructions. } } /// Check if there exists an earlier instruction that /// operates on the same source operands and sets flags in the same way as /// Compare; remove Compare if possible. bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int CmpMask, int CmpValue, const MachineRegisterInfo *MRI) const { // Check whether we can replace SUB with CMP. switch (CmpInstr.getOpcode()) { default: break; case X86::SUB64ri32: case X86::SUB64ri8: case X86::SUB32ri: case X86::SUB32ri8: case X86::SUB16ri: case X86::SUB16ri8: case X86::SUB8ri: case X86::SUB64rm: case X86::SUB32rm: case X86::SUB16rm: case X86::SUB8rm: case X86::SUB64rr: case X86::SUB32rr: case X86::SUB16rr: case X86::SUB8rr: { if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg())) return false; // There is no use of the destination register, we can replace SUB with CMP. unsigned NewOpcode = 0; switch (CmpInstr.getOpcode()) { default: llvm_unreachable("Unreachable!"); case X86::SUB64rm: NewOpcode = X86::CMP64rm; break; case X86::SUB32rm: NewOpcode = X86::CMP32rm; break; case X86::SUB16rm: NewOpcode = X86::CMP16rm; break; case X86::SUB8rm: NewOpcode = X86::CMP8rm; break; case X86::SUB64rr: NewOpcode = X86::CMP64rr; break; case X86::SUB32rr: NewOpcode = X86::CMP32rr; break; case X86::SUB16rr: NewOpcode = X86::CMP16rr; break; case X86::SUB8rr: NewOpcode = X86::CMP8rr; break; case X86::SUB64ri32: NewOpcode = X86::CMP64ri32; break; case X86::SUB64ri8: NewOpcode = X86::CMP64ri8; break; case X86::SUB32ri: NewOpcode = X86::CMP32ri; break; case X86::SUB32ri8: NewOpcode = X86::CMP32ri8; break; case X86::SUB16ri: NewOpcode = X86::CMP16ri; break; case X86::SUB16ri8: NewOpcode = X86::CMP16ri8; break; case X86::SUB8ri: NewOpcode = X86::CMP8ri; break; } CmpInstr.setDesc(get(NewOpcode)); CmpInstr.RemoveOperand(0); // Fall through to optimize Cmp if Cmp is CMPrr or CMPri. if (NewOpcode == X86::CMP64rm || NewOpcode == X86::CMP32rm || NewOpcode == X86::CMP16rm || NewOpcode == X86::CMP8rm) return false; } } // Get the unique definition of SrcReg. MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg); if (!MI) return false; // CmpInstr is the first instruction of the BB. MachineBasicBlock::iterator I = CmpInstr, Def = MI; // If we are comparing against zero, check whether we can use MI to update // EFLAGS. If MI is not in the same BB as CmpInstr, do not optimize. bool IsCmpZero = (CmpMask != 0 && CmpValue == 0); if (IsCmpZero && MI->getParent() != CmpInstr.getParent()) return false; // If we have a use of the source register between the def and our compare // instruction we can eliminate the compare iff the use sets EFLAGS in the // right way. bool ShouldUpdateCC = false; bool NoSignFlag = false; X86::CondCode NewCC = X86::COND_INVALID; if (IsCmpZero && !isDefConvertible(*MI, NoSignFlag)) { // Scan forward from the use until we hit the use we're looking for or the // compare instruction. for (MachineBasicBlock::iterator J = MI;; ++J) { // Do we have a convertible instruction? NewCC = isUseDefConvertible(*J); if (NewCC != X86::COND_INVALID && J->getOperand(1).isReg() && J->getOperand(1).getReg() == SrcReg) { assert(J->definesRegister(X86::EFLAGS) && "Must be an EFLAGS def!"); ShouldUpdateCC = true; // Update CC later on. // This is not a def of SrcReg, but still a def of EFLAGS. Keep going // with the new def. Def = J; MI = &*Def; break; } if (J == I) return false; } } // We are searching for an earlier instruction that can make CmpInstr // redundant and that instruction will be saved in Sub. MachineInstr *Sub = nullptr; const TargetRegisterInfo *TRI = &getRegisterInfo(); // We iterate backward, starting from the instruction before CmpInstr and // stop when reaching the definition of a source register or done with the BB. // RI points to the instruction before CmpInstr. // If the definition is in this basic block, RE points to the definition; // otherwise, RE is the rend of the basic block. MachineBasicBlock::reverse_iterator RI = ++I.getReverse(), RE = CmpInstr.getParent() == MI->getParent() ? Def.getReverse() /* points to MI */ : CmpInstr.getParent()->rend(); MachineInstr *Movr0Inst = nullptr; for (; RI != RE; ++RI) { MachineInstr &Instr = *RI; // Check whether CmpInstr can be made redundant by the current instruction. if (!IsCmpZero && isRedundantFlagInstr(CmpInstr, SrcReg, SrcReg2, CmpMask, CmpValue, Instr)) { Sub = &Instr; break; } if (Instr.modifiesRegister(X86::EFLAGS, TRI) || Instr.readsRegister(X86::EFLAGS, TRI)) { // This instruction modifies or uses EFLAGS. // MOV32r0 etc. are implemented with xor which clobbers condition code. // They are safe to move up, if the definition to EFLAGS is dead and // earlier instructions do not read or write EFLAGS. if (!Movr0Inst && Instr.getOpcode() == X86::MOV32r0 && Instr.registerDefIsDead(X86::EFLAGS, TRI)) { Movr0Inst = &Instr; continue; } // We can't remove CmpInstr. return false; } } // Return false if no candidates exist. if (!IsCmpZero && !Sub) return false; bool IsSwapped = (SrcReg2 != 0 && Sub && Sub->getOperand(1).getReg() == SrcReg2 && Sub->getOperand(2).getReg() == SrcReg); // Scan forward from the instruction after CmpInstr for uses of EFLAGS. // It is safe to remove CmpInstr if EFLAGS is redefined or killed. // If we are done with the basic block, we need to check whether EFLAGS is // live-out. bool IsSafe = false; SmallVector, 4> OpsToUpdate; MachineBasicBlock::iterator E = CmpInstr.getParent()->end(); for (++I; I != E; ++I) { const MachineInstr &Instr = *I; bool ModifyEFLAGS = Instr.modifiesRegister(X86::EFLAGS, TRI); bool UseEFLAGS = Instr.readsRegister(X86::EFLAGS, TRI); // We should check the usage if this instruction uses and updates EFLAGS. if (!UseEFLAGS && ModifyEFLAGS) { // It is safe to remove CmpInstr if EFLAGS is updated again. IsSafe = true; break; } if (!UseEFLAGS && !ModifyEFLAGS) continue; // EFLAGS is used by this instruction. X86::CondCode OldCC = X86::COND_INVALID; if (IsCmpZero || IsSwapped) { // We decode the condition code from opcode. if (Instr.isBranch()) OldCC = X86::getCondFromBranch(Instr); else { OldCC = X86::getCondFromSETCC(Instr); if (OldCC == X86::COND_INVALID) OldCC = X86::getCondFromCMov(Instr); } if (OldCC == X86::COND_INVALID) return false; } X86::CondCode ReplacementCC = X86::COND_INVALID; if (IsCmpZero) { switch (OldCC) { default: break; case X86::COND_A: case X86::COND_AE: case X86::COND_B: case X86::COND_BE: case X86::COND_G: case X86::COND_GE: case X86::COND_L: case X86::COND_LE: case X86::COND_O: case X86::COND_NO: // CF and OF are used, we can't perform this optimization. return false; case X86::COND_S: case X86::COND_NS: // If SF is used, but the instruction doesn't update the SF, then we // can't do the optimization. if (NoSignFlag) return false; break; } // If we're updating the condition code check if we have to reverse the // condition. if (ShouldUpdateCC) switch (OldCC) { default: return false; case X86::COND_E: ReplacementCC = NewCC; break; case X86::COND_NE: ReplacementCC = GetOppositeBranchCondition(NewCC); break; } } else if (IsSwapped) { // If we have SUB(r1, r2) and CMP(r2, r1), the condition code needs // to be changed from r2 > r1 to r1 < r2, from r2 < r1 to r1 > r2, etc. // We swap the condition code and synthesize the new opcode. ReplacementCC = getSwappedCondition(OldCC); if (ReplacementCC == X86::COND_INVALID) return false; } if ((ShouldUpdateCC || IsSwapped) && ReplacementCC != OldCC) { // Push the MachineInstr to OpsToUpdate. // If it is safe to remove CmpInstr, the condition code of these // instructions will be modified. OpsToUpdate.push_back(std::make_pair(&*I, ReplacementCC)); } if (ModifyEFLAGS || Instr.killsRegister(X86::EFLAGS, TRI)) { // It is safe to remove CmpInstr if EFLAGS is updated again or killed. IsSafe = true; break; } } // If EFLAGS is not killed nor re-defined, we should check whether it is // live-out. If it is live-out, do not optimize. if ((IsCmpZero || IsSwapped) && !IsSafe) { MachineBasicBlock *MBB = CmpInstr.getParent(); for (MachineBasicBlock *Successor : MBB->successors()) if (Successor->isLiveIn(X86::EFLAGS)) return false; } // The instruction to be updated is either Sub or MI. Sub = IsCmpZero ? MI : Sub; // Move Movr0Inst to the appropriate place before Sub. if (Movr0Inst) { // Look backwards until we find a def that doesn't use the current EFLAGS. Def = Sub; MachineBasicBlock::reverse_iterator InsertI = Def.getReverse(), InsertE = Sub->getParent()->rend(); for (; InsertI != InsertE; ++InsertI) { MachineInstr *Instr = &*InsertI; if (!Instr->readsRegister(X86::EFLAGS, TRI) && Instr->modifiesRegister(X86::EFLAGS, TRI)) { Sub->getParent()->remove(Movr0Inst); Instr->getParent()->insert(MachineBasicBlock::iterator(Instr), Movr0Inst); break; } } if (InsertI == InsertE) return false; } // Make sure Sub instruction defines EFLAGS and mark the def live. MachineOperand *FlagDef = Sub->findRegisterDefOperand(X86::EFLAGS); assert(FlagDef && "Unable to locate a def EFLAGS operand"); FlagDef->setIsDead(false); CmpInstr.eraseFromParent(); // Modify the condition code of instructions in OpsToUpdate. for (auto &Op : OpsToUpdate) { Op.first->getOperand(Op.first->getDesc().getNumOperands() - 1) .setImm(Op.second); } return true; } /// Try to remove the load by folding it to a register /// operand at the use. We fold the load instructions if load defines a virtual /// register, the virtual register is used once in the same BB, and the /// instructions in-between do not load or store, and have no side effects. MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr &MI, const MachineRegisterInfo *MRI, Register &FoldAsLoadDefReg, MachineInstr *&DefMI) const { // Check whether we can move DefMI here. DefMI = MRI->getVRegDef(FoldAsLoadDefReg); assert(DefMI); bool SawStore = false; if (!DefMI->isSafeToMove(nullptr, SawStore)) return nullptr; // Collect information about virtual register operands of MI. SmallVector SrcOperandIds; for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { MachineOperand &MO = MI.getOperand(i); if (!MO.isReg()) continue; Register Reg = MO.getReg(); if (Reg != FoldAsLoadDefReg) continue; // Do not fold if we have a subreg use or a def. if (MO.getSubReg() || MO.isDef()) return nullptr; SrcOperandIds.push_back(i); } if (SrcOperandIds.empty()) return nullptr; // Check whether we can fold the def into SrcOperandId. if (MachineInstr *FoldMI = foldMemoryOperand(MI, SrcOperandIds, *DefMI)) { FoldAsLoadDefReg = 0; return FoldMI; } return nullptr; } /// Expand a single-def pseudo instruction to a two-addr /// instruction with two undef reads of the register being defined. /// This is used for mapping: /// %xmm4 = V_SET0 /// to: /// %xmm4 = PXORrr undef %xmm4, undef %xmm4 /// static bool Expand2AddrUndef(MachineInstrBuilder &MIB, const MCInstrDesc &Desc) { assert(Desc.getNumOperands() == 3 && "Expected two-addr instruction."); Register Reg = MIB.getReg(0); MIB->setDesc(Desc); // MachineInstr::addOperand() will insert explicit operands before any // implicit operands. MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef); // But we don't trust that. assert(MIB.getReg(1) == Reg && MIB.getReg(2) == Reg && "Misplaced operand"); return true; } /// Expand a single-def pseudo instruction to a two-addr /// instruction with two %k0 reads. /// This is used for mapping: /// %k4 = K_SET1 /// to: /// %k4 = KXNORrr %k0, %k0 static bool Expand2AddrKreg(MachineInstrBuilder &MIB, const MCInstrDesc &Desc, Register Reg) { assert(Desc.getNumOperands() == 3 && "Expected two-addr instruction."); MIB->setDesc(Desc); MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef); return true; } static bool expandMOV32r1(MachineInstrBuilder &MIB, const TargetInstrInfo &TII, bool MinusOne) { MachineBasicBlock &MBB = *MIB->getParent(); DebugLoc DL = MIB->getDebugLoc(); Register Reg = MIB.getReg(0); // Insert the XOR. BuildMI(MBB, MIB.getInstr(), DL, TII.get(X86::XOR32rr), Reg) .addReg(Reg, RegState::Undef) .addReg(Reg, RegState::Undef); // Turn the pseudo into an INC or DEC. MIB->setDesc(TII.get(MinusOne ? X86::DEC32r : X86::INC32r)); MIB.addReg(Reg); return true; } static bool ExpandMOVImmSExti8(MachineInstrBuilder &MIB, const TargetInstrInfo &TII, const X86Subtarget &Subtarget) { MachineBasicBlock &MBB = *MIB->getParent(); DebugLoc DL = MIB->getDebugLoc(); int64_t Imm = MIB->getOperand(1).getImm(); assert(Imm != 0 && "Using push/pop for 0 is not efficient."); MachineBasicBlock::iterator I = MIB.getInstr(); int StackAdjustment; if (Subtarget.is64Bit()) { assert(MIB->getOpcode() == X86::MOV64ImmSExti8 || MIB->getOpcode() == X86::MOV32ImmSExti8); // Can't use push/pop lowering if the function might write to the red zone. X86MachineFunctionInfo *X86FI = MBB.getParent()->getInfo(); if (X86FI->getUsesRedZone()) { MIB->setDesc(TII.get(MIB->getOpcode() == X86::MOV32ImmSExti8 ? X86::MOV32ri : X86::MOV64ri)); return true; } // 64-bit mode doesn't have 32-bit push/pop, so use 64-bit operations and // widen the register if necessary. StackAdjustment = 8; BuildMI(MBB, I, DL, TII.get(X86::PUSH64i8)).addImm(Imm); MIB->setDesc(TII.get(X86::POP64r)); MIB->getOperand(0) .setReg(getX86SubSuperRegister(MIB.getReg(0), 64)); } else { assert(MIB->getOpcode() == X86::MOV32ImmSExti8); StackAdjustment = 4; BuildMI(MBB, I, DL, TII.get(X86::PUSH32i8)).addImm(Imm); MIB->setDesc(TII.get(X86::POP32r)); } MIB->RemoveOperand(1); MIB->addImplicitDefUseOperands(*MBB.getParent()); // Build CFI if necessary. MachineFunction &MF = *MBB.getParent(); const X86FrameLowering *TFL = Subtarget.getFrameLowering(); bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); bool NeedsDwarfCFI = !IsWin64Prologue && MF.needsFrameMoves(); bool EmitCFI = !TFL->hasFP(MF) && NeedsDwarfCFI; if (EmitCFI) { TFL->BuildCFI(MBB, I, DL, MCCFIInstruction::createAdjustCfaOffset(nullptr, StackAdjustment)); TFL->BuildCFI(MBB, std::next(I), DL, MCCFIInstruction::createAdjustCfaOffset(nullptr, -StackAdjustment)); } return true; } // LoadStackGuard has so far only been implemented for 64-bit MachO. Different // code sequence is needed for other targets. static void expandLoadStackGuard(MachineInstrBuilder &MIB, const TargetInstrInfo &TII) { MachineBasicBlock &MBB = *MIB->getParent(); DebugLoc DL = MIB->getDebugLoc(); Register Reg = MIB.getReg(0); const GlobalValue *GV = cast((*MIB->memoperands_begin())->getValue()); auto Flags = MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | MachineMemOperand::MOInvariant; MachineMemOperand *MMO = MBB.getParent()->getMachineMemOperand( MachinePointerInfo::getGOT(*MBB.getParent()), Flags, 8, Align(8)); MachineBasicBlock::iterator I = MIB.getInstr(); BuildMI(MBB, I, DL, TII.get(X86::MOV64rm), Reg).addReg(X86::RIP).addImm(1) .addReg(0).addGlobalAddress(GV, 0, X86II::MO_GOTPCREL).addReg(0) .addMemOperand(MMO); MIB->setDebugLoc(DL); MIB->setDesc(TII.get(X86::MOV64rm)); MIB.addReg(Reg, RegState::Kill).addImm(1).addReg(0).addImm(0).addReg(0); } static bool expandXorFP(MachineInstrBuilder &MIB, const TargetInstrInfo &TII) { MachineBasicBlock &MBB = *MIB->getParent(); MachineFunction &MF = *MBB.getParent(); const X86Subtarget &Subtarget = MF.getSubtarget(); const X86RegisterInfo *TRI = Subtarget.getRegisterInfo(); unsigned XorOp = MIB->getOpcode() == X86::XOR64_FP ? X86::XOR64rr : X86::XOR32rr; MIB->setDesc(TII.get(XorOp)); MIB.addReg(TRI->getFrameRegister(MF), RegState::Undef); return true; } // This is used to handle spills for 128/256-bit registers when we have AVX512, // but not VLX. If it uses an extended register we need to use an instruction // that loads the lower 128/256-bit, but is available with only AVX512F. static bool expandNOVLXLoad(MachineInstrBuilder &MIB, const TargetRegisterInfo *TRI, const MCInstrDesc &LoadDesc, const MCInstrDesc &BroadcastDesc, unsigned SubIdx) { Register DestReg = MIB.getReg(0); // Check if DestReg is XMM16-31 or YMM16-31. if (TRI->getEncodingValue(DestReg) < 16) { // We can use a normal VEX encoded load. MIB->setDesc(LoadDesc); } else { // Use a 128/256-bit VBROADCAST instruction. MIB->setDesc(BroadcastDesc); // Change the destination to a 512-bit register. DestReg = TRI->getMatchingSuperReg(DestReg, SubIdx, &X86::VR512RegClass); MIB->getOperand(0).setReg(DestReg); } return true; } // This is used to handle spills for 128/256-bit registers when we have AVX512, // but not VLX. If it uses an extended register we need to use an instruction // that stores the lower 128/256-bit, but is available with only AVX512F. static bool expandNOVLXStore(MachineInstrBuilder &MIB, const TargetRegisterInfo *TRI, const MCInstrDesc &StoreDesc, const MCInstrDesc &ExtractDesc, unsigned SubIdx) { Register SrcReg = MIB.getReg(X86::AddrNumOperands); // Check if DestReg is XMM16-31 or YMM16-31. if (TRI->getEncodingValue(SrcReg) < 16) { // We can use a normal VEX encoded store. MIB->setDesc(StoreDesc); } else { // Use a VEXTRACTF instruction. MIB->setDesc(ExtractDesc); // Change the destination to a 512-bit register. SrcReg = TRI->getMatchingSuperReg(SrcReg, SubIdx, &X86::VR512RegClass); MIB->getOperand(X86::AddrNumOperands).setReg(SrcReg); MIB.addImm(0x0); // Append immediate to extract from the lower bits. } return true; } static bool expandSHXDROT(MachineInstrBuilder &MIB, const MCInstrDesc &Desc) { MIB->setDesc(Desc); int64_t ShiftAmt = MIB->getOperand(2).getImm(); // Temporarily remove the immediate so we can add another source register. MIB->RemoveOperand(2); // Add the register. Don't copy the kill flag if there is one. MIB.addReg(MIB.getReg(1), getUndefRegState(MIB->getOperand(1).isUndef())); // Add back the immediate. MIB.addImm(ShiftAmt); return true; } bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { bool HasAVX = Subtarget.hasAVX(); MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI); switch (MI.getOpcode()) { case X86::MOV32r0: return Expand2AddrUndef(MIB, get(X86::XOR32rr)); case X86::MOV32r1: return expandMOV32r1(MIB, *this, /*MinusOne=*/ false); case X86::MOV32r_1: return expandMOV32r1(MIB, *this, /*MinusOne=*/ true); case X86::MOV32ImmSExti8: case X86::MOV64ImmSExti8: return ExpandMOVImmSExti8(MIB, *this, Subtarget); case X86::SETB_C32r: return Expand2AddrUndef(MIB, get(X86::SBB32rr)); case X86::SETB_C64r: return Expand2AddrUndef(MIB, get(X86::SBB64rr)); case X86::MMX_SET0: return Expand2AddrUndef(MIB, get(X86::MMX_PXORirr)); case X86::V_SET0: case X86::FsFLD0SS: case X86::FsFLD0SD: case X86::FsFLD0F128: return Expand2AddrUndef(MIB, get(HasAVX ? X86::VXORPSrr : X86::XORPSrr)); case X86::AVX_SET0: { assert(HasAVX && "AVX not supported"); const TargetRegisterInfo *TRI = &getRegisterInfo(); Register SrcReg = MIB.getReg(0); Register XReg = TRI->getSubReg(SrcReg, X86::sub_xmm); MIB->getOperand(0).setReg(XReg); Expand2AddrUndef(MIB, get(X86::VXORPSrr)); MIB.addReg(SrcReg, RegState::ImplicitDefine); return true; } case X86::AVX512_128_SET0: case X86::AVX512_FsFLD0SS: case X86::AVX512_FsFLD0SD: case X86::AVX512_FsFLD0F128: { bool HasVLX = Subtarget.hasVLX(); Register SrcReg = MIB.getReg(0); const TargetRegisterInfo *TRI = &getRegisterInfo(); if (HasVLX || TRI->getEncodingValue(SrcReg) < 16) return Expand2AddrUndef(MIB, get(HasVLX ? X86::VPXORDZ128rr : X86::VXORPSrr)); // Extended register without VLX. Use a larger XOR. SrcReg = TRI->getMatchingSuperReg(SrcReg, X86::sub_xmm, &X86::VR512RegClass); MIB->getOperand(0).setReg(SrcReg); return Expand2AddrUndef(MIB, get(X86::VPXORDZrr)); } case X86::AVX512_256_SET0: case X86::AVX512_512_SET0: { bool HasVLX = Subtarget.hasVLX(); Register SrcReg = MIB.getReg(0); const TargetRegisterInfo *TRI = &getRegisterInfo(); if (HasVLX || TRI->getEncodingValue(SrcReg) < 16) { Register XReg = TRI->getSubReg(SrcReg, X86::sub_xmm); MIB->getOperand(0).setReg(XReg); Expand2AddrUndef(MIB, get(HasVLX ? X86::VPXORDZ128rr : X86::VXORPSrr)); MIB.addReg(SrcReg, RegState::ImplicitDefine); return true; } if (MI.getOpcode() == X86::AVX512_256_SET0) { // No VLX so we must reference a zmm. unsigned ZReg = TRI->getMatchingSuperReg(SrcReg, X86::sub_ymm, &X86::VR512RegClass); MIB->getOperand(0).setReg(ZReg); } return Expand2AddrUndef(MIB, get(X86::VPXORDZrr)); } case X86::V_SETALLONES: return Expand2AddrUndef(MIB, get(HasAVX ? X86::VPCMPEQDrr : X86::PCMPEQDrr)); case X86::AVX2_SETALLONES: return Expand2AddrUndef(MIB, get(X86::VPCMPEQDYrr)); case X86::AVX1_SETALLONES: { Register Reg = MIB.getReg(0); // VCMPPSYrri with an immediate 0xf should produce VCMPTRUEPS. MIB->setDesc(get(X86::VCMPPSYrri)); MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef).addImm(0xf); return true; } case X86::AVX512_512_SETALLONES: { Register Reg = MIB.getReg(0); MIB->setDesc(get(X86::VPTERNLOGDZrri)); // VPTERNLOGD needs 3 register inputs and an immediate. // 0xff will return 1s for any input. MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef) .addReg(Reg, RegState::Undef).addImm(0xff); return true; } case X86::AVX512_512_SEXT_MASK_32: case X86::AVX512_512_SEXT_MASK_64: { Register Reg = MIB.getReg(0); Register MaskReg = MIB.getReg(1); unsigned MaskState = getRegState(MIB->getOperand(1)); unsigned Opc = (MI.getOpcode() == X86::AVX512_512_SEXT_MASK_64) ? X86::VPTERNLOGQZrrikz : X86::VPTERNLOGDZrrikz; MI.RemoveOperand(1); MIB->setDesc(get(Opc)); // VPTERNLOG needs 3 register inputs and an immediate. // 0xff will return 1s for any input. MIB.addReg(Reg, RegState::Undef).addReg(MaskReg, MaskState) .addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef).addImm(0xff); return true; } case X86::VMOVAPSZ128rm_NOVLX: return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVAPSrm), get(X86::VBROADCASTF32X4rm), X86::sub_xmm); case X86::VMOVUPSZ128rm_NOVLX: return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVUPSrm), get(X86::VBROADCASTF32X4rm), X86::sub_xmm); case X86::VMOVAPSZ256rm_NOVLX: return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVAPSYrm), get(X86::VBROADCASTF64X4rm), X86::sub_ymm); case X86::VMOVUPSZ256rm_NOVLX: return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVUPSYrm), get(X86::VBROADCASTF64X4rm), X86::sub_ymm); case X86::VMOVAPSZ128mr_NOVLX: return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVAPSmr), get(X86::VEXTRACTF32x4Zmr), X86::sub_xmm); case X86::VMOVUPSZ128mr_NOVLX: return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVUPSmr), get(X86::VEXTRACTF32x4Zmr), X86::sub_xmm); case X86::VMOVAPSZ256mr_NOVLX: return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVAPSYmr), get(X86::VEXTRACTF64x4Zmr), X86::sub_ymm); case X86::VMOVUPSZ256mr_NOVLX: return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVUPSYmr), get(X86::VEXTRACTF64x4Zmr), X86::sub_ymm); case X86::MOV32ri64: { Register Reg = MIB.getReg(0); Register Reg32 = RI.getSubReg(Reg, X86::sub_32bit); MI.setDesc(get(X86::MOV32ri)); MIB->getOperand(0).setReg(Reg32); MIB.addReg(Reg, RegState::ImplicitDefine); return true; } // KNL does not recognize dependency-breaking idioms for mask registers, // so kxnor %k1, %k1, %k2 has a RAW dependence on %k1. // Using %k0 as the undef input register is a performance heuristic based // on the assumption that %k0 is used less frequently than the other mask // registers, since it is not usable as a write mask. // FIXME: A more advanced approach would be to choose the best input mask // register based on context. case X86::KSET0W: return Expand2AddrKreg(MIB, get(X86::KXORWrr), X86::K0); case X86::KSET0D: return Expand2AddrKreg(MIB, get(X86::KXORDrr), X86::K0); case X86::KSET0Q: return Expand2AddrKreg(MIB, get(X86::KXORQrr), X86::K0); case X86::KSET1W: return Expand2AddrKreg(MIB, get(X86::KXNORWrr), X86::K0); case X86::KSET1D: return Expand2AddrKreg(MIB, get(X86::KXNORDrr), X86::K0); case X86::KSET1Q: return Expand2AddrKreg(MIB, get(X86::KXNORQrr), X86::K0); case TargetOpcode::LOAD_STACK_GUARD: expandLoadStackGuard(MIB, *this); return true; case X86::XOR64_FP: case X86::XOR32_FP: return expandXorFP(MIB, *this); case X86::SHLDROT32ri: return expandSHXDROT(MIB, get(X86::SHLD32rri8)); case X86::SHLDROT64ri: return expandSHXDROT(MIB, get(X86::SHLD64rri8)); case X86::SHRDROT32ri: return expandSHXDROT(MIB, get(X86::SHRD32rri8)); case X86::SHRDROT64ri: return expandSHXDROT(MIB, get(X86::SHRD64rri8)); case X86::ADD8rr_DB: MIB->setDesc(get(X86::OR8rr)); break; case X86::ADD16rr_DB: MIB->setDesc(get(X86::OR16rr)); break; case X86::ADD32rr_DB: MIB->setDesc(get(X86::OR32rr)); break; case X86::ADD64rr_DB: MIB->setDesc(get(X86::OR64rr)); break; case X86::ADD8ri_DB: MIB->setDesc(get(X86::OR8ri)); break; case X86::ADD16ri_DB: MIB->setDesc(get(X86::OR16ri)); break; case X86::ADD32ri_DB: MIB->setDesc(get(X86::OR32ri)); break; case X86::ADD64ri32_DB: MIB->setDesc(get(X86::OR64ri32)); break; case X86::ADD16ri8_DB: MIB->setDesc(get(X86::OR16ri8)); break; case X86::ADD32ri8_DB: MIB->setDesc(get(X86::OR32ri8)); break; case X86::ADD64ri8_DB: MIB->setDesc(get(X86::OR64ri8)); break; } return false; } /// Return true for all instructions that only update /// the first 32 or 64-bits of the destination register and leave the rest /// unmodified. This can be used to avoid folding loads if the instructions /// only update part of the destination register, and the non-updated part is /// not needed. e.g. cvtss2sd, sqrtss. Unfolding the load from these /// instructions breaks the partial register dependency and it can improve /// performance. e.g.: /// /// movss (%rdi), %xmm0 /// cvtss2sd %xmm0, %xmm0 /// /// Instead of /// cvtss2sd (%rdi), %xmm0 /// /// FIXME: This should be turned into a TSFlags. /// static bool hasPartialRegUpdate(unsigned Opcode, const X86Subtarget &Subtarget, bool ForLoadFold = false) { switch (Opcode) { case X86::CVTSI2SSrr: case X86::CVTSI2SSrm: case X86::CVTSI642SSrr: case X86::CVTSI642SSrm: case X86::CVTSI2SDrr: case X86::CVTSI2SDrm: case X86::CVTSI642SDrr: case X86::CVTSI642SDrm: // Load folding won't effect the undef register update since the input is // a GPR. return !ForLoadFold; case X86::CVTSD2SSrr: case X86::CVTSD2SSrm: case X86::CVTSS2SDrr: case X86::CVTSS2SDrm: case X86::MOVHPDrm: case X86::MOVHPSrm: case X86::MOVLPDrm: case X86::MOVLPSrm: case X86::RCPSSr: case X86::RCPSSm: case X86::RCPSSr_Int: case X86::RCPSSm_Int: case X86::ROUNDSDr: case X86::ROUNDSDm: case X86::ROUNDSSr: case X86::ROUNDSSm: case X86::RSQRTSSr: case X86::RSQRTSSm: case X86::RSQRTSSr_Int: case X86::RSQRTSSm_Int: case X86::SQRTSSr: case X86::SQRTSSm: case X86::SQRTSSr_Int: case X86::SQRTSSm_Int: case X86::SQRTSDr: case X86::SQRTSDm: case X86::SQRTSDr_Int: case X86::SQRTSDm_Int: return true; // GPR case X86::POPCNT32rm: case X86::POPCNT32rr: case X86::POPCNT64rm: case X86::POPCNT64rr: return Subtarget.hasPOPCNTFalseDeps(); case X86::LZCNT32rm: case X86::LZCNT32rr: case X86::LZCNT64rm: case X86::LZCNT64rr: case X86::TZCNT32rm: case X86::TZCNT32rr: case X86::TZCNT64rm: case X86::TZCNT64rr: return Subtarget.hasLZCNTFalseDeps(); } return false; } /// Inform the BreakFalseDeps pass how many idle /// instructions we would like before a partial register update. unsigned X86InstrInfo::getPartialRegUpdateClearance( const MachineInstr &MI, unsigned OpNum, const TargetRegisterInfo *TRI) const { if (OpNum != 0 || !hasPartialRegUpdate(MI.getOpcode(), Subtarget)) return 0; // If MI is marked as reading Reg, the partial register update is wanted. const MachineOperand &MO = MI.getOperand(0); Register Reg = MO.getReg(); if (Reg.isVirtual()) { if (MO.readsReg() || MI.readsVirtualRegister(Reg)) return 0; } else { if (MI.readsRegister(Reg, TRI)) return 0; } // If any instructions in the clearance range are reading Reg, insert a // dependency breaking instruction, which is inexpensive and is likely to // be hidden in other instruction's cycles. return PartialRegUpdateClearance; } // Return true for any instruction the copies the high bits of the first source // operand into the unused high bits of the destination operand. // Also returns true for instructions that have two inputs where one may // be undef and we want it to use the same register as the other input. static bool hasUndefRegUpdate(unsigned Opcode, unsigned OpNum, bool ForLoadFold = false) { // Set the OpNum parameter to the first source operand. switch (Opcode) { case X86::MMX_PUNPCKHBWirr: case X86::MMX_PUNPCKHWDirr: case X86::MMX_PUNPCKHDQirr: case X86::MMX_PUNPCKLBWirr: case X86::MMX_PUNPCKLWDirr: case X86::MMX_PUNPCKLDQirr: case X86::MOVHLPSrr: case X86::PACKSSWBrr: case X86::PACKUSWBrr: case X86::PACKSSDWrr: case X86::PACKUSDWrr: case X86::PUNPCKHBWrr: case X86::PUNPCKLBWrr: case X86::PUNPCKHWDrr: case X86::PUNPCKLWDrr: case X86::PUNPCKHDQrr: case X86::PUNPCKLDQrr: case X86::PUNPCKHQDQrr: case X86::PUNPCKLQDQrr: case X86::SHUFPDrri: case X86::SHUFPSrri: // These instructions are sometimes used with an undef first or second // source. Return true here so BreakFalseDeps will assign this source to the // same register as the first source to avoid a false dependency. // Operand 1 of these instructions is tied so they're separate from their // VEX counterparts. return OpNum == 2 && !ForLoadFold; case X86::VMOVLHPSrr: case X86::VMOVLHPSZrr: case X86::VPACKSSWBrr: case X86::VPACKUSWBrr: case X86::VPACKSSDWrr: case X86::VPACKUSDWrr: case X86::VPACKSSWBZ128rr: case X86::VPACKUSWBZ128rr: case X86::VPACKSSDWZ128rr: case X86::VPACKUSDWZ128rr: case X86::VPERM2F128rr: case X86::VPERM2I128rr: case X86::VSHUFF32X4Z256rri: case X86::VSHUFF32X4Zrri: case X86::VSHUFF64X2Z256rri: case X86::VSHUFF64X2Zrri: case X86::VSHUFI32X4Z256rri: case X86::VSHUFI32X4Zrri: case X86::VSHUFI64X2Z256rri: case X86::VSHUFI64X2Zrri: case X86::VPUNPCKHBWrr: case X86::VPUNPCKLBWrr: case X86::VPUNPCKHBWYrr: case X86::VPUNPCKLBWYrr: case X86::VPUNPCKHBWZ128rr: case X86::VPUNPCKLBWZ128rr: case X86::VPUNPCKHBWZ256rr: case X86::VPUNPCKLBWZ256rr: case X86::VPUNPCKHBWZrr: case X86::VPUNPCKLBWZrr: case X86::VPUNPCKHWDrr: case X86::VPUNPCKLWDrr: case X86::VPUNPCKHWDYrr: case X86::VPUNPCKLWDYrr: case X86::VPUNPCKHWDZ128rr: case X86::VPUNPCKLWDZ128rr: case X86::VPUNPCKHWDZ256rr: case X86::VPUNPCKLWDZ256rr: case X86::VPUNPCKHWDZrr: case X86::VPUNPCKLWDZrr: case X86::VPUNPCKHDQrr: case X86::VPUNPCKLDQrr: case X86::VPUNPCKHDQYrr: case X86::VPUNPCKLDQYrr: case X86::VPUNPCKHDQZ128rr: case X86::VPUNPCKLDQZ128rr: case X86::VPUNPCKHDQZ256rr: case X86::VPUNPCKLDQZ256rr: case X86::VPUNPCKHDQZrr: case X86::VPUNPCKLDQZrr: case X86::VPUNPCKHQDQrr: case X86::VPUNPCKLQDQrr: case X86::VPUNPCKHQDQYrr: case X86::VPUNPCKLQDQYrr: case X86::VPUNPCKHQDQZ128rr: case X86::VPUNPCKLQDQZ128rr: case X86::VPUNPCKHQDQZ256rr: case X86::VPUNPCKLQDQZ256rr: case X86::VPUNPCKHQDQZrr: case X86::VPUNPCKLQDQZrr: // These instructions are sometimes used with an undef first or second // source. Return true here so BreakFalseDeps will assign this source to the // same register as the first source to avoid a false dependency. return (OpNum == 1 || OpNum == 2) && !ForLoadFold; case X86::VCVTSI2SSrr: case X86::VCVTSI2SSrm: case X86::VCVTSI2SSrr_Int: case X86::VCVTSI2SSrm_Int: case X86::VCVTSI642SSrr: case X86::VCVTSI642SSrm: case X86::VCVTSI642SSrr_Int: case X86::VCVTSI642SSrm_Int: case X86::VCVTSI2SDrr: case X86::VCVTSI2SDrm: case X86::VCVTSI2SDrr_Int: case X86::VCVTSI2SDrm_Int: case X86::VCVTSI642SDrr: case X86::VCVTSI642SDrm: case X86::VCVTSI642SDrr_Int: case X86::VCVTSI642SDrm_Int: // AVX-512 case X86::VCVTSI2SSZrr: case X86::VCVTSI2SSZrm: case X86::VCVTSI2SSZrr_Int: case X86::VCVTSI2SSZrrb_Int: case X86::VCVTSI2SSZrm_Int: case X86::VCVTSI642SSZrr: case X86::VCVTSI642SSZrm: case X86::VCVTSI642SSZrr_Int: case X86::VCVTSI642SSZrrb_Int: case X86::VCVTSI642SSZrm_Int: case X86::VCVTSI2SDZrr: case X86::VCVTSI2SDZrm: case X86::VCVTSI2SDZrr_Int: case X86::VCVTSI2SDZrm_Int: case X86::VCVTSI642SDZrr: case X86::VCVTSI642SDZrm: case X86::VCVTSI642SDZrr_Int: case X86::VCVTSI642SDZrrb_Int: case X86::VCVTSI642SDZrm_Int: case X86::VCVTUSI2SSZrr: case X86::VCVTUSI2SSZrm: case X86::VCVTUSI2SSZrr_Int: case X86::VCVTUSI2SSZrrb_Int: case X86::VCVTUSI2SSZrm_Int: case X86::VCVTUSI642SSZrr: case X86::VCVTUSI642SSZrm: case X86::VCVTUSI642SSZrr_Int: case X86::VCVTUSI642SSZrrb_Int: case X86::VCVTUSI642SSZrm_Int: case X86::VCVTUSI2SDZrr: case X86::VCVTUSI2SDZrm: case X86::VCVTUSI2SDZrr_Int: case X86::VCVTUSI2SDZrm_Int: case X86::VCVTUSI642SDZrr: case X86::VCVTUSI642SDZrm: case X86::VCVTUSI642SDZrr_Int: case X86::VCVTUSI642SDZrrb_Int: case X86::VCVTUSI642SDZrm_Int: // Load folding won't effect the undef register update since the input is // a GPR. return OpNum == 1 && !ForLoadFold; case X86::VCVTSD2SSrr: case X86::VCVTSD2SSrm: case X86::VCVTSD2SSrr_Int: case X86::VCVTSD2SSrm_Int: case X86::VCVTSS2SDrr: case X86::VCVTSS2SDrm: case X86::VCVTSS2SDrr_Int: case X86::VCVTSS2SDrm_Int: case X86::VRCPSSr: case X86::VRCPSSr_Int: case X86::VRCPSSm: case X86::VRCPSSm_Int: case X86::VROUNDSDr: case X86::VROUNDSDm: case X86::VROUNDSDr_Int: case X86::VROUNDSDm_Int: case X86::VROUNDSSr: case X86::VROUNDSSm: case X86::VROUNDSSr_Int: case X86::VROUNDSSm_Int: case X86::VRSQRTSSr: case X86::VRSQRTSSr_Int: case X86::VRSQRTSSm: case X86::VRSQRTSSm_Int: case X86::VSQRTSSr: case X86::VSQRTSSr_Int: case X86::VSQRTSSm: case X86::VSQRTSSm_Int: case X86::VSQRTSDr: case X86::VSQRTSDr_Int: case X86::VSQRTSDm: case X86::VSQRTSDm_Int: // AVX-512 case X86::VCVTSD2SSZrr: case X86::VCVTSD2SSZrr_Int: case X86::VCVTSD2SSZrrb_Int: case X86::VCVTSD2SSZrm: case X86::VCVTSD2SSZrm_Int: case X86::VCVTSS2SDZrr: case X86::VCVTSS2SDZrr_Int: case X86::VCVTSS2SDZrrb_Int: case X86::VCVTSS2SDZrm: case X86::VCVTSS2SDZrm_Int: case X86::VGETEXPSDZr: case X86::VGETEXPSDZrb: case X86::VGETEXPSDZm: case X86::VGETEXPSSZr: case X86::VGETEXPSSZrb: case X86::VGETEXPSSZm: case X86::VGETMANTSDZrri: case X86::VGETMANTSDZrrib: case X86::VGETMANTSDZrmi: case X86::VGETMANTSSZrri: case X86::VGETMANTSSZrrib: case X86::VGETMANTSSZrmi: case X86::VRNDSCALESDZr: case X86::VRNDSCALESDZr_Int: case X86::VRNDSCALESDZrb_Int: case X86::VRNDSCALESDZm: case X86::VRNDSCALESDZm_Int: case X86::VRNDSCALESSZr: case X86::VRNDSCALESSZr_Int: case X86::VRNDSCALESSZrb_Int: case X86::VRNDSCALESSZm: case X86::VRNDSCALESSZm_Int: case X86::VRCP14SDZrr: case X86::VRCP14SDZrm: case X86::VRCP14SSZrr: case X86::VRCP14SSZrm: case X86::VRCP28SDZr: case X86::VRCP28SDZrb: case X86::VRCP28SDZm: case X86::VRCP28SSZr: case X86::VRCP28SSZrb: case X86::VRCP28SSZm: case X86::VREDUCESSZrmi: case X86::VREDUCESSZrri: case X86::VREDUCESSZrrib: case X86::VRSQRT14SDZrr: case X86::VRSQRT14SDZrm: case X86::VRSQRT14SSZrr: case X86::VRSQRT14SSZrm: case X86::VRSQRT28SDZr: case X86::VRSQRT28SDZrb: case X86::VRSQRT28SDZm: case X86::VRSQRT28SSZr: case X86::VRSQRT28SSZrb: case X86::VRSQRT28SSZm: case X86::VSQRTSSZr: case X86::VSQRTSSZr_Int: case X86::VSQRTSSZrb_Int: case X86::VSQRTSSZm: case X86::VSQRTSSZm_Int: case X86::VSQRTSDZr: case X86::VSQRTSDZr_Int: case X86::VSQRTSDZrb_Int: case X86::VSQRTSDZm: case X86::VSQRTSDZm_Int: return OpNum == 1; case X86::VMOVSSZrrk: case X86::VMOVSDZrrk: return OpNum == 3 && !ForLoadFold; case X86::VMOVSSZrrkz: case X86::VMOVSDZrrkz: return OpNum == 2 && !ForLoadFold; } return false; } /// Inform the BreakFalseDeps pass how many idle instructions we would like /// before certain undef register reads. /// /// This catches the VCVTSI2SD family of instructions: /// /// vcvtsi2sdq %rax, undef %xmm0, %xmm14 /// /// We should to be careful *not* to catch VXOR idioms which are presumably /// handled specially in the pipeline: /// /// vxorps undef %xmm1, undef %xmm1, %xmm1 /// /// Like getPartialRegUpdateClearance, this makes a strong assumption that the /// high bits that are passed-through are not live. unsigned X86InstrInfo::getUndefRegClearance(const MachineInstr &MI, unsigned OpNum, const TargetRegisterInfo *TRI) const { const MachineOperand &MO = MI.getOperand(OpNum); if (Register::isPhysicalRegister(MO.getReg()) && hasUndefRegUpdate(MI.getOpcode(), OpNum)) return UndefRegClearance; return 0; } void X86InstrInfo::breakPartialRegDependency( MachineInstr &MI, unsigned OpNum, const TargetRegisterInfo *TRI) const { Register Reg = MI.getOperand(OpNum).getReg(); // If MI kills this register, the false dependence is already broken. if (MI.killsRegister(Reg, TRI)) return; if (X86::VR128RegClass.contains(Reg)) { // These instructions are all floating point domain, so xorps is the best // choice. unsigned Opc = Subtarget.hasAVX() ? X86::VXORPSrr : X86::XORPSrr; BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(Opc), Reg) .addReg(Reg, RegState::Undef) .addReg(Reg, RegState::Undef); MI.addRegisterKilled(Reg, TRI, true); } else if (X86::VR256RegClass.contains(Reg)) { // Use vxorps to clear the full ymm register. // It wants to read and write the xmm sub-register. Register XReg = TRI->getSubReg(Reg, X86::sub_xmm); BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::VXORPSrr), XReg) .addReg(XReg, RegState::Undef) .addReg(XReg, RegState::Undef) .addReg(Reg, RegState::ImplicitDefine); MI.addRegisterKilled(Reg, TRI, true); } else if (X86::GR64RegClass.contains(Reg)) { // Using XOR32rr because it has shorter encoding and zeros up the upper bits // as well. Register XReg = TRI->getSubReg(Reg, X86::sub_32bit); BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::XOR32rr), XReg) .addReg(XReg, RegState::Undef) .addReg(XReg, RegState::Undef) .addReg(Reg, RegState::ImplicitDefine); MI.addRegisterKilled(Reg, TRI, true); } else if (X86::GR32RegClass.contains(Reg)) { BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::XOR32rr), Reg) .addReg(Reg, RegState::Undef) .addReg(Reg, RegState::Undef); MI.addRegisterKilled(Reg, TRI, true); } } static void addOperands(MachineInstrBuilder &MIB, ArrayRef MOs, int PtrOffset = 0) { unsigned NumAddrOps = MOs.size(); if (NumAddrOps < 4) { // FrameIndex only - add an immediate offset (whether its zero or not). for (unsigned i = 0; i != NumAddrOps; ++i) MIB.add(MOs[i]); addOffset(MIB, PtrOffset); } else { // General Memory Addressing - we need to add any offset to an existing // offset. assert(MOs.size() == 5 && "Unexpected memory operand list length"); for (unsigned i = 0; i != NumAddrOps; ++i) { const MachineOperand &MO = MOs[i]; if (i == 3 && PtrOffset != 0) { MIB.addDisp(MO, PtrOffset); } else { MIB.add(MO); } } } } static void updateOperandRegConstraints(MachineFunction &MF, MachineInstr &NewMI, const TargetInstrInfo &TII) { MachineRegisterInfo &MRI = MF.getRegInfo(); const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo(); for (int Idx : llvm::seq(0, NewMI.getNumOperands())) { MachineOperand &MO = NewMI.getOperand(Idx); // We only need to update constraints on virtual register operands. if (!MO.isReg()) continue; Register Reg = MO.getReg(); if (!Reg.isVirtual()) continue; auto *NewRC = MRI.constrainRegClass( Reg, TII.getRegClass(NewMI.getDesc(), Idx, &TRI, MF)); if (!NewRC) { LLVM_DEBUG( dbgs() << "WARNING: Unable to update register constraint for operand " << Idx << " of instruction:\n"; NewMI.dump(); dbgs() << "\n"); } } } static MachineInstr *FuseTwoAddrInst(MachineFunction &MF, unsigned Opcode, ArrayRef MOs, MachineBasicBlock::iterator InsertPt, MachineInstr &MI, const TargetInstrInfo &TII) { // Create the base instruction with the memory operand as the first part. // Omit the implicit operands, something BuildMI can't do. MachineInstr *NewMI = MF.CreateMachineInstr(TII.get(Opcode), MI.getDebugLoc(), true); MachineInstrBuilder MIB(MF, NewMI); addOperands(MIB, MOs); // Loop over the rest of the ri operands, converting them over. unsigned NumOps = MI.getDesc().getNumOperands() - 2; for (unsigned i = 0; i != NumOps; ++i) { MachineOperand &MO = MI.getOperand(i + 2); MIB.add(MO); } for (unsigned i = NumOps + 2, e = MI.getNumOperands(); i != e; ++i) { MachineOperand &MO = MI.getOperand(i); MIB.add(MO); } updateOperandRegConstraints(MF, *NewMI, TII); MachineBasicBlock *MBB = InsertPt->getParent(); MBB->insert(InsertPt, NewMI); return MIB; } static MachineInstr *FuseInst(MachineFunction &MF, unsigned Opcode, unsigned OpNo, ArrayRef MOs, MachineBasicBlock::iterator InsertPt, MachineInstr &MI, const TargetInstrInfo &TII, int PtrOffset = 0) { // Omit the implicit operands, something BuildMI can't do. MachineInstr *NewMI = MF.CreateMachineInstr(TII.get(Opcode), MI.getDebugLoc(), true); MachineInstrBuilder MIB(MF, NewMI); for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { MachineOperand &MO = MI.getOperand(i); if (i == OpNo) { assert(MO.isReg() && "Expected to fold into reg operand!"); addOperands(MIB, MOs, PtrOffset); } else { MIB.add(MO); } } updateOperandRegConstraints(MF, *NewMI, TII); // Copy the NoFPExcept flag from the instruction we're fusing. if (MI.getFlag(MachineInstr::MIFlag::NoFPExcept)) NewMI->setFlag(MachineInstr::MIFlag::NoFPExcept); MachineBasicBlock *MBB = InsertPt->getParent(); MBB->insert(InsertPt, NewMI); return MIB; } static MachineInstr *MakeM0Inst(const TargetInstrInfo &TII, unsigned Opcode, ArrayRef MOs, MachineBasicBlock::iterator InsertPt, MachineInstr &MI) { MachineInstrBuilder MIB = BuildMI(*InsertPt->getParent(), InsertPt, MI.getDebugLoc(), TII.get(Opcode)); addOperands(MIB, MOs); return MIB.addImm(0); } MachineInstr *X86InstrInfo::foldMemoryOperandCustom( MachineFunction &MF, MachineInstr &MI, unsigned OpNum, ArrayRef MOs, MachineBasicBlock::iterator InsertPt, unsigned Size, Align Alignment) const { switch (MI.getOpcode()) { case X86::INSERTPSrr: case X86::VINSERTPSrr: case X86::VINSERTPSZrr: // Attempt to convert the load of inserted vector into a fold load // of a single float. if (OpNum == 2) { unsigned Imm = MI.getOperand(MI.getNumOperands() - 1).getImm(); unsigned ZMask = Imm & 15; unsigned DstIdx = (Imm >> 4) & 3; unsigned SrcIdx = (Imm >> 6) & 3; const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI, MF); unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8; if ((Size == 0 || Size >= 16) && RCSize >= 16 && Alignment >= Align(4)) { int PtrOffset = SrcIdx * 4; unsigned NewImm = (DstIdx << 4) | ZMask; unsigned NewOpCode = (MI.getOpcode() == X86::VINSERTPSZrr) ? X86::VINSERTPSZrm : (MI.getOpcode() == X86::VINSERTPSrr) ? X86::VINSERTPSrm : X86::INSERTPSrm; MachineInstr *NewMI = FuseInst(MF, NewOpCode, OpNum, MOs, InsertPt, MI, *this, PtrOffset); NewMI->getOperand(NewMI->getNumOperands() - 1).setImm(NewImm); return NewMI; } } break; case X86::MOVHLPSrr: case X86::VMOVHLPSrr: case X86::VMOVHLPSZrr: // Move the upper 64-bits of the second operand to the lower 64-bits. // To fold the load, adjust the pointer to the upper and use (V)MOVLPS. // TODO: In most cases AVX doesn't have a 8-byte alignment requirement. if (OpNum == 2) { const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI, MF); unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8; if ((Size == 0 || Size >= 16) && RCSize >= 16 && Alignment >= Align(8)) { unsigned NewOpCode = (MI.getOpcode() == X86::VMOVHLPSZrr) ? X86::VMOVLPSZ128rm : (MI.getOpcode() == X86::VMOVHLPSrr) ? X86::VMOVLPSrm : X86::MOVLPSrm; MachineInstr *NewMI = FuseInst(MF, NewOpCode, OpNum, MOs, InsertPt, MI, *this, 8); return NewMI; } } break; case X86::UNPCKLPDrr: // If we won't be able to fold this to the memory form of UNPCKL, use // MOVHPD instead. Done as custom because we can't have this in the load // table twice. if (OpNum == 2) { const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI, MF); unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8; if ((Size == 0 || Size >= 16) && RCSize >= 16 && Alignment < Align(16)) { MachineInstr *NewMI = FuseInst(MF, X86::MOVHPDrm, OpNum, MOs, InsertPt, MI, *this); return NewMI; } } break; } return nullptr; } static bool shouldPreventUndefRegUpdateMemFold(MachineFunction &MF, MachineInstr &MI) { if (!hasUndefRegUpdate(MI.getOpcode(), 1, /*ForLoadFold*/true) || !MI.getOperand(1).isReg()) return false; // The are two cases we need to handle depending on where in the pipeline // the folding attempt is being made. // -Register has the undef flag set. // -Register is produced by the IMPLICIT_DEF instruction. if (MI.getOperand(1).isUndef()) return true; MachineRegisterInfo &RegInfo = MF.getRegInfo(); MachineInstr *VRegDef = RegInfo.getUniqueVRegDef(MI.getOperand(1).getReg()); return VRegDef && VRegDef->isImplicitDef(); } MachineInstr *X86InstrInfo::foldMemoryOperandImpl( MachineFunction &MF, MachineInstr &MI, unsigned OpNum, ArrayRef MOs, MachineBasicBlock::iterator InsertPt, unsigned Size, Align Alignment, bool AllowCommute) const { bool isSlowTwoMemOps = Subtarget.slowTwoMemOps(); bool isTwoAddrFold = false; // For CPUs that favor the register form of a call or push, // do not fold loads into calls or pushes, unless optimizing for size // aggressively. if (isSlowTwoMemOps && !MF.getFunction().hasMinSize() && (MI.getOpcode() == X86::CALL32r || MI.getOpcode() == X86::CALL64r || MI.getOpcode() == X86::PUSH16r || MI.getOpcode() == X86::PUSH32r || MI.getOpcode() == X86::PUSH64r)) return nullptr; // Avoid partial and undef register update stalls unless optimizing for size. if (!MF.getFunction().hasOptSize() && (hasPartialRegUpdate(MI.getOpcode(), Subtarget, /*ForLoadFold*/true) || shouldPreventUndefRegUpdateMemFold(MF, MI))) return nullptr; unsigned NumOps = MI.getDesc().getNumOperands(); bool isTwoAddr = NumOps > 1 && MI.getDesc().getOperandConstraint(1, MCOI::TIED_TO) != -1; // FIXME: AsmPrinter doesn't know how to handle // X86II::MO_GOT_ABSOLUTE_ADDRESS after folding. if (MI.getOpcode() == X86::ADD32ri && MI.getOperand(2).getTargetFlags() == X86II::MO_GOT_ABSOLUTE_ADDRESS) return nullptr; // GOTTPOFF relocation loads can only be folded into add instructions. // FIXME: Need to exclude other relocations that only support specific // instructions. if (MOs.size() == X86::AddrNumOperands && MOs[X86::AddrDisp].getTargetFlags() == X86II::MO_GOTTPOFF && MI.getOpcode() != X86::ADD64rr) return nullptr; MachineInstr *NewMI = nullptr; // Attempt to fold any custom cases we have. if (MachineInstr *CustomMI = foldMemoryOperandCustom( MF, MI, OpNum, MOs, InsertPt, Size, Alignment)) return CustomMI; const X86MemoryFoldTableEntry *I = nullptr; // Folding a memory location into the two-address part of a two-address // instruction is different than folding it other places. It requires // replacing the *two* registers with the memory location. if (isTwoAddr && NumOps >= 2 && OpNum < 2 && MI.getOperand(0).isReg() && MI.getOperand(1).isReg() && MI.getOperand(0).getReg() == MI.getOperand(1).getReg()) { I = lookupTwoAddrFoldTable(MI.getOpcode()); isTwoAddrFold = true; } else { if (OpNum == 0) { if (MI.getOpcode() == X86::MOV32r0) { NewMI = MakeM0Inst(*this, X86::MOV32mi, MOs, InsertPt, MI); if (NewMI) return NewMI; } } I = lookupFoldTable(MI.getOpcode(), OpNum); } if (I != nullptr) { unsigned Opcode = I->DstOp; bool FoldedLoad = isTwoAddrFold || (OpNum == 0 && I->Flags & TB_FOLDED_LOAD) || OpNum > 0; bool FoldedStore = isTwoAddrFold || (OpNum == 0 && I->Flags & TB_FOLDED_STORE); MaybeAlign MinAlign = decodeMaybeAlign((I->Flags & TB_ALIGN_MASK) >> TB_ALIGN_SHIFT); if (MinAlign && Alignment < *MinAlign) return nullptr; bool NarrowToMOV32rm = false; if (Size) { const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI, MF); unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8; // Check if it's safe to fold the load. If the size of the object is // narrower than the load width, then it's not. // FIXME: Allow scalar intrinsic instructions like ADDSSrm_Int. if (FoldedLoad && Size < RCSize) { // If this is a 64-bit load, but the spill slot is 32, then we can do // a 32-bit load which is implicitly zero-extended. This likely is // due to live interval analysis remat'ing a load from stack slot. if (Opcode != X86::MOV64rm || RCSize != 8 || Size != 4) return nullptr; if (MI.getOperand(0).getSubReg() || MI.getOperand(1).getSubReg()) return nullptr; Opcode = X86::MOV32rm; NarrowToMOV32rm = true; } // For stores, make sure the size of the object is equal to the size of // the store. If the object is larger, the extra bits would be garbage. If // the object is smaller we might overwrite another object or fault. if (FoldedStore && Size != RCSize) return nullptr; } if (isTwoAddrFold) NewMI = FuseTwoAddrInst(MF, Opcode, MOs, InsertPt, MI, *this); else NewMI = FuseInst(MF, Opcode, OpNum, MOs, InsertPt, MI, *this); if (NarrowToMOV32rm) { // If this is the special case where we use a MOV32rm to load a 32-bit // value and zero-extend the top bits. Change the destination register // to a 32-bit one. Register DstReg = NewMI->getOperand(0).getReg(); if (DstReg.isPhysical()) NewMI->getOperand(0).setReg(RI.getSubReg(DstReg, X86::sub_32bit)); else NewMI->getOperand(0).setSubReg(X86::sub_32bit); } return NewMI; } // If the instruction and target operand are commutable, commute the // instruction and try again. if (AllowCommute) { unsigned CommuteOpIdx1 = OpNum, CommuteOpIdx2 = CommuteAnyOperandIndex; if (findCommutedOpIndices(MI, CommuteOpIdx1, CommuteOpIdx2)) { bool HasDef = MI.getDesc().getNumDefs(); Register Reg0 = HasDef ? MI.getOperand(0).getReg() : Register(); Register Reg1 = MI.getOperand(CommuteOpIdx1).getReg(); Register Reg2 = MI.getOperand(CommuteOpIdx2).getReg(); bool Tied1 = 0 == MI.getDesc().getOperandConstraint(CommuteOpIdx1, MCOI::TIED_TO); bool Tied2 = 0 == MI.getDesc().getOperandConstraint(CommuteOpIdx2, MCOI::TIED_TO); // If either of the commutable operands are tied to the destination // then we can not commute + fold. if ((HasDef && Reg0 == Reg1 && Tied1) || (HasDef && Reg0 == Reg2 && Tied2)) return nullptr; MachineInstr *CommutedMI = commuteInstruction(MI, false, CommuteOpIdx1, CommuteOpIdx2); if (!CommutedMI) { // Unable to commute. return nullptr; } if (CommutedMI != &MI) { // New instruction. We can't fold from this. CommutedMI->eraseFromParent(); return nullptr; } // Attempt to fold with the commuted version of the instruction. NewMI = foldMemoryOperandImpl(MF, MI, CommuteOpIdx2, MOs, InsertPt, Size, Alignment, /*AllowCommute=*/false); if (NewMI) return NewMI; // Folding failed again - undo the commute before returning. MachineInstr *UncommutedMI = commuteInstruction(MI, false, CommuteOpIdx1, CommuteOpIdx2); if (!UncommutedMI) { // Unable to commute. return nullptr; } if (UncommutedMI != &MI) { // New instruction. It doesn't need to be kept. UncommutedMI->eraseFromParent(); return nullptr; } // Return here to prevent duplicate fuse failure report. return nullptr; } } // No fusion if (PrintFailedFusing && !MI.isCopy()) dbgs() << "We failed to fuse operand " << OpNum << " in " << MI; return nullptr; } MachineInstr * X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, ArrayRef Ops, MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS, VirtRegMap *VRM) const { // Check switch flag if (NoFusing) return nullptr; // Avoid partial and undef register update stalls unless optimizing for size. if (!MF.getFunction().hasOptSize() && (hasPartialRegUpdate(MI.getOpcode(), Subtarget, /*ForLoadFold*/true) || shouldPreventUndefRegUpdateMemFold(MF, MI))) return nullptr; // Don't fold subreg spills, or reloads that use a high subreg. for (auto Op : Ops) { MachineOperand &MO = MI.getOperand(Op); auto SubReg = MO.getSubReg(); if (SubReg && (MO.isDef() || SubReg == X86::sub_8bit_hi)) return nullptr; } const MachineFrameInfo &MFI = MF.getFrameInfo(); unsigned Size = MFI.getObjectSize(FrameIndex); Align Alignment = MFI.getObjectAlign(FrameIndex); // If the function stack isn't realigned we don't want to fold instructions // that need increased alignment. if (!RI.needsStackRealignment(MF)) Alignment = std::min(Alignment, Subtarget.getFrameLowering()->getStackAlign()); if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) { unsigned NewOpc = 0; unsigned RCSize = 0; switch (MI.getOpcode()) { default: return nullptr; case X86::TEST8rr: NewOpc = X86::CMP8ri; RCSize = 1; break; case X86::TEST16rr: NewOpc = X86::CMP16ri8; RCSize = 2; break; case X86::TEST32rr: NewOpc = X86::CMP32ri8; RCSize = 4; break; case X86::TEST64rr: NewOpc = X86::CMP64ri8; RCSize = 8; break; } // Check if it's safe to fold the load. If the size of the object is // narrower than the load width, then it's not. if (Size < RCSize) return nullptr; // Change to CMPXXri r, 0 first. MI.setDesc(get(NewOpc)); MI.getOperand(1).ChangeToImmediate(0); } else if (Ops.size() != 1) return nullptr; return foldMemoryOperandImpl(MF, MI, Ops[0], MachineOperand::CreateFI(FrameIndex), InsertPt, Size, Alignment, /*AllowCommute=*/true); } /// Check if \p LoadMI is a partial register load that we can't fold into \p MI /// because the latter uses contents that wouldn't be defined in the folded /// version. For instance, this transformation isn't legal: /// movss (%rdi), %xmm0 /// addps %xmm0, %xmm0 /// -> /// addps (%rdi), %xmm0 /// /// But this one is: /// movss (%rdi), %xmm0 /// addss %xmm0, %xmm0 /// -> /// addss (%rdi), %xmm0 /// static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI, const MachineInstr &UserMI, const MachineFunction &MF) { unsigned Opc = LoadMI.getOpcode(); unsigned UserOpc = UserMI.getOpcode(); const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); const TargetRegisterClass *RC = MF.getRegInfo().getRegClass(LoadMI.getOperand(0).getReg()); unsigned RegSize = TRI.getRegSizeInBits(*RC); if ((Opc == X86::MOVSSrm || Opc == X86::VMOVSSrm || Opc == X86::VMOVSSZrm || Opc == X86::MOVSSrm_alt || Opc == X86::VMOVSSrm_alt || Opc == X86::VMOVSSZrm_alt) && RegSize > 32) { // These instructions only load 32 bits, we can't fold them if the // destination register is wider than 32 bits (4 bytes), and its user // instruction isn't scalar (SS). switch (UserOpc) { case X86::CVTSS2SDrr_Int: case X86::VCVTSS2SDrr_Int: case X86::VCVTSS2SDZrr_Int: case X86::VCVTSS2SDZrr_Intk: case X86::VCVTSS2SDZrr_Intkz: case X86::CVTSS2SIrr_Int: case X86::CVTSS2SI64rr_Int: case X86::VCVTSS2SIrr_Int: case X86::VCVTSS2SI64rr_Int: case X86::VCVTSS2SIZrr_Int: case X86::VCVTSS2SI64Zrr_Int: case X86::CVTTSS2SIrr_Int: case X86::CVTTSS2SI64rr_Int: case X86::VCVTTSS2SIrr_Int: case X86::VCVTTSS2SI64rr_Int: case X86::VCVTTSS2SIZrr_Int: case X86::VCVTTSS2SI64Zrr_Int: case X86::VCVTSS2USIZrr_Int: case X86::VCVTSS2USI64Zrr_Int: case X86::VCVTTSS2USIZrr_Int: case X86::VCVTTSS2USI64Zrr_Int: case X86::RCPSSr_Int: case X86::VRCPSSr_Int: case X86::RSQRTSSr_Int: case X86::VRSQRTSSr_Int: case X86::ROUNDSSr_Int: case X86::VROUNDSSr_Int: case X86::COMISSrr_Int: case X86::VCOMISSrr_Int: case X86::VCOMISSZrr_Int: case X86::UCOMISSrr_Int:case X86::VUCOMISSrr_Int:case X86::VUCOMISSZrr_Int: case X86::ADDSSrr_Int: case X86::VADDSSrr_Int: case X86::VADDSSZrr_Int: case X86::CMPSSrr_Int: case X86::VCMPSSrr_Int: case X86::VCMPSSZrr_Int: case X86::DIVSSrr_Int: case X86::VDIVSSrr_Int: case X86::VDIVSSZrr_Int: case X86::MAXSSrr_Int: case X86::VMAXSSrr_Int: case X86::VMAXSSZrr_Int: case X86::MINSSrr_Int: case X86::VMINSSrr_Int: case X86::VMINSSZrr_Int: case X86::MULSSrr_Int: case X86::VMULSSrr_Int: case X86::VMULSSZrr_Int: case X86::SQRTSSr_Int: case X86::VSQRTSSr_Int: case X86::VSQRTSSZr_Int: case X86::SUBSSrr_Int: case X86::VSUBSSrr_Int: case X86::VSUBSSZrr_Int: case X86::VADDSSZrr_Intk: case X86::VADDSSZrr_Intkz: case X86::VCMPSSZrr_Intk: case X86::VDIVSSZrr_Intk: case X86::VDIVSSZrr_Intkz: case X86::VMAXSSZrr_Intk: case X86::VMAXSSZrr_Intkz: case X86::VMINSSZrr_Intk: case X86::VMINSSZrr_Intkz: case X86::VMULSSZrr_Intk: case X86::VMULSSZrr_Intkz: case X86::VSQRTSSZr_Intk: case X86::VSQRTSSZr_Intkz: case X86::VSUBSSZrr_Intk: case X86::VSUBSSZrr_Intkz: case X86::VFMADDSS4rr_Int: case X86::VFNMADDSS4rr_Int: case X86::VFMSUBSS4rr_Int: case X86::VFNMSUBSS4rr_Int: case X86::VFMADD132SSr_Int: case X86::VFNMADD132SSr_Int: case X86::VFMADD213SSr_Int: case X86::VFNMADD213SSr_Int: case X86::VFMADD231SSr_Int: case X86::VFNMADD231SSr_Int: case X86::VFMSUB132SSr_Int: case X86::VFNMSUB132SSr_Int: case X86::VFMSUB213SSr_Int: case X86::VFNMSUB213SSr_Int: case X86::VFMSUB231SSr_Int: case X86::VFNMSUB231SSr_Int: case X86::VFMADD132SSZr_Int: case X86::VFNMADD132SSZr_Int: case X86::VFMADD213SSZr_Int: case X86::VFNMADD213SSZr_Int: case X86::VFMADD231SSZr_Int: case X86::VFNMADD231SSZr_Int: case X86::VFMSUB132SSZr_Int: case X86::VFNMSUB132SSZr_Int: case X86::VFMSUB213SSZr_Int: case X86::VFNMSUB213SSZr_Int: case X86::VFMSUB231SSZr_Int: case X86::VFNMSUB231SSZr_Int: case X86::VFMADD132SSZr_Intk: case X86::VFNMADD132SSZr_Intk: case X86::VFMADD213SSZr_Intk: case X86::VFNMADD213SSZr_Intk: case X86::VFMADD231SSZr_Intk: case X86::VFNMADD231SSZr_Intk: case X86::VFMSUB132SSZr_Intk: case X86::VFNMSUB132SSZr_Intk: case X86::VFMSUB213SSZr_Intk: case X86::VFNMSUB213SSZr_Intk: case X86::VFMSUB231SSZr_Intk: case X86::VFNMSUB231SSZr_Intk: case X86::VFMADD132SSZr_Intkz: case X86::VFNMADD132SSZr_Intkz: case X86::VFMADD213SSZr_Intkz: case X86::VFNMADD213SSZr_Intkz: case X86::VFMADD231SSZr_Intkz: case X86::VFNMADD231SSZr_Intkz: case X86::VFMSUB132SSZr_Intkz: case X86::VFNMSUB132SSZr_Intkz: case X86::VFMSUB213SSZr_Intkz: case X86::VFNMSUB213SSZr_Intkz: case X86::VFMSUB231SSZr_Intkz: case X86::VFNMSUB231SSZr_Intkz: case X86::VFIXUPIMMSSZrri: case X86::VFIXUPIMMSSZrrik: case X86::VFIXUPIMMSSZrrikz: case X86::VFPCLASSSSZrr: case X86::VFPCLASSSSZrrk: case X86::VGETEXPSSZr: case X86::VGETEXPSSZrk: case X86::VGETEXPSSZrkz: case X86::VGETMANTSSZrri: case X86::VGETMANTSSZrrik: case X86::VGETMANTSSZrrikz: case X86::VRANGESSZrri: case X86::VRANGESSZrrik: case X86::VRANGESSZrrikz: case X86::VRCP14SSZrr: case X86::VRCP14SSZrrk: case X86::VRCP14SSZrrkz: case X86::VRCP28SSZr: case X86::VRCP28SSZrk: case X86::VRCP28SSZrkz: case X86::VREDUCESSZrri: case X86::VREDUCESSZrrik: case X86::VREDUCESSZrrikz: case X86::VRNDSCALESSZr_Int: case X86::VRNDSCALESSZr_Intk: case X86::VRNDSCALESSZr_Intkz: case X86::VRSQRT14SSZrr: case X86::VRSQRT14SSZrrk: case X86::VRSQRT14SSZrrkz: case X86::VRSQRT28SSZr: case X86::VRSQRT28SSZrk: case X86::VRSQRT28SSZrkz: case X86::VSCALEFSSZrr: case X86::VSCALEFSSZrrk: case X86::VSCALEFSSZrrkz: return false; default: return true; } } if ((Opc == X86::MOVSDrm || Opc == X86::VMOVSDrm || Opc == X86::VMOVSDZrm || Opc == X86::MOVSDrm_alt || Opc == X86::VMOVSDrm_alt || Opc == X86::VMOVSDZrm_alt) && RegSize > 64) { // These instructions only load 64 bits, we can't fold them if the // destination register is wider than 64 bits (8 bytes), and its user // instruction isn't scalar (SD). switch (UserOpc) { case X86::CVTSD2SSrr_Int: case X86::VCVTSD2SSrr_Int: case X86::VCVTSD2SSZrr_Int: case X86::VCVTSD2SSZrr_Intk: case X86::VCVTSD2SSZrr_Intkz: case X86::CVTSD2SIrr_Int: case X86::CVTSD2SI64rr_Int: case X86::VCVTSD2SIrr_Int: case X86::VCVTSD2SI64rr_Int: case X86::VCVTSD2SIZrr_Int: case X86::VCVTSD2SI64Zrr_Int: case X86::CVTTSD2SIrr_Int: case X86::CVTTSD2SI64rr_Int: case X86::VCVTTSD2SIrr_Int: case X86::VCVTTSD2SI64rr_Int: case X86::VCVTTSD2SIZrr_Int: case X86::VCVTTSD2SI64Zrr_Int: case X86::VCVTSD2USIZrr_Int: case X86::VCVTSD2USI64Zrr_Int: case X86::VCVTTSD2USIZrr_Int: case X86::VCVTTSD2USI64Zrr_Int: case X86::ROUNDSDr_Int: case X86::VROUNDSDr_Int: case X86::COMISDrr_Int: case X86::VCOMISDrr_Int: case X86::VCOMISDZrr_Int: case X86::UCOMISDrr_Int:case X86::VUCOMISDrr_Int:case X86::VUCOMISDZrr_Int: case X86::ADDSDrr_Int: case X86::VADDSDrr_Int: case X86::VADDSDZrr_Int: case X86::CMPSDrr_Int: case X86::VCMPSDrr_Int: case X86::VCMPSDZrr_Int: case X86::DIVSDrr_Int: case X86::VDIVSDrr_Int: case X86::VDIVSDZrr_Int: case X86::MAXSDrr_Int: case X86::VMAXSDrr_Int: case X86::VMAXSDZrr_Int: case X86::MINSDrr_Int: case X86::VMINSDrr_Int: case X86::VMINSDZrr_Int: case X86::MULSDrr_Int: case X86::VMULSDrr_Int: case X86::VMULSDZrr_Int: case X86::SQRTSDr_Int: case X86::VSQRTSDr_Int: case X86::VSQRTSDZr_Int: case X86::SUBSDrr_Int: case X86::VSUBSDrr_Int: case X86::VSUBSDZrr_Int: case X86::VADDSDZrr_Intk: case X86::VADDSDZrr_Intkz: case X86::VCMPSDZrr_Intk: case X86::VDIVSDZrr_Intk: case X86::VDIVSDZrr_Intkz: case X86::VMAXSDZrr_Intk: case X86::VMAXSDZrr_Intkz: case X86::VMINSDZrr_Intk: case X86::VMINSDZrr_Intkz: case X86::VMULSDZrr_Intk: case X86::VMULSDZrr_Intkz: case X86::VSQRTSDZr_Intk: case X86::VSQRTSDZr_Intkz: case X86::VSUBSDZrr_Intk: case X86::VSUBSDZrr_Intkz: case X86::VFMADDSD4rr_Int: case X86::VFNMADDSD4rr_Int: case X86::VFMSUBSD4rr_Int: case X86::VFNMSUBSD4rr_Int: case X86::VFMADD132SDr_Int: case X86::VFNMADD132SDr_Int: case X86::VFMADD213SDr_Int: case X86::VFNMADD213SDr_Int: case X86::VFMADD231SDr_Int: case X86::VFNMADD231SDr_Int: case X86::VFMSUB132SDr_Int: case X86::VFNMSUB132SDr_Int: case X86::VFMSUB213SDr_Int: case X86::VFNMSUB213SDr_Int: case X86::VFMSUB231SDr_Int: case X86::VFNMSUB231SDr_Int: case X86::VFMADD132SDZr_Int: case X86::VFNMADD132SDZr_Int: case X86::VFMADD213SDZr_Int: case X86::VFNMADD213SDZr_Int: case X86::VFMADD231SDZr_Int: case X86::VFNMADD231SDZr_Int: case X86::VFMSUB132SDZr_Int: case X86::VFNMSUB132SDZr_Int: case X86::VFMSUB213SDZr_Int: case X86::VFNMSUB213SDZr_Int: case X86::VFMSUB231SDZr_Int: case X86::VFNMSUB231SDZr_Int: case X86::VFMADD132SDZr_Intk: case X86::VFNMADD132SDZr_Intk: case X86::VFMADD213SDZr_Intk: case X86::VFNMADD213SDZr_Intk: case X86::VFMADD231SDZr_Intk: case X86::VFNMADD231SDZr_Intk: case X86::VFMSUB132SDZr_Intk: case X86::VFNMSUB132SDZr_Intk: case X86::VFMSUB213SDZr_Intk: case X86::VFNMSUB213SDZr_Intk: case X86::VFMSUB231SDZr_Intk: case X86::VFNMSUB231SDZr_Intk: case X86::VFMADD132SDZr_Intkz: case X86::VFNMADD132SDZr_Intkz: case X86::VFMADD213SDZr_Intkz: case X86::VFNMADD213SDZr_Intkz: case X86::VFMADD231SDZr_Intkz: case X86::VFNMADD231SDZr_Intkz: case X86::VFMSUB132SDZr_Intkz: case X86::VFNMSUB132SDZr_Intkz: case X86::VFMSUB213SDZr_Intkz: case X86::VFNMSUB213SDZr_Intkz: case X86::VFMSUB231SDZr_Intkz: case X86::VFNMSUB231SDZr_Intkz: case X86::VFIXUPIMMSDZrri: case X86::VFIXUPIMMSDZrrik: case X86::VFIXUPIMMSDZrrikz: case X86::VFPCLASSSDZrr: case X86::VFPCLASSSDZrrk: case X86::VGETEXPSDZr: case X86::VGETEXPSDZrk: case X86::VGETEXPSDZrkz: case X86::VGETMANTSDZrri: case X86::VGETMANTSDZrrik: case X86::VGETMANTSDZrrikz: case X86::VRANGESDZrri: case X86::VRANGESDZrrik: case X86::VRANGESDZrrikz: case X86::VRCP14SDZrr: case X86::VRCP14SDZrrk: case X86::VRCP14SDZrrkz: case X86::VRCP28SDZr: case X86::VRCP28SDZrk: case X86::VRCP28SDZrkz: case X86::VREDUCESDZrri: case X86::VREDUCESDZrrik: case X86::VREDUCESDZrrikz: case X86::VRNDSCALESDZr_Int: case X86::VRNDSCALESDZr_Intk: case X86::VRNDSCALESDZr_Intkz: case X86::VRSQRT14SDZrr: case X86::VRSQRT14SDZrrk: case X86::VRSQRT14SDZrrkz: case X86::VRSQRT28SDZr: case X86::VRSQRT28SDZrk: case X86::VRSQRT28SDZrkz: case X86::VSCALEFSDZrr: case X86::VSCALEFSDZrrk: case X86::VSCALEFSDZrrkz: return false; default: return true; } } return false; } MachineInstr *X86InstrInfo::foldMemoryOperandImpl( MachineFunction &MF, MachineInstr &MI, ArrayRef Ops, MachineBasicBlock::iterator InsertPt, MachineInstr &LoadMI, LiveIntervals *LIS) const { // TODO: Support the case where LoadMI loads a wide register, but MI // only uses a subreg. for (auto Op : Ops) { if (MI.getOperand(Op).getSubReg()) return nullptr; } // If loading from a FrameIndex, fold directly from the FrameIndex. unsigned NumOps = LoadMI.getDesc().getNumOperands(); int FrameIndex; if (isLoadFromStackSlot(LoadMI, FrameIndex)) { if (isNonFoldablePartialRegisterLoad(LoadMI, MI, MF)) return nullptr; return foldMemoryOperandImpl(MF, MI, Ops, InsertPt, FrameIndex, LIS); } // Check switch flag if (NoFusing) return nullptr; // Avoid partial and undef register update stalls unless optimizing for size. if (!MF.getFunction().hasOptSize() && (hasPartialRegUpdate(MI.getOpcode(), Subtarget, /*ForLoadFold*/true) || shouldPreventUndefRegUpdateMemFold(MF, MI))) return nullptr; // Determine the alignment of the load. Align Alignment; if (LoadMI.hasOneMemOperand()) Alignment = (*LoadMI.memoperands_begin())->getAlign(); else switch (LoadMI.getOpcode()) { case X86::AVX512_512_SET0: case X86::AVX512_512_SETALLONES: Alignment = Align(64); break; case X86::AVX2_SETALLONES: case X86::AVX1_SETALLONES: case X86::AVX_SET0: case X86::AVX512_256_SET0: Alignment = Align(32); break; case X86::V_SET0: case X86::V_SETALLONES: case X86::AVX512_128_SET0: case X86::FsFLD0F128: case X86::AVX512_FsFLD0F128: Alignment = Align(16); break; case X86::MMX_SET0: case X86::FsFLD0SD: case X86::AVX512_FsFLD0SD: Alignment = Align(8); break; case X86::FsFLD0SS: case X86::AVX512_FsFLD0SS: Alignment = Align(4); break; default: return nullptr; } if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) { unsigned NewOpc = 0; switch (MI.getOpcode()) { default: return nullptr; case X86::TEST8rr: NewOpc = X86::CMP8ri; break; case X86::TEST16rr: NewOpc = X86::CMP16ri8; break; case X86::TEST32rr: NewOpc = X86::CMP32ri8; break; case X86::TEST64rr: NewOpc = X86::CMP64ri8; break; } // Change to CMPXXri r, 0 first. MI.setDesc(get(NewOpc)); MI.getOperand(1).ChangeToImmediate(0); } else if (Ops.size() != 1) return nullptr; // Make sure the subregisters match. // Otherwise we risk changing the size of the load. if (LoadMI.getOperand(0).getSubReg() != MI.getOperand(Ops[0]).getSubReg()) return nullptr; SmallVector MOs; switch (LoadMI.getOpcode()) { case X86::MMX_SET0: case X86::V_SET0: case X86::V_SETALLONES: case X86::AVX2_SETALLONES: case X86::AVX1_SETALLONES: case X86::AVX_SET0: case X86::AVX512_128_SET0: case X86::AVX512_256_SET0: case X86::AVX512_512_SET0: case X86::AVX512_512_SETALLONES: case X86::FsFLD0SD: case X86::AVX512_FsFLD0SD: case X86::FsFLD0SS: case X86::AVX512_FsFLD0SS: case X86::FsFLD0F128: case X86::AVX512_FsFLD0F128: { // Folding a V_SET0 or V_SETALLONES as a load, to ease register pressure. // Create a constant-pool entry and operands to load from it. // Medium and large mode can't fold loads this way. if (MF.getTarget().getCodeModel() != CodeModel::Small && MF.getTarget().getCodeModel() != CodeModel::Kernel) return nullptr; // x86-32 PIC requires a PIC base register for constant pools. unsigned PICBase = 0; if (MF.getTarget().isPositionIndependent()) { if (Subtarget.is64Bit()) PICBase = X86::RIP; else // FIXME: PICBase = getGlobalBaseReg(&MF); // This doesn't work for several reasons. // 1. GlobalBaseReg may have been spilled. // 2. It may not be live at MI. return nullptr; } // Create a constant-pool entry. MachineConstantPool &MCP = *MF.getConstantPool(); Type *Ty; unsigned Opc = LoadMI.getOpcode(); if (Opc == X86::FsFLD0SS || Opc == X86::AVX512_FsFLD0SS) Ty = Type::getFloatTy(MF.getFunction().getContext()); else if (Opc == X86::FsFLD0SD || Opc == X86::AVX512_FsFLD0SD) Ty = Type::getDoubleTy(MF.getFunction().getContext()); else if (Opc == X86::FsFLD0F128 || Opc == X86::AVX512_FsFLD0F128) Ty = Type::getFP128Ty(MF.getFunction().getContext()); else if (Opc == X86::AVX512_512_SET0 || Opc == X86::AVX512_512_SETALLONES) Ty = FixedVectorType::get(Type::getInt32Ty(MF.getFunction().getContext()), 16); else if (Opc == X86::AVX2_SETALLONES || Opc == X86::AVX_SET0 || Opc == X86::AVX512_256_SET0 || Opc == X86::AVX1_SETALLONES) Ty = FixedVectorType::get(Type::getInt32Ty(MF.getFunction().getContext()), 8); else if (Opc == X86::MMX_SET0) Ty = FixedVectorType::get(Type::getInt32Ty(MF.getFunction().getContext()), 2); else Ty = FixedVectorType::get(Type::getInt32Ty(MF.getFunction().getContext()), 4); bool IsAllOnes = (Opc == X86::V_SETALLONES || Opc == X86::AVX2_SETALLONES || Opc == X86::AVX512_512_SETALLONES || Opc == X86::AVX1_SETALLONES); const Constant *C = IsAllOnes ? Constant::getAllOnesValue(Ty) : Constant::getNullValue(Ty); unsigned CPI = MCP.getConstantPoolIndex(C, Alignment); // Create operands to load from the constant pool entry. MOs.push_back(MachineOperand::CreateReg(PICBase, false)); MOs.push_back(MachineOperand::CreateImm(1)); MOs.push_back(MachineOperand::CreateReg(0, false)); MOs.push_back(MachineOperand::CreateCPI(CPI, 0)); MOs.push_back(MachineOperand::CreateReg(0, false)); break; } default: { if (isNonFoldablePartialRegisterLoad(LoadMI, MI, MF)) return nullptr; // Folding a normal load. Just copy the load's address operands. MOs.append(LoadMI.operands_begin() + NumOps - X86::AddrNumOperands, LoadMI.operands_begin() + NumOps); break; } } return foldMemoryOperandImpl(MF, MI, Ops[0], MOs, InsertPt, /*Size=*/0, Alignment, /*AllowCommute=*/true); } static SmallVector extractLoadMMOs(ArrayRef MMOs, MachineFunction &MF) { SmallVector LoadMMOs; for (MachineMemOperand *MMO : MMOs) { if (!MMO->isLoad()) continue; if (!MMO->isStore()) { // Reuse the MMO. LoadMMOs.push_back(MMO); } else { // Clone the MMO and unset the store flag. LoadMMOs.push_back(MF.getMachineMemOperand( MMO, MMO->getFlags() & ~MachineMemOperand::MOStore)); } } return LoadMMOs; } static SmallVector extractStoreMMOs(ArrayRef MMOs, MachineFunction &MF) { SmallVector StoreMMOs; for (MachineMemOperand *MMO : MMOs) { if (!MMO->isStore()) continue; if (!MMO->isLoad()) { // Reuse the MMO. StoreMMOs.push_back(MMO); } else { // Clone the MMO and unset the load flag. StoreMMOs.push_back(MF.getMachineMemOperand( MMO, MMO->getFlags() & ~MachineMemOperand::MOLoad)); } } return StoreMMOs; } static unsigned getBroadcastOpcode(const X86MemoryFoldTableEntry *I, const TargetRegisterClass *RC, const X86Subtarget &STI) { assert(STI.hasAVX512() && "Expected at least AVX512!"); unsigned SpillSize = STI.getRegisterInfo()->getSpillSize(*RC); assert((SpillSize == 64 || STI.hasVLX()) && "Can't broadcast less than 64 bytes without AVX512VL!"); switch (I->Flags & TB_BCAST_MASK) { default: llvm_unreachable("Unexpected broadcast type!"); case TB_BCAST_D: switch (SpillSize) { default: llvm_unreachable("Unknown spill size"); case 16: return X86::VPBROADCASTDZ128rm; case 32: return X86::VPBROADCASTDZ256rm; case 64: return X86::VPBROADCASTDZrm; } break; case TB_BCAST_Q: switch (SpillSize) { default: llvm_unreachable("Unknown spill size"); case 16: return X86::VPBROADCASTQZ128rm; case 32: return X86::VPBROADCASTQZ256rm; case 64: return X86::VPBROADCASTQZrm; } break; case TB_BCAST_SS: switch (SpillSize) { default: llvm_unreachable("Unknown spill size"); case 16: return X86::VBROADCASTSSZ128rm; case 32: return X86::VBROADCASTSSZ256rm; case 64: return X86::VBROADCASTSSZrm; } break; case TB_BCAST_SD: switch (SpillSize) { default: llvm_unreachable("Unknown spill size"); case 16: return X86::VMOVDDUPZ128rm; case 32: return X86::VBROADCASTSDZ256rm; case 64: return X86::VBROADCASTSDZrm; } break; } } bool X86InstrInfo::unfoldMemoryOperand( MachineFunction &MF, MachineInstr &MI, unsigned Reg, bool UnfoldLoad, bool UnfoldStore, SmallVectorImpl &NewMIs) const { const X86MemoryFoldTableEntry *I = lookupUnfoldTable(MI.getOpcode()); if (I == nullptr) return false; unsigned Opc = I->DstOp; unsigned Index = I->Flags & TB_INDEX_MASK; bool FoldedLoad = I->Flags & TB_FOLDED_LOAD; bool FoldedStore = I->Flags & TB_FOLDED_STORE; bool FoldedBCast = I->Flags & TB_FOLDED_BCAST; if (UnfoldLoad && !FoldedLoad) return false; UnfoldLoad &= FoldedLoad; if (UnfoldStore && !FoldedStore) return false; UnfoldStore &= FoldedStore; const MCInstrDesc &MCID = get(Opc); const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI, MF); const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); // TODO: Check if 32-byte or greater accesses are slow too? if (!MI.hasOneMemOperand() && RC == &X86::VR128RegClass && Subtarget.isUnalignedMem16Slow()) // Without memoperands, loadRegFromAddr and storeRegToStackSlot will // conservatively assume the address is unaligned. That's bad for // performance. return false; SmallVector AddrOps; SmallVector BeforeOps; SmallVector AfterOps; SmallVector ImpOps; for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { MachineOperand &Op = MI.getOperand(i); if (i >= Index && i < Index + X86::AddrNumOperands) AddrOps.push_back(Op); else if (Op.isReg() && Op.isImplicit()) ImpOps.push_back(Op); else if (i < Index) BeforeOps.push_back(Op); else if (i > Index) AfterOps.push_back(Op); } // Emit the load or broadcast instruction. if (UnfoldLoad) { auto MMOs = extractLoadMMOs(MI.memoperands(), MF); unsigned Opc; if (FoldedBCast) { Opc = getBroadcastOpcode(I, RC, Subtarget); } else { unsigned Alignment = std::max(TRI.getSpillSize(*RC), 16); bool isAligned = !MMOs.empty() && MMOs.front()->getAlign() >= Alignment; Opc = getLoadRegOpcode(Reg, RC, isAligned, Subtarget); } DebugLoc DL; MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc), Reg); for (unsigned i = 0, e = AddrOps.size(); i != e; ++i) MIB.add(AddrOps[i]); MIB.setMemRefs(MMOs); NewMIs.push_back(MIB); if (UnfoldStore) { // Address operands cannot be marked isKill. for (unsigned i = 1; i != 1 + X86::AddrNumOperands; ++i) { MachineOperand &MO = NewMIs[0]->getOperand(i); if (MO.isReg()) MO.setIsKill(false); } } } // Emit the data processing instruction. MachineInstr *DataMI = MF.CreateMachineInstr(MCID, MI.getDebugLoc(), true); MachineInstrBuilder MIB(MF, DataMI); if (FoldedStore) MIB.addReg(Reg, RegState::Define); for (MachineOperand &BeforeOp : BeforeOps) MIB.add(BeforeOp); if (FoldedLoad) MIB.addReg(Reg); for (MachineOperand &AfterOp : AfterOps) MIB.add(AfterOp); for (MachineOperand &ImpOp : ImpOps) { MIB.addReg(ImpOp.getReg(), getDefRegState(ImpOp.isDef()) | RegState::Implicit | getKillRegState(ImpOp.isKill()) | getDeadRegState(ImpOp.isDead()) | getUndefRegState(ImpOp.isUndef())); } // Change CMP32ri r, 0 back to TEST32rr r, r, etc. switch (DataMI->getOpcode()) { default: break; case X86::CMP64ri32: case X86::CMP64ri8: case X86::CMP32ri: case X86::CMP32ri8: case X86::CMP16ri: case X86::CMP16ri8: case X86::CMP8ri: { MachineOperand &MO0 = DataMI->getOperand(0); MachineOperand &MO1 = DataMI->getOperand(1); if (MO1.getImm() == 0) { unsigned NewOpc; switch (DataMI->getOpcode()) { default: llvm_unreachable("Unreachable!"); case X86::CMP64ri8: case X86::CMP64ri32: NewOpc = X86::TEST64rr; break; case X86::CMP32ri8: case X86::CMP32ri: NewOpc = X86::TEST32rr; break; case X86::CMP16ri8: case X86::CMP16ri: NewOpc = X86::TEST16rr; break; case X86::CMP8ri: NewOpc = X86::TEST8rr; break; } DataMI->setDesc(get(NewOpc)); MO1.ChangeToRegister(MO0.getReg(), false); } } } NewMIs.push_back(DataMI); // Emit the store instruction. if (UnfoldStore) { const TargetRegisterClass *DstRC = getRegClass(MCID, 0, &RI, MF); auto MMOs = extractStoreMMOs(MI.memoperands(), MF); unsigned Alignment = std::max(TRI.getSpillSize(*DstRC), 16); bool isAligned = !MMOs.empty() && MMOs.front()->getAlign() >= Alignment; unsigned Opc = getStoreRegOpcode(Reg, DstRC, isAligned, Subtarget); DebugLoc DL; MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc)); for (unsigned i = 0, e = AddrOps.size(); i != e; ++i) MIB.add(AddrOps[i]); MIB.addReg(Reg, RegState::Kill); MIB.setMemRefs(MMOs); NewMIs.push_back(MIB); } return true; } bool X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N, SmallVectorImpl &NewNodes) const { if (!N->isMachineOpcode()) return false; const X86MemoryFoldTableEntry *I = lookupUnfoldTable(N->getMachineOpcode()); if (I == nullptr) return false; unsigned Opc = I->DstOp; unsigned Index = I->Flags & TB_INDEX_MASK; bool FoldedLoad = I->Flags & TB_FOLDED_LOAD; bool FoldedStore = I->Flags & TB_FOLDED_STORE; bool FoldedBCast = I->Flags & TB_FOLDED_BCAST; const MCInstrDesc &MCID = get(Opc); MachineFunction &MF = DAG.getMachineFunction(); const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI, MF); unsigned NumDefs = MCID.NumDefs; std::vector AddrOps; std::vector BeforeOps; std::vector AfterOps; SDLoc dl(N); unsigned NumOps = N->getNumOperands(); for (unsigned i = 0; i != NumOps-1; ++i) { SDValue Op = N->getOperand(i); if (i >= Index-NumDefs && i < Index-NumDefs + X86::AddrNumOperands) AddrOps.push_back(Op); else if (i < Index-NumDefs) BeforeOps.push_back(Op); else if (i > Index-NumDefs) AfterOps.push_back(Op); } SDValue Chain = N->getOperand(NumOps-1); AddrOps.push_back(Chain); // Emit the load instruction. SDNode *Load = nullptr; if (FoldedLoad) { EVT VT = *TRI.legalclasstypes_begin(*RC); auto MMOs = extractLoadMMOs(cast(N)->memoperands(), MF); if (MMOs.empty() && RC == &X86::VR128RegClass && Subtarget.isUnalignedMem16Slow()) // Do not introduce a slow unaligned load. return false; // FIXME: If a VR128 can have size 32, we should be checking if a 32-byte // memory access is slow above. unsigned Opc; if (FoldedBCast) { Opc = getBroadcastOpcode(I, RC, Subtarget); } else { unsigned Alignment = std::max(TRI.getSpillSize(*RC), 16); bool isAligned = !MMOs.empty() && MMOs.front()->getAlign() >= Alignment; Opc = getLoadRegOpcode(0, RC, isAligned, Subtarget); } Load = DAG.getMachineNode(Opc, dl, VT, MVT::Other, AddrOps); NewNodes.push_back(Load); // Preserve memory reference information. DAG.setNodeMemRefs(cast(Load), MMOs); } // Emit the data processing instruction. std::vector VTs; const TargetRegisterClass *DstRC = nullptr; if (MCID.getNumDefs() > 0) { DstRC = getRegClass(MCID, 0, &RI, MF); VTs.push_back(*TRI.legalclasstypes_begin(*DstRC)); } for (unsigned i = 0, e = N->getNumValues(); i != e; ++i) { EVT VT = N->getValueType(i); if (VT != MVT::Other && i >= (unsigned)MCID.getNumDefs()) VTs.push_back(VT); } if (Load) BeforeOps.push_back(SDValue(Load, 0)); llvm::append_range(BeforeOps, AfterOps); // Change CMP32ri r, 0 back to TEST32rr r, r, etc. switch (Opc) { default: break; case X86::CMP64ri32: case X86::CMP64ri8: case X86::CMP32ri: case X86::CMP32ri8: case X86::CMP16ri: case X86::CMP16ri8: case X86::CMP8ri: if (isNullConstant(BeforeOps[1])) { switch (Opc) { default: llvm_unreachable("Unreachable!"); case X86::CMP64ri8: case X86::CMP64ri32: Opc = X86::TEST64rr; break; case X86::CMP32ri8: case X86::CMP32ri: Opc = X86::TEST32rr; break; case X86::CMP16ri8: case X86::CMP16ri: Opc = X86::TEST16rr; break; case X86::CMP8ri: Opc = X86::TEST8rr; break; } BeforeOps[1] = BeforeOps[0]; } } SDNode *NewNode= DAG.getMachineNode(Opc, dl, VTs, BeforeOps); NewNodes.push_back(NewNode); // Emit the store instruction. if (FoldedStore) { AddrOps.pop_back(); AddrOps.push_back(SDValue(NewNode, 0)); AddrOps.push_back(Chain); auto MMOs = extractStoreMMOs(cast(N)->memoperands(), MF); if (MMOs.empty() && RC == &X86::VR128RegClass && Subtarget.isUnalignedMem16Slow()) // Do not introduce a slow unaligned store. return false; // FIXME: If a VR128 can have size 32, we should be checking if a 32-byte // memory access is slow above. unsigned Alignment = std::max(TRI.getSpillSize(*RC), 16); bool isAligned = !MMOs.empty() && MMOs.front()->getAlign() >= Alignment; SDNode *Store = DAG.getMachineNode(getStoreRegOpcode(0, DstRC, isAligned, Subtarget), dl, MVT::Other, AddrOps); NewNodes.push_back(Store); // Preserve memory reference information. DAG.setNodeMemRefs(cast(Store), MMOs); } return true; } unsigned X86InstrInfo::getOpcodeAfterMemoryUnfold(unsigned Opc, bool UnfoldLoad, bool UnfoldStore, unsigned *LoadRegIndex) const { const X86MemoryFoldTableEntry *I = lookupUnfoldTable(Opc); if (I == nullptr) return 0; bool FoldedLoad = I->Flags & TB_FOLDED_LOAD; bool FoldedStore = I->Flags & TB_FOLDED_STORE; if (UnfoldLoad && !FoldedLoad) return 0; if (UnfoldStore && !FoldedStore) return 0; if (LoadRegIndex) *LoadRegIndex = I->Flags & TB_INDEX_MASK; return I->DstOp; } bool X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, int64_t &Offset1, int64_t &Offset2) const { if (!Load1->isMachineOpcode() || !Load2->isMachineOpcode()) return false; unsigned Opc1 = Load1->getMachineOpcode(); unsigned Opc2 = Load2->getMachineOpcode(); switch (Opc1) { default: return false; case X86::MOV8rm: case X86::MOV16rm: case X86::MOV32rm: case X86::MOV64rm: case X86::LD_Fp32m: case X86::LD_Fp64m: case X86::LD_Fp80m: case X86::MOVSSrm: case X86::MOVSSrm_alt: case X86::MOVSDrm: case X86::MOVSDrm_alt: case X86::MMX_MOVD64rm: case X86::MMX_MOVQ64rm: case X86::MOVAPSrm: case X86::MOVUPSrm: case X86::MOVAPDrm: case X86::MOVUPDrm: case X86::MOVDQArm: case X86::MOVDQUrm: // AVX load instructions case X86::VMOVSSrm: case X86::VMOVSSrm_alt: case X86::VMOVSDrm: case X86::VMOVSDrm_alt: case X86::VMOVAPSrm: case X86::VMOVUPSrm: case X86::VMOVAPDrm: case X86::VMOVUPDrm: case X86::VMOVDQArm: case X86::VMOVDQUrm: case X86::VMOVAPSYrm: case X86::VMOVUPSYrm: case X86::VMOVAPDYrm: case X86::VMOVUPDYrm: case X86::VMOVDQAYrm: case X86::VMOVDQUYrm: // AVX512 load instructions case X86::VMOVSSZrm: case X86::VMOVSSZrm_alt: case X86::VMOVSDZrm: case X86::VMOVSDZrm_alt: case X86::VMOVAPSZ128rm: case X86::VMOVUPSZ128rm: case X86::VMOVAPSZ128rm_NOVLX: case X86::VMOVUPSZ128rm_NOVLX: case X86::VMOVAPDZ128rm: case X86::VMOVUPDZ128rm: case X86::VMOVDQU8Z128rm: case X86::VMOVDQU16Z128rm: case X86::VMOVDQA32Z128rm: case X86::VMOVDQU32Z128rm: case X86::VMOVDQA64Z128rm: case X86::VMOVDQU64Z128rm: case X86::VMOVAPSZ256rm: case X86::VMOVUPSZ256rm: case X86::VMOVAPSZ256rm_NOVLX: case X86::VMOVUPSZ256rm_NOVLX: case X86::VMOVAPDZ256rm: case X86::VMOVUPDZ256rm: case X86::VMOVDQU8Z256rm: case X86::VMOVDQU16Z256rm: case X86::VMOVDQA32Z256rm: case X86::VMOVDQU32Z256rm: case X86::VMOVDQA64Z256rm: case X86::VMOVDQU64Z256rm: case X86::VMOVAPSZrm: case X86::VMOVUPSZrm: case X86::VMOVAPDZrm: case X86::VMOVUPDZrm: case X86::VMOVDQU8Zrm: case X86::VMOVDQU16Zrm: case X86::VMOVDQA32Zrm: case X86::VMOVDQU32Zrm: case X86::VMOVDQA64Zrm: case X86::VMOVDQU64Zrm: case X86::KMOVBkm: case X86::KMOVWkm: case X86::KMOVDkm: case X86::KMOVQkm: break; } switch (Opc2) { default: return false; case X86::MOV8rm: case X86::MOV16rm: case X86::MOV32rm: case X86::MOV64rm: case X86::LD_Fp32m: case X86::LD_Fp64m: case X86::LD_Fp80m: case X86::MOVSSrm: case X86::MOVSSrm_alt: case X86::MOVSDrm: case X86::MOVSDrm_alt: case X86::MMX_MOVD64rm: case X86::MMX_MOVQ64rm: case X86::MOVAPSrm: case X86::MOVUPSrm: case X86::MOVAPDrm: case X86::MOVUPDrm: case X86::MOVDQArm: case X86::MOVDQUrm: // AVX load instructions case X86::VMOVSSrm: case X86::VMOVSSrm_alt: case X86::VMOVSDrm: case X86::VMOVSDrm_alt: case X86::VMOVAPSrm: case X86::VMOVUPSrm: case X86::VMOVAPDrm: case X86::VMOVUPDrm: case X86::VMOVDQArm: case X86::VMOVDQUrm: case X86::VMOVAPSYrm: case X86::VMOVUPSYrm: case X86::VMOVAPDYrm: case X86::VMOVUPDYrm: case X86::VMOVDQAYrm: case X86::VMOVDQUYrm: // AVX512 load instructions case X86::VMOVSSZrm: case X86::VMOVSSZrm_alt: case X86::VMOVSDZrm: case X86::VMOVSDZrm_alt: case X86::VMOVAPSZ128rm: case X86::VMOVUPSZ128rm: case X86::VMOVAPSZ128rm_NOVLX: case X86::VMOVUPSZ128rm_NOVLX: case X86::VMOVAPDZ128rm: case X86::VMOVUPDZ128rm: case X86::VMOVDQU8Z128rm: case X86::VMOVDQU16Z128rm: case X86::VMOVDQA32Z128rm: case X86::VMOVDQU32Z128rm: case X86::VMOVDQA64Z128rm: case X86::VMOVDQU64Z128rm: case X86::VMOVAPSZ256rm: case X86::VMOVUPSZ256rm: case X86::VMOVAPSZ256rm_NOVLX: case X86::VMOVUPSZ256rm_NOVLX: case X86::VMOVAPDZ256rm: case X86::VMOVUPDZ256rm: case X86::VMOVDQU8Z256rm: case X86::VMOVDQU16Z256rm: case X86::VMOVDQA32Z256rm: case X86::VMOVDQU32Z256rm: case X86::VMOVDQA64Z256rm: case X86::VMOVDQU64Z256rm: case X86::VMOVAPSZrm: case X86::VMOVUPSZrm: case X86::VMOVAPDZrm: case X86::VMOVUPDZrm: case X86::VMOVDQU8Zrm: case X86::VMOVDQU16Zrm: case X86::VMOVDQA32Zrm: case X86::VMOVDQU32Zrm: case X86::VMOVDQA64Zrm: case X86::VMOVDQU64Zrm: case X86::KMOVBkm: case X86::KMOVWkm: case X86::KMOVDkm: case X86::KMOVQkm: break; } // Lambda to check if both the loads have the same value for an operand index. auto HasSameOp = [&](int I) { return Load1->getOperand(I) == Load2->getOperand(I); }; // All operands except the displacement should match. if (!HasSameOp(X86::AddrBaseReg) || !HasSameOp(X86::AddrScaleAmt) || !HasSameOp(X86::AddrIndexReg) || !HasSameOp(X86::AddrSegmentReg)) return false; // Chain Operand must be the same. if (!HasSameOp(5)) return false; // Now let's examine if the displacements are constants. auto Disp1 = dyn_cast(Load1->getOperand(X86::AddrDisp)); auto Disp2 = dyn_cast(Load2->getOperand(X86::AddrDisp)); if (!Disp1 || !Disp2) return false; Offset1 = Disp1->getSExtValue(); Offset2 = Disp2->getSExtValue(); return true; } bool X86InstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2, int64_t Offset1, int64_t Offset2, unsigned NumLoads) const { assert(Offset2 > Offset1); if ((Offset2 - Offset1) / 8 > 64) return false; unsigned Opc1 = Load1->getMachineOpcode(); unsigned Opc2 = Load2->getMachineOpcode(); if (Opc1 != Opc2) return false; // FIXME: overly conservative? switch (Opc1) { default: break; case X86::LD_Fp32m: case X86::LD_Fp64m: case X86::LD_Fp80m: case X86::MMX_MOVD64rm: case X86::MMX_MOVQ64rm: return false; } EVT VT = Load1->getValueType(0); switch (VT.getSimpleVT().SimpleTy) { default: // XMM registers. In 64-bit mode we can be a bit more aggressive since we // have 16 of them to play with. if (Subtarget.is64Bit()) { if (NumLoads >= 3) return false; } else if (NumLoads) { return false; } break; case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64: case MVT::f32: case MVT::f64: if (NumLoads) return false; break; } return true; } bool X86InstrInfo::isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const { // ENDBR instructions should not be scheduled around. unsigned Opcode = MI.getOpcode(); if (Opcode == X86::ENDBR64 || Opcode == X86::ENDBR32) return true; return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF); } bool X86InstrInfo:: reverseBranchCondition(SmallVectorImpl &Cond) const { assert(Cond.size() == 1 && "Invalid X86 branch condition!"); X86::CondCode CC = static_cast(Cond[0].getImm()); Cond[0].setImm(GetOppositeBranchCondition(CC)); return false; } bool X86InstrInfo:: isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const { // FIXME: Return false for x87 stack register classes for now. We can't // allow any loads of these registers before FpGet_ST0_80. return !(RC == &X86::CCRRegClass || RC == &X86::DFCCRRegClass || RC == &X86::RFP32RegClass || RC == &X86::RFP64RegClass || RC == &X86::RFP80RegClass); } /// Return a virtual register initialized with the /// the global base register value. Output instructions required to /// initialize the register in the function entry block, if necessary. /// /// TODO: Eliminate this and move the code to X86MachineFunctionInfo. /// unsigned X86InstrInfo::getGlobalBaseReg(MachineFunction *MF) const { assert((!Subtarget.is64Bit() || MF->getTarget().getCodeModel() == CodeModel::Medium || MF->getTarget().getCodeModel() == CodeModel::Large) && "X86-64 PIC uses RIP relative addressing"); X86MachineFunctionInfo *X86FI = MF->getInfo(); Register GlobalBaseReg = X86FI->getGlobalBaseReg(); if (GlobalBaseReg != 0) return GlobalBaseReg; // Create the register. The code to initialize it is inserted // later, by the CGBR pass (below). MachineRegisterInfo &RegInfo = MF->getRegInfo(); GlobalBaseReg = RegInfo.createVirtualRegister( Subtarget.is64Bit() ? &X86::GR64_NOSPRegClass : &X86::GR32_NOSPRegClass); X86FI->setGlobalBaseReg(GlobalBaseReg); return GlobalBaseReg; } // These are the replaceable SSE instructions. Some of these have Int variants // that we don't include here. We don't want to replace instructions selected // by intrinsics. static const uint16_t ReplaceableInstrs[][3] = { //PackedSingle PackedDouble PackedInt { X86::MOVAPSmr, X86::MOVAPDmr, X86::MOVDQAmr }, { X86::MOVAPSrm, X86::MOVAPDrm, X86::MOVDQArm }, { X86::MOVAPSrr, X86::MOVAPDrr, X86::MOVDQArr }, { X86::MOVUPSmr, X86::MOVUPDmr, X86::MOVDQUmr }, { X86::MOVUPSrm, X86::MOVUPDrm, X86::MOVDQUrm }, { X86::MOVLPSmr, X86::MOVLPDmr, X86::MOVPQI2QImr }, { X86::MOVSDmr, X86::MOVSDmr, X86::MOVPQI2QImr }, { X86::MOVSSmr, X86::MOVSSmr, X86::MOVPDI2DImr }, { X86::MOVSDrm, X86::MOVSDrm, X86::MOVQI2PQIrm }, { X86::MOVSDrm_alt,X86::MOVSDrm_alt,X86::MOVQI2PQIrm }, { X86::MOVSSrm, X86::MOVSSrm, X86::MOVDI2PDIrm }, { X86::MOVSSrm_alt,X86::MOVSSrm_alt,X86::MOVDI2PDIrm }, { X86::MOVNTPSmr, X86::MOVNTPDmr, X86::MOVNTDQmr }, { X86::ANDNPSrm, X86::ANDNPDrm, X86::PANDNrm }, { X86::ANDNPSrr, X86::ANDNPDrr, X86::PANDNrr }, { X86::ANDPSrm, X86::ANDPDrm, X86::PANDrm }, { X86::ANDPSrr, X86::ANDPDrr, X86::PANDrr }, { X86::ORPSrm, X86::ORPDrm, X86::PORrm }, { X86::ORPSrr, X86::ORPDrr, X86::PORrr }, { X86::XORPSrm, X86::XORPDrm, X86::PXORrm }, { X86::XORPSrr, X86::XORPDrr, X86::PXORrr }, { X86::UNPCKLPDrm, X86::UNPCKLPDrm, X86::PUNPCKLQDQrm }, { X86::MOVLHPSrr, X86::UNPCKLPDrr, X86::PUNPCKLQDQrr }, { X86::UNPCKHPDrm, X86::UNPCKHPDrm, X86::PUNPCKHQDQrm }, { X86::UNPCKHPDrr, X86::UNPCKHPDrr, X86::PUNPCKHQDQrr }, { X86::UNPCKLPSrm, X86::UNPCKLPSrm, X86::PUNPCKLDQrm }, { X86::UNPCKLPSrr, X86::UNPCKLPSrr, X86::PUNPCKLDQrr }, { X86::UNPCKHPSrm, X86::UNPCKHPSrm, X86::PUNPCKHDQrm }, { X86::UNPCKHPSrr, X86::UNPCKHPSrr, X86::PUNPCKHDQrr }, { X86::EXTRACTPSmr, X86::EXTRACTPSmr, X86::PEXTRDmr }, { X86::EXTRACTPSrr, X86::EXTRACTPSrr, X86::PEXTRDrr }, // AVX 128-bit support { X86::VMOVAPSmr, X86::VMOVAPDmr, X86::VMOVDQAmr }, { X86::VMOVAPSrm, X86::VMOVAPDrm, X86::VMOVDQArm }, { X86::VMOVAPSrr, X86::VMOVAPDrr, X86::VMOVDQArr }, { X86::VMOVUPSmr, X86::VMOVUPDmr, X86::VMOVDQUmr }, { X86::VMOVUPSrm, X86::VMOVUPDrm, X86::VMOVDQUrm }, { X86::VMOVLPSmr, X86::VMOVLPDmr, X86::VMOVPQI2QImr }, { X86::VMOVSDmr, X86::VMOVSDmr, X86::VMOVPQI2QImr }, { X86::VMOVSSmr, X86::VMOVSSmr, X86::VMOVPDI2DImr }, { X86::VMOVSDrm, X86::VMOVSDrm, X86::VMOVQI2PQIrm }, { X86::VMOVSDrm_alt,X86::VMOVSDrm_alt,X86::VMOVQI2PQIrm }, { X86::VMOVSSrm, X86::VMOVSSrm, X86::VMOVDI2PDIrm }, { X86::VMOVSSrm_alt,X86::VMOVSSrm_alt,X86::VMOVDI2PDIrm }, { X86::VMOVNTPSmr, X86::VMOVNTPDmr, X86::VMOVNTDQmr }, { X86::VANDNPSrm, X86::VANDNPDrm, X86::VPANDNrm }, { X86::VANDNPSrr, X86::VANDNPDrr, X86::VPANDNrr }, { X86::VANDPSrm, X86::VANDPDrm, X86::VPANDrm }, { X86::VANDPSrr, X86::VANDPDrr, X86::VPANDrr }, { X86::VORPSrm, X86::VORPDrm, X86::VPORrm }, { X86::VORPSrr, X86::VORPDrr, X86::VPORrr }, { X86::VXORPSrm, X86::VXORPDrm, X86::VPXORrm }, { X86::VXORPSrr, X86::VXORPDrr, X86::VPXORrr }, { X86::VUNPCKLPDrm, X86::VUNPCKLPDrm, X86::VPUNPCKLQDQrm }, { X86::VMOVLHPSrr, X86::VUNPCKLPDrr, X86::VPUNPCKLQDQrr }, { X86::VUNPCKHPDrm, X86::VUNPCKHPDrm, X86::VPUNPCKHQDQrm }, { X86::VUNPCKHPDrr, X86::VUNPCKHPDrr, X86::VPUNPCKHQDQrr }, { X86::VUNPCKLPSrm, X86::VUNPCKLPSrm, X86::VPUNPCKLDQrm }, { X86::VUNPCKLPSrr, X86::VUNPCKLPSrr, X86::VPUNPCKLDQrr }, { X86::VUNPCKHPSrm, X86::VUNPCKHPSrm, X86::VPUNPCKHDQrm }, { X86::VUNPCKHPSrr, X86::VUNPCKHPSrr, X86::VPUNPCKHDQrr }, { X86::VEXTRACTPSmr, X86::VEXTRACTPSmr, X86::VPEXTRDmr }, { X86::VEXTRACTPSrr, X86::VEXTRACTPSrr, X86::VPEXTRDrr }, // AVX 256-bit support { X86::VMOVAPSYmr, X86::VMOVAPDYmr, X86::VMOVDQAYmr }, { X86::VMOVAPSYrm, X86::VMOVAPDYrm, X86::VMOVDQAYrm }, { X86::VMOVAPSYrr, X86::VMOVAPDYrr, X86::VMOVDQAYrr }, { X86::VMOVUPSYmr, X86::VMOVUPDYmr, X86::VMOVDQUYmr }, { X86::VMOVUPSYrm, X86::VMOVUPDYrm, X86::VMOVDQUYrm }, { X86::VMOVNTPSYmr, X86::VMOVNTPDYmr, X86::VMOVNTDQYmr }, { X86::VPERMPSYrm, X86::VPERMPSYrm, X86::VPERMDYrm }, { X86::VPERMPSYrr, X86::VPERMPSYrr, X86::VPERMDYrr }, { X86::VPERMPDYmi, X86::VPERMPDYmi, X86::VPERMQYmi }, { X86::VPERMPDYri, X86::VPERMPDYri, X86::VPERMQYri }, // AVX512 support { X86::VMOVLPSZ128mr, X86::VMOVLPDZ128mr, X86::VMOVPQI2QIZmr }, { X86::VMOVNTPSZ128mr, X86::VMOVNTPDZ128mr, X86::VMOVNTDQZ128mr }, { X86::VMOVNTPSZ256mr, X86::VMOVNTPDZ256mr, X86::VMOVNTDQZ256mr }, { X86::VMOVNTPSZmr, X86::VMOVNTPDZmr, X86::VMOVNTDQZmr }, { X86::VMOVSDZmr, X86::VMOVSDZmr, X86::VMOVPQI2QIZmr }, { X86::VMOVSSZmr, X86::VMOVSSZmr, X86::VMOVPDI2DIZmr }, { X86::VMOVSDZrm, X86::VMOVSDZrm, X86::VMOVQI2PQIZrm }, { X86::VMOVSDZrm_alt, X86::VMOVSDZrm_alt, X86::VMOVQI2PQIZrm }, { X86::VMOVSSZrm, X86::VMOVSSZrm, X86::VMOVDI2PDIZrm }, { X86::VMOVSSZrm_alt, X86::VMOVSSZrm_alt, X86::VMOVDI2PDIZrm }, { X86::VBROADCASTSSZ128rr,X86::VBROADCASTSSZ128rr,X86::VPBROADCASTDZ128rr }, { X86::VBROADCASTSSZ128rm,X86::VBROADCASTSSZ128rm,X86::VPBROADCASTDZ128rm }, { X86::VBROADCASTSSZ256rr,X86::VBROADCASTSSZ256rr,X86::VPBROADCASTDZ256rr }, { X86::VBROADCASTSSZ256rm,X86::VBROADCASTSSZ256rm,X86::VPBROADCASTDZ256rm }, { X86::VBROADCASTSSZrr, X86::VBROADCASTSSZrr, X86::VPBROADCASTDZrr }, { X86::VBROADCASTSSZrm, X86::VBROADCASTSSZrm, X86::VPBROADCASTDZrm }, { X86::VMOVDDUPZ128rr, X86::VMOVDDUPZ128rr, X86::VPBROADCASTQZ128rr }, { X86::VMOVDDUPZ128rm, X86::VMOVDDUPZ128rm, X86::VPBROADCASTQZ128rm }, { X86::VBROADCASTSDZ256rr,X86::VBROADCASTSDZ256rr,X86::VPBROADCASTQZ256rr }, { X86::VBROADCASTSDZ256rm,X86::VBROADCASTSDZ256rm,X86::VPBROADCASTQZ256rm }, { X86::VBROADCASTSDZrr, X86::VBROADCASTSDZrr, X86::VPBROADCASTQZrr }, { X86::VBROADCASTSDZrm, X86::VBROADCASTSDZrm, X86::VPBROADCASTQZrm }, { X86::VINSERTF32x4Zrr, X86::VINSERTF32x4Zrr, X86::VINSERTI32x4Zrr }, { X86::VINSERTF32x4Zrm, X86::VINSERTF32x4Zrm, X86::VINSERTI32x4Zrm }, { X86::VINSERTF32x8Zrr, X86::VINSERTF32x8Zrr, X86::VINSERTI32x8Zrr }, { X86::VINSERTF32x8Zrm, X86::VINSERTF32x8Zrm, X86::VINSERTI32x8Zrm }, { X86::VINSERTF64x2Zrr, X86::VINSERTF64x2Zrr, X86::VINSERTI64x2Zrr }, { X86::VINSERTF64x2Zrm, X86::VINSERTF64x2Zrm, X86::VINSERTI64x2Zrm }, { X86::VINSERTF64x4Zrr, X86::VINSERTF64x4Zrr, X86::VINSERTI64x4Zrr }, { X86::VINSERTF64x4Zrm, X86::VINSERTF64x4Zrm, X86::VINSERTI64x4Zrm }, { X86::VINSERTF32x4Z256rr,X86::VINSERTF32x4Z256rr,X86::VINSERTI32x4Z256rr }, { X86::VINSERTF32x4Z256rm,X86::VINSERTF32x4Z256rm,X86::VINSERTI32x4Z256rm }, { X86::VINSERTF64x2Z256rr,X86::VINSERTF64x2Z256rr,X86::VINSERTI64x2Z256rr }, { X86::VINSERTF64x2Z256rm,X86::VINSERTF64x2Z256rm,X86::VINSERTI64x2Z256rm }, { X86::VEXTRACTF32x4Zrr, X86::VEXTRACTF32x4Zrr, X86::VEXTRACTI32x4Zrr }, { X86::VEXTRACTF32x4Zmr, X86::VEXTRACTF32x4Zmr, X86::VEXTRACTI32x4Zmr }, { X86::VEXTRACTF32x8Zrr, X86::VEXTRACTF32x8Zrr, X86::VEXTRACTI32x8Zrr }, { X86::VEXTRACTF32x8Zmr, X86::VEXTRACTF32x8Zmr, X86::VEXTRACTI32x8Zmr }, { X86::VEXTRACTF64x2Zrr, X86::VEXTRACTF64x2Zrr, X86::VEXTRACTI64x2Zrr }, { X86::VEXTRACTF64x2Zmr, X86::VEXTRACTF64x2Zmr, X86::VEXTRACTI64x2Zmr }, { X86::VEXTRACTF64x4Zrr, X86::VEXTRACTF64x4Zrr, X86::VEXTRACTI64x4Zrr }, { X86::VEXTRACTF64x4Zmr, X86::VEXTRACTF64x4Zmr, X86::VEXTRACTI64x4Zmr }, { X86::VEXTRACTF32x4Z256rr,X86::VEXTRACTF32x4Z256rr,X86::VEXTRACTI32x4Z256rr }, { X86::VEXTRACTF32x4Z256mr,X86::VEXTRACTF32x4Z256mr,X86::VEXTRACTI32x4Z256mr }, { X86::VEXTRACTF64x2Z256rr,X86::VEXTRACTF64x2Z256rr,X86::VEXTRACTI64x2Z256rr }, { X86::VEXTRACTF64x2Z256mr,X86::VEXTRACTF64x2Z256mr,X86::VEXTRACTI64x2Z256mr }, { X86::VPERMILPSmi, X86::VPERMILPSmi, X86::VPSHUFDmi }, { X86::VPERMILPSri, X86::VPERMILPSri, X86::VPSHUFDri }, { X86::VPERMILPSZ128mi, X86::VPERMILPSZ128mi, X86::VPSHUFDZ128mi }, { X86::VPERMILPSZ128ri, X86::VPERMILPSZ128ri, X86::VPSHUFDZ128ri }, { X86::VPERMILPSZ256mi, X86::VPERMILPSZ256mi, X86::VPSHUFDZ256mi }, { X86::VPERMILPSZ256ri, X86::VPERMILPSZ256ri, X86::VPSHUFDZ256ri }, { X86::VPERMILPSZmi, X86::VPERMILPSZmi, X86::VPSHUFDZmi }, { X86::VPERMILPSZri, X86::VPERMILPSZri, X86::VPSHUFDZri }, { X86::VPERMPSZ256rm, X86::VPERMPSZ256rm, X86::VPERMDZ256rm }, { X86::VPERMPSZ256rr, X86::VPERMPSZ256rr, X86::VPERMDZ256rr }, { X86::VPERMPDZ256mi, X86::VPERMPDZ256mi, X86::VPERMQZ256mi }, { X86::VPERMPDZ256ri, X86::VPERMPDZ256ri, X86::VPERMQZ256ri }, { X86::VPERMPDZ256rm, X86::VPERMPDZ256rm, X86::VPERMQZ256rm }, { X86::VPERMPDZ256rr, X86::VPERMPDZ256rr, X86::VPERMQZ256rr }, { X86::VPERMPSZrm, X86::VPERMPSZrm, X86::VPERMDZrm }, { X86::VPERMPSZrr, X86::VPERMPSZrr, X86::VPERMDZrr }, { X86::VPERMPDZmi, X86::VPERMPDZmi, X86::VPERMQZmi }, { X86::VPERMPDZri, X86::VPERMPDZri, X86::VPERMQZri }, { X86::VPERMPDZrm, X86::VPERMPDZrm, X86::VPERMQZrm }, { X86::VPERMPDZrr, X86::VPERMPDZrr, X86::VPERMQZrr }, { X86::VUNPCKLPDZ256rm, X86::VUNPCKLPDZ256rm, X86::VPUNPCKLQDQZ256rm }, { X86::VUNPCKLPDZ256rr, X86::VUNPCKLPDZ256rr, X86::VPUNPCKLQDQZ256rr }, { X86::VUNPCKHPDZ256rm, X86::VUNPCKHPDZ256rm, X86::VPUNPCKHQDQZ256rm }, { X86::VUNPCKHPDZ256rr, X86::VUNPCKHPDZ256rr, X86::VPUNPCKHQDQZ256rr }, { X86::VUNPCKLPSZ256rm, X86::VUNPCKLPSZ256rm, X86::VPUNPCKLDQZ256rm }, { X86::VUNPCKLPSZ256rr, X86::VUNPCKLPSZ256rr, X86::VPUNPCKLDQZ256rr }, { X86::VUNPCKHPSZ256rm, X86::VUNPCKHPSZ256rm, X86::VPUNPCKHDQZ256rm }, { X86::VUNPCKHPSZ256rr, X86::VUNPCKHPSZ256rr, X86::VPUNPCKHDQZ256rr }, { X86::VUNPCKLPDZ128rm, X86::VUNPCKLPDZ128rm, X86::VPUNPCKLQDQZ128rm }, { X86::VMOVLHPSZrr, X86::VUNPCKLPDZ128rr, X86::VPUNPCKLQDQZ128rr }, { X86::VUNPCKHPDZ128rm, X86::VUNPCKHPDZ128rm, X86::VPUNPCKHQDQZ128rm }, { X86::VUNPCKHPDZ128rr, X86::VUNPCKHPDZ128rr, X86::VPUNPCKHQDQZ128rr }, { X86::VUNPCKLPSZ128rm, X86::VUNPCKLPSZ128rm, X86::VPUNPCKLDQZ128rm }, { X86::VUNPCKLPSZ128rr, X86::VUNPCKLPSZ128rr, X86::VPUNPCKLDQZ128rr }, { X86::VUNPCKHPSZ128rm, X86::VUNPCKHPSZ128rm, X86::VPUNPCKHDQZ128rm }, { X86::VUNPCKHPSZ128rr, X86::VUNPCKHPSZ128rr, X86::VPUNPCKHDQZ128rr }, { X86::VUNPCKLPDZrm, X86::VUNPCKLPDZrm, X86::VPUNPCKLQDQZrm }, { X86::VUNPCKLPDZrr, X86::VUNPCKLPDZrr, X86::VPUNPCKLQDQZrr }, { X86::VUNPCKHPDZrm, X86::VUNPCKHPDZrm, X86::VPUNPCKHQDQZrm }, { X86::VUNPCKHPDZrr, X86::VUNPCKHPDZrr, X86::VPUNPCKHQDQZrr }, { X86::VUNPCKLPSZrm, X86::VUNPCKLPSZrm, X86::VPUNPCKLDQZrm }, { X86::VUNPCKLPSZrr, X86::VUNPCKLPSZrr, X86::VPUNPCKLDQZrr }, { X86::VUNPCKHPSZrm, X86::VUNPCKHPSZrm, X86::VPUNPCKHDQZrm }, { X86::VUNPCKHPSZrr, X86::VUNPCKHPSZrr, X86::VPUNPCKHDQZrr }, { X86::VEXTRACTPSZmr, X86::VEXTRACTPSZmr, X86::VPEXTRDZmr }, { X86::VEXTRACTPSZrr, X86::VEXTRACTPSZrr, X86::VPEXTRDZrr }, }; static const uint16_t ReplaceableInstrsAVX2[][3] = { //PackedSingle PackedDouble PackedInt { X86::VANDNPSYrm, X86::VANDNPDYrm, X86::VPANDNYrm }, { X86::VANDNPSYrr, X86::VANDNPDYrr, X86::VPANDNYrr }, { X86::VANDPSYrm, X86::VANDPDYrm, X86::VPANDYrm }, { X86::VANDPSYrr, X86::VANDPDYrr, X86::VPANDYrr }, { X86::VORPSYrm, X86::VORPDYrm, X86::VPORYrm }, { X86::VORPSYrr, X86::VORPDYrr, X86::VPORYrr }, { X86::VXORPSYrm, X86::VXORPDYrm, X86::VPXORYrm }, { X86::VXORPSYrr, X86::VXORPDYrr, X86::VPXORYrr }, { X86::VPERM2F128rm, X86::VPERM2F128rm, X86::VPERM2I128rm }, { X86::VPERM2F128rr, X86::VPERM2F128rr, X86::VPERM2I128rr }, { X86::VBROADCASTSSrm, X86::VBROADCASTSSrm, X86::VPBROADCASTDrm}, { X86::VBROADCASTSSrr, X86::VBROADCASTSSrr, X86::VPBROADCASTDrr}, { X86::VMOVDDUPrm, X86::VMOVDDUPrm, X86::VPBROADCASTQrm}, { X86::VMOVDDUPrr, X86::VMOVDDUPrr, X86::VPBROADCASTQrr}, { X86::VBROADCASTSSYrr, X86::VBROADCASTSSYrr, X86::VPBROADCASTDYrr}, { X86::VBROADCASTSSYrm, X86::VBROADCASTSSYrm, X86::VPBROADCASTDYrm}, { X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrr, X86::VPBROADCASTQYrr}, { X86::VBROADCASTSDYrm, X86::VBROADCASTSDYrm, X86::VPBROADCASTQYrm}, { X86::VBROADCASTF128, X86::VBROADCASTF128, X86::VBROADCASTI128 }, { X86::VBLENDPSYrri, X86::VBLENDPSYrri, X86::VPBLENDDYrri }, { X86::VBLENDPSYrmi, X86::VBLENDPSYrmi, X86::VPBLENDDYrmi }, { X86::VPERMILPSYmi, X86::VPERMILPSYmi, X86::VPSHUFDYmi }, { X86::VPERMILPSYri, X86::VPERMILPSYri, X86::VPSHUFDYri }, { X86::VUNPCKLPDYrm, X86::VUNPCKLPDYrm, X86::VPUNPCKLQDQYrm }, { X86::VUNPCKLPDYrr, X86::VUNPCKLPDYrr, X86::VPUNPCKLQDQYrr }, { X86::VUNPCKHPDYrm, X86::VUNPCKHPDYrm, X86::VPUNPCKHQDQYrm }, { X86::VUNPCKHPDYrr, X86::VUNPCKHPDYrr, X86::VPUNPCKHQDQYrr }, { X86::VUNPCKLPSYrm, X86::VUNPCKLPSYrm, X86::VPUNPCKLDQYrm }, { X86::VUNPCKLPSYrr, X86::VUNPCKLPSYrr, X86::VPUNPCKLDQYrr }, { X86::VUNPCKHPSYrm, X86::VUNPCKHPSYrm, X86::VPUNPCKHDQYrm }, { X86::VUNPCKHPSYrr, X86::VUNPCKHPSYrr, X86::VPUNPCKHDQYrr }, }; static const uint16_t ReplaceableInstrsFP[][3] = { //PackedSingle PackedDouble { X86::MOVLPSrm, X86::MOVLPDrm, X86::INSTRUCTION_LIST_END }, { X86::MOVHPSrm, X86::MOVHPDrm, X86::INSTRUCTION_LIST_END }, { X86::MOVHPSmr, X86::MOVHPDmr, X86::INSTRUCTION_LIST_END }, { X86::VMOVLPSrm, X86::VMOVLPDrm, X86::INSTRUCTION_LIST_END }, { X86::VMOVHPSrm, X86::VMOVHPDrm, X86::INSTRUCTION_LIST_END }, { X86::VMOVHPSmr, X86::VMOVHPDmr, X86::INSTRUCTION_LIST_END }, { X86::VMOVLPSZ128rm, X86::VMOVLPDZ128rm, X86::INSTRUCTION_LIST_END }, { X86::VMOVHPSZ128rm, X86::VMOVHPDZ128rm, X86::INSTRUCTION_LIST_END }, { X86::VMOVHPSZ128mr, X86::VMOVHPDZ128mr, X86::INSTRUCTION_LIST_END }, }; static const uint16_t ReplaceableInstrsAVX2InsertExtract[][3] = { //PackedSingle PackedDouble PackedInt { X86::VEXTRACTF128mr, X86::VEXTRACTF128mr, X86::VEXTRACTI128mr }, { X86::VEXTRACTF128rr, X86::VEXTRACTF128rr, X86::VEXTRACTI128rr }, { X86::VINSERTF128rm, X86::VINSERTF128rm, X86::VINSERTI128rm }, { X86::VINSERTF128rr, X86::VINSERTF128rr, X86::VINSERTI128rr }, }; static const uint16_t ReplaceableInstrsAVX512[][4] = { // Two integer columns for 64-bit and 32-bit elements. //PackedSingle PackedDouble PackedInt PackedInt { X86::VMOVAPSZ128mr, X86::VMOVAPDZ128mr, X86::VMOVDQA64Z128mr, X86::VMOVDQA32Z128mr }, { X86::VMOVAPSZ128rm, X86::VMOVAPDZ128rm, X86::VMOVDQA64Z128rm, X86::VMOVDQA32Z128rm }, { X86::VMOVAPSZ128rr, X86::VMOVAPDZ128rr, X86::VMOVDQA64Z128rr, X86::VMOVDQA32Z128rr }, { X86::VMOVUPSZ128mr, X86::VMOVUPDZ128mr, X86::VMOVDQU64Z128mr, X86::VMOVDQU32Z128mr }, { X86::VMOVUPSZ128rm, X86::VMOVUPDZ128rm, X86::VMOVDQU64Z128rm, X86::VMOVDQU32Z128rm }, { X86::VMOVAPSZ256mr, X86::VMOVAPDZ256mr, X86::VMOVDQA64Z256mr, X86::VMOVDQA32Z256mr }, { X86::VMOVAPSZ256rm, X86::VMOVAPDZ256rm, X86::VMOVDQA64Z256rm, X86::VMOVDQA32Z256rm }, { X86::VMOVAPSZ256rr, X86::VMOVAPDZ256rr, X86::VMOVDQA64Z256rr, X86::VMOVDQA32Z256rr }, { X86::VMOVUPSZ256mr, X86::VMOVUPDZ256mr, X86::VMOVDQU64Z256mr, X86::VMOVDQU32Z256mr }, { X86::VMOVUPSZ256rm, X86::VMOVUPDZ256rm, X86::VMOVDQU64Z256rm, X86::VMOVDQU32Z256rm }, { X86::VMOVAPSZmr, X86::VMOVAPDZmr, X86::VMOVDQA64Zmr, X86::VMOVDQA32Zmr }, { X86::VMOVAPSZrm, X86::VMOVAPDZrm, X86::VMOVDQA64Zrm, X86::VMOVDQA32Zrm }, { X86::VMOVAPSZrr, X86::VMOVAPDZrr, X86::VMOVDQA64Zrr, X86::VMOVDQA32Zrr }, { X86::VMOVUPSZmr, X86::VMOVUPDZmr, X86::VMOVDQU64Zmr, X86::VMOVDQU32Zmr }, { X86::VMOVUPSZrm, X86::VMOVUPDZrm, X86::VMOVDQU64Zrm, X86::VMOVDQU32Zrm }, }; static const uint16_t ReplaceableInstrsAVX512DQ[][4] = { // Two integer columns for 64-bit and 32-bit elements. //PackedSingle PackedDouble PackedInt PackedInt { X86::VANDNPSZ128rm, X86::VANDNPDZ128rm, X86::VPANDNQZ128rm, X86::VPANDNDZ128rm }, { X86::VANDNPSZ128rr, X86::VANDNPDZ128rr, X86::VPANDNQZ128rr, X86::VPANDNDZ128rr }, { X86::VANDPSZ128rm, X86::VANDPDZ128rm, X86::VPANDQZ128rm, X86::VPANDDZ128rm }, { X86::VANDPSZ128rr, X86::VANDPDZ128rr, X86::VPANDQZ128rr, X86::VPANDDZ128rr }, { X86::VORPSZ128rm, X86::VORPDZ128rm, X86::VPORQZ128rm, X86::VPORDZ128rm }, { X86::VORPSZ128rr, X86::VORPDZ128rr, X86::VPORQZ128rr, X86::VPORDZ128rr }, { X86::VXORPSZ128rm, X86::VXORPDZ128rm, X86::VPXORQZ128rm, X86::VPXORDZ128rm }, { X86::VXORPSZ128rr, X86::VXORPDZ128rr, X86::VPXORQZ128rr, X86::VPXORDZ128rr }, { X86::VANDNPSZ256rm, X86::VANDNPDZ256rm, X86::VPANDNQZ256rm, X86::VPANDNDZ256rm }, { X86::VANDNPSZ256rr, X86::VANDNPDZ256rr, X86::VPANDNQZ256rr, X86::VPANDNDZ256rr }, { X86::VANDPSZ256rm, X86::VANDPDZ256rm, X86::VPANDQZ256rm, X86::VPANDDZ256rm }, { X86::VANDPSZ256rr, X86::VANDPDZ256rr, X86::VPANDQZ256rr, X86::VPANDDZ256rr }, { X86::VORPSZ256rm, X86::VORPDZ256rm, X86::VPORQZ256rm, X86::VPORDZ256rm }, { X86::VORPSZ256rr, X86::VORPDZ256rr, X86::VPORQZ256rr, X86::VPORDZ256rr }, { X86::VXORPSZ256rm, X86::VXORPDZ256rm, X86::VPXORQZ256rm, X86::VPXORDZ256rm }, { X86::VXORPSZ256rr, X86::VXORPDZ256rr, X86::VPXORQZ256rr, X86::VPXORDZ256rr }, { X86::VANDNPSZrm, X86::VANDNPDZrm, X86::VPANDNQZrm, X86::VPANDNDZrm }, { X86::VANDNPSZrr, X86::VANDNPDZrr, X86::VPANDNQZrr, X86::VPANDNDZrr }, { X86::VANDPSZrm, X86::VANDPDZrm, X86::VPANDQZrm, X86::VPANDDZrm }, { X86::VANDPSZrr, X86::VANDPDZrr, X86::VPANDQZrr, X86::VPANDDZrr }, { X86::VORPSZrm, X86::VORPDZrm, X86::VPORQZrm, X86::VPORDZrm }, { X86::VORPSZrr, X86::VORPDZrr, X86::VPORQZrr, X86::VPORDZrr }, { X86::VXORPSZrm, X86::VXORPDZrm, X86::VPXORQZrm, X86::VPXORDZrm }, { X86::VXORPSZrr, X86::VXORPDZrr, X86::VPXORQZrr, X86::VPXORDZrr }, }; static const uint16_t ReplaceableInstrsAVX512DQMasked[][4] = { // Two integer columns for 64-bit and 32-bit elements. //PackedSingle PackedDouble //PackedInt PackedInt { X86::VANDNPSZ128rmk, X86::VANDNPDZ128rmk, X86::VPANDNQZ128rmk, X86::VPANDNDZ128rmk }, { X86::VANDNPSZ128rmkz, X86::VANDNPDZ128rmkz, X86::VPANDNQZ128rmkz, X86::VPANDNDZ128rmkz }, { X86::VANDNPSZ128rrk, X86::VANDNPDZ128rrk, X86::VPANDNQZ128rrk, X86::VPANDNDZ128rrk }, { X86::VANDNPSZ128rrkz, X86::VANDNPDZ128rrkz, X86::VPANDNQZ128rrkz, X86::VPANDNDZ128rrkz }, { X86::VANDPSZ128rmk, X86::VANDPDZ128rmk, X86::VPANDQZ128rmk, X86::VPANDDZ128rmk }, { X86::VANDPSZ128rmkz, X86::VANDPDZ128rmkz, X86::VPANDQZ128rmkz, X86::VPANDDZ128rmkz }, { X86::VANDPSZ128rrk, X86::VANDPDZ128rrk, X86::VPANDQZ128rrk, X86::VPANDDZ128rrk }, { X86::VANDPSZ128rrkz, X86::VANDPDZ128rrkz, X86::VPANDQZ128rrkz, X86::VPANDDZ128rrkz }, { X86::VORPSZ128rmk, X86::VORPDZ128rmk, X86::VPORQZ128rmk, X86::VPORDZ128rmk }, { X86::VORPSZ128rmkz, X86::VORPDZ128rmkz, X86::VPORQZ128rmkz, X86::VPORDZ128rmkz }, { X86::VORPSZ128rrk, X86::VORPDZ128rrk, X86::VPORQZ128rrk, X86::VPORDZ128rrk }, { X86::VORPSZ128rrkz, X86::VORPDZ128rrkz, X86::VPORQZ128rrkz, X86::VPORDZ128rrkz }, { X86::VXORPSZ128rmk, X86::VXORPDZ128rmk, X86::VPXORQZ128rmk, X86::VPXORDZ128rmk }, { X86::VXORPSZ128rmkz, X86::VXORPDZ128rmkz, X86::VPXORQZ128rmkz, X86::VPXORDZ128rmkz }, { X86::VXORPSZ128rrk, X86::VXORPDZ128rrk, X86::VPXORQZ128rrk, X86::VPXORDZ128rrk }, { X86::VXORPSZ128rrkz, X86::VXORPDZ128rrkz, X86::VPXORQZ128rrkz, X86::VPXORDZ128rrkz }, { X86::VANDNPSZ256rmk, X86::VANDNPDZ256rmk, X86::VPANDNQZ256rmk, X86::VPANDNDZ256rmk }, { X86::VANDNPSZ256rmkz, X86::VANDNPDZ256rmkz, X86::VPANDNQZ256rmkz, X86::VPANDNDZ256rmkz }, { X86::VANDNPSZ256rrk, X86::VANDNPDZ256rrk, X86::VPANDNQZ256rrk, X86::VPANDNDZ256rrk }, { X86::VANDNPSZ256rrkz, X86::VANDNPDZ256rrkz, X86::VPANDNQZ256rrkz, X86::VPANDNDZ256rrkz }, { X86::VANDPSZ256rmk, X86::VANDPDZ256rmk, X86::VPANDQZ256rmk, X86::VPANDDZ256rmk }, { X86::VANDPSZ256rmkz, X86::VANDPDZ256rmkz, X86::VPANDQZ256rmkz, X86::VPANDDZ256rmkz }, { X86::VANDPSZ256rrk, X86::VANDPDZ256rrk, X86::VPANDQZ256rrk, X86::VPANDDZ256rrk }, { X86::VANDPSZ256rrkz, X86::VANDPDZ256rrkz, X86::VPANDQZ256rrkz, X86::VPANDDZ256rrkz }, { X86::VORPSZ256rmk, X86::VORPDZ256rmk, X86::VPORQZ256rmk, X86::VPORDZ256rmk }, { X86::VORPSZ256rmkz, X86::VORPDZ256rmkz, X86::VPORQZ256rmkz, X86::VPORDZ256rmkz }, { X86::VORPSZ256rrk, X86::VORPDZ256rrk, X86::VPORQZ256rrk, X86::VPORDZ256rrk }, { X86::VORPSZ256rrkz, X86::VORPDZ256rrkz, X86::VPORQZ256rrkz, X86::VPORDZ256rrkz }, { X86::VXORPSZ256rmk, X86::VXORPDZ256rmk, X86::VPXORQZ256rmk, X86::VPXORDZ256rmk }, { X86::VXORPSZ256rmkz, X86::VXORPDZ256rmkz, X86::VPXORQZ256rmkz, X86::VPXORDZ256rmkz }, { X86::VXORPSZ256rrk, X86::VXORPDZ256rrk, X86::VPXORQZ256rrk, X86::VPXORDZ256rrk }, { X86::VXORPSZ256rrkz, X86::VXORPDZ256rrkz, X86::VPXORQZ256rrkz, X86::VPXORDZ256rrkz }, { X86::VANDNPSZrmk, X86::VANDNPDZrmk, X86::VPANDNQZrmk, X86::VPANDNDZrmk }, { X86::VANDNPSZrmkz, X86::VANDNPDZrmkz, X86::VPANDNQZrmkz, X86::VPANDNDZrmkz }, { X86::VANDNPSZrrk, X86::VANDNPDZrrk, X86::VPANDNQZrrk, X86::VPANDNDZrrk }, { X86::VANDNPSZrrkz, X86::VANDNPDZrrkz, X86::VPANDNQZrrkz, X86::VPANDNDZrrkz }, { X86::VANDPSZrmk, X86::VANDPDZrmk, X86::VPANDQZrmk, X86::VPANDDZrmk }, { X86::VANDPSZrmkz, X86::VANDPDZrmkz, X86::VPANDQZrmkz, X86::VPANDDZrmkz }, { X86::VANDPSZrrk, X86::VANDPDZrrk, X86::VPANDQZrrk, X86::VPANDDZrrk }, { X86::VANDPSZrrkz, X86::VANDPDZrrkz, X86::VPANDQZrrkz, X86::VPANDDZrrkz }, { X86::VORPSZrmk, X86::VORPDZrmk, X86::VPORQZrmk, X86::VPORDZrmk }, { X86::VORPSZrmkz, X86::VORPDZrmkz, X86::VPORQZrmkz, X86::VPORDZrmkz }, { X86::VORPSZrrk, X86::VORPDZrrk, X86::VPORQZrrk, X86::VPORDZrrk }, { X86::VORPSZrrkz, X86::VORPDZrrkz, X86::VPORQZrrkz, X86::VPORDZrrkz }, { X86::VXORPSZrmk, X86::VXORPDZrmk, X86::VPXORQZrmk, X86::VPXORDZrmk }, { X86::VXORPSZrmkz, X86::VXORPDZrmkz, X86::VPXORQZrmkz, X86::VPXORDZrmkz }, { X86::VXORPSZrrk, X86::VXORPDZrrk, X86::VPXORQZrrk, X86::VPXORDZrrk }, { X86::VXORPSZrrkz, X86::VXORPDZrrkz, X86::VPXORQZrrkz, X86::VPXORDZrrkz }, // Broadcast loads can be handled the same as masked operations to avoid // changing element size. { X86::VANDNPSZ128rmb, X86::VANDNPDZ128rmb, X86::VPANDNQZ128rmb, X86::VPANDNDZ128rmb }, { X86::VANDPSZ128rmb, X86::VANDPDZ128rmb, X86::VPANDQZ128rmb, X86::VPANDDZ128rmb }, { X86::VORPSZ128rmb, X86::VORPDZ128rmb, X86::VPORQZ128rmb, X86::VPORDZ128rmb }, { X86::VXORPSZ128rmb, X86::VXORPDZ128rmb, X86::VPXORQZ128rmb, X86::VPXORDZ128rmb }, { X86::VANDNPSZ256rmb, X86::VANDNPDZ256rmb, X86::VPANDNQZ256rmb, X86::VPANDNDZ256rmb }, { X86::VANDPSZ256rmb, X86::VANDPDZ256rmb, X86::VPANDQZ256rmb, X86::VPANDDZ256rmb }, { X86::VORPSZ256rmb, X86::VORPDZ256rmb, X86::VPORQZ256rmb, X86::VPORDZ256rmb }, { X86::VXORPSZ256rmb, X86::VXORPDZ256rmb, X86::VPXORQZ256rmb, X86::VPXORDZ256rmb }, { X86::VANDNPSZrmb, X86::VANDNPDZrmb, X86::VPANDNQZrmb, X86::VPANDNDZrmb }, { X86::VANDPSZrmb, X86::VANDPDZrmb, X86::VPANDQZrmb, X86::VPANDDZrmb }, { X86::VANDPSZrmb, X86::VANDPDZrmb, X86::VPANDQZrmb, X86::VPANDDZrmb }, { X86::VORPSZrmb, X86::VORPDZrmb, X86::VPORQZrmb, X86::VPORDZrmb }, { X86::VXORPSZrmb, X86::VXORPDZrmb, X86::VPXORQZrmb, X86::VPXORDZrmb }, { X86::VANDNPSZ128rmbk, X86::VANDNPDZ128rmbk, X86::VPANDNQZ128rmbk, X86::VPANDNDZ128rmbk }, { X86::VANDPSZ128rmbk, X86::VANDPDZ128rmbk, X86::VPANDQZ128rmbk, X86::VPANDDZ128rmbk }, { X86::VORPSZ128rmbk, X86::VORPDZ128rmbk, X86::VPORQZ128rmbk, X86::VPORDZ128rmbk }, { X86::VXORPSZ128rmbk, X86::VXORPDZ128rmbk, X86::VPXORQZ128rmbk, X86::VPXORDZ128rmbk }, { X86::VANDNPSZ256rmbk, X86::VANDNPDZ256rmbk, X86::VPANDNQZ256rmbk, X86::VPANDNDZ256rmbk }, { X86::VANDPSZ256rmbk, X86::VANDPDZ256rmbk, X86::VPANDQZ256rmbk, X86::VPANDDZ256rmbk }, { X86::VORPSZ256rmbk, X86::VORPDZ256rmbk, X86::VPORQZ256rmbk, X86::VPORDZ256rmbk }, { X86::VXORPSZ256rmbk, X86::VXORPDZ256rmbk, X86::VPXORQZ256rmbk, X86::VPXORDZ256rmbk }, { X86::VANDNPSZrmbk, X86::VANDNPDZrmbk, X86::VPANDNQZrmbk, X86::VPANDNDZrmbk }, { X86::VANDPSZrmbk, X86::VANDPDZrmbk, X86::VPANDQZrmbk, X86::VPANDDZrmbk }, { X86::VANDPSZrmbk, X86::VANDPDZrmbk, X86::VPANDQZrmbk, X86::VPANDDZrmbk }, { X86::VORPSZrmbk, X86::VORPDZrmbk, X86::VPORQZrmbk, X86::VPORDZrmbk }, { X86::VXORPSZrmbk, X86::VXORPDZrmbk, X86::VPXORQZrmbk, X86::VPXORDZrmbk }, { X86::VANDNPSZ128rmbkz,X86::VANDNPDZ128rmbkz, X86::VPANDNQZ128rmbkz,X86::VPANDNDZ128rmbkz}, { X86::VANDPSZ128rmbkz, X86::VANDPDZ128rmbkz, X86::VPANDQZ128rmbkz, X86::VPANDDZ128rmbkz }, { X86::VORPSZ128rmbkz, X86::VORPDZ128rmbkz, X86::VPORQZ128rmbkz, X86::VPORDZ128rmbkz }, { X86::VXORPSZ128rmbkz, X86::VXORPDZ128rmbkz, X86::VPXORQZ128rmbkz, X86::VPXORDZ128rmbkz }, { X86::VANDNPSZ256rmbkz,X86::VANDNPDZ256rmbkz, X86::VPANDNQZ256rmbkz,X86::VPANDNDZ256rmbkz}, { X86::VANDPSZ256rmbkz, X86::VANDPDZ256rmbkz, X86::VPANDQZ256rmbkz, X86::VPANDDZ256rmbkz }, { X86::VORPSZ256rmbkz, X86::VORPDZ256rmbkz, X86::VPORQZ256rmbkz, X86::VPORDZ256rmbkz }, { X86::VXORPSZ256rmbkz, X86::VXORPDZ256rmbkz, X86::VPXORQZ256rmbkz, X86::VPXORDZ256rmbkz }, { X86::VANDNPSZrmbkz, X86::VANDNPDZrmbkz, X86::VPANDNQZrmbkz, X86::VPANDNDZrmbkz }, { X86::VANDPSZrmbkz, X86::VANDPDZrmbkz, X86::VPANDQZrmbkz, X86::VPANDDZrmbkz }, { X86::VANDPSZrmbkz, X86::VANDPDZrmbkz, X86::VPANDQZrmbkz, X86::VPANDDZrmbkz }, { X86::VORPSZrmbkz, X86::VORPDZrmbkz, X86::VPORQZrmbkz, X86::VPORDZrmbkz }, { X86::VXORPSZrmbkz, X86::VXORPDZrmbkz, X86::VPXORQZrmbkz, X86::VPXORDZrmbkz }, }; // NOTE: These should only be used by the custom domain methods. static const uint16_t ReplaceableBlendInstrs[][3] = { //PackedSingle PackedDouble PackedInt { X86::BLENDPSrmi, X86::BLENDPDrmi, X86::PBLENDWrmi }, { X86::BLENDPSrri, X86::BLENDPDrri, X86::PBLENDWrri }, { X86::VBLENDPSrmi, X86::VBLENDPDrmi, X86::VPBLENDWrmi }, { X86::VBLENDPSrri, X86::VBLENDPDrri, X86::VPBLENDWrri }, { X86::VBLENDPSYrmi, X86::VBLENDPDYrmi, X86::VPBLENDWYrmi }, { X86::VBLENDPSYrri, X86::VBLENDPDYrri, X86::VPBLENDWYrri }, }; static const uint16_t ReplaceableBlendAVX2Instrs[][3] = { //PackedSingle PackedDouble PackedInt { X86::VBLENDPSrmi, X86::VBLENDPDrmi, X86::VPBLENDDrmi }, { X86::VBLENDPSrri, X86::VBLENDPDrri, X86::VPBLENDDrri }, { X86::VBLENDPSYrmi, X86::VBLENDPDYrmi, X86::VPBLENDDYrmi }, { X86::VBLENDPSYrri, X86::VBLENDPDYrri, X86::VPBLENDDYrri }, }; // Special table for changing EVEX logic instructions to VEX. // TODO: Should we run EVEX->VEX earlier? static const uint16_t ReplaceableCustomAVX512LogicInstrs[][4] = { // Two integer columns for 64-bit and 32-bit elements. //PackedSingle PackedDouble PackedInt PackedInt { X86::VANDNPSrm, X86::VANDNPDrm, X86::VPANDNQZ128rm, X86::VPANDNDZ128rm }, { X86::VANDNPSrr, X86::VANDNPDrr, X86::VPANDNQZ128rr, X86::VPANDNDZ128rr }, { X86::VANDPSrm, X86::VANDPDrm, X86::VPANDQZ128rm, X86::VPANDDZ128rm }, { X86::VANDPSrr, X86::VANDPDrr, X86::VPANDQZ128rr, X86::VPANDDZ128rr }, { X86::VORPSrm, X86::VORPDrm, X86::VPORQZ128rm, X86::VPORDZ128rm }, { X86::VORPSrr, X86::VORPDrr, X86::VPORQZ128rr, X86::VPORDZ128rr }, { X86::VXORPSrm, X86::VXORPDrm, X86::VPXORQZ128rm, X86::VPXORDZ128rm }, { X86::VXORPSrr, X86::VXORPDrr, X86::VPXORQZ128rr, X86::VPXORDZ128rr }, { X86::VANDNPSYrm, X86::VANDNPDYrm, X86::VPANDNQZ256rm, X86::VPANDNDZ256rm }, { X86::VANDNPSYrr, X86::VANDNPDYrr, X86::VPANDNQZ256rr, X86::VPANDNDZ256rr }, { X86::VANDPSYrm, X86::VANDPDYrm, X86::VPANDQZ256rm, X86::VPANDDZ256rm }, { X86::VANDPSYrr, X86::VANDPDYrr, X86::VPANDQZ256rr, X86::VPANDDZ256rr }, { X86::VORPSYrm, X86::VORPDYrm, X86::VPORQZ256rm, X86::VPORDZ256rm }, { X86::VORPSYrr, X86::VORPDYrr, X86::VPORQZ256rr, X86::VPORDZ256rr }, { X86::VXORPSYrm, X86::VXORPDYrm, X86::VPXORQZ256rm, X86::VPXORDZ256rm }, { X86::VXORPSYrr, X86::VXORPDYrr, X86::VPXORQZ256rr, X86::VPXORDZ256rr }, }; // FIXME: Some shuffle and unpack instructions have equivalents in different // domains, but they require a bit more work than just switching opcodes. static const uint16_t *lookup(unsigned opcode, unsigned domain, ArrayRef Table) { for (const uint16_t (&Row)[3] : Table) if (Row[domain-1] == opcode) return Row; return nullptr; } static const uint16_t *lookupAVX512(unsigned opcode, unsigned domain, ArrayRef Table) { // If this is the integer domain make sure to check both integer columns. for (const uint16_t (&Row)[4] : Table) if (Row[domain-1] == opcode || (domain == 3 && Row[3] == opcode)) return Row; return nullptr; } // Helper to attempt to widen/narrow blend masks. static bool AdjustBlendMask(unsigned OldMask, unsigned OldWidth, unsigned NewWidth, unsigned *pNewMask = nullptr) { assert(((OldWidth % NewWidth) == 0 || (NewWidth % OldWidth) == 0) && "Illegal blend mask scale"); unsigned NewMask = 0; if ((OldWidth % NewWidth) == 0) { unsigned Scale = OldWidth / NewWidth; unsigned SubMask = (1u << Scale) - 1; for (unsigned i = 0; i != NewWidth; ++i) { unsigned Sub = (OldMask >> (i * Scale)) & SubMask; if (Sub == SubMask) NewMask |= (1u << i); else if (Sub != 0x0) return false; } } else { unsigned Scale = NewWidth / OldWidth; unsigned SubMask = (1u << Scale) - 1; for (unsigned i = 0; i != OldWidth; ++i) { if (OldMask & (1 << i)) { NewMask |= (SubMask << (i * Scale)); } } } if (pNewMask) *pNewMask = NewMask; return true; } uint16_t X86InstrInfo::getExecutionDomainCustom(const MachineInstr &MI) const { unsigned Opcode = MI.getOpcode(); unsigned NumOperands = MI.getDesc().getNumOperands(); auto GetBlendDomains = [&](unsigned ImmWidth, bool Is256) { uint16_t validDomains = 0; if (MI.getOperand(NumOperands - 1).isImm()) { unsigned Imm = MI.getOperand(NumOperands - 1).getImm(); if (AdjustBlendMask(Imm, ImmWidth, Is256 ? 8 : 4)) validDomains |= 0x2; // PackedSingle if (AdjustBlendMask(Imm, ImmWidth, Is256 ? 4 : 2)) validDomains |= 0x4; // PackedDouble if (!Is256 || Subtarget.hasAVX2()) validDomains |= 0x8; // PackedInt } return validDomains; }; switch (Opcode) { case X86::BLENDPDrmi: case X86::BLENDPDrri: case X86::VBLENDPDrmi: case X86::VBLENDPDrri: return GetBlendDomains(2, false); case X86::VBLENDPDYrmi: case X86::VBLENDPDYrri: return GetBlendDomains(4, true); case X86::BLENDPSrmi: case X86::BLENDPSrri: case X86::VBLENDPSrmi: case X86::VBLENDPSrri: case X86::VPBLENDDrmi: case X86::VPBLENDDrri: return GetBlendDomains(4, false); case X86::VBLENDPSYrmi: case X86::VBLENDPSYrri: case X86::VPBLENDDYrmi: case X86::VPBLENDDYrri: return GetBlendDomains(8, true); case X86::PBLENDWrmi: case X86::PBLENDWrri: case X86::VPBLENDWrmi: case X86::VPBLENDWrri: // Treat VPBLENDWY as a 128-bit vector as it repeats the lo/hi masks. case X86::VPBLENDWYrmi: case X86::VPBLENDWYrri: return GetBlendDomains(8, false); case X86::VPANDDZ128rr: case X86::VPANDDZ128rm: case X86::VPANDDZ256rr: case X86::VPANDDZ256rm: case X86::VPANDQZ128rr: case X86::VPANDQZ128rm: case X86::VPANDQZ256rr: case X86::VPANDQZ256rm: case X86::VPANDNDZ128rr: case X86::VPANDNDZ128rm: case X86::VPANDNDZ256rr: case X86::VPANDNDZ256rm: case X86::VPANDNQZ128rr: case X86::VPANDNQZ128rm: case X86::VPANDNQZ256rr: case X86::VPANDNQZ256rm: case X86::VPORDZ128rr: case X86::VPORDZ128rm: case X86::VPORDZ256rr: case X86::VPORDZ256rm: case X86::VPORQZ128rr: case X86::VPORQZ128rm: case X86::VPORQZ256rr: case X86::VPORQZ256rm: case X86::VPXORDZ128rr: case X86::VPXORDZ128rm: case X86::VPXORDZ256rr: case X86::VPXORDZ256rm: case X86::VPXORQZ128rr: case X86::VPXORQZ128rm: case X86::VPXORQZ256rr: case X86::VPXORQZ256rm: // If we don't have DQI see if we can still switch from an EVEX integer // instruction to a VEX floating point instruction. if (Subtarget.hasDQI()) return 0; if (RI.getEncodingValue(MI.getOperand(0).getReg()) >= 16) return 0; if (RI.getEncodingValue(MI.getOperand(1).getReg()) >= 16) return 0; // Register forms will have 3 operands. Memory form will have more. if (NumOperands == 3 && RI.getEncodingValue(MI.getOperand(2).getReg()) >= 16) return 0; // All domains are valid. return 0xe; case X86::MOVHLPSrr: // We can swap domains when both inputs are the same register. // FIXME: This doesn't catch all the cases we would like. If the input // register isn't KILLed by the instruction, the two address instruction // pass puts a COPY on one input. The other input uses the original // register. This prevents the same physical register from being used by // both inputs. if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg() && MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).getSubReg() == 0 && MI.getOperand(2).getSubReg() == 0) return 0x6; return 0; case X86::SHUFPDrri: return 0x6; } return 0; } bool X86InstrInfo::setExecutionDomainCustom(MachineInstr &MI, unsigned Domain) const { assert(Domain > 0 && Domain < 4 && "Invalid execution domain"); uint16_t dom = (MI.getDesc().TSFlags >> X86II::SSEDomainShift) & 3; assert(dom && "Not an SSE instruction"); unsigned Opcode = MI.getOpcode(); unsigned NumOperands = MI.getDesc().getNumOperands(); auto SetBlendDomain = [&](unsigned ImmWidth, bool Is256) { if (MI.getOperand(NumOperands - 1).isImm()) { unsigned Imm = MI.getOperand(NumOperands - 1).getImm() & 255; Imm = (ImmWidth == 16 ? ((Imm << 8) | Imm) : Imm); unsigned NewImm = Imm; const uint16_t *table = lookup(Opcode, dom, ReplaceableBlendInstrs); if (!table) table = lookup(Opcode, dom, ReplaceableBlendAVX2Instrs); if (Domain == 1) { // PackedSingle AdjustBlendMask(Imm, ImmWidth, Is256 ? 8 : 4, &NewImm); } else if (Domain == 2) { // PackedDouble AdjustBlendMask(Imm, ImmWidth, Is256 ? 4 : 2, &NewImm); } else if (Domain == 3) { // PackedInt if (Subtarget.hasAVX2()) { // If we are already VPBLENDW use that, else use VPBLENDD. if ((ImmWidth / (Is256 ? 2 : 1)) != 8) { table = lookup(Opcode, dom, ReplaceableBlendAVX2Instrs); AdjustBlendMask(Imm, ImmWidth, Is256 ? 8 : 4, &NewImm); } } else { assert(!Is256 && "128-bit vector expected"); AdjustBlendMask(Imm, ImmWidth, 8, &NewImm); } } assert(table && table[Domain - 1] && "Unknown domain op"); MI.setDesc(get(table[Domain - 1])); MI.getOperand(NumOperands - 1).setImm(NewImm & 255); } return true; }; switch (Opcode) { case X86::BLENDPDrmi: case X86::BLENDPDrri: case X86::VBLENDPDrmi: case X86::VBLENDPDrri: return SetBlendDomain(2, false); case X86::VBLENDPDYrmi: case X86::VBLENDPDYrri: return SetBlendDomain(4, true); case X86::BLENDPSrmi: case X86::BLENDPSrri: case X86::VBLENDPSrmi: case X86::VBLENDPSrri: case X86::VPBLENDDrmi: case X86::VPBLENDDrri: return SetBlendDomain(4, false); case X86::VBLENDPSYrmi: case X86::VBLENDPSYrri: case X86::VPBLENDDYrmi: case X86::VPBLENDDYrri: return SetBlendDomain(8, true); case X86::PBLENDWrmi: case X86::PBLENDWrri: case X86::VPBLENDWrmi: case X86::VPBLENDWrri: return SetBlendDomain(8, false); case X86::VPBLENDWYrmi: case X86::VPBLENDWYrri: return SetBlendDomain(16, true); case X86::VPANDDZ128rr: case X86::VPANDDZ128rm: case X86::VPANDDZ256rr: case X86::VPANDDZ256rm: case X86::VPANDQZ128rr: case X86::VPANDQZ128rm: case X86::VPANDQZ256rr: case X86::VPANDQZ256rm: case X86::VPANDNDZ128rr: case X86::VPANDNDZ128rm: case X86::VPANDNDZ256rr: case X86::VPANDNDZ256rm: case X86::VPANDNQZ128rr: case X86::VPANDNQZ128rm: case X86::VPANDNQZ256rr: case X86::VPANDNQZ256rm: case X86::VPORDZ128rr: case X86::VPORDZ128rm: case X86::VPORDZ256rr: case X86::VPORDZ256rm: case X86::VPORQZ128rr: case X86::VPORQZ128rm: case X86::VPORQZ256rr: case X86::VPORQZ256rm: case X86::VPXORDZ128rr: case X86::VPXORDZ128rm: case X86::VPXORDZ256rr: case X86::VPXORDZ256rm: case X86::VPXORQZ128rr: case X86::VPXORQZ128rm: case X86::VPXORQZ256rr: case X86::VPXORQZ256rm: { // Without DQI, convert EVEX instructions to VEX instructions. if (Subtarget.hasDQI()) return false; const uint16_t *table = lookupAVX512(MI.getOpcode(), dom, ReplaceableCustomAVX512LogicInstrs); assert(table && "Instruction not found in table?"); // Don't change integer Q instructions to D instructions and // use D intructions if we started with a PS instruction. if (Domain == 3 && (dom == 1 || table[3] == MI.getOpcode())) Domain = 4; MI.setDesc(get(table[Domain - 1])); return true; } case X86::UNPCKHPDrr: case X86::MOVHLPSrr: // We just need to commute the instruction which will switch the domains. if (Domain != dom && Domain != 3 && MI.getOperand(1).getReg() == MI.getOperand(2).getReg() && MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).getSubReg() == 0 && MI.getOperand(2).getSubReg() == 0) { commuteInstruction(MI, false); return true; } // We must always return true for MOVHLPSrr. if (Opcode == X86::MOVHLPSrr) return true; break; case X86::SHUFPDrri: { if (Domain == 1) { unsigned Imm = MI.getOperand(3).getImm(); unsigned NewImm = 0x44; if (Imm & 1) NewImm |= 0x0a; if (Imm & 2) NewImm |= 0xa0; MI.getOperand(3).setImm(NewImm); MI.setDesc(get(X86::SHUFPSrri)); } return true; } } return false; } std::pair X86InstrInfo::getExecutionDomain(const MachineInstr &MI) const { uint16_t domain = (MI.getDesc().TSFlags >> X86II::SSEDomainShift) & 3; unsigned opcode = MI.getOpcode(); uint16_t validDomains = 0; if (domain) { // Attempt to match for custom instructions. validDomains = getExecutionDomainCustom(MI); if (validDomains) return std::make_pair(domain, validDomains); if (lookup(opcode, domain, ReplaceableInstrs)) { validDomains = 0xe; } else if (lookup(opcode, domain, ReplaceableInstrsAVX2)) { validDomains = Subtarget.hasAVX2() ? 0xe : 0x6; } else if (lookup(opcode, domain, ReplaceableInstrsFP)) { validDomains = 0x6; } else if (lookup(opcode, domain, ReplaceableInstrsAVX2InsertExtract)) { // Insert/extract instructions should only effect domain if AVX2 // is enabled. if (!Subtarget.hasAVX2()) return std::make_pair(0, 0); validDomains = 0xe; } else if (lookupAVX512(opcode, domain, ReplaceableInstrsAVX512)) { validDomains = 0xe; } else if (Subtarget.hasDQI() && lookupAVX512(opcode, domain, ReplaceableInstrsAVX512DQ)) { validDomains = 0xe; } else if (Subtarget.hasDQI()) { if (const uint16_t *table = lookupAVX512(opcode, domain, ReplaceableInstrsAVX512DQMasked)) { if (domain == 1 || (domain == 3 && table[3] == opcode)) validDomains = 0xa; else validDomains = 0xc; } } } return std::make_pair(domain, validDomains); } void X86InstrInfo::setExecutionDomain(MachineInstr &MI, unsigned Domain) const { assert(Domain>0 && Domain<4 && "Invalid execution domain"); uint16_t dom = (MI.getDesc().TSFlags >> X86II::SSEDomainShift) & 3; assert(dom && "Not an SSE instruction"); // Attempt to match for custom instructions. if (setExecutionDomainCustom(MI, Domain)) return; const uint16_t *table = lookup(MI.getOpcode(), dom, ReplaceableInstrs); if (!table) { // try the other table assert((Subtarget.hasAVX2() || Domain < 3) && "256-bit vector operations only available in AVX2"); table = lookup(MI.getOpcode(), dom, ReplaceableInstrsAVX2); } if (!table) { // try the FP table table = lookup(MI.getOpcode(), dom, ReplaceableInstrsFP); assert((!table || Domain < 3) && "Can only select PackedSingle or PackedDouble"); } if (!table) { // try the other table assert(Subtarget.hasAVX2() && "256-bit insert/extract only available in AVX2"); table = lookup(MI.getOpcode(), dom, ReplaceableInstrsAVX2InsertExtract); } if (!table) { // try the AVX512 table assert(Subtarget.hasAVX512() && "Requires AVX-512"); table = lookupAVX512(MI.getOpcode(), dom, ReplaceableInstrsAVX512); // Don't change integer Q instructions to D instructions. if (table && Domain == 3 && table[3] == MI.getOpcode()) Domain = 4; } if (!table) { // try the AVX512DQ table assert((Subtarget.hasDQI() || Domain >= 3) && "Requires AVX-512DQ"); table = lookupAVX512(MI.getOpcode(), dom, ReplaceableInstrsAVX512DQ); // Don't change integer Q instructions to D instructions and // use D instructions if we started with a PS instruction. if (table && Domain == 3 && (dom == 1 || table[3] == MI.getOpcode())) Domain = 4; } if (!table) { // try the AVX512DQMasked table assert((Subtarget.hasDQI() || Domain >= 3) && "Requires AVX-512DQ"); table = lookupAVX512(MI.getOpcode(), dom, ReplaceableInstrsAVX512DQMasked); if (table && Domain == 3 && (dom == 1 || table[3] == MI.getOpcode())) Domain = 4; } assert(table && "Cannot change domain"); MI.setDesc(get(table[Domain - 1])); } /// Return the noop instruction to use for a noop. void X86InstrInfo::getNoop(MCInst &NopInst) const { NopInst.setOpcode(X86::NOOP); } bool X86InstrInfo::isHighLatencyDef(int opc) const { switch (opc) { default: return false; case X86::DIVPDrm: case X86::DIVPDrr: case X86::DIVPSrm: case X86::DIVPSrr: case X86::DIVSDrm: case X86::DIVSDrm_Int: case X86::DIVSDrr: case X86::DIVSDrr_Int: case X86::DIVSSrm: case X86::DIVSSrm_Int: case X86::DIVSSrr: case X86::DIVSSrr_Int: case X86::SQRTPDm: case X86::SQRTPDr: case X86::SQRTPSm: case X86::SQRTPSr: case X86::SQRTSDm: case X86::SQRTSDm_Int: case X86::SQRTSDr: case X86::SQRTSDr_Int: case X86::SQRTSSm: case X86::SQRTSSm_Int: case X86::SQRTSSr: case X86::SQRTSSr_Int: // AVX instructions with high latency case X86::VDIVPDrm: case X86::VDIVPDrr: case X86::VDIVPDYrm: case X86::VDIVPDYrr: case X86::VDIVPSrm: case X86::VDIVPSrr: case X86::VDIVPSYrm: case X86::VDIVPSYrr: case X86::VDIVSDrm: case X86::VDIVSDrm_Int: case X86::VDIVSDrr: case X86::VDIVSDrr_Int: case X86::VDIVSSrm: case X86::VDIVSSrm_Int: case X86::VDIVSSrr: case X86::VDIVSSrr_Int: case X86::VSQRTPDm: case X86::VSQRTPDr: case X86::VSQRTPDYm: case X86::VSQRTPDYr: case X86::VSQRTPSm: case X86::VSQRTPSr: case X86::VSQRTPSYm: case X86::VSQRTPSYr: case X86::VSQRTSDm: case X86::VSQRTSDm_Int: case X86::VSQRTSDr: case X86::VSQRTSDr_Int: case X86::VSQRTSSm: case X86::VSQRTSSm_Int: case X86::VSQRTSSr: case X86::VSQRTSSr_Int: // AVX512 instructions with high latency case X86::VDIVPDZ128rm: case X86::VDIVPDZ128rmb: case X86::VDIVPDZ128rmbk: case X86::VDIVPDZ128rmbkz: case X86::VDIVPDZ128rmk: case X86::VDIVPDZ128rmkz: case X86::VDIVPDZ128rr: case X86::VDIVPDZ128rrk: case X86::VDIVPDZ128rrkz: case X86::VDIVPDZ256rm: case X86::VDIVPDZ256rmb: case X86::VDIVPDZ256rmbk: case X86::VDIVPDZ256rmbkz: case X86::VDIVPDZ256rmk: case X86::VDIVPDZ256rmkz: case X86::VDIVPDZ256rr: case X86::VDIVPDZ256rrk: case X86::VDIVPDZ256rrkz: case X86::VDIVPDZrrb: case X86::VDIVPDZrrbk: case X86::VDIVPDZrrbkz: case X86::VDIVPDZrm: case X86::VDIVPDZrmb: case X86::VDIVPDZrmbk: case X86::VDIVPDZrmbkz: case X86::VDIVPDZrmk: case X86::VDIVPDZrmkz: case X86::VDIVPDZrr: case X86::VDIVPDZrrk: case X86::VDIVPDZrrkz: case X86::VDIVPSZ128rm: case X86::VDIVPSZ128rmb: case X86::VDIVPSZ128rmbk: case X86::VDIVPSZ128rmbkz: case X86::VDIVPSZ128rmk: case X86::VDIVPSZ128rmkz: case X86::VDIVPSZ128rr: case X86::VDIVPSZ128rrk: case X86::VDIVPSZ128rrkz: case X86::VDIVPSZ256rm: case X86::VDIVPSZ256rmb: case X86::VDIVPSZ256rmbk: case X86::VDIVPSZ256rmbkz: case X86::VDIVPSZ256rmk: case X86::VDIVPSZ256rmkz: case X86::VDIVPSZ256rr: case X86::VDIVPSZ256rrk: case X86::VDIVPSZ256rrkz: case X86::VDIVPSZrrb: case X86::VDIVPSZrrbk: case X86::VDIVPSZrrbkz: case X86::VDIVPSZrm: case X86::VDIVPSZrmb: case X86::VDIVPSZrmbk: case X86::VDIVPSZrmbkz: case X86::VDIVPSZrmk: case X86::VDIVPSZrmkz: case X86::VDIVPSZrr: case X86::VDIVPSZrrk: case X86::VDIVPSZrrkz: case X86::VDIVSDZrm: case X86::VDIVSDZrr: case X86::VDIVSDZrm_Int: case X86::VDIVSDZrm_Intk: case X86::VDIVSDZrm_Intkz: case X86::VDIVSDZrr_Int: case X86::VDIVSDZrr_Intk: case X86::VDIVSDZrr_Intkz: case X86::VDIVSDZrrb_Int: case X86::VDIVSDZrrb_Intk: case X86::VDIVSDZrrb_Intkz: case X86::VDIVSSZrm: case X86::VDIVSSZrr: case X86::VDIVSSZrm_Int: case X86::VDIVSSZrm_Intk: case X86::VDIVSSZrm_Intkz: case X86::VDIVSSZrr_Int: case X86::VDIVSSZrr_Intk: case X86::VDIVSSZrr_Intkz: case X86::VDIVSSZrrb_Int: case X86::VDIVSSZrrb_Intk: case X86::VDIVSSZrrb_Intkz: case X86::VSQRTPDZ128m: case X86::VSQRTPDZ128mb: case X86::VSQRTPDZ128mbk: case X86::VSQRTPDZ128mbkz: case X86::VSQRTPDZ128mk: case X86::VSQRTPDZ128mkz: case X86::VSQRTPDZ128r: case X86::VSQRTPDZ128rk: case X86::VSQRTPDZ128rkz: case X86::VSQRTPDZ256m: case X86::VSQRTPDZ256mb: case X86::VSQRTPDZ256mbk: case X86::VSQRTPDZ256mbkz: case X86::VSQRTPDZ256mk: case X86::VSQRTPDZ256mkz: case X86::VSQRTPDZ256r: case X86::VSQRTPDZ256rk: case X86::VSQRTPDZ256rkz: case X86::VSQRTPDZm: case X86::VSQRTPDZmb: case X86::VSQRTPDZmbk: case X86::VSQRTPDZmbkz: case X86::VSQRTPDZmk: case X86::VSQRTPDZmkz: case X86::VSQRTPDZr: case X86::VSQRTPDZrb: case X86::VSQRTPDZrbk: case X86::VSQRTPDZrbkz: case X86::VSQRTPDZrk: case X86::VSQRTPDZrkz: case X86::VSQRTPSZ128m: case X86::VSQRTPSZ128mb: case X86::VSQRTPSZ128mbk: case X86::VSQRTPSZ128mbkz: case X86::VSQRTPSZ128mk: case X86::VSQRTPSZ128mkz: case X86::VSQRTPSZ128r: case X86::VSQRTPSZ128rk: case X86::VSQRTPSZ128rkz: case X86::VSQRTPSZ256m: case X86::VSQRTPSZ256mb: case X86::VSQRTPSZ256mbk: case X86::VSQRTPSZ256mbkz: case X86::VSQRTPSZ256mk: case X86::VSQRTPSZ256mkz: case X86::VSQRTPSZ256r: case X86::VSQRTPSZ256rk: case X86::VSQRTPSZ256rkz: case X86::VSQRTPSZm: case X86::VSQRTPSZmb: case X86::VSQRTPSZmbk: case X86::VSQRTPSZmbkz: case X86::VSQRTPSZmk: case X86::VSQRTPSZmkz: case X86::VSQRTPSZr: case X86::VSQRTPSZrb: case X86::VSQRTPSZrbk: case X86::VSQRTPSZrbkz: case X86::VSQRTPSZrk: case X86::VSQRTPSZrkz: case X86::VSQRTSDZm: case X86::VSQRTSDZm_Int: case X86::VSQRTSDZm_Intk: case X86::VSQRTSDZm_Intkz: case X86::VSQRTSDZr: case X86::VSQRTSDZr_Int: case X86::VSQRTSDZr_Intk: case X86::VSQRTSDZr_Intkz: case X86::VSQRTSDZrb_Int: case X86::VSQRTSDZrb_Intk: case X86::VSQRTSDZrb_Intkz: case X86::VSQRTSSZm: case X86::VSQRTSSZm_Int: case X86::VSQRTSSZm_Intk: case X86::VSQRTSSZm_Intkz: case X86::VSQRTSSZr: case X86::VSQRTSSZr_Int: case X86::VSQRTSSZr_Intk: case X86::VSQRTSSZr_Intkz: case X86::VSQRTSSZrb_Int: case X86::VSQRTSSZrb_Intk: case X86::VSQRTSSZrb_Intkz: case X86::VGATHERDPDYrm: case X86::VGATHERDPDZ128rm: case X86::VGATHERDPDZ256rm: case X86::VGATHERDPDZrm: case X86::VGATHERDPDrm: case X86::VGATHERDPSYrm: case X86::VGATHERDPSZ128rm: case X86::VGATHERDPSZ256rm: case X86::VGATHERDPSZrm: case X86::VGATHERDPSrm: case X86::VGATHERPF0DPDm: case X86::VGATHERPF0DPSm: case X86::VGATHERPF0QPDm: case X86::VGATHERPF0QPSm: case X86::VGATHERPF1DPDm: case X86::VGATHERPF1DPSm: case X86::VGATHERPF1QPDm: case X86::VGATHERPF1QPSm: case X86::VGATHERQPDYrm: case X86::VGATHERQPDZ128rm: case X86::VGATHERQPDZ256rm: case X86::VGATHERQPDZrm: case X86::VGATHERQPDrm: case X86::VGATHERQPSYrm: case X86::VGATHERQPSZ128rm: case X86::VGATHERQPSZ256rm: case X86::VGATHERQPSZrm: case X86::VGATHERQPSrm: case X86::VPGATHERDDYrm: case X86::VPGATHERDDZ128rm: case X86::VPGATHERDDZ256rm: case X86::VPGATHERDDZrm: case X86::VPGATHERDDrm: case X86::VPGATHERDQYrm: case X86::VPGATHERDQZ128rm: case X86::VPGATHERDQZ256rm: case X86::VPGATHERDQZrm: case X86::VPGATHERDQrm: case X86::VPGATHERQDYrm: case X86::VPGATHERQDZ128rm: case X86::VPGATHERQDZ256rm: case X86::VPGATHERQDZrm: case X86::VPGATHERQDrm: case X86::VPGATHERQQYrm: case X86::VPGATHERQQZ128rm: case X86::VPGATHERQQZ256rm: case X86::VPGATHERQQZrm: case X86::VPGATHERQQrm: case X86::VSCATTERDPDZ128mr: case X86::VSCATTERDPDZ256mr: case X86::VSCATTERDPDZmr: case X86::VSCATTERDPSZ128mr: case X86::VSCATTERDPSZ256mr: case X86::VSCATTERDPSZmr: case X86::VSCATTERPF0DPDm: case X86::VSCATTERPF0DPSm: case X86::VSCATTERPF0QPDm: case X86::VSCATTERPF0QPSm: case X86::VSCATTERPF1DPDm: case X86::VSCATTERPF1DPSm: case X86::VSCATTERPF1QPDm: case X86::VSCATTERPF1QPSm: case X86::VSCATTERQPDZ128mr: case X86::VSCATTERQPDZ256mr: case X86::VSCATTERQPDZmr: case X86::VSCATTERQPSZ128mr: case X86::VSCATTERQPSZ256mr: case X86::VSCATTERQPSZmr: case X86::VPSCATTERDDZ128mr: case X86::VPSCATTERDDZ256mr: case X86::VPSCATTERDDZmr: case X86::VPSCATTERDQZ128mr: case X86::VPSCATTERDQZ256mr: case X86::VPSCATTERDQZmr: case X86::VPSCATTERQDZ128mr: case X86::VPSCATTERQDZ256mr: case X86::VPSCATTERQDZmr: case X86::VPSCATTERQQZ128mr: case X86::VPSCATTERQQZ256mr: case X86::VPSCATTERQQZmr: return true; } } bool X86InstrInfo::hasHighOperandLatency(const TargetSchedModel &SchedModel, const MachineRegisterInfo *MRI, const MachineInstr &DefMI, unsigned DefIdx, const MachineInstr &UseMI, unsigned UseIdx) const { return isHighLatencyDef(DefMI.getOpcode()); } bool X86InstrInfo::hasReassociableOperands(const MachineInstr &Inst, const MachineBasicBlock *MBB) const { assert(Inst.getNumExplicitOperands() == 3 && Inst.getNumExplicitDefs() == 1 && Inst.getNumDefs() <= 2 && "Reassociation needs binary operators"); // Integer binary math/logic instructions have a third source operand: // the EFLAGS register. That operand must be both defined here and never // used; ie, it must be dead. If the EFLAGS operand is live, then we can // not change anything because rearranging the operands could affect other // instructions that depend on the exact status flags (zero, sign, etc.) // that are set by using these particular operands with this operation. const MachineOperand *FlagDef = Inst.findRegisterDefOperand(X86::EFLAGS); assert((Inst.getNumDefs() == 1 || FlagDef) && "Implicit def isn't flags?"); if (FlagDef && !FlagDef->isDead()) return false; return TargetInstrInfo::hasReassociableOperands(Inst, MBB); } // TODO: There are many more machine instruction opcodes to match: // 1. Other data types (integer, vectors) // 2. Other math / logic operations (xor, or) // 3. Other forms of the same operation (intrinsics and other variants) bool X86InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst) const { switch (Inst.getOpcode()) { case X86::AND8rr: case X86::AND16rr: case X86::AND32rr: case X86::AND64rr: case X86::OR8rr: case X86::OR16rr: case X86::OR32rr: case X86::OR64rr: case X86::XOR8rr: case X86::XOR16rr: case X86::XOR32rr: case X86::XOR64rr: case X86::IMUL16rr: case X86::IMUL32rr: case X86::IMUL64rr: case X86::PANDrr: case X86::PORrr: case X86::PXORrr: case X86::ANDPDrr: case X86::ANDPSrr: case X86::ORPDrr: case X86::ORPSrr: case X86::XORPDrr: case X86::XORPSrr: case X86::PADDBrr: case X86::PADDWrr: case X86::PADDDrr: case X86::PADDQrr: case X86::PMULLWrr: case X86::PMULLDrr: case X86::PMAXSBrr: case X86::PMAXSDrr: case X86::PMAXSWrr: case X86::PMAXUBrr: case X86::PMAXUDrr: case X86::PMAXUWrr: case X86::PMINSBrr: case X86::PMINSDrr: case X86::PMINSWrr: case X86::PMINUBrr: case X86::PMINUDrr: case X86::PMINUWrr: case X86::VPANDrr: case X86::VPANDYrr: case X86::VPANDDZ128rr: case X86::VPANDDZ256rr: case X86::VPANDDZrr: case X86::VPANDQZ128rr: case X86::VPANDQZ256rr: case X86::VPANDQZrr: case X86::VPORrr: case X86::VPORYrr: case X86::VPORDZ128rr: case X86::VPORDZ256rr: case X86::VPORDZrr: case X86::VPORQZ128rr: case X86::VPORQZ256rr: case X86::VPORQZrr: case X86::VPXORrr: case X86::VPXORYrr: case X86::VPXORDZ128rr: case X86::VPXORDZ256rr: case X86::VPXORDZrr: case X86::VPXORQZ128rr: case X86::VPXORQZ256rr: case X86::VPXORQZrr: case X86::VANDPDrr: case X86::VANDPSrr: case X86::VANDPDYrr: case X86::VANDPSYrr: case X86::VANDPDZ128rr: case X86::VANDPSZ128rr: case X86::VANDPDZ256rr: case X86::VANDPSZ256rr: case X86::VANDPDZrr: case X86::VANDPSZrr: case X86::VORPDrr: case X86::VORPSrr: case X86::VORPDYrr: case X86::VORPSYrr: case X86::VORPDZ128rr: case X86::VORPSZ128rr: case X86::VORPDZ256rr: case X86::VORPSZ256rr: case X86::VORPDZrr: case X86::VORPSZrr: case X86::VXORPDrr: case X86::VXORPSrr: case X86::VXORPDYrr: case X86::VXORPSYrr: case X86::VXORPDZ128rr: case X86::VXORPSZ128rr: case X86::VXORPDZ256rr: case X86::VXORPSZ256rr: case X86::VXORPDZrr: case X86::VXORPSZrr: case X86::KADDBrr: case X86::KADDWrr: case X86::KADDDrr: case X86::KADDQrr: case X86::KANDBrr: case X86::KANDWrr: case X86::KANDDrr: case X86::KANDQrr: case X86::KORBrr: case X86::KORWrr: case X86::KORDrr: case X86::KORQrr: case X86::KXORBrr: case X86::KXORWrr: case X86::KXORDrr: case X86::KXORQrr: case X86::VPADDBrr: case X86::VPADDWrr: case X86::VPADDDrr: case X86::VPADDQrr: case X86::VPADDBYrr: case X86::VPADDWYrr: case X86::VPADDDYrr: case X86::VPADDQYrr: case X86::VPADDBZ128rr: case X86::VPADDWZ128rr: case X86::VPADDDZ128rr: case X86::VPADDQZ128rr: case X86::VPADDBZ256rr: case X86::VPADDWZ256rr: case X86::VPADDDZ256rr: case X86::VPADDQZ256rr: case X86::VPADDBZrr: case X86::VPADDWZrr: case X86::VPADDDZrr: case X86::VPADDQZrr: case X86::VPMULLWrr: case X86::VPMULLWYrr: case X86::VPMULLWZ128rr: case X86::VPMULLWZ256rr: case X86::VPMULLWZrr: case X86::VPMULLDrr: case X86::VPMULLDYrr: case X86::VPMULLDZ128rr: case X86::VPMULLDZ256rr: case X86::VPMULLDZrr: case X86::VPMULLQZ128rr: case X86::VPMULLQZ256rr: case X86::VPMULLQZrr: case X86::VPMAXSBrr: case X86::VPMAXSBYrr: case X86::VPMAXSBZ128rr: case X86::VPMAXSBZ256rr: case X86::VPMAXSBZrr: case X86::VPMAXSDrr: case X86::VPMAXSDYrr: case X86::VPMAXSDZ128rr: case X86::VPMAXSDZ256rr: case X86::VPMAXSDZrr: case X86::VPMAXSQZ128rr: case X86::VPMAXSQZ256rr: case X86::VPMAXSQZrr: case X86::VPMAXSWrr: case X86::VPMAXSWYrr: case X86::VPMAXSWZ128rr: case X86::VPMAXSWZ256rr: case X86::VPMAXSWZrr: case X86::VPMAXUBrr: case X86::VPMAXUBYrr: case X86::VPMAXUBZ128rr: case X86::VPMAXUBZ256rr: case X86::VPMAXUBZrr: case X86::VPMAXUDrr: case X86::VPMAXUDYrr: case X86::VPMAXUDZ128rr: case X86::VPMAXUDZ256rr: case X86::VPMAXUDZrr: case X86::VPMAXUQZ128rr: case X86::VPMAXUQZ256rr: case X86::VPMAXUQZrr: case X86::VPMAXUWrr: case X86::VPMAXUWYrr: case X86::VPMAXUWZ128rr: case X86::VPMAXUWZ256rr: case X86::VPMAXUWZrr: case X86::VPMINSBrr: case X86::VPMINSBYrr: case X86::VPMINSBZ128rr: case X86::VPMINSBZ256rr: case X86::VPMINSBZrr: case X86::VPMINSDrr: case X86::VPMINSDYrr: case X86::VPMINSDZ128rr: case X86::VPMINSDZ256rr: case X86::VPMINSDZrr: case X86::VPMINSQZ128rr: case X86::VPMINSQZ256rr: case X86::VPMINSQZrr: case X86::VPMINSWrr: case X86::VPMINSWYrr: case X86::VPMINSWZ128rr: case X86::VPMINSWZ256rr: case X86::VPMINSWZrr: case X86::VPMINUBrr: case X86::VPMINUBYrr: case X86::VPMINUBZ128rr: case X86::VPMINUBZ256rr: case X86::VPMINUBZrr: case X86::VPMINUDrr: case X86::VPMINUDYrr: case X86::VPMINUDZ128rr: case X86::VPMINUDZ256rr: case X86::VPMINUDZrr: case X86::VPMINUQZ128rr: case X86::VPMINUQZ256rr: case X86::VPMINUQZrr: case X86::VPMINUWrr: case X86::VPMINUWYrr: case X86::VPMINUWZ128rr: case X86::VPMINUWZ256rr: case X86::VPMINUWZrr: // Normal min/max instructions are not commutative because of NaN and signed // zero semantics, but these are. Thus, there's no need to check for global // relaxed math; the instructions themselves have the properties we need. case X86::MAXCPDrr: case X86::MAXCPSrr: case X86::MAXCSDrr: case X86::MAXCSSrr: case X86::MINCPDrr: case X86::MINCPSrr: case X86::MINCSDrr: case X86::MINCSSrr: case X86::VMAXCPDrr: case X86::VMAXCPSrr: case X86::VMAXCPDYrr: case X86::VMAXCPSYrr: case X86::VMAXCPDZ128rr: case X86::VMAXCPSZ128rr: case X86::VMAXCPDZ256rr: case X86::VMAXCPSZ256rr: case X86::VMAXCPDZrr: case X86::VMAXCPSZrr: case X86::VMAXCSDrr: case X86::VMAXCSSrr: case X86::VMAXCSDZrr: case X86::VMAXCSSZrr: case X86::VMINCPDrr: case X86::VMINCPSrr: case X86::VMINCPDYrr: case X86::VMINCPSYrr: case X86::VMINCPDZ128rr: case X86::VMINCPSZ128rr: case X86::VMINCPDZ256rr: case X86::VMINCPSZ256rr: case X86::VMINCPDZrr: case X86::VMINCPSZrr: case X86::VMINCSDrr: case X86::VMINCSSrr: case X86::VMINCSDZrr: case X86::VMINCSSZrr: return true; case X86::ADDPDrr: case X86::ADDPSrr: case X86::ADDSDrr: case X86::ADDSSrr: case X86::MULPDrr: case X86::MULPSrr: case X86::MULSDrr: case X86::MULSSrr: case X86::VADDPDrr: case X86::VADDPSrr: case X86::VADDPDYrr: case X86::VADDPSYrr: case X86::VADDPDZ128rr: case X86::VADDPSZ128rr: case X86::VADDPDZ256rr: case X86::VADDPSZ256rr: case X86::VADDPDZrr: case X86::VADDPSZrr: case X86::VADDSDrr: case X86::VADDSSrr: case X86::VADDSDZrr: case X86::VADDSSZrr: case X86::VMULPDrr: case X86::VMULPSrr: case X86::VMULPDYrr: case X86::VMULPSYrr: case X86::VMULPDZ128rr: case X86::VMULPSZ128rr: case X86::VMULPDZ256rr: case X86::VMULPSZ256rr: case X86::VMULPDZrr: case X86::VMULPSZrr: case X86::VMULSDrr: case X86::VMULSSrr: case X86::VMULSDZrr: case X86::VMULSSZrr: return Inst.getFlag(MachineInstr::MIFlag::FmReassoc) && Inst.getFlag(MachineInstr::MIFlag::FmNsz); default: return false; } } /// If \p DescribedReg overlaps with the MOVrr instruction's destination /// register then, if possible, describe the value in terms of the source /// register. static Optional describeMOVrrLoadedValue(const MachineInstr &MI, Register DescribedReg, const TargetRegisterInfo *TRI) { Register DestReg = MI.getOperand(0).getReg(); Register SrcReg = MI.getOperand(1).getReg(); auto Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), {}); // If the described register is the destination, just return the source. if (DestReg == DescribedReg) return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr); // If the described register is a sub-register of the destination register, // then pick out the source register's corresponding sub-register. if (unsigned SubRegIdx = TRI->getSubRegIndex(DestReg, DescribedReg)) { Register SrcSubReg = TRI->getSubReg(SrcReg, SubRegIdx); return ParamLoadedValue(MachineOperand::CreateReg(SrcSubReg, false), Expr); } // The remaining case to consider is when the described register is a // super-register of the destination register. MOV8rr and MOV16rr does not // write to any of the other bytes in the register, meaning that we'd have to // describe the value using a combination of the source register and the // non-overlapping bits in the described register, which is not currently // possible. if (MI.getOpcode() == X86::MOV8rr || MI.getOpcode() == X86::MOV16rr || !TRI->isSuperRegister(DestReg, DescribedReg)) return None; assert(MI.getOpcode() == X86::MOV32rr && "Unexpected super-register case"); return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr); } Optional X86InstrInfo::describeLoadedValue(const MachineInstr &MI, Register Reg) const { const MachineOperand *Op = nullptr; DIExpression *Expr = nullptr; const TargetRegisterInfo *TRI = &getRegisterInfo(); switch (MI.getOpcode()) { case X86::LEA32r: case X86::LEA64r: case X86::LEA64_32r: { // We may need to describe a 64-bit parameter with a 32-bit LEA. if (!TRI->isSuperRegisterEq(MI.getOperand(0).getReg(), Reg)) return None; // Operand 4 could be global address. For now we do not support // such situation. if (!MI.getOperand(4).isImm() || !MI.getOperand(2).isImm()) return None; const MachineOperand &Op1 = MI.getOperand(1); const MachineOperand &Op2 = MI.getOperand(3); assert(Op2.isReg() && (Op2.getReg() == X86::NoRegister || Register::isPhysicalRegister(Op2.getReg()))); // Omit situations like: // %rsi = lea %rsi, 4, ... if ((Op1.isReg() && Op1.getReg() == MI.getOperand(0).getReg()) || Op2.getReg() == MI.getOperand(0).getReg()) return None; else if ((Op1.isReg() && Op1.getReg() != X86::NoRegister && TRI->regsOverlap(Op1.getReg(), MI.getOperand(0).getReg())) || (Op2.getReg() != X86::NoRegister && TRI->regsOverlap(Op2.getReg(), MI.getOperand(0).getReg()))) return None; int64_t Coef = MI.getOperand(2).getImm(); int64_t Offset = MI.getOperand(4).getImm(); SmallVector Ops; if ((Op1.isReg() && Op1.getReg() != X86::NoRegister)) { Op = &Op1; } else if (Op1.isFI()) Op = &Op1; if (Op && Op->isReg() && Op->getReg() == Op2.getReg() && Coef > 0) { Ops.push_back(dwarf::DW_OP_constu); Ops.push_back(Coef + 1); Ops.push_back(dwarf::DW_OP_mul); } else { if (Op && Op2.getReg() != X86::NoRegister) { int dwarfReg = TRI->getDwarfRegNum(Op2.getReg(), false); if (dwarfReg < 0) return None; else if (dwarfReg < 32) { Ops.push_back(dwarf::DW_OP_breg0 + dwarfReg); Ops.push_back(0); } else { Ops.push_back(dwarf::DW_OP_bregx); Ops.push_back(dwarfReg); Ops.push_back(0); } } else if (!Op) { assert(Op2.getReg() != X86::NoRegister); Op = &Op2; } if (Coef > 1) { assert(Op2.getReg() != X86::NoRegister); Ops.push_back(dwarf::DW_OP_constu); Ops.push_back(Coef); Ops.push_back(dwarf::DW_OP_mul); } if (((Op1.isReg() && Op1.getReg() != X86::NoRegister) || Op1.isFI()) && Op2.getReg() != X86::NoRegister) { Ops.push_back(dwarf::DW_OP_plus); } } DIExpression::appendOffset(Ops, Offset); Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), Ops); return ParamLoadedValue(*Op, Expr);; } case X86::MOV8ri: case X86::MOV16ri: // TODO: Handle MOV8ri and MOV16ri. return None; case X86::MOV32ri: case X86::MOV64ri: case X86::MOV64ri32: // MOV32ri may be used for producing zero-extended 32-bit immediates in // 64-bit parameters, so we need to consider super-registers. if (!TRI->isSuperRegisterEq(MI.getOperand(0).getReg(), Reg)) return None; return ParamLoadedValue(MI.getOperand(1), Expr); case X86::MOV8rr: case X86::MOV16rr: case X86::MOV32rr: case X86::MOV64rr: return describeMOVrrLoadedValue(MI, Reg, TRI); case X86::XOR32rr: { // 64-bit parameters are zero-materialized using XOR32rr, so also consider // super-registers. if (!TRI->isSuperRegisterEq(MI.getOperand(0).getReg(), Reg)) return None; if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) return ParamLoadedValue(MachineOperand::CreateImm(0), Expr); return None; } case X86::MOVSX64rr32: { // We may need to describe the lower 32 bits of the MOVSX; for example, in // cases like this: // // $ebx = [...] // $rdi = MOVSX64rr32 $ebx // $esi = MOV32rr $edi if (!TRI->isSubRegisterEq(MI.getOperand(0).getReg(), Reg)) return None; Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), {}); // If the described register is the destination register we need to // sign-extend the source register from 32 bits. The other case we handle // is when the described register is the 32-bit sub-register of the // destination register, in case we just need to return the source // register. if (Reg == MI.getOperand(0).getReg()) Expr = DIExpression::appendExt(Expr, 32, 64, true); else assert(X86MCRegisterClasses[X86::GR32RegClassID].contains(Reg) && "Unhandled sub-register case for MOVSX64rr32"); return ParamLoadedValue(MI.getOperand(1), Expr); } default: assert(!MI.isMoveImmediate() && "Unexpected MoveImm instruction"); return TargetInstrInfo::describeLoadedValue(MI, Reg); } } /// This is an architecture-specific helper function of reassociateOps. /// Set special operand attributes for new instructions after reassociation. void X86InstrInfo::setSpecialOperandAttr(MachineInstr &OldMI1, MachineInstr &OldMI2, MachineInstr &NewMI1, MachineInstr &NewMI2) const { // Propagate FP flags from the original instructions. // But clear poison-generating flags because those may not be valid now. // TODO: There should be a helper function for copying only fast-math-flags. uint16_t IntersectedFlags = OldMI1.getFlags() & OldMI2.getFlags(); NewMI1.setFlags(IntersectedFlags); NewMI1.clearFlag(MachineInstr::MIFlag::NoSWrap); NewMI1.clearFlag(MachineInstr::MIFlag::NoUWrap); NewMI1.clearFlag(MachineInstr::MIFlag::IsExact); NewMI2.setFlags(IntersectedFlags); NewMI2.clearFlag(MachineInstr::MIFlag::NoSWrap); NewMI2.clearFlag(MachineInstr::MIFlag::NoUWrap); NewMI2.clearFlag(MachineInstr::MIFlag::IsExact); // Integer instructions may define an implicit EFLAGS dest register operand. MachineOperand *OldFlagDef1 = OldMI1.findRegisterDefOperand(X86::EFLAGS); MachineOperand *OldFlagDef2 = OldMI2.findRegisterDefOperand(X86::EFLAGS); assert(!OldFlagDef1 == !OldFlagDef2 && "Unexpected instruction type for reassociation"); if (!OldFlagDef1 || !OldFlagDef2) return; assert(OldFlagDef1->isDead() && OldFlagDef2->isDead() && "Must have dead EFLAGS operand in reassociable instruction"); MachineOperand *NewFlagDef1 = NewMI1.findRegisterDefOperand(X86::EFLAGS); MachineOperand *NewFlagDef2 = NewMI2.findRegisterDefOperand(X86::EFLAGS); assert(NewFlagDef1 && NewFlagDef2 && "Unexpected operand in reassociable instruction"); // Mark the new EFLAGS operands as dead to be helpful to subsequent iterations // of this pass or other passes. The EFLAGS operands must be dead in these new // instructions because the EFLAGS operands in the original instructions must // be dead in order for reassociation to occur. NewFlagDef1->setIsDead(); NewFlagDef2->setIsDead(); } std::pair X86InstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const { return std::make_pair(TF, 0u); } ArrayRef> X86InstrInfo::getSerializableDirectMachineOperandTargetFlags() const { using namespace X86II; static const std::pair TargetFlags[] = { {MO_GOT_ABSOLUTE_ADDRESS, "x86-got-absolute-address"}, {MO_PIC_BASE_OFFSET, "x86-pic-base-offset"}, {MO_GOT, "x86-got"}, {MO_GOTOFF, "x86-gotoff"}, {MO_GOTPCREL, "x86-gotpcrel"}, {MO_PLT, "x86-plt"}, {MO_TLSGD, "x86-tlsgd"}, {MO_TLSLD, "x86-tlsld"}, {MO_TLSLDM, "x86-tlsldm"}, {MO_GOTTPOFF, "x86-gottpoff"}, {MO_INDNTPOFF, "x86-indntpoff"}, {MO_TPOFF, "x86-tpoff"}, {MO_DTPOFF, "x86-dtpoff"}, {MO_NTPOFF, "x86-ntpoff"}, {MO_GOTNTPOFF, "x86-gotntpoff"}, {MO_DLLIMPORT, "x86-dllimport"}, {MO_DARWIN_NONLAZY, "x86-darwin-nonlazy"}, {MO_DARWIN_NONLAZY_PIC_BASE, "x86-darwin-nonlazy-pic-base"}, {MO_TLVP, "x86-tlvp"}, {MO_TLVP_PIC_BASE, "x86-tlvp-pic-base"}, {MO_SECREL, "x86-secrel"}, {MO_COFFSTUB, "x86-coffstub"}}; return makeArrayRef(TargetFlags); } namespace { /// Create Global Base Reg pass. This initializes the PIC /// global base register for x86-32. struct CGBR : public MachineFunctionPass { static char ID; CGBR() : MachineFunctionPass(ID) {} bool runOnMachineFunction(MachineFunction &MF) override { const X86TargetMachine *TM = static_cast(&MF.getTarget()); const X86Subtarget &STI = MF.getSubtarget(); // Don't do anything in the 64-bit small and kernel code models. They use // RIP-relative addressing for everything. if (STI.is64Bit() && (TM->getCodeModel() == CodeModel::Small || TM->getCodeModel() == CodeModel::Kernel)) return false; // Only emit a global base reg in PIC mode. if (!TM->isPositionIndependent()) return false; X86MachineFunctionInfo *X86FI = MF.getInfo(); Register GlobalBaseReg = X86FI->getGlobalBaseReg(); // If we didn't need a GlobalBaseReg, don't insert code. if (GlobalBaseReg == 0) return false; // Insert the set of GlobalBaseReg into the first MBB of the function MachineBasicBlock &FirstMBB = MF.front(); MachineBasicBlock::iterator MBBI = FirstMBB.begin(); DebugLoc DL = FirstMBB.findDebugLoc(MBBI); MachineRegisterInfo &RegInfo = MF.getRegInfo(); const X86InstrInfo *TII = STI.getInstrInfo(); Register PC; if (STI.isPICStyleGOT()) PC = RegInfo.createVirtualRegister(&X86::GR32RegClass); else PC = GlobalBaseReg; if (STI.is64Bit()) { if (TM->getCodeModel() == CodeModel::Medium) { // In the medium code model, use a RIP-relative LEA to materialize the // GOT. BuildMI(FirstMBB, MBBI, DL, TII->get(X86::LEA64r), PC) .addReg(X86::RIP) .addImm(0) .addReg(0) .addExternalSymbol("_GLOBAL_OFFSET_TABLE_") .addReg(0); } else if (TM->getCodeModel() == CodeModel::Large) { // In the large code model, we are aiming for this code, though the // register allocation may vary: // leaq .LN$pb(%rip), %rax // movq $_GLOBAL_OFFSET_TABLE_ - .LN$pb, %rcx // addq %rcx, %rax // RAX now holds address of _GLOBAL_OFFSET_TABLE_. Register PBReg = RegInfo.createVirtualRegister(&X86::GR64RegClass); Register GOTReg = RegInfo.createVirtualRegister(&X86::GR64RegClass); BuildMI(FirstMBB, MBBI, DL, TII->get(X86::LEA64r), PBReg) .addReg(X86::RIP) .addImm(0) .addReg(0) .addSym(MF.getPICBaseSymbol()) .addReg(0); std::prev(MBBI)->setPreInstrSymbol(MF, MF.getPICBaseSymbol()); BuildMI(FirstMBB, MBBI, DL, TII->get(X86::MOV64ri), GOTReg) .addExternalSymbol("_GLOBAL_OFFSET_TABLE_", X86II::MO_PIC_BASE_OFFSET); BuildMI(FirstMBB, MBBI, DL, TII->get(X86::ADD64rr), PC) .addReg(PBReg, RegState::Kill) .addReg(GOTReg, RegState::Kill); } else { llvm_unreachable("unexpected code model"); } } else { // Operand of MovePCtoStack is completely ignored by asm printer. It's // only used in JIT code emission as displacement to pc. BuildMI(FirstMBB, MBBI, DL, TII->get(X86::MOVPC32r), PC).addImm(0); // If we're using vanilla 'GOT' PIC style, we should use relative // addressing not to pc, but to _GLOBAL_OFFSET_TABLE_ external. if (STI.isPICStyleGOT()) { // Generate addl $__GLOBAL_OFFSET_TABLE_ + [.-piclabel], // %some_register BuildMI(FirstMBB, MBBI, DL, TII->get(X86::ADD32ri), GlobalBaseReg) .addReg(PC) .addExternalSymbol("_GLOBAL_OFFSET_TABLE_", X86II::MO_GOT_ABSOLUTE_ADDRESS); } } return true; } StringRef getPassName() const override { return "X86 PIC Global Base Reg Initialization"; } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); MachineFunctionPass::getAnalysisUsage(AU); } }; } // namespace char CGBR::ID = 0; FunctionPass* llvm::createX86GlobalBaseRegPass() { return new CGBR(); } namespace { struct LDTLSCleanup : public MachineFunctionPass { static char ID; LDTLSCleanup() : MachineFunctionPass(ID) {} bool runOnMachineFunction(MachineFunction &MF) override { if (skipFunction(MF.getFunction())) return false; X86MachineFunctionInfo *MFI = MF.getInfo(); if (MFI->getNumLocalDynamicTLSAccesses() < 2) { // No point folding accesses if there isn't at least two. return false; } MachineDominatorTree *DT = &getAnalysis(); return VisitNode(DT->getRootNode(), 0); } // Visit the dominator subtree rooted at Node in pre-order. // If TLSBaseAddrReg is non-null, then use that to replace any // TLS_base_addr instructions. Otherwise, create the register // when the first such instruction is seen, and then use it // as we encounter more instructions. bool VisitNode(MachineDomTreeNode *Node, unsigned TLSBaseAddrReg) { MachineBasicBlock *BB = Node->getBlock(); bool Changed = false; // Traverse the current block. for (MachineBasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { switch (I->getOpcode()) { case X86::TLS_base_addr32: case X86::TLS_base_addr64: if (TLSBaseAddrReg) I = ReplaceTLSBaseAddrCall(*I, TLSBaseAddrReg); else I = SetRegister(*I, &TLSBaseAddrReg); Changed = true; break; default: break; } } // Visit the children of this block in the dominator tree. for (auto I = Node->begin(), E = Node->end(); I != E; ++I) { Changed |= VisitNode(*I, TLSBaseAddrReg); } return Changed; } // Replace the TLS_base_addr instruction I with a copy from // TLSBaseAddrReg, returning the new instruction. MachineInstr *ReplaceTLSBaseAddrCall(MachineInstr &I, unsigned TLSBaseAddrReg) { MachineFunction *MF = I.getParent()->getParent(); const X86Subtarget &STI = MF->getSubtarget(); const bool is64Bit = STI.is64Bit(); const X86InstrInfo *TII = STI.getInstrInfo(); // Insert a Copy from TLSBaseAddrReg to RAX/EAX. MachineInstr *Copy = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII->get(TargetOpcode::COPY), is64Bit ? X86::RAX : X86::EAX) .addReg(TLSBaseAddrReg); // Erase the TLS_base_addr instruction. I.eraseFromParent(); return Copy; } // Create a virtual register in *TLSBaseAddrReg, and populate it by // inserting a copy instruction after I. Returns the new instruction. MachineInstr *SetRegister(MachineInstr &I, unsigned *TLSBaseAddrReg) { MachineFunction *MF = I.getParent()->getParent(); const X86Subtarget &STI = MF->getSubtarget(); const bool is64Bit = STI.is64Bit(); const X86InstrInfo *TII = STI.getInstrInfo(); // Create a virtual register for the TLS base address. MachineRegisterInfo &RegInfo = MF->getRegInfo(); *TLSBaseAddrReg = RegInfo.createVirtualRegister(is64Bit ? &X86::GR64RegClass : &X86::GR32RegClass); // Insert a copy from RAX/EAX to TLSBaseAddrReg. MachineInstr *Next = I.getNextNode(); MachineInstr *Copy = BuildMI(*I.getParent(), Next, I.getDebugLoc(), TII->get(TargetOpcode::COPY), *TLSBaseAddrReg) .addReg(is64Bit ? X86::RAX : X86::EAX); return Copy; } StringRef getPassName() const override { return "Local Dynamic TLS Access Clean-up"; } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); } }; } char LDTLSCleanup::ID = 0; FunctionPass* llvm::createCleanupLocalDynamicTLSPass() { return new LDTLSCleanup(); } /// Constants defining how certain sequences should be outlined. /// /// \p MachineOutlinerDefault implies that the function is called with a call /// instruction, and a return must be emitted for the outlined function frame. /// /// That is, /// /// I1 OUTLINED_FUNCTION: /// I2 --> call OUTLINED_FUNCTION I1 /// I3 I2 /// I3 /// ret /// /// * Call construction overhead: 1 (call instruction) /// * Frame construction overhead: 1 (return instruction) /// /// \p MachineOutlinerTailCall implies that the function is being tail called. /// A jump is emitted instead of a call, and the return is already present in /// the outlined sequence. That is, /// /// I1 OUTLINED_FUNCTION: /// I2 --> jmp OUTLINED_FUNCTION I1 /// ret I2 /// ret /// /// * Call construction overhead: 1 (jump instruction) /// * Frame construction overhead: 0 (don't need to return) /// enum MachineOutlinerClass { MachineOutlinerDefault, MachineOutlinerTailCall }; outliner::OutlinedFunction X86InstrInfo::getOutliningCandidateInfo( std::vector &RepeatedSequenceLocs) const { unsigned SequenceSize = std::accumulate(RepeatedSequenceLocs[0].front(), std::next(RepeatedSequenceLocs[0].back()), 0, [](unsigned Sum, const MachineInstr &MI) { // FIXME: x86 doesn't implement getInstSizeInBytes, so // we can't tell the cost. Just assume each instruction // is one byte. if (MI.isDebugInstr() || MI.isKill()) return Sum; return Sum + 1; }); // We check to see if CFI Instructions are present, and if they are // we find the number of CFI Instructions in the candidates. unsigned CFICount = 0; MachineBasicBlock::iterator MBBI = RepeatedSequenceLocs[0].front(); for (unsigned Loc = RepeatedSequenceLocs[0].getStartIdx(); Loc < RepeatedSequenceLocs[0].getEndIdx() + 1; Loc++) { const std::vector &CFIInstructions = RepeatedSequenceLocs[0].getMF()->getFrameInstructions(); if (MBBI->isCFIInstruction()) { unsigned CFIIndex = MBBI->getOperand(0).getCFIIndex(); MCCFIInstruction CFI = CFIInstructions[CFIIndex]; CFICount++; } MBBI++; } // We compare the number of found CFI Instructions to the number of CFI // instructions in the parent function for each candidate. We must check this // since if we outline one of the CFI instructions in a function, we have to // outline them all for correctness. If we do not, the address offsets will be // incorrect between the two sections of the program. for (outliner::Candidate &C : RepeatedSequenceLocs) { std::vector CFIInstructions = C.getMF()->getFrameInstructions(); if (CFICount > 0 && CFICount != CFIInstructions.size()) return outliner::OutlinedFunction(); } // FIXME: Use real size in bytes for call and ret instructions. if (RepeatedSequenceLocs[0].back()->isTerminator()) { for (outliner::Candidate &C : RepeatedSequenceLocs) C.setCallInfo(MachineOutlinerTailCall, 1); return outliner::OutlinedFunction(RepeatedSequenceLocs, SequenceSize, 0, // Number of bytes to emit frame. MachineOutlinerTailCall // Type of frame. ); } if (CFICount > 0) return outliner::OutlinedFunction(); for (outliner::Candidate &C : RepeatedSequenceLocs) C.setCallInfo(MachineOutlinerDefault, 1); return outliner::OutlinedFunction(RepeatedSequenceLocs, SequenceSize, 1, MachineOutlinerDefault); } bool X86InstrInfo::isFunctionSafeToOutlineFrom(MachineFunction &MF, bool OutlineFromLinkOnceODRs) const { const Function &F = MF.getFunction(); // Does the function use a red zone? If it does, then we can't risk messing // with the stack. if (Subtarget.getFrameLowering()->has128ByteRedZone(MF)) { // It could have a red zone. If it does, then we don't want to touch it. const X86MachineFunctionInfo *X86FI = MF.getInfo(); if (!X86FI || X86FI->getUsesRedZone()) return false; } // If we *don't* want to outline from things that could potentially be deduped // then return false. if (!OutlineFromLinkOnceODRs && F.hasLinkOnceODRLinkage()) return false; // This function is viable for outlining, so return true. return true; } outliner::InstrType X86InstrInfo::getOutliningType(MachineBasicBlock::iterator &MIT, unsigned Flags) const { MachineInstr &MI = *MIT; // Don't allow debug values to impact outlining type. if (MI.isDebugInstr() || MI.isIndirectDebugValue()) return outliner::InstrType::Invisible; // At this point, KILL instructions don't really tell us much so we can go // ahead and skip over them. if (MI.isKill()) return outliner::InstrType::Invisible; // Is this a tail call? If yes, we can outline as a tail call. if (isTailCall(MI)) return outliner::InstrType::Legal; // Is this the terminator of a basic block? if (MI.isTerminator() || MI.isReturn()) { // Does its parent have any successors in its MachineFunction? if (MI.getParent()->succ_empty()) return outliner::InstrType::Legal; // It does, so we can't tail call it. return outliner::InstrType::Illegal; } // Don't outline anything that modifies or reads from the stack pointer. // // FIXME: There are instructions which are being manually built without // explicit uses/defs so we also have to check the MCInstrDesc. We should be // able to remove the extra checks once those are fixed up. For example, // sometimes we might get something like %rax = POP64r 1. This won't be // caught by modifiesRegister or readsRegister even though the instruction // really ought to be formed so that modifiesRegister/readsRegister would // catch it. if (MI.modifiesRegister(X86::RSP, &RI) || MI.readsRegister(X86::RSP, &RI) || MI.getDesc().hasImplicitUseOfPhysReg(X86::RSP) || MI.getDesc().hasImplicitDefOfPhysReg(X86::RSP)) return outliner::InstrType::Illegal; // Outlined calls change the instruction pointer, so don't read from it. if (MI.readsRegister(X86::RIP, &RI) || MI.getDesc().hasImplicitUseOfPhysReg(X86::RIP) || MI.getDesc().hasImplicitDefOfPhysReg(X86::RIP)) return outliner::InstrType::Illegal; // Positions can't safely be outlined. if (MI.isPosition()) return outliner::InstrType::Illegal; // Make sure none of the operands of this instruction do anything tricky. for (const MachineOperand &MOP : MI.operands()) if (MOP.isCPI() || MOP.isJTI() || MOP.isCFIIndex() || MOP.isFI() || MOP.isTargetIndex()) return outliner::InstrType::Illegal; return outliner::InstrType::Legal; } void X86InstrInfo::buildOutlinedFrame(MachineBasicBlock &MBB, MachineFunction &MF, const outliner::OutlinedFunction &OF) const { // If we're a tail call, we already have a return, so don't do anything. if (OF.FrameConstructionID == MachineOutlinerTailCall) return; // We're a normal call, so our sequence doesn't have a return instruction. // Add it in. MachineInstr *retq = BuildMI(MF, DebugLoc(), get(X86::RETQ)); MBB.insert(MBB.end(), retq); } MachineBasicBlock::iterator X86InstrInfo::insertOutlinedCall(Module &M, MachineBasicBlock &MBB, MachineBasicBlock::iterator &It, MachineFunction &MF, const outliner::Candidate &C) const { // Is it a tail call? if (C.CallConstructionID == MachineOutlinerTailCall) { // Yes, just insert a JMP. It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(X86::TAILJMPd64)) .addGlobalAddress(M.getNamedValue(MF.getName()))); } else { // No, insert a call. It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(X86::CALL64pcrel32)) .addGlobalAddress(M.getNamedValue(MF.getName()))); } return It; } #define GET_INSTRINFO_HELPERS #include "X86GenInstrInfo.inc" diff --git a/llvm/lib/Target/X86/X86PreTileConfig.cpp b/llvm/lib/Target/X86/X86PreTileConfig.cpp index a61f9c5cc752..05ee6c6c8384 100644 --- a/llvm/lib/Target/X86/X86PreTileConfig.cpp +++ b/llvm/lib/Target/X86/X86PreTileConfig.cpp @@ -1,387 +1,265 @@ //===-- X86PreTileConfig.cpp - Tile Register Configure---------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // /// \file Pass to pre-config the shape of AMX register /// AMX register need to be configured before use. The shape of AMX register /// is encoded in the 1st and 2nd machine operand of AMX pseudo instructions. /// The pldtilecfg is to config tile registers. It should dominator all AMX /// instructions. The pldtilecfg produce a virtual cfg register and the cfg /// register is used by all AMX instructions. /// This pass is to find the common dominator of all AMX instructions and /// insert the pldtilecfg instruction. Besides the cfg register that pldtilecfg /// produces is inserted as the last operand of each AMX instruction. We use /// this scheme to model the def-use relationship between AMX config instruction /// and other AMX instructions. Below is an example. /// /// ----B1---- /// / \ /// / \ /// B2 B3 /// %1:tile = PTILELOADDV %2:tile = PTILELOADDV /// /// is transformed to /// /// B1 /// %25:tilecfg = PLDTILECFG /// / \ /// / \ /// %1:tile = PTILELOADDV %25 %2:tile = PTILELOADDV %25 // //===----------------------------------------------------------------------===// #include "X86.h" #include "X86InstrBuilder.h" #include "X86RegisterInfo.h" #include "X86Subtarget.h" -#include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/CodeGen/TileShapeInfo.h" #include "llvm/InitializePasses.h" using namespace llvm; #define DEBUG_TYPE "tile-pre-config" namespace { class X86PreTileConfig : public MachineFunctionPass { // context MachineFunction *MF = nullptr; const X86Subtarget *ST = nullptr; const TargetRegisterInfo *TRI; const TargetInstrInfo *TII; MachineDominatorTree *DomTree = nullptr; MachineRegisterInfo *MRI = nullptr; - LiveIntervals *LIS = nullptr; - SmallVector VTileRegs; - MachineInstr *TileConfigMI = nullptr; - void buildConfigMI(MachineBasicBlock::iterator MI, int FrameIdx); MachineInstr *getTileConfigPoint(); - void reloadTileConfig(int FI); public: X86PreTileConfig() : MachineFunctionPass(ID) {} /// Return the pass name. StringRef getPassName() const override { return "Tile Register Pre-configure"; } /// X86PreTileConfig analysis usage. void getAnalysisUsage(AnalysisUsage &AU) const override; /// Perform register allocation. bool runOnMachineFunction(MachineFunction &mf) override; static char ID; }; } // end anonymous namespace char X86PreTileConfig::ID = 0; INITIALIZE_PASS_BEGIN(X86PreTileConfig, "tilepreconfig", "Tile Register Configure", false, false) -INITIALIZE_PASS_DEPENDENCY(LiveIntervals) INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) INITIALIZE_PASS_END(X86PreTileConfig, "tilepreconfig", "Tile Register Configure", false, false) void X86PreTileConfig::getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesAll(); - AU.addRequired(); - AU.addPreserved(); AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); } -void X86PreTileConfig::buildConfigMI(MachineBasicBlock::iterator MI, - int FrameIdx) { +static Register buildConfigMI(MachineBasicBlock::iterator MI, int FrameIdx, + const TargetInstrInfo *TII, + MachineRegisterInfo *MRI, + const X86Subtarget *ST) { auto *MBB = MI->getParent(); // FIXME: AMX should assume AVX512 enabled. if (ST->hasAVX512()) { // Zero stack slot. Register Zmm = MRI->createVirtualRegister(&X86::VR512RegClass); BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::VPXORDZrr), Zmm) .addReg(Zmm, RegState::Undef) .addReg(Zmm, RegState::Undef); - TileConfigMI = &*addFrameReference(BuildMI(*MBB, MI, DebugLoc(), - TII->get(X86::VMOVUPSZmr)), - FrameIdx) - .addReg(Zmm); + addFrameReference(BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::VMOVUPSZmr)), + FrameIdx) + .addReg(Zmm); } // build psuedo ldtilecfg - addFrameReference(BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::LDTILECFG)), - FrameIdx); + Register VReg = MRI->createVirtualRegister(&X86::TILECFGRegClass); + + addFrameReference( + BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::PLDTILECFG), VReg), FrameIdx); + + return VReg; } static ShapeT getShape(const MachineInstr &MI, MachineRegisterInfo *MRI) { unsigned Opcode = MI.getOpcode(); switch (Opcode) { default: llvm_unreachable("Unexpected machine instruction on tile"); case X86::PTILELOADDV: case X86::PTDPBSSDV: case X86::PTILEZEROV: MachineOperand &MO1 = const_cast(MI.getOperand(1)); MachineOperand &MO2 = const_cast(MI.getOperand(2)); ShapeT Shape(&MO1, &MO2, MRI); return Shape; } } MachineInstr *X86PreTileConfig::getTileConfigPoint() { DenseMap PhysShapeInfo; MachineBasicBlock *MBB = nullptr; DenseSet MIs; for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) { Register VirtReg = Register::index2VirtReg(i); if (MRI->reg_nodbg_empty(VirtReg)) continue; const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg); if (RC.getID() != X86::TILERegClassID) continue; - VTileRegs.push_back(VirtReg); // Find the common dominator for all MI that define tile register. for (const MachineOperand &MO : MRI->def_operands(VirtReg)) { if (MO.isUndef()) continue; const auto *MI = MO.getParent(); // PHI or IMPLICIT_DEF instructiion. // There must be a input tile before PHI instruction. if (MI->isTransient()) continue; if (!MBB) MBB = const_cast(MI->getParent()); MBB = DomTree->findNearestCommonDominator( MBB, const_cast(MI->getParent())); // Collect the instructions that define shape. ShapeT Shape = getShape(*MI, MRI); std::array ShapeMOs = {Shape.getRow(), Shape.getCol()}; for (auto *ShapeMO : ShapeMOs) { Register ShapeReg = ShapeMO->getReg(); for (const MachineOperand &MO : MRI->def_operands(ShapeReg)) { const auto *ShapeMI = MO.getParent(); MIs.insert(ShapeMI); } } } } if (!MBB) return nullptr; // This pass is before the pass of eliminating PHI node, so it // is in SSA form. assert(MRI->isSSA() && "Not SSA form in pre-tile config"); // Shape def should dominate tile config MBB. // def s s1 s2 // / \ \ / // / \ \ / // conf s3=phi(s1,s2) // | // c // for (const auto *MI : MIs) { const MachineBasicBlock *ShapeMBB = MI->getParent(); if (DomTree->dominates(ShapeMBB, MBB)) continue; if (MI->isMoveImmediate()) continue; report_fatal_error(MF->getName() + ": Failed to config tile register, " "please define the shape earlier"); } // ldtilecfg should be inserted after the MI that define the shape. MachineBasicBlock::reverse_instr_iterator I, E; for (I = MBB->instr_rbegin(), E = MBB->instr_rend(); I != E; ++I) { auto *MI = &*I; if (MIs.count(MI) && (!MI->isMoveImmediate())) break; } MachineBasicBlock::iterator MII; if (I == E) MII = MBB->getFirstNonPHI(); else { MII = MachineBasicBlock::iterator(&*I); MII++; } return &*MII; } -void X86PreTileConfig::reloadTileConfig(int FI) { - SmallSet MIVisited; - const TargetRegisterClass *RC = TRI->getRegClass(X86::TILERegClassID); - auto TileRegNum = RC->getNumRegs(); - - for (Register VReg : VTileRegs) { - BitVector UsableRegs(TRI->getNumRegs()); - for (unsigned I = 0; I < TileRegNum; I++) - UsableRegs.set(X86::TMM0 + I); - SmallVector RegSlots; - SmallVector RegMasks; - LiveInterval &LI = LIS->getInterval(VReg); - if (!LIS->getInterferenceRegMasks(LI, RegSlots, RegMasks)) - continue; - for (unsigned I = 0; I < RegSlots.size(); I++) { - SlotIndex &SI = RegSlots[I]; - MachineInstr *MI = LIS->getInstructionFromIndex(SI); - // We have reload the tile config register before. - if (MIVisited.count(MI)) - continue; - // For inline assembly, we don't reload tile config register. - // If there is any ldtilecfg instruction in inline assembly, - // it is user's reponsibility to restore everything. - if (!MI->isCall()) - continue; - UsableRegs.clearBitsInMask(RegMasks[I]); - MIVisited.insert(MI); - // There is no interference in callee. This is benifited from - // IPRA. - if (UsableRegs.none()) - continue; - - // build psuedo ldtilecfg - auto *MBB = MI->getParent(); - auto MII = MachineBasicBlock::iterator(MI); - MII++; - addFrameReference( - BuildMI(*MBB, *MII, DebugLoc(), TII->get(X86::LDTILECFG)), FI); - } - } - // We just check tile data register interference, we also need check tile - // config register interference. Since we don't model the config register - // we should check interference from the ldtilecfg to each tile data register - // def. - // ldtilecfg - // / \ - // BB1 BB2 - // / \ - // call BB3 - // / \ - // %1=tileload %2=tilezero - // We can start from the instruction of each tile def, and backward to - // ldtilecfg. If there is any call instruction, and tile data register is - // not preserved, we should insert ldtilecfg after the call instruction. - SmallSet MBBVisited; - for (Register VReg : VTileRegs) { - for (MachineOperand &MO : MRI->def_operands(VReg)) { - if (MO.isUndef()) - continue; - MachineInstr *MI = MO.getParent(); - // May be PHI instructiion. - // There must be several def tile before PHI instruction. - if (MI->isTransient()) - continue; - - bool Terminate = false; - MachineBasicBlock *MBB = MI->getParent(); - // backward to see if there is any call instruction after ldtilecfg. - std::queue WorkList; - WorkList.push(MBB); - bool First = true; - while (!WorkList.empty()) { - MBB = WorkList.front(); - WorkList.pop(); - // If we have iterate the basic block before, don't iterate it and - // its predecessor again. This may be caused by loop, or it has a - // cross path from several successor, or it has been iterated when - // handle other tile register. In below example, BB1 hit the condition. - // ldtilecfg - // | - // ---BB1--- - // / \ - // BB2 BB3 - // / \ - // %1=tileload %2=tilezero - if (MBBVisited.count(MBB)) - continue; - // For the first MBB, we start from the amx instruction which def - // tile register. - auto I = (First) ? MI->getReverseIterator() : MBB->instr_rbegin(); - for (auto E = MBB->instr_rend(); I != E; ++I) { - // If it is inserted point for ldtilecfg, then we've finished - // backward. - if (&*I == TileConfigMI) { - Terminate = true; - break; - } - if (MIVisited.count(&*I)) - continue; - if (!I->isCall()) - continue; - BitVector UsableRegs(TRI->getNumRegs()); - for (unsigned I = 0; I < TileRegNum; I++) - UsableRegs.set(X86::TMM0 + I); - for (MachineOperand &CallMO : I->operands()) { - if (CallMO.isRegMask()) - UsableRegs.clearBitsInMask(CallMO.getRegMask()); - } - // Record the call to avoid double ldtilecfg insert. - MIVisited.insert(&*I); - if (UsableRegs.none()) - continue; - // Insert ldtilecfg after call instruction. - --I; - addFrameReference( - BuildMI(*MBB, *I, DebugLoc(), TII->get(X86::LDTILECFG)), FI); - } - // We encounter visited MachineInst, so we don't need to do backward - // again. - if (Terminate) - break; - // Next we will iterate its predecessor. - for (MachineBasicBlock::pred_iterator S = MBB->pred_begin(), - E = MBB->pred_end(); - S != E; S++) - WorkList.push(*S); +static void addTileCFGUse(MachineFunction &MF, Register CFG) { + for (MachineBasicBlock &MBB : MF) { - // The first the MBB may be visited for the second time when it is in - // a loop. - if (!First) - MBBVisited.insert(MBB); - First = false; + // Traverse the basic block. + for (MachineInstr &MI : MBB) { + unsigned Opcode = MI.getOpcode(); + switch (Opcode) { + default: + break; + case X86::PTILELOADDV: + case X86::PTILESTOREDV: + case X86::PTDPBSSDV: + case X86::PTILEZEROV: + unsigned NumOperands = MI.getNumOperands(); + MI.RemoveOperand(NumOperands - 1); + MI.addOperand(MF, MachineOperand::CreateReg(CFG, false)); + break; } } } } bool X86PreTileConfig::runOnMachineFunction(MachineFunction &mf) { MF = &mf; MRI = &mf.getRegInfo(); ST = &mf.getSubtarget(); TRI = ST->getRegisterInfo(); TII = mf.getSubtarget().getInstrInfo(); DomTree = &getAnalysis(); - LIS = &getAnalysis(); - auto *TileConfigPoint = getTileConfigPoint(); - if (!TileConfigPoint) + MachineInstr *MI = getTileConfigPoint(); + if (!MI) return false; unsigned Size = ST->getTileConfigSize(); Align Alignment = ST->getTileConfigAlignment(); int SS = mf.getFrameInfo().CreateStackObject(Size, Alignment, false); - buildConfigMI(TileConfigPoint, SS); - reloadTileConfig(SS); - VTileRegs.clear(); + Register CFG = buildConfigMI(MI, SS, TII, MRI, ST); + addTileCFGUse(mf, CFG); return true; } FunctionPass *llvm::createX86PreTileConfigPass() { return new X86PreTileConfig(); } diff --git a/llvm/lib/Target/X86/X86RegisterInfo.td b/llvm/lib/Target/X86/X86RegisterInfo.td index c8723c8268f2..75cbd4e1cff1 100644 --- a/llvm/lib/Target/X86/X86RegisterInfo.td +++ b/llvm/lib/Target/X86/X86RegisterInfo.td @@ -1,641 +1,646 @@ //===- X86RegisterInfo.td - Describe the X86 Register File --*- tablegen -*-==// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file describes the X86 Register file, defining the registers themselves, // aliases between the registers, and the register classes built out of the // registers. // //===----------------------------------------------------------------------===// class X86Reg Enc, list subregs = []> : Register { let Namespace = "X86"; let HWEncoding = Enc; let SubRegs = subregs; } // Subregister indices. let Namespace = "X86" in { def sub_8bit : SubRegIndex<8>; def sub_8bit_hi : SubRegIndex<8, 8>; def sub_8bit_hi_phony : SubRegIndex<8, 8>; def sub_16bit : SubRegIndex<16>; def sub_16bit_hi : SubRegIndex<16, 16>; def sub_32bit : SubRegIndex<32>; def sub_xmm : SubRegIndex<128>; def sub_ymm : SubRegIndex<256>; def sub_mask_0 : SubRegIndex<-1>; def sub_mask_1 : SubRegIndex<-1, -1>; } //===----------------------------------------------------------------------===// // Register definitions... // // In the register alias definitions below, we define which registers alias // which others. We only specify which registers the small registers alias, // because the register file generator is smart enough to figure out that // AL aliases AX if we tell it that AX aliased AL (for example). // Dwarf numbering is different for 32-bit and 64-bit, and there are // variations by target as well. Currently the first entry is for X86-64, // second - for EH on X86-32/Darwin and third is 'generic' one (X86-32/Linux // and debug information on X86-32/Darwin) // 8-bit registers // Low registers def AL : X86Reg<"al", 0>; def DL : X86Reg<"dl", 2>; def CL : X86Reg<"cl", 1>; def BL : X86Reg<"bl", 3>; // High registers. On x86-64, these cannot be used in any instruction // with a REX prefix. def AH : X86Reg<"ah", 4>; def DH : X86Reg<"dh", 6>; def CH : X86Reg<"ch", 5>; def BH : X86Reg<"bh", 7>; // X86-64 only, requires REX. let CostPerUse = 1 in { def SIL : X86Reg<"sil", 6>; def DIL : X86Reg<"dil", 7>; def BPL : X86Reg<"bpl", 5>; def SPL : X86Reg<"spl", 4>; def R8B : X86Reg<"r8b", 8>; def R9B : X86Reg<"r9b", 9>; def R10B : X86Reg<"r10b", 10>; def R11B : X86Reg<"r11b", 11>; def R12B : X86Reg<"r12b", 12>; def R13B : X86Reg<"r13b", 13>; def R14B : X86Reg<"r14b", 14>; def R15B : X86Reg<"r15b", 15>; } let isArtificial = 1 in { // High byte of the low 16 bits of the super-register: def SIH : X86Reg<"", -1>; def DIH : X86Reg<"", -1>; def BPH : X86Reg<"", -1>; def SPH : X86Reg<"", -1>; def R8BH : X86Reg<"", -1>; def R9BH : X86Reg<"", -1>; def R10BH : X86Reg<"", -1>; def R11BH : X86Reg<"", -1>; def R12BH : X86Reg<"", -1>; def R13BH : X86Reg<"", -1>; def R14BH : X86Reg<"", -1>; def R15BH : X86Reg<"", -1>; // High word of the low 32 bits of the super-register: def HAX : X86Reg<"", -1>; def HDX : X86Reg<"", -1>; def HCX : X86Reg<"", -1>; def HBX : X86Reg<"", -1>; def HSI : X86Reg<"", -1>; def HDI : X86Reg<"", -1>; def HBP : X86Reg<"", -1>; def HSP : X86Reg<"", -1>; def HIP : X86Reg<"", -1>; def R8WH : X86Reg<"", -1>; def R9WH : X86Reg<"", -1>; def R10WH : X86Reg<"", -1>; def R11WH : X86Reg<"", -1>; def R12WH : X86Reg<"", -1>; def R13WH : X86Reg<"", -1>; def R14WH : X86Reg<"", -1>; def R15WH : X86Reg<"", -1>; } // 16-bit registers let SubRegIndices = [sub_8bit, sub_8bit_hi], CoveredBySubRegs = 1 in { def AX : X86Reg<"ax", 0, [AL,AH]>; def DX : X86Reg<"dx", 2, [DL,DH]>; def CX : X86Reg<"cx", 1, [CL,CH]>; def BX : X86Reg<"bx", 3, [BL,BH]>; } let SubRegIndices = [sub_8bit, sub_8bit_hi_phony], CoveredBySubRegs = 1 in { def SI : X86Reg<"si", 6, [SIL,SIH]>; def DI : X86Reg<"di", 7, [DIL,DIH]>; def BP : X86Reg<"bp", 5, [BPL,BPH]>; def SP : X86Reg<"sp", 4, [SPL,SPH]>; } def IP : X86Reg<"ip", 0>; // X86-64 only, requires REX. let SubRegIndices = [sub_8bit, sub_8bit_hi_phony], CostPerUse = 1, CoveredBySubRegs = 1 in { def R8W : X86Reg<"r8w", 8, [R8B,R8BH]>; def R9W : X86Reg<"r9w", 9, [R9B,R9BH]>; def R10W : X86Reg<"r10w", 10, [R10B,R10BH]>; def R11W : X86Reg<"r11w", 11, [R11B,R11BH]>; def R12W : X86Reg<"r12w", 12, [R12B,R12BH]>; def R13W : X86Reg<"r13w", 13, [R13B,R13BH]>; def R14W : X86Reg<"r14w", 14, [R14B,R14BH]>; def R15W : X86Reg<"r15w", 15, [R15B,R15BH]>; } // 32-bit registers let SubRegIndices = [sub_16bit, sub_16bit_hi], CoveredBySubRegs = 1 in { def EAX : X86Reg<"eax", 0, [AX, HAX]>, DwarfRegNum<[-2, 0, 0]>; def EDX : X86Reg<"edx", 2, [DX, HDX]>, DwarfRegNum<[-2, 2, 2]>; def ECX : X86Reg<"ecx", 1, [CX, HCX]>, DwarfRegNum<[-2, 1, 1]>; def EBX : X86Reg<"ebx", 3, [BX, HBX]>, DwarfRegNum<[-2, 3, 3]>; def ESI : X86Reg<"esi", 6, [SI, HSI]>, DwarfRegNum<[-2, 6, 6]>; def EDI : X86Reg<"edi", 7, [DI, HDI]>, DwarfRegNum<[-2, 7, 7]>; def EBP : X86Reg<"ebp", 5, [BP, HBP]>, DwarfRegNum<[-2, 4, 5]>; def ESP : X86Reg<"esp", 4, [SP, HSP]>, DwarfRegNum<[-2, 5, 4]>; def EIP : X86Reg<"eip", 0, [IP, HIP]>, DwarfRegNum<[-2, 8, 8]>; } // X86-64 only, requires REX let SubRegIndices = [sub_16bit, sub_16bit_hi], CostPerUse = 1, CoveredBySubRegs = 1 in { def R8D : X86Reg<"r8d", 8, [R8W,R8WH]>; def R9D : X86Reg<"r9d", 9, [R9W,R9WH]>; def R10D : X86Reg<"r10d", 10, [R10W,R10WH]>; def R11D : X86Reg<"r11d", 11, [R11W,R11WH]>; def R12D : X86Reg<"r12d", 12, [R12W,R12WH]>; def R13D : X86Reg<"r13d", 13, [R13W,R13WH]>; def R14D : X86Reg<"r14d", 14, [R14W,R14WH]>; def R15D : X86Reg<"r15d", 15, [R15W,R15WH]>; } // 64-bit registers, X86-64 only let SubRegIndices = [sub_32bit] in { def RAX : X86Reg<"rax", 0, [EAX]>, DwarfRegNum<[0, -2, -2]>; def RDX : X86Reg<"rdx", 2, [EDX]>, DwarfRegNum<[1, -2, -2]>; def RCX : X86Reg<"rcx", 1, [ECX]>, DwarfRegNum<[2, -2, -2]>; def RBX : X86Reg<"rbx", 3, [EBX]>, DwarfRegNum<[3, -2, -2]>; def RSI : X86Reg<"rsi", 6, [ESI]>, DwarfRegNum<[4, -2, -2]>; def RDI : X86Reg<"rdi", 7, [EDI]>, DwarfRegNum<[5, -2, -2]>; def RBP : X86Reg<"rbp", 5, [EBP]>, DwarfRegNum<[6, -2, -2]>; def RSP : X86Reg<"rsp", 4, [ESP]>, DwarfRegNum<[7, -2, -2]>; // These also require REX. let CostPerUse = 1 in { def R8 : X86Reg<"r8", 8, [R8D]>, DwarfRegNum<[ 8, -2, -2]>; def R9 : X86Reg<"r9", 9, [R9D]>, DwarfRegNum<[ 9, -2, -2]>; def R10 : X86Reg<"r10", 10, [R10D]>, DwarfRegNum<[10, -2, -2]>; def R11 : X86Reg<"r11", 11, [R11D]>, DwarfRegNum<[11, -2, -2]>; def R12 : X86Reg<"r12", 12, [R12D]>, DwarfRegNum<[12, -2, -2]>; def R13 : X86Reg<"r13", 13, [R13D]>, DwarfRegNum<[13, -2, -2]>; def R14 : X86Reg<"r14", 14, [R14D]>, DwarfRegNum<[14, -2, -2]>; def R15 : X86Reg<"r15", 15, [R15D]>, DwarfRegNum<[15, -2, -2]>; def RIP : X86Reg<"rip", 0, [EIP]>, DwarfRegNum<[16, -2, -2]>; }} // MMX Registers. These are actually aliased to ST0 .. ST7 def MM0 : X86Reg<"mm0", 0>, DwarfRegNum<[41, 29, 29]>; def MM1 : X86Reg<"mm1", 1>, DwarfRegNum<[42, 30, 30]>; def MM2 : X86Reg<"mm2", 2>, DwarfRegNum<[43, 31, 31]>; def MM3 : X86Reg<"mm3", 3>, DwarfRegNum<[44, 32, 32]>; def MM4 : X86Reg<"mm4", 4>, DwarfRegNum<[45, 33, 33]>; def MM5 : X86Reg<"mm5", 5>, DwarfRegNum<[46, 34, 34]>; def MM6 : X86Reg<"mm6", 6>, DwarfRegNum<[47, 35, 35]>; def MM7 : X86Reg<"mm7", 7>, DwarfRegNum<[48, 36, 36]>; // Pseudo Floating Point registers def FP0 : X86Reg<"fp0", 0>; def FP1 : X86Reg<"fp1", 0>; def FP2 : X86Reg<"fp2", 0>; def FP3 : X86Reg<"fp3", 0>; def FP4 : X86Reg<"fp4", 0>; def FP5 : X86Reg<"fp5", 0>; def FP6 : X86Reg<"fp6", 0>; def FP7 : X86Reg<"fp7", 0>; // XMM Registers, used by the various SSE instruction set extensions. def XMM0: X86Reg<"xmm0", 0>, DwarfRegNum<[17, 21, 21]>; def XMM1: X86Reg<"xmm1", 1>, DwarfRegNum<[18, 22, 22]>; def XMM2: X86Reg<"xmm2", 2>, DwarfRegNum<[19, 23, 23]>; def XMM3: X86Reg<"xmm3", 3>, DwarfRegNum<[20, 24, 24]>; def XMM4: X86Reg<"xmm4", 4>, DwarfRegNum<[21, 25, 25]>; def XMM5: X86Reg<"xmm5", 5>, DwarfRegNum<[22, 26, 26]>; def XMM6: X86Reg<"xmm6", 6>, DwarfRegNum<[23, 27, 27]>; def XMM7: X86Reg<"xmm7", 7>, DwarfRegNum<[24, 28, 28]>; // X86-64 only let CostPerUse = 1 in { def XMM8: X86Reg<"xmm8", 8>, DwarfRegNum<[25, -2, -2]>; def XMM9: X86Reg<"xmm9", 9>, DwarfRegNum<[26, -2, -2]>; def XMM10: X86Reg<"xmm10", 10>, DwarfRegNum<[27, -2, -2]>; def XMM11: X86Reg<"xmm11", 11>, DwarfRegNum<[28, -2, -2]>; def XMM12: X86Reg<"xmm12", 12>, DwarfRegNum<[29, -2, -2]>; def XMM13: X86Reg<"xmm13", 13>, DwarfRegNum<[30, -2, -2]>; def XMM14: X86Reg<"xmm14", 14>, DwarfRegNum<[31, -2, -2]>; def XMM15: X86Reg<"xmm15", 15>, DwarfRegNum<[32, -2, -2]>; def XMM16: X86Reg<"xmm16", 16>, DwarfRegNum<[67, -2, -2]>; def XMM17: X86Reg<"xmm17", 17>, DwarfRegNum<[68, -2, -2]>; def XMM18: X86Reg<"xmm18", 18>, DwarfRegNum<[69, -2, -2]>; def XMM19: X86Reg<"xmm19", 19>, DwarfRegNum<[70, -2, -2]>; def XMM20: X86Reg<"xmm20", 20>, DwarfRegNum<[71, -2, -2]>; def XMM21: X86Reg<"xmm21", 21>, DwarfRegNum<[72, -2, -2]>; def XMM22: X86Reg<"xmm22", 22>, DwarfRegNum<[73, -2, -2]>; def XMM23: X86Reg<"xmm23", 23>, DwarfRegNum<[74, -2, -2]>; def XMM24: X86Reg<"xmm24", 24>, DwarfRegNum<[75, -2, -2]>; def XMM25: X86Reg<"xmm25", 25>, DwarfRegNum<[76, -2, -2]>; def XMM26: X86Reg<"xmm26", 26>, DwarfRegNum<[77, -2, -2]>; def XMM27: X86Reg<"xmm27", 27>, DwarfRegNum<[78, -2, -2]>; def XMM28: X86Reg<"xmm28", 28>, DwarfRegNum<[79, -2, -2]>; def XMM29: X86Reg<"xmm29", 29>, DwarfRegNum<[80, -2, -2]>; def XMM30: X86Reg<"xmm30", 30>, DwarfRegNum<[81, -2, -2]>; def XMM31: X86Reg<"xmm31", 31>, DwarfRegNum<[82, -2, -2]>; } // CostPerUse // YMM0-15 registers, used by AVX instructions and // YMM16-31 registers, used by AVX-512 instructions. let SubRegIndices = [sub_xmm] in { foreach Index = 0-31 in { def YMM#Index : X86Reg<"ymm"#Index, Index, [!cast("XMM"#Index)]>, DwarfRegAlias("XMM"#Index)>; } } // ZMM Registers, used by AVX-512 instructions. let SubRegIndices = [sub_ymm] in { foreach Index = 0-31 in { def ZMM#Index : X86Reg<"zmm"#Index, Index, [!cast("YMM"#Index)]>, DwarfRegAlias("XMM"#Index)>; } } // Tile config registers. def TMMCFG: X86Reg<"tmmcfg", 0>; // Tile "registers". def TMM0: X86Reg<"tmm0", 0>; def TMM1: X86Reg<"tmm1", 1>; def TMM2: X86Reg<"tmm2", 2>; def TMM3: X86Reg<"tmm3", 3>; def TMM4: X86Reg<"tmm4", 4>; def TMM5: X86Reg<"tmm5", 5>; def TMM6: X86Reg<"tmm6", 6>; def TMM7: X86Reg<"tmm7", 7>; // Mask Registers, used by AVX-512 instructions. def K0 : X86Reg<"k0", 0>, DwarfRegNum<[118, 93, 93]>; def K1 : X86Reg<"k1", 1>, DwarfRegNum<[119, 94, 94]>; def K2 : X86Reg<"k2", 2>, DwarfRegNum<[120, 95, 95]>; def K3 : X86Reg<"k3", 3>, DwarfRegNum<[121, 96, 96]>; def K4 : X86Reg<"k4", 4>, DwarfRegNum<[122, 97, 97]>; def K5 : X86Reg<"k5", 5>, DwarfRegNum<[123, 98, 98]>; def K6 : X86Reg<"k6", 6>, DwarfRegNum<[124, 99, 99]>; def K7 : X86Reg<"k7", 7>, DwarfRegNum<[125, 100, 100]>; // Floating point stack registers. These don't map one-to-one to the FP // pseudo registers, but we still mark them as aliasing FP registers. That // way both kinds can be live without exceeding the stack depth. ST registers // are only live around inline assembly. def ST0 : X86Reg<"st", 0>, DwarfRegNum<[33, 12, 11]>; def ST1 : X86Reg<"st(1)", 1>, DwarfRegNum<[34, 13, 12]>; def ST2 : X86Reg<"st(2)", 2>, DwarfRegNum<[35, 14, 13]>; def ST3 : X86Reg<"st(3)", 3>, DwarfRegNum<[36, 15, 14]>; def ST4 : X86Reg<"st(4)", 4>, DwarfRegNum<[37, 16, 15]>; def ST5 : X86Reg<"st(5)", 5>, DwarfRegNum<[38, 17, 16]>; def ST6 : X86Reg<"st(6)", 6>, DwarfRegNum<[39, 18, 17]>; def ST7 : X86Reg<"st(7)", 7>, DwarfRegNum<[40, 19, 18]>; // Floating-point status word def FPSW : X86Reg<"fpsr", 0>; // Floating-point control word def FPCW : X86Reg<"fpcr", 0>; // SIMD Floating-point control register. // Note: We only model the "Uses" of the control bits: current rounding modes, // DAZ, FTZ and exception masks. We don't model the "Defs" of flag bits. def MXCSR : X86Reg<"mxcsr", 0>; // Status flags register. // // Note that some flags that are commonly thought of as part of the status // flags register are modeled separately. Typically this is due to instructions // reading and updating those flags independently of all the others. We don't // want to create false dependencies between these instructions and so we use // a separate register to model them. def EFLAGS : X86Reg<"flags", 0>; // The direction flag. def DF : X86Reg<"dirflag", 0>; // Segment registers def CS : X86Reg<"cs", 1>; def DS : X86Reg<"ds", 3>; def SS : X86Reg<"ss", 2>; def ES : X86Reg<"es", 0>; def FS : X86Reg<"fs", 4>; def GS : X86Reg<"gs", 5>; // Debug registers def DR0 : X86Reg<"dr0", 0>; def DR1 : X86Reg<"dr1", 1>; def DR2 : X86Reg<"dr2", 2>; def DR3 : X86Reg<"dr3", 3>; def DR4 : X86Reg<"dr4", 4>; def DR5 : X86Reg<"dr5", 5>; def DR6 : X86Reg<"dr6", 6>; def DR7 : X86Reg<"dr7", 7>; def DR8 : X86Reg<"dr8", 8>; def DR9 : X86Reg<"dr9", 9>; def DR10 : X86Reg<"dr10", 10>; def DR11 : X86Reg<"dr11", 11>; def DR12 : X86Reg<"dr12", 12>; def DR13 : X86Reg<"dr13", 13>; def DR14 : X86Reg<"dr14", 14>; def DR15 : X86Reg<"dr15", 15>; // Control registers def CR0 : X86Reg<"cr0", 0>; def CR1 : X86Reg<"cr1", 1>; def CR2 : X86Reg<"cr2", 2>; def CR3 : X86Reg<"cr3", 3>; def CR4 : X86Reg<"cr4", 4>; def CR5 : X86Reg<"cr5", 5>; def CR6 : X86Reg<"cr6", 6>; def CR7 : X86Reg<"cr7", 7>; def CR8 : X86Reg<"cr8", 8>; def CR9 : X86Reg<"cr9", 9>; def CR10 : X86Reg<"cr10", 10>; def CR11 : X86Reg<"cr11", 11>; def CR12 : X86Reg<"cr12", 12>; def CR13 : X86Reg<"cr13", 13>; def CR14 : X86Reg<"cr14", 14>; def CR15 : X86Reg<"cr15", 15>; // Pseudo index registers def EIZ : X86Reg<"eiz", 4>; def RIZ : X86Reg<"riz", 4>; // Bound registers, used in MPX instructions def BND0 : X86Reg<"bnd0", 0>; def BND1 : X86Reg<"bnd1", 1>; def BND2 : X86Reg<"bnd2", 2>; def BND3 : X86Reg<"bnd3", 3>; // CET registers - Shadow Stack Pointer def SSP : X86Reg<"ssp", 0>; //===----------------------------------------------------------------------===// // Register Class Definitions... now that we have all of the pieces, define the // top-level register classes. The order specified in the register list is // implicitly defined to be the register allocation order. // // List call-clobbered registers before callee-save registers. RBX, RBP, (and // R12, R13, R14, and R15 for X86-64) are callee-save registers. // In 64-mode, there are 12 additional i8 registers, SIL, DIL, BPL, SPL, and // R8B, ... R15B. // Allocate R12 and R13 last, as these require an extra byte when // encoded in x86_64 instructions. // FIXME: Allow AH, CH, DH, BH to be used as general-purpose registers in // 64-bit mode. The main complication is that they cannot be encoded in an // instruction requiring a REX prefix, while SIL, DIL, BPL, R8D, etc. // require a REX prefix. For example, "addb %ah, %dil" and "movzbl %ah, %r8d" // cannot be encoded. def GR8 : RegisterClass<"X86", [i8], 8, (add AL, CL, DL, AH, CH, DH, BL, BH, SIL, DIL, BPL, SPL, R8B, R9B, R10B, R11B, R14B, R15B, R12B, R13B)> { let AltOrders = [(sub GR8, AH, BH, CH, DH)]; let AltOrderSelect = [{ return MF.getSubtarget().is64Bit(); }]; } let isAllocatable = 0 in def GRH8 : RegisterClass<"X86", [i8], 8, (add SIH, DIH, BPH, SPH, R8BH, R9BH, R10BH, R11BH, R12BH, R13BH, R14BH, R15BH)>; def GR16 : RegisterClass<"X86", [i16], 16, (add AX, CX, DX, SI, DI, BX, BP, SP, R8W, R9W, R10W, R11W, R14W, R15W, R12W, R13W)>; let isAllocatable = 0 in def GRH16 : RegisterClass<"X86", [i16], 16, (add HAX, HCX, HDX, HSI, HDI, HBX, HBP, HSP, HIP, R8WH, R9WH, R10WH, R11WH, R12WH, R13WH, R14WH, R15WH)>; def GR32 : RegisterClass<"X86", [i32], 32, (add EAX, ECX, EDX, ESI, EDI, EBX, EBP, ESP, R8D, R9D, R10D, R11D, R14D, R15D, R12D, R13D)>; // GR64 - 64-bit GPRs. This oddly includes RIP, which isn't accurate, since // RIP isn't really a register and it can't be used anywhere except in an // address, but it doesn't cause trouble. // FIXME: it *does* cause trouble - CheckBaseRegAndIndexReg() has extra // tests because of the inclusion of RIP in this register class. def GR64 : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11, RBX, R14, R15, R12, R13, RBP, RSP, RIP)>; // Segment registers for use by MOV instructions (and others) that have a // segment register as one operand. Always contain a 16-bit segment // descriptor. def SEGMENT_REG : RegisterClass<"X86", [i16], 16, (add CS, DS, SS, ES, FS, GS)>; // Debug registers. def DEBUG_REG : RegisterClass<"X86", [i32], 32, (sequence "DR%u", 0, 15)>; // Control registers. def CONTROL_REG : RegisterClass<"X86", [i64], 64, (sequence "CR%u", 0, 15)>; // GR8_ABCD_L, GR8_ABCD_H, GR16_ABCD, GR32_ABCD, GR64_ABCD - Subclasses of // GR8, GR16, GR32, and GR64 which contain just the "a" "b", "c", and "d" // registers. On x86-32, GR16_ABCD and GR32_ABCD are classes for registers // that support 8-bit subreg operations. On x86-64, GR16_ABCD, GR32_ABCD, // and GR64_ABCD are classes for registers that support 8-bit h-register // operations. def GR8_ABCD_L : RegisterClass<"X86", [i8], 8, (add AL, CL, DL, BL)>; def GR8_ABCD_H : RegisterClass<"X86", [i8], 8, (add AH, CH, DH, BH)>; def GR16_ABCD : RegisterClass<"X86", [i16], 16, (add AX, CX, DX, BX)>; def GR32_ABCD : RegisterClass<"X86", [i32], 32, (add EAX, ECX, EDX, EBX)>; def GR64_ABCD : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX, RBX)>; def GR32_TC : RegisterClass<"X86", [i32], 32, (add EAX, ECX, EDX, ESP)>; def GR64_TC : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX, RSI, RDI, R8, R9, R11, RIP, RSP)>; def GR64_TCW64 : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX, R8, R9, R10, R11, RIP, RSP)>; // GR8_NOREX - GR8 registers which do not require a REX prefix. def GR8_NOREX : RegisterClass<"X86", [i8], 8, (add AL, CL, DL, AH, CH, DH, BL, BH)> { let AltOrders = [(sub GR8_NOREX, AH, BH, CH, DH)]; let AltOrderSelect = [{ return MF.getSubtarget().is64Bit(); }]; } // GR16_NOREX - GR16 registers which do not require a REX prefix. def GR16_NOREX : RegisterClass<"X86", [i16], 16, (add AX, CX, DX, SI, DI, BX, BP, SP)>; // GR32_NOREX - GR32 registers which do not require a REX prefix. def GR32_NOREX : RegisterClass<"X86", [i32], 32, (add EAX, ECX, EDX, ESI, EDI, EBX, EBP, ESP)>; // GR64_NOREX - GR64 registers which do not require a REX prefix. def GR64_NOREX : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX, RSI, RDI, RBX, RBP, RSP, RIP)>; // GR32_NOSP - GR32 registers except ESP. def GR32_NOSP : RegisterClass<"X86", [i32], 32, (sub GR32, ESP)>; // GR64_NOSP - GR64 registers except RSP (and RIP). def GR64_NOSP : RegisterClass<"X86", [i64], 64, (sub GR64, RSP, RIP)>; // GR32_NOREX_NOSP - GR32 registers which do not require a REX prefix except // ESP. def GR32_NOREX_NOSP : RegisterClass<"X86", [i32], 32, (and GR32_NOREX, GR32_NOSP)>; // GR64_NOREX_NOSP - GR64_NOREX registers except RSP. def GR64_NOREX_NOSP : RegisterClass<"X86", [i64], 64, (and GR64_NOREX, GR64_NOSP)>; // Register classes used for ABIs that use 32-bit address accesses, // while using the whole x84_64 ISA. // In such cases, it is fine to use RIP as we are sure the 32 high // bits are not set. We do not need variants for NOSP as RIP is not // allowed there. // RIP is not spilled anywhere for now, so stick to 32-bit alignment // to save on memory space. // FIXME: We could allow all 64bit registers, but we would need // something to check that the 32 high bits are not set, // which we do not have right now. def LOW32_ADDR_ACCESS : RegisterClass<"X86", [i32], 32, (add GR32, RIP)>; // When RBP is used as a base pointer in a 32-bit addresses environment, // this is also safe to use the full register to access addresses. // Since RBP will never be spilled, stick to a 32 alignment to save // on memory consumption. def LOW32_ADDR_ACCESS_RBP : RegisterClass<"X86", [i32], 32, (add LOW32_ADDR_ACCESS, RBP)>; // A class to support the 'A' assembler constraint: [ER]AX then [ER]DX. def GR32_AD : RegisterClass<"X86", [i32], 32, (add EAX, EDX)>; def GR64_AD : RegisterClass<"X86", [i64], 64, (add RAX, RDX)>; // Classes to support the 64-bit assembler constraint tied to a fixed // register in 32-bit mode. The second register is always the next in // the list. Wrap around causes an error. def GR32_DC : RegisterClass<"X86", [i32], 32, (add EDX, ECX)>; def GR32_CB : RegisterClass<"X86", [i32], 32, (add ECX, EBX)>; def GR32_BSI : RegisterClass<"X86", [i32], 32, (add EBX, ESI)>; def GR32_SIDI : RegisterClass<"X86", [i32], 32, (add ESI, EDI)>; def GR32_DIBP : RegisterClass<"X86", [i32], 32, (add EDI, EBP)>; def GR32_BPSP : RegisterClass<"X86", [i32], 32, (add EBP, ESP)>; // Scalar SSE2 floating point registers. def FR32 : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 15)>; def FR64 : RegisterClass<"X86", [f64], 64, (add FR32)>; // FIXME: This sets up the floating point register files as though they are f64 // values, though they really are f80 values. This will cause us to spill // values as 64-bit quantities instead of 80-bit quantities, which is much much // faster on common hardware. In reality, this should be controlled by a // command line option or something. def RFP32 : RegisterClass<"X86",[f32], 32, (sequence "FP%u", 0, 6)>; def RFP64 : RegisterClass<"X86",[f64], 32, (add RFP32)>; def RFP80 : RegisterClass<"X86",[f80], 32, (add RFP32)>; // st(7) may be is not allocatable. def RFP80_7 : RegisterClass<"X86",[f80], 32, (add FP7)> { let isAllocatable = 0; } // Floating point stack registers (these are not allocatable by the // register allocator - the floating point stackifier is responsible // for transforming FPn allocations to STn registers) def RST : RegisterClass<"X86", [f80, f64, f32], 32, (sequence "ST%u", 0, 7)> { let isAllocatable = 0; } // Helper to allow %st to print as %st(0) when its encoded in the instruction. def RSTi : RegisterOperand; // Generic vector registers: VR64 and VR128. // Ensure that float types are declared first - only float is legal on SSE1. def VR64: RegisterClass<"X86", [x86mmx], 64, (sequence "MM%u", 0, 7)>; def VR128 : RegisterClass<"X86", [v4f32, v2f64, v16i8, v8i16, v4i32, v2i64, f128], 128, (add FR32)>; def VR256 : RegisterClass<"X86", [v8f32, v4f64, v32i8, v16i16, v8i32, v4i64], 256, (sequence "YMM%u", 0, 15)>; // Status flags registers. def CCR : RegisterClass<"X86", [i32], 32, (add EFLAGS)> { let CopyCost = -1; // Don't allow copying of status registers. let isAllocatable = 0; } def FPCCR : RegisterClass<"X86", [i16], 16, (add FPSW)> { let CopyCost = -1; // Don't allow copying of status registers. let isAllocatable = 0; } def DFCCR : RegisterClass<"X86", [i32], 32, (add DF)> { let CopyCost = -1; // Don't allow copying of status registers. let isAllocatable = 0; } // AVX-512 vector/mask registers. def VR512 : RegisterClass<"X86", [v16f32, v8f64, v64i8, v32i16, v16i32, v8i64], 512, (sequence "ZMM%u", 0, 31)>; // Represents the lower 16 registers that have VEX/legacy encodable subregs. def VR512_0_15 : RegisterClass<"X86", [v16f32, v8f64, v64i8, v32i16, v16i32, v8i64], 512, (sequence "ZMM%u", 0, 15)>; // Scalar AVX-512 floating point registers. def FR32X : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 31)>; def FR64X : RegisterClass<"X86", [f64], 64, (add FR32X)>; // Extended VR128 and VR256 for AVX-512 instructions def VR128X : RegisterClass<"X86", [v4f32, v2f64, v16i8, v8i16, v4i32, v2i64, f128], 128, (add FR32X)>; def VR256X : RegisterClass<"X86", [v8f32, v4f64, v32i8, v16i16, v8i32, v4i64], 256, (sequence "YMM%u", 0, 31)>; // Mask registers def VK1 : RegisterClass<"X86", [v1i1], 16, (sequence "K%u", 0, 7)> {let Size = 16;} def VK2 : RegisterClass<"X86", [v2i1], 16, (add VK1)> {let Size = 16;} def VK4 : RegisterClass<"X86", [v4i1], 16, (add VK2)> {let Size = 16;} def VK8 : RegisterClass<"X86", [v8i1], 16, (add VK4)> {let Size = 16;} def VK16 : RegisterClass<"X86", [v16i1], 16, (add VK8)> {let Size = 16;} def VK32 : RegisterClass<"X86", [v32i1], 32, (add VK16)> {let Size = 32;} def VK64 : RegisterClass<"X86", [v64i1], 64, (add VK32)> {let Size = 64;} // Mask register pairs def KPAIRS : RegisterTuples<[sub_mask_0, sub_mask_1], [(add K0, K2, K4, K6), (add K1, K3, K5, K7)]>; def VK1PAIR : RegisterClass<"X86", [untyped], 16, (add KPAIRS)> {let Size = 32;} def VK2PAIR : RegisterClass<"X86", [untyped], 16, (add KPAIRS)> {let Size = 32;} def VK4PAIR : RegisterClass<"X86", [untyped], 16, (add KPAIRS)> {let Size = 32;} def VK8PAIR : RegisterClass<"X86", [untyped], 16, (add KPAIRS)> {let Size = 32;} def VK16PAIR : RegisterClass<"X86", [untyped], 16, (add KPAIRS)> {let Size = 32;} def VK1WM : RegisterClass<"X86", [v1i1], 16, (sub VK1, K0)> {let Size = 16;} def VK2WM : RegisterClass<"X86", [v2i1], 16, (sub VK2, K0)> {let Size = 16;} def VK4WM : RegisterClass<"X86", [v4i1], 16, (sub VK4, K0)> {let Size = 16;} def VK8WM : RegisterClass<"X86", [v8i1], 16, (sub VK8, K0)> {let Size = 16;} def VK16WM : RegisterClass<"X86", [v16i1], 16, (add VK8WM)> {let Size = 16;} def VK32WM : RegisterClass<"X86", [v32i1], 32, (add VK16WM)> {let Size = 32;} def VK64WM : RegisterClass<"X86", [v64i1], 64, (add VK32WM)> {let Size = 64;} // Bound registers def BNDR : RegisterClass<"X86", [v2i64], 128, (sequence "BND%u", 0, 3)>; // Tiles let CopyCost = -1 in // Don't allow copying of tile registers def TILE : RegisterClass<"X86", [x86amx], 8192, (sequence "TMM%u", 0, 7)> {let Size = 8192;} +def TILECFG : RegisterClass<"X86", [untyped], 512, (add TMMCFG)> { + let CopyCost = -1; // Don't allow copying of tile config registers. + let isAllocatable = 1; + let Size = 512; +} diff --git a/llvm/lib/Target/X86/X86TileConfig.cpp b/llvm/lib/Target/X86/X86TileConfig.cpp index d6c1dcaf0588..ef010bcd38b7 100644 --- a/llvm/lib/Target/X86/X86TileConfig.cpp +++ b/llvm/lib/Target/X86/X86TileConfig.cpp @@ -1,250 +1,248 @@ //===-- X86TileConfig.cpp - Tile Register Configure----------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // /// \file Pass to config the shape of AMX physical registers /// AMX register need to be configured before use. In X86PreTileConfig pass /// the pldtilecfg instruction is inserted, however at that time we don't /// know the shape of each physical tile registers, because the register /// allocation is not done yet. This pass runs after egister allocation /// pass. It collects the shape information of each physical tile register /// and store the shape in the stack slot that is allocated for load config /// to tile config register. // //===----------------------------------------------------------------------===// #include "X86.h" #include "X86InstrBuilder.h" #include "X86MachineFunctionInfo.h" #include "X86RegisterInfo.h" #include "X86Subtarget.h" -#include "llvm/ADT/PostOrderIterator.h" #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/CodeGen/TileShapeInfo.h" #include "llvm/CodeGen/VirtRegMap.h" #include "llvm/InitializePasses.h" using namespace llvm; #define DEBUG_TYPE "tile-config" namespace { class X86TileConfig : public MachineFunctionPass { // context MachineFunction *MF = nullptr; const X86Subtarget *ST = nullptr; const TargetRegisterInfo *TRI; const TargetInstrInfo *TII; MachineDominatorTree *DomTree = nullptr; MachineRegisterInfo *MRI = nullptr; VirtRegMap *VRM = nullptr; LiveIntervals *LIS = nullptr; MachineInstr *getTileConfigPoint(); void tileConfig(); public: X86TileConfig() : MachineFunctionPass(ID) {} /// Return the pass name. StringRef getPassName() const override { return "Tile Register Configure"; } /// X86TileConfig analysis usage. void getAnalysisUsage(AnalysisUsage &AU) const override; /// Perform register allocation. bool runOnMachineFunction(MachineFunction &mf) override; MachineFunctionProperties getRequiredProperties() const override { return MachineFunctionProperties().set( MachineFunctionProperties::Property::NoPHIs); } static char ID; }; } // end anonymous namespace char X86TileConfig::ID = 0; INITIALIZE_PASS_BEGIN(X86TileConfig, "tileconfig", "Tile Register Configure", false, false) INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) INITIALIZE_PASS_DEPENDENCY(VirtRegMap) INITIALIZE_PASS_END(X86TileConfig, "tileconfig", "Tile Register Configure", false, false) void X86TileConfig::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired(); AU.addRequired(); AU.addPreserved(); AU.addRequired(); AU.setPreservesAll(); MachineFunctionPass::getAnalysisUsage(AU); } static unsigned getTilePhysRegIndex(Register PhysReg) { assert((PhysReg >= X86::TMM0 && X86::TMM0 <= X86::TMM7) && "Tile register number is invalid"); return (PhysReg - X86::TMM0); } static MachineInstr * storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, unsigned BitSize, int FrameIdx, int Offset, const TargetInstrInfo *TII, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) { unsigned SubIdx = (BitSize == 8) ? X86::sub_8bit : X86::sub_16bit; unsigned Opc = (BitSize == 8) ? X86::MOV8mr : X86::MOV16mr; if (BitSize == TRI->getRegSizeInBits(*RC)) SubIdx = 0; MachineInstr *NewMI = addFrameReference(BuildMI(MBB, MI, DebugLoc(), TII->get(Opc)), FrameIdx, Offset) .addReg(SrcReg, 0, SubIdx); return NewMI; } static MachineInstr *storeImmToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, int64_t Imm, unsigned BitSize, int FrameIdx, int Offset, const TargetInstrInfo *TII) { unsigned Opc = (BitSize == 8) ? X86::MOV8mi : X86::MOV16mi; return addFrameReference(BuildMI(MBB, MI, DebugLoc(), TII->get(Opc)), FrameIdx, Offset) .addImm(Imm); } MachineInstr *X86TileConfig::getTileConfigPoint() { - MachineBasicBlock *Entry = &*MF->begin(); - ReversePostOrderTraversal RPOT(Entry); - for (MachineBasicBlock *MBB : RPOT) { - for (MachineInstr &MI : *MBB) + for (MachineBasicBlock &MBB : *MF) { + + // Traverse the basic block. + for (MachineInstr &MI : MBB) // Refer X86PreTileConfig.cpp. - // We only support one tile config for now. The other ldtilecfg - // is for spill purpose and is dominated by the first ldtilecfg. - if (MI.getOpcode() == X86::LDTILECFG) + // We only support one tile config for now. + if (MI.getOpcode() == X86::PLDTILECFG) return &MI; } return nullptr; } void X86TileConfig::tileConfig() { MachineInstr *MI = getTileConfigPoint(); if (!MI) return; MachineBasicBlock *MBB = MI->getParent(); - int SS = MI->getOperand(0).getIndex(); + int SS = MI->getOperand(1).getIndex(); BitVector PhysRegs(TRI->getNumRegs()); // Fill in the palette first. auto *NewMI = storeImmToStackSlot(*MBB, *MI, 1, 8, SS, 0, TII); LIS->InsertMachineInstrInMaps(*NewMI); // Fill in the shape of each tile physical register. for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) { Register VirtReg = Register::index2VirtReg(i); if (MRI->reg_nodbg_empty(VirtReg)) continue; const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg); if (RC.getID() != X86::TILERegClassID) continue; Register PhysReg = VRM->getPhys(VirtReg); if (PhysRegs.test(PhysReg)) continue; PhysRegs.set(PhysReg); ShapeT Shape = VRM->getShape(VirtReg); Register RowReg = Shape.getRow()->getReg(); Register ColReg = Shape.getCol()->getReg(); // Here is the data format for the tile config. // 0 palette // 1 start_row // 2-15 reserved, must be zero // 16-17 tile0.colsb Tile 0 bytes per row. // 18-19 tile1.colsb Tile 1 bytes per row. // 20-21 tile2.colsb Tile 2 bytes per row. // ... (sequence continues) // 30-31 tile7.colsb Tile 7 bytes per row. // 32-47 reserved, must be zero // 48 tile0.rows Tile 0 rows. // 49 tile1.rows Tile 1 rows. // 50 tile2.rows Tile 2 rows. // ... (sequence continues) // 55 tile7.rows Tile 7 rows. // 56-63 reserved, must be zero unsigned Index = getTilePhysRegIndex(PhysReg); int RowOffset = 48 + Index; int ColOffset = 16 + Index * 2; unsigned BitSize = 8; for (const auto &Pair : {std::make_pair(RowReg, RowOffset), std::make_pair(ColReg, ColOffset)}) { int64_t Imm; int ImmCount = 0; // All def must be the same value, otherwise it is invalid MIs. // Immediate is prefered. for (const MachineOperand &MO : MRI->def_operands(Pair.first)) { const auto *Inst = MO.getParent(); if (Inst->isMoveImmediate()) { ImmCount++; Imm = Inst->getOperand(1).getImm(); break; } } auto StoreConfig = [&](int Offset) { MachineInstr *NewMI = nullptr; if (ImmCount) NewMI = storeImmToStackSlot(*MBB, *MI, Imm, BitSize, SS, Offset, TII); else { const TargetRegisterClass *RC = MRI->getRegClass(Pair.first); NewMI = storeRegToStackSlot(*MBB, *MI, Pair.first, BitSize, SS, Offset, TII, RC, TRI); } SlotIndex SIdx = LIS->InsertMachineInstrInMaps(*NewMI); if (!ImmCount) { // Extend the live interval. SmallVector EndPoints = {SIdx.getRegSlot()}; LiveInterval &Int = LIS->getInterval(Pair.first); LIS->extendToIndices(Int, EndPoints); } }; StoreConfig(Pair.second); BitSize += 8; } } } bool X86TileConfig::runOnMachineFunction(MachineFunction &mf) { MF = &mf; MRI = &mf.getRegInfo(); ST = &mf.getSubtarget(); TRI = ST->getRegisterInfo(); TII = mf.getSubtarget().getInstrInfo(); DomTree = &getAnalysis(); VRM = &getAnalysis(); LIS = &getAnalysis(); if (VRM->isShapeMapEmpty()) return false; tileConfig(); return true; } FunctionPass *llvm::createX86TileConfigPass() { return new X86TileConfig(); } diff --git a/llvm/test/CodeGen/X86/AMX/amx-across-func.ll b/llvm/test/CodeGen/X86/AMX/amx-across-func.ll index 87973fd9c315..a68a81b8d732 100644 --- a/llvm/test/CodeGen/X86/AMX/amx-across-func.ll +++ b/llvm/test/CodeGen/X86/AMX/amx-across-func.ll @@ -1,125 +1,71 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -verify-machineinstrs | FileCheck %s -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -verify-machineinstrs -enable-ipra | FileCheck -check-prefix=IPRA %s -@buf = dso_local global [3072 x i8] zeroinitializer, align 64 +%struct.__tile_str = type <{ i16, i16, [60 x i8], <256 x i32> }> -define internal void @foo() #0 { -; CHECK-LABEL: foo: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: pushq %rbp -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset %rbp, -16 -; CHECK-NEXT: movq %rsp, %rbp -; CHECK-NEXT: .cfi_def_cfa_register %rbp -; CHECK-NEXT: popq %rbp -; CHECK-NEXT: .cfi_def_cfa %rsp, 8 -; CHECK-NEXT: retq -; -; IPRA-LABEL: foo: -; IPRA: # %bb.0: # %entry -; IPRA-NEXT: pushq %rbp -; IPRA-NEXT: .cfi_def_cfa_offset 16 -; IPRA-NEXT: .cfi_offset %rbp, -16 -; IPRA-NEXT: movq %rsp, %rbp -; IPRA-NEXT: .cfi_def_cfa_register %rbp -; IPRA-NEXT: popq %rbp -; IPRA-NEXT: .cfi_def_cfa %rsp, 8 -; IPRA-NEXT: retq -entry: - ret void -} +@buf = dso_local global [3072 x i8] zeroinitializer, align 64 define dso_local void @test_api(i16 signext %0, i16 signext %1) nounwind { ; CHECK-LABEL: test_api: ; CHECK: # %bb.0: ; CHECK-NEXT: pushq %rbp ; CHECK-NEXT: pushq %r15 ; CHECK-NEXT: pushq %r14 ; CHECK-NEXT: pushq %rbx ; CHECK-NEXT: subq $4056, %rsp # imm = 0xFD8 ; CHECK-NEXT: movl %esi, %ebx ; CHECK-NEXT: movl %edi, %ebp ; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0 ; CHECK-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movb %bpl, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movw %bx, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movb %bpl, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movw %bx, {{[0-9]+}}(%rsp) ; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; CHECK-NEXT: sttilecfg {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Folded Spill ; CHECK-NEXT: movl $buf, %eax ; CHECK-NEXT: movl $32, %r14d ; CHECK-NEXT: movw $8, %r15w ; CHECK-NEXT: tileloadd (%rax,%r14), %tmm1 ; CHECK-NEXT: movabsq $64, %rax ; CHECK-NEXT: tilestored %tmm1, 2048(%rsp,%rax) # 1024-byte Folded Spill ; CHECK-NEXT: movl $buf+1024, %eax ; CHECK-NEXT: tileloadd (%rax,%r14), %tmm2 ; CHECK-NEXT: movabsq $64, %rax ; CHECK-NEXT: tilestored %tmm2, 1024(%rsp,%rax) # 1024-byte Folded Spill +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq foo -; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) ; CHECK-NEXT: movl $buf+2048, %eax +; CHECK-NEXT: ldtilecfg {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Folded Reload ; CHECK-NEXT: tileloadd (%rax,%r14), %tmm0 ; CHECK-NEXT: movabsq $64, %rcx ; CHECK-NEXT: tileloadd 2048(%rsp,%rcx), %tmm1 # 1024-byte Folded Reload ; CHECK-NEXT: movabsq $64, %rcx ; CHECK-NEXT: tileloadd 1024(%rsp,%rcx), %tmm2 # 1024-byte Folded Reload ; CHECK-NEXT: tdpbssd %tmm2, %tmm1, %tmm0 ; CHECK-NEXT: tilestored %tmm0, (%rax,%r14) ; CHECK-NEXT: addq $4056, %rsp # imm = 0xFD8 ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %r14 ; CHECK-NEXT: popq %r15 ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: tilerelease ; CHECK-NEXT: retq -; -; IPRA-LABEL: test_api: -; IPRA: # %bb.0: -; IPRA-NEXT: pushq %rbp -; IPRA-NEXT: subq $64, %rsp -; IPRA-NEXT: vpxord %zmm0, %zmm0, %zmm0 -; IPRA-NEXT: vmovdqu64 %zmm0, (%rsp) -; IPRA-NEXT: movb $1, (%rsp) -; IPRA-NEXT: movb %dil, {{[0-9]+}}(%rsp) -; IPRA-NEXT: movw $8, {{[0-9]+}}(%rsp) -; IPRA-NEXT: movb $8, {{[0-9]+}}(%rsp) -; IPRA-NEXT: movw %si, {{[0-9]+}}(%rsp) -; IPRA-NEXT: movb %dil, {{[0-9]+}}(%rsp) -; IPRA-NEXT: movw %si, {{[0-9]+}}(%rsp) -; IPRA-NEXT: ldtilecfg (%rsp) -; IPRA-NEXT: movl $buf, %eax -; IPRA-NEXT: movl $32, %ecx -; IPRA-NEXT: movw $8, %dx -; IPRA-NEXT: tileloadd (%rax,%rcx), %tmm0 -; IPRA-NEXT: movl $buf+1024, %eax -; IPRA-NEXT: tileloadd (%rax,%rcx), %tmm1 -; IPRA-NEXT: callq foo -; IPRA-NEXT: movl $buf+2048, %eax -; IPRA-NEXT: tileloadd (%rax,%rcx), %tmm2 -; IPRA-NEXT: tdpbssd %tmm1, %tmm0, %tmm2 -; IPRA-NEXT: tilestored %tmm2, (%rax,%rcx) -; IPRA-NEXT: addq $64, %rsp -; IPRA-NEXT: popq %rbp -; IPRA-NEXT: tilerelease -; IPRA-NEXT: vzeroupper -; IPRA-NEXT: retq %3 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 8, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 0), i64 32) %4 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 1024), i64 32) - call void @foo() + tail call void (...) @foo() %5 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32) %6 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %0, i16 %1, i16 8, x86_amx %5, x86_amx %3, x86_amx %4) tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32, x86_amx %6) ret void } +declare dso_local void @foo(...) + declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64) declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx) - -attributes #0 = { noinline nounwind optnone uwtable "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" } diff --git a/llvm/test/CodeGen/X86/AMX/amx-bf16-intrinsics.ll b/llvm/test/CodeGen/X86/AMX/amx-bf16-intrinsics.ll index f38554b9f79d..a415d9c15242 100644 --- a/llvm/test/CodeGen/X86/AMX/amx-bf16-intrinsics.ll +++ b/llvm/test/CodeGen/X86/AMX/amx-bf16-intrinsics.ll @@ -1,12 +1,13 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-tile -mattr=+amx-bf16 -verify-machineinstrs | FileCheck %s define void @test_amx() { ; CHECK-LABEL: test_amx: ; CHECK: # %bb.0: ; CHECK-NEXT: tdpbf16ps %tmm7, %tmm4, %tmm3 +; CHECK-NEXT: retq call void @llvm.x86.tdpbf16ps(i8 3, i8 4, i8 7) ret void } declare void @llvm.x86.tdpbf16ps(i8 %tile0, i8 %tile1, i8 %tile2) diff --git a/llvm/test/CodeGen/X86/AMX/amx-ldtilecfg-insert.ll b/llvm/test/CodeGen/X86/AMX/amx-ldtilecfg-insert.ll deleted file mode 100644 index b381429c9374..000000000000 --- a/llvm/test/CodeGen/X86/AMX/amx-ldtilecfg-insert.ll +++ /dev/null @@ -1,131 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -verify-machineinstrs | FileCheck %s -@buf = dso_local global [3072 x i8] zeroinitializer, align 16 - -define dso_local void @test1(i16 signext %0, i16 signext %1) local_unnamed_addr { -; CHECK-LABEL: test1: -; CHECK: # %bb.0: -; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0 -; CHECK-NEXT: vmovdqu64 %zmm0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movb $1, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movw %si, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movw %si, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movl $buf, %eax -; CHECK-NEXT: movl $32, %ecx -; CHECK-NEXT: movw $8, %dx -; CHECK-NEXT: tileloadd (%rax,%rcx), %tmm0 -; CHECK-NEXT: movl $buf+1024, %eax -; CHECK-NEXT: tileloadd (%rax,%rcx), %tmm1 -; CHECK-NEXT: movl $buf+2048, %eax -; CHECK-NEXT: tileloadd (%rax,%rcx), %tmm2 -; CHECK-NEXT: tdpbssd %tmm1, %tmm0, %tmm2 -; CHECK-NEXT: tilestored %tmm2, (%rax,%rcx) -; CHECK-NEXT: tilerelease -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: jmp foo # TAILCALL - %3 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 8, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 0), i64 32) - %4 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 1024), i64 32) - %5 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32) - %6 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %0, i16 %1, i16 8, x86_amx %5, x86_amx %3, x86_amx %4) - tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32, x86_amx %6) - tail call void @foo() - ret void -} - -define dso_local void @test2(i16 signext %0, i16 signext %1) local_unnamed_addr { -; CHECK-LABEL: test2: -; CHECK: # %bb.0: -; CHECK-NEXT: pushq %rbp -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: .cfi_def_cfa_offset 24 -; CHECK-NEXT: subq $72, %rsp -; CHECK-NEXT: .cfi_def_cfa_offset 96 -; CHECK-NEXT: .cfi_offset %rbx, -24 -; CHECK-NEXT: .cfi_offset %rbp, -16 -; CHECK-NEXT: movl %esi, %ebx -; CHECK-NEXT: movl %edi, %ebp -; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0 -; CHECK-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movw %bx, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movb %bpl, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movw %bx, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movb %bpl, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movw %bx, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movb %bpl, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp) -; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: callq foo -; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) -; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: testb %al, %al -; CHECK-NEXT: jne .LBB1_3 -; CHECK-NEXT: # %bb.1: # %if.true -; CHECK-NEXT: movw $8, %ax -; CHECK-NEXT: tilezero %tmm0 -; CHECK-NEXT: movl $32, %ecx -; CHECK-NEXT: movl $buf+1024, %edx -; CHECK-NEXT: tileloadd (%rdx,%rcx), %tmm1 -; CHECK-NEXT: movl $buf+2048, %edx -; CHECK-NEXT: tileloadd (%rdx,%rcx), %tmm2 -; CHECK-NEXT: tdpbssd %tmm2, %tmm1, %tmm0 -; CHECK-NEXT: tilestored %tmm0, (%rdx,%rcx) -; CHECK-NEXT: jmp .LBB1_2 -; CHECK-NEXT: .LBB1_3: # %if.false -; CHECK-NEXT: movl $buf, %eax -; CHECK-NEXT: movl $32, %ecx -; CHECK-NEXT: movw $8, %dx -; CHECK-NEXT: tileloadd (%rax,%rcx), %tmm3 -; CHECK-NEXT: movl $buf+1024, %eax -; CHECK-NEXT: tileloadd (%rax,%rcx), %tmm4 -; CHECK-NEXT: movl $buf+2048, %eax -; CHECK-NEXT: tileloadd (%rax,%rcx), %tmm2 -; CHECK-NEXT: tdpbssd %tmm2, %tmm4, %tmm3 -; CHECK-NEXT: tilestored %tmm3, (%rax,%rcx) -; CHECK-NEXT: .LBB1_2: # %if.true -; CHECK-NEXT: addq $72, %rsp -; CHECK-NEXT: .cfi_def_cfa_offset 24 -; CHECK-NEXT: popq %rbx -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: popq %rbp -; CHECK-NEXT: .cfi_def_cfa_offset 8 -; CHECK-NEXT: tilerelease -; CHECK-NEXT: retq - call void @foo() - br i1 undef, label %if.true, label %if.false - -if.true: - %t1 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %0, i16 8) - %t2 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 1024), i64 32) - %t3 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32) - %t4 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %0, i16 %1, i16 8, x86_amx %t1, x86_amx %t2, x86_amx %t3) - tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32, x86_amx %t4) - br label %exit - -if.false: - %t5 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 8, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 0), i64 32) - %t6 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 1024), i64 32) - %t7 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32) - %t8 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %0, i16 %1, i16 8, x86_amx %t5, x86_amx %t6, x86_amx %t7) - tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32, x86_amx %t8) - br label %exit - -exit: - ret void -} - -declare dso_local void @foo() local_unnamed_addr -declare x86_amx @llvm.x86.tilezero.internal(i16, i16) -declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64) -declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) -declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx) diff --git a/llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll b/llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll index 57b67c456b36..0dc0c34c340c 100644 --- a/llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll +++ b/llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll @@ -1,204 +1,204 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -verify-machineinstrs | FileCheck %s @buf = dso_local global [3072 x i8] zeroinitializer, align 64 define dso_local void @test_api(i16 signext %0, i16 signext %1) nounwind { ; CHECK-LABEL: test_api: ; CHECK: # %bb.0: ; CHECK-NEXT: pushq %rbp ; CHECK-NEXT: pushq %r15 ; CHECK-NEXT: pushq %r14 ; CHECK-NEXT: pushq %rbx ; CHECK-NEXT: subq $4056, %rsp # imm = 0xFD8 ; CHECK-NEXT: movl %esi, %ebx ; CHECK-NEXT: movl %edi, %ebp ; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0 ; CHECK-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movb %bpl, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movw %bx, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movb %bpl, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movw %bx, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movb %bpl, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movb %bpl, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movw %bx, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movb %bpl, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movw %bx, {{[0-9]+}}(%rsp) ; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) ; CHECK-NEXT: movl $32, %r14d ; CHECK-NEXT: movl $buf+2048, %r15d ; CHECK-NEXT: tileloadd (%r15,%r14), %tmm5 ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: testb %al, %al -; CHECK-NEXT: jne .LBB0_2 -; CHECK-NEXT: # %bb.1: # %if.true +; CHECK-NEXT: sttilecfg {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Folded Spill ; CHECK-NEXT: movl $buf, %eax ; CHECK-NEXT: movw $8, %cx +; CHECK-NEXT: jne .LBB0_2 +; CHECK-NEXT: # %bb.1: # %if.true ; CHECK-NEXT: tileloadd (%rax,%r14), %tmm0 ; CHECK-NEXT: movl $buf+1024, %eax ; CHECK-NEXT: tileloadd (%rax,%r14), %tmm1 ; CHECK-NEXT: movabsq $64, %rax ; CHECK-NEXT: tilestored %tmm5, 2048(%rsp,%rax) # 1024-byte Folded Spill ; CHECK-NEXT: tdpbssd %tmm1, %tmm0, %tmm5 ; CHECK-NEXT: movabsq $64, %rax ; CHECK-NEXT: tilestored %tmm5, 1024(%rsp,%rax) # 1024-byte Folded Spill ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq foo -; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; CHECK-NEXT: ldtilecfg {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Folded Reload ; CHECK-NEXT: movabsq $64, %rax ; CHECK-NEXT: tileloadd 1024(%rsp,%rax), %tmm6 # 1024-byte Folded Reload ; CHECK-NEXT: jmp .LBB0_3 ; CHECK-NEXT: .LBB0_2: # %if.false -; CHECK-NEXT: movl $buf, %eax -; CHECK-NEXT: movw $8, %cx ; CHECK-NEXT: tileloadd (%rax,%r14), %tmm2 ; CHECK-NEXT: movl $buf+1024, %eax ; CHECK-NEXT: tileloadd (%rax,%r14), %tmm3 ; CHECK-NEXT: movabsq $64, %rax ; CHECK-NEXT: tilestored %tmm5, 2048(%rsp,%rax) # 1024-byte Folded Spill ; CHECK-NEXT: tdpbssd %tmm3, %tmm2, %tmm5 ; CHECK-NEXT: movabsq $64, %rax ; CHECK-NEXT: tilestored %tmm5, 1024(%rsp,%rax) # 1024-byte Folded Spill ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq foo -; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; CHECK-NEXT: ldtilecfg {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Folded Reload ; CHECK-NEXT: movabsq $64, %rax ; CHECK-NEXT: tileloadd 1024(%rsp,%rax), %tmm6 # 1024-byte Folded Reload ; CHECK-NEXT: tilestored %tmm6, (%r15,%r14) ; CHECK-NEXT: .LBB0_3: # %exit ; CHECK-NEXT: movl $buf, %eax ; CHECK-NEXT: movl $32, %ecx ; CHECK-NEXT: movw $8, %dx ; CHECK-NEXT: tileloadd (%rax,%rcx), %tmm4 ; CHECK-NEXT: movabsq $64, %rax ; CHECK-NEXT: tileloadd 2048(%rsp,%rax), %tmm5 # 1024-byte Folded Reload ; CHECK-NEXT: tdpbssd %tmm4, %tmm6, %tmm5 ; CHECK-NEXT: movl $buf+2048, %eax ; CHECK-NEXT: tilestored %tmm5, (%rax,%rcx) ; CHECK-NEXT: addq $4056, %rsp # imm = 0xFD8 ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %r14 ; CHECK-NEXT: popq %r15 ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: tilerelease ; CHECK-NEXT: retq %c = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32) br i1 undef, label %if.true, label %if.false if.true: %a1 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 8, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 0), i64 32) %b1 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 1024), i64 32) %d1 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %0, i16 %1, i16 8, x86_amx %c, x86_amx %a1, x86_amx %b1) tail call void (...) @foo() br label %exit if.false: %a2 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 8, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 0), i64 32) %b2 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 1024), i64 32) %d2 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %0, i16 %1, i16 8, x86_amx %c, x86_amx %a2, x86_amx %b2) tail call void (...) @foo() tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32, x86_amx %d2) br label %exit exit: %d = phi x86_amx [ %d1, %if.true ], [ %d2, %if.false ] %a = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 8, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 0), i64 32) %res = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %0, i16 %1, i16 8, x86_amx %c, x86_amx %d, x86_amx %a) tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32, x86_amx %res) ret void } define dso_local void @test3(i8 *%buf) nounwind { ; CHECK-LABEL: test3: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pushq %rbp ; CHECK-NEXT: pushq %r15 ; CHECK-NEXT: pushq %r14 ; CHECK-NEXT: pushq %rbx ; CHECK-NEXT: subq $3032, %rsp # imm = 0xBD8 ; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0 ; CHECK-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp) ; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) ; CHECK-NEXT: movw $8, %r15w ; CHECK-NEXT: tilezero %tmm0 ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: testb %al, %al ; CHECK-NEXT: jne .LBB1_3 ; CHECK-NEXT: # %bb.1: # %loop.header.preheader ; CHECK-NEXT: movq %rdi, %rbx ; CHECK-NEXT: movl $32, %r14d ; CHECK-NEXT: xorl %ebp, %ebp +; CHECK-NEXT: sttilecfg {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Folded Spill ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB1_2: # %loop.header ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: movabsq $64, %rax ; CHECK-NEXT: tilestored %tmm0, 1024(%rsp,%rax) # 1024-byte Folded Spill ; CHECK-NEXT: tilestored %tmm0, (%rbx,%r14) ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq foo -; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; CHECK-NEXT: ldtilecfg {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Folded Reload ; CHECK-NEXT: tilezero %tmm0 ; CHECK-NEXT: tileloadd (%rbx,%r14), %tmm1 ; CHECK-NEXT: tileloadd (%rbx,%r14), %tmm2 ; CHECK-NEXT: tdpbssd %tmm2, %tmm1, %tmm0 ; CHECK-NEXT: tilestored %tmm0, (%rbx,%r14) ; CHECK-NEXT: movabsq $64, %rax ; CHECK-NEXT: tileloadd 1024(%rsp,%rax), %tmm0 # 1024-byte Folded Reload ; CHECK-NEXT: incl %ebp ; CHECK-NEXT: cmpw $100, %bp ; CHECK-NEXT: jl .LBB1_2 ; CHECK-NEXT: .LBB1_3: # %exit ; CHECK-NEXT: addq $3032, %rsp # imm = 0xBD8 ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %r14 ; CHECK-NEXT: popq %r15 ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: tilerelease ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq entry: %t5 = tail call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 8) br i1 undef, label %loop.header, label %exit loop.header: %ivphi = phi i16 [0, %entry], [%iv, %loop.latch] call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %buf, i64 32, x86_amx %t5) call void (...) @foo() br label %loop.body loop.body: %t1 = tail call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 8) %t2 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* %buf, i64 32) %t3 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* %buf, i64 32) %t4 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 8, i16 8, i16 8, x86_amx %t1, x86_amx %t2, x86_amx %t3) tail call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %buf, i64 32, x86_amx %t4) br label %loop.latch loop.latch: %iv = add i16 %ivphi, 1 %c = icmp slt i16 %iv, 100 br i1 %c, label %loop.header, label %exit exit: ret void } declare dso_local void @foo(...) nounwind declare x86_amx @llvm.x86.tilezero.internal(i16, i16) declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64) declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx) diff --git a/llvm/test/CodeGen/X86/opt-pipeline.ll b/llvm/test/CodeGen/X86/opt-pipeline.ll index 1e1154b5f759..b851eea60b0a 100644 --- a/llvm/test/CodeGen/X86/opt-pipeline.ll +++ b/llvm/test/CodeGen/X86/opt-pipeline.ll @@ -1,212 +1,210 @@ ; When EXPENSIVE_CHECKS are enabled, the machine verifier appears between each ; pass. Ignore it with 'grep -v'. ; RUN: llc -mtriple=x86_64-- -O1 -debug-pass=Structure < %s -o /dev/null 2>&1 \ ; RUN: | grep -v 'Verify generated machine code' | FileCheck %s ; RUN: llc -mtriple=x86_64-- -O2 -debug-pass=Structure < %s -o /dev/null 2>&1 \ ; RUN: | grep -v 'Verify generated machine code' | FileCheck %s ; RUN: llc -mtriple=x86_64-- -O3 -debug-pass=Structure < %s -o /dev/null 2>&1 \ ; RUN: | grep -v 'Verify generated machine code' | FileCheck %s ; REQUIRES: asserts ; CHECK-LABEL: Pass Arguments: ; CHECK-NEXT: Target Library Information ; CHECK-NEXT: Target Pass Configuration ; CHECK-NEXT: Machine Module Information ; CHECK-NEXT: Target Transform Information ; CHECK-NEXT: Type-Based Alias Analysis ; CHECK-NEXT: Scoped NoAlias Alias Analysis ; CHECK-NEXT: Assumption Cache Tracker ; CHECK-NEXT: Profile summary info ; CHECK-NEXT: Create Garbage Collector Module Metadata ; CHECK-NEXT: Machine Branch Probability Analysis ; CHECK-NEXT: ModulePass Manager ; CHECK-NEXT: Pre-ISel Intrinsic Lowering ; CHECK-NEXT: FunctionPass Manager ; CHECK-NEXT: Expand Atomic instructions ; CHECK-NEXT: Lower AMX type for load/store ; CHECK-NEXT: Module Verifier ; CHECK-NEXT: Dominator Tree Construction ; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) ; CHECK-NEXT: Natural Loop Information ; CHECK-NEXT: Canonicalize natural loops ; CHECK-NEXT: Scalar Evolution Analysis ; CHECK-NEXT: Loop Pass Manager ; CHECK-NEXT: Canonicalize Freeze Instructions in Loops ; CHECK-NEXT: Induction Variable Users ; CHECK-NEXT: Loop Strength Reduction ; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) ; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Merge contiguous icmps into a memcmp ; CHECK-NEXT: Natural Loop Information ; CHECK-NEXT: Lazy Branch Probability Analysis ; CHECK-NEXT: Lazy Block Frequency Analysis ; CHECK-NEXT: Expand memcmp() to load/stores ; CHECK-NEXT: Lower Garbage Collection Instructions ; CHECK-NEXT: Shadow Stack GC Lowering ; CHECK-NEXT: Lower constant intrinsics ; CHECK-NEXT: Remove unreachable blocks from the CFG ; CHECK-NEXT: Dominator Tree Construction ; CHECK-NEXT: Natural Loop Information ; CHECK-NEXT: Post-Dominator Tree Construction ; CHECK-NEXT: Branch Probability Analysis ; CHECK-NEXT: Block Frequency Analysis ; CHECK-NEXT: Constant Hoisting ; CHECK-NEXT: Partially inline calls to library functions ; CHECK-NEXT: Instrument function entry/exit with calls to e.g. mcount() (post inlining) ; CHECK-NEXT: Scalarize Masked Memory Intrinsics ; CHECK-NEXT: Expand reduction intrinsics ; CHECK-NEXT: Dominator Tree Construction ; CHECK-NEXT: Interleaved Access Pass ; CHECK-NEXT: X86 Partial Reduction ; CHECK-NEXT: Expand indirectbr instructions ; CHECK-NEXT: Dominator Tree Construction ; CHECK-NEXT: Natural Loop Information ; CHECK-NEXT: CodeGen Prepare ; CHECK-NEXT: Rewrite Symbols ; CHECK-NEXT: FunctionPass Manager ; CHECK-NEXT: Dominator Tree Construction ; CHECK-NEXT: Exception handling preparation ; CHECK-NEXT: Safe Stack instrumentation pass ; CHECK-NEXT: Insert stack protectors ; CHECK-NEXT: Module Verifier ; CHECK-NEXT: Dominator Tree Construction ; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) ; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Natural Loop Information ; CHECK-NEXT: Post-Dominator Tree Construction ; CHECK-NEXT: Branch Probability Analysis ; CHECK-NEXT: Lazy Branch Probability Analysis ; CHECK-NEXT: Lazy Block Frequency Analysis ; CHECK-NEXT: X86 DAG->DAG Instruction Selection ; CHECK-NEXT: MachineDominator Tree Construction ; CHECK-NEXT: Local Dynamic TLS Access Clean-up ; CHECK-NEXT: X86 PIC Global Base Reg Initialization ; CHECK-NEXT: Finalize ISel and expand pseudo-instructions ; CHECK-NEXT: X86 Domain Reassignment Pass ; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Early Tail Duplication ; CHECK-NEXT: Optimize machine instruction PHIs ; CHECK-NEXT: Slot index numbering ; CHECK-NEXT: Merge disjoint stack slots ; CHECK-NEXT: Local Stack Slot Allocation ; CHECK-NEXT: Remove dead machine instructions ; CHECK-NEXT: MachineDominator Tree Construction ; CHECK-NEXT: Machine Natural Loop Construction ; CHECK-NEXT: Machine Trace Metrics ; CHECK-NEXT: Early If-Conversion ; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Machine InstCombiner ; CHECK-NEXT: X86 cmov Conversion ; CHECK-NEXT: MachineDominator Tree Construction ; CHECK-NEXT: Machine Natural Loop Construction ; CHECK-NEXT: Machine Block Frequency Analysis ; CHECK-NEXT: Early Machine Loop Invariant Code Motion ; CHECK-NEXT: MachineDominator Tree Construction ; CHECK-NEXT: Machine Block Frequency Analysis ; CHECK-NEXT: Machine Common Subexpression Elimination ; CHECK-NEXT: MachinePostDominator Tree Construction ; CHECK-NEXT: Machine code sinking ; CHECK-NEXT: Peephole Optimizations ; CHECK-NEXT: Remove dead machine instructions ; CHECK-NEXT: Live Range Shrink ; CHECK-NEXT: X86 Fixup SetCC ; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: X86 LEA Optimize ; CHECK-NEXT: X86 Optimize Call Frame ; CHECK-NEXT: X86 Avoid Store Forwarding Block ; CHECK-NEXT: X86 speculative load hardening ; CHECK-NEXT: MachineDominator Tree Construction ; CHECK-NEXT: X86 EFLAGS copy lowering ; CHECK-NEXT: X86 WinAlloca Expander ; CHECK-NEXT: MachineDominator Tree Construction -; CHECK-NEXT: Slot index numbering -; CHECK-NEXT: Live Interval Analysis ; CHECK-NEXT: Tile Register Pre-configure ; CHECK-NEXT: Detect Dead Lanes ; CHECK-NEXT: Process Implicit Definitions ; CHECK-NEXT: Remove unreachable machine basic blocks ; CHECK-NEXT: Live Variable Analysis ; CHECK-NEXT: Machine Natural Loop Construction ; CHECK-NEXT: Eliminate PHI nodes for register allocation ; CHECK-NEXT: Two-Address instruction pass ; CHECK-NEXT: Slot index numbering ; CHECK-NEXT: Live Interval Analysis ; CHECK-NEXT: Simple Register Coalescing ; CHECK-NEXT: Rename Disconnected Subregister Components ; CHECK-NEXT: Machine Instruction Scheduler ; CHECK-NEXT: Machine Block Frequency Analysis ; CHECK-NEXT: Debug Variable Analysis ; CHECK-NEXT: Live Stack Slot Analysis ; CHECK-NEXT: Virtual Register Map ; CHECK-NEXT: Live Register Matrix ; CHECK-NEXT: Bundle Machine CFG Edges ; CHECK-NEXT: Spill Code Placement Analysis ; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Machine Optimization Remark Emitter ; CHECK-NEXT: Greedy Register Allocator ; CHECK-NEXT: Tile Register Configure ; CHECK-NEXT: Virtual Register Rewriter ; CHECK-NEXT: Stack Slot Coloring ; CHECK-NEXT: Machine Copy Propagation Pass ; CHECK-NEXT: Machine Loop Invariant Code Motion ; CHECK-NEXT: Bundle Machine CFG Edges ; CHECK-NEXT: X86 FP Stackifier ; CHECK-NEXT: MachineDominator Tree Construction ; CHECK-NEXT: Machine Dominance Frontier Construction ; CHECK-NEXT: X86 Load Value Injection (LVI) Load Hardening ; CHECK-NEXT: Fixup Statepoint Caller Saved ; CHECK-NEXT: PostRA Machine Sink ; CHECK-NEXT: Machine Block Frequency Analysis ; CHECK-NEXT: MachinePostDominator Tree Construction ; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Machine Optimization Remark Emitter ; CHECK-NEXT: Shrink Wrapping analysis ; CHECK-NEXT: Prologue/Epilogue Insertion & Frame Finalization ; CHECK-NEXT: Control Flow Optimizer ; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Tail Duplication ; CHECK-NEXT: Machine Copy Propagation Pass ; CHECK-NEXT: Post-RA pseudo instruction expansion pass ; CHECK-NEXT: X86 pseudo instruction expansion pass ; CHECK-NEXT: MachineDominator Tree Construction ; CHECK-NEXT: Machine Natural Loop Construction ; CHECK-NEXT: Post RA top-down list latency scheduler ; CHECK-NEXT: Analyze Machine Code For Garbage Collection ; CHECK-NEXT: Machine Block Frequency Analysis ; CHECK-NEXT: MachinePostDominator Tree Construction ; CHECK-NEXT: Branch Probability Basic Block Placement ; CHECK-NEXT: Insert fentry calls ; CHECK-NEXT: Insert XRay ops ; CHECK-NEXT: Implement the 'patchable-function' attribute ; CHECK-NEXT: ReachingDefAnalysis ; CHECK-NEXT: X86 Execution Dependency Fix ; CHECK-NEXT: BreakFalseDeps ; CHECK-NEXT: X86 Indirect Branch Tracking ; CHECK-NEXT: X86 vzeroupper inserter ; CHECK-NEXT: MachineDominator Tree Construction ; CHECK-NEXT: Machine Natural Loop Construction ; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: X86 Byte/Word Instruction Fixup ; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: X86 Atom pad short functions ; CHECK-NEXT: X86 LEA Fixup ; CHECK-NEXT: Compressing EVEX instrs to VEX encoding when possible ; CHECK-NEXT: X86 Discriminate Memory Operands ; CHECK-NEXT: X86 Insert Cache Prefetches ; CHECK-NEXT: X86 insert wait instruction ; CHECK-NEXT: Contiguously Lay Out Funclets ; CHECK-NEXT: StackMap Liveness Analysis ; CHECK-NEXT: Live DEBUG_VALUE analysis ; CHECK-NEXT: X86 Speculative Execution Side Effect Suppression ; CHECK-NEXT: X86 Indirect Thunks ; CHECK-NEXT: Check CFA info and insert CFI instructions if needed ; CHECK-NEXT: X86 Load Value Injection (LVI) Ret-Hardening ; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Machine Optimization Remark Emitter ; CHECK-NEXT: X86 Assembly Printer ; CHECK-NEXT: Free MachineFunction define void @f() { ret void }