Index: include/llvm/CodeGen/LiveRangeEdit.h =================================================================== --- include/llvm/CodeGen/LiveRangeEdit.h +++ include/llvm/CodeGen/LiveRangeEdit.h @@ -60,6 +60,7 @@ private: LiveInterval *Parent; SmallVectorImpl &NewRegs; + SmallPtrSet *DeadRemats; MachineRegisterInfo &MRI; LiveIntervals &LIS; VirtRegMap *VRM; @@ -111,18 +112,21 @@ /// @param parent The register being spilled or split. /// @param newRegs List to receive any new registers created. This needn't be /// empty initially, any existing registers are ignored. + /// @param deadRemats The collection of all the instructions defining an + /// original reg and are dead after remat. /// @param MF The MachineFunction the live range edit is taking place in. /// @param lis The collection of all live intervals in this function. /// @param vrm Map of virtual registers to physical registers for this /// function. If NULL, no virtual register map updates will /// be done. This could be the case if called before Regalloc. LiveRangeEdit(LiveInterval *parent, SmallVectorImpl &newRegs, + SmallPtrSet *deadRemats, MachineFunction &MF, LiveIntervals &lis, VirtRegMap *vrm, Delegate *delegate = nullptr) - : Parent(parent), NewRegs(newRegs), MRI(MF.getRegInfo()), LIS(lis), - VRM(vrm), TII(*MF.getSubtarget().getInstrInfo()), - TheDelegate(delegate), FirstNew(newRegs.size()), - ScannedRemattable(false) { + : Parent(parent), NewRegs(newRegs), DeadRemats(deadRemats), + MRI(MF.getRegInfo()), LIS(lis), VRM(vrm), + TII(*MF.getSubtarget().getInstrInfo()), TheDelegate(delegate), + FirstNew(newRegs.size()), ScannedRemattable(false) { MRI.setDelegate(this); } @@ -141,6 +145,7 @@ unsigned size() const { return NewRegs.size()-FirstNew; } bool empty() const { return size() == 0; } unsigned get(unsigned idx) const { return NewRegs[idx+FirstNew]; } + void pop_back() { NewRegs.pop_back(); } ArrayRef regs() const { return makeArrayRef(NewRegs).slice(FirstNew); @@ -175,8 +180,11 @@ /// Remat - Information needed to rematerialize at a specific location. struct Remat { VNInfo *ParentVNI; // parent_'s value at the remat location. - MachineInstr *OrigMI; // Instruction defining ParentVNI. - explicit Remat(VNInfo *ParentVNI) : ParentVNI(ParentVNI), OrigMI(nullptr) {} + VNInfo *OrigVNI; // ParentVNI.def may be a copy only. OrigVNI.def + // contains the real expr for remat. + MachineInstr *OrigMI; // Instruction defining OrigVNI. + explicit Remat(VNInfo *ParentVNI, VNInfo *OrigVNI) + : ParentVNI(ParentVNI), OrigVNI(OrigVNI), OrigMI(nullptr) {} }; /// canRematerializeAt - Determine if ParentVNI can be rematerialized at @@ -208,6 +216,12 @@ return Rematted.count(ParentVNI); } + void markDeadRemat(MachineInstr *inst) { + // For regallocs other than Greedy, DeadRemats is nullptr for now. + if (DeadRemats) + DeadRemats->insert(inst); + } + /// eraseVirtReg - Notify the delegate that Reg is no longer in use, and try /// to erase it from LIS. void eraseVirtReg(unsigned Reg); @@ -218,8 +232,11 @@ /// RegsBeingSpilled lists registers currently being spilled by the register /// allocator. These registers should not be split into new intervals /// as currently those new intervals are not guaranteed to spill. - void eliminateDeadDefs(SmallVectorImpl &Dead, - ArrayRef RegsBeingSpilled = None); + /// NoSplit indicates it is used after the iterations of selectOrSplit and + /// registers should not be split into new intervals. + void eliminateDeadDefs(SmallVectorImpl &Dead, + ArrayRef RegsBeingSpilled = None, + bool NoSplit = false); /// calculateRegClassAndHint - Recompute register class and hint for each new /// register. Index: lib/CodeGen/InlineSpiller.cpp =================================================================== --- lib/CodeGen/InlineSpiller.cpp +++ lib/CodeGen/InlineSpiller.cpp @@ -48,13 +48,70 @@ STATISTIC(NumFolded, "Number of folded stack accesses"); STATISTIC(NumFoldedLoads, "Number of folded loads"); STATISTIC(NumRemats, "Number of rematerialized defs for spilling"); -STATISTIC(NumOmitReloadSpill, "Number of omitted spills of reloads"); -STATISTIC(NumHoists, "Number of hoisted spills"); static cl::opt DisableHoisting("disable-spill-hoist", cl::Hidden, cl::desc("Disable inline spill hoisting")); namespace { +class HoistSpiller { + MachineFunction &MF; + LiveIntervals &LIS; + LiveStacks &LSS; + AliasAnalysis *AA; + MachineDominatorTree &MDT; + MachineLoopInfo &Loops; + VirtRegMap &VRM; + MachineFrameInfo &MFI; + MachineRegisterInfo &MRI; + const TargetInstrInfo &TII; + const TargetRegisterInfo &TRI; + const MachineBlockFrequencyInfo &MBFI; + + // Map from StackSlot to its original register. + DenseMap StackSlotToReg; + // Map from pair of (StackSlot and Original VNI) to a set of spills which + // have the same stackslot and have equal values defined by Original VNI. + // These spills are mergable and are hoist candiates. + typedef DenseMap, SmallPtrSet> + MergableSpillsMap; + MergableSpillsMap MergableSpills; + + /// Virt2SibingsMap - This is the map from original register to a set + /// containing all its siblings. To hoist a spill to another BB, we need + /// to find out a live sibling there and use it as the RHS of the new spill. + DenseMap> Virt2SiblingsMap; + + bool isSpillCandBB(unsigned OrigReg, VNInfo *OrigVNI, MachineBasicBlock *BB, + unsigned &LiveReg); + void getVisitOrders( + MachineBasicBlock *Root, SmallPtrSet &Spills, + SmallVectorImpl &Orders, + SmallVectorImpl &SpillsToRm, + DenseMap &SpillsToKept, + DenseMap &SpillBBToSpill); + void runHoistSpills(unsigned OrigReg, VNInfo *OrigVNI, + SmallPtrSet &Spills, + SmallVectorImpl &SpillsToRm, + DenseMap &SpillsToIns); + +public: + HoistSpiller(MachineFunctionPass &pass, MachineFunction &mf, VirtRegMap &vrm) + : MF(mf), LIS(pass.getAnalysis()), + LSS(pass.getAnalysis()), + AA(&pass.getAnalysis().getAAResults()), + MDT(pass.getAnalysis()), + Loops(pass.getAnalysis()), VRM(vrm), + MFI(*mf.getFrameInfo()), MRI(mf.getRegInfo()), + TII(*mf.getSubtarget().getInstrInfo()), + TRI(*mf.getSubtarget().getRegisterInfo()), + MBFI(pass.getAnalysis()) {} + + void addToMergableSpills(MachineInstr *Spill, int StackSlot, + unsigned Original); + bool rmFromMergableSpills(MachineInstr *Spill, int StackSlot); + void hoistAllSpills(LiveRangeEdit &Edit); +}; + class InlineSpiller : public Spiller { MachineFunction &MF; LiveIntervals &LIS; @@ -85,56 +142,12 @@ // Values that failed to remat at some point. SmallPtrSet UsedValues; -public: - // Information about a value that was defined by a copy from a sibling - // register. - struct SibValueInfo { - // True when all reaching defs were reloads: No spill is necessary. - bool AllDefsAreReloads; - - // True when value is defined by an original PHI not from splitting. - bool DefByOrigPHI; - - // True when the COPY defining this value killed its source. - bool KillsSource; - - // The preferred register to spill. - unsigned SpillReg; - - // The value of SpillReg that should be spilled. - VNInfo *SpillVNI; - - // The block where SpillVNI should be spilled. Currently, this must be the - // block containing SpillVNI->def. - MachineBasicBlock *SpillMBB; - - // A defining instruction that is not a sibling copy or a reload, or NULL. - // This can be used as a template for rematerialization. - MachineInstr *DefMI; - - // List of values that depend on this one. These values are actually the - // same, but live range splitting has placed them in different registers, - // or SSA update needed to insert PHI-defs to preserve SSA form. This is - // copies of the current value and phi-kills. Usually only phi-kills cause - // more than one dependent value. - TinyPtrVector Deps; - - SibValueInfo(unsigned Reg, VNInfo *VNI) - : AllDefsAreReloads(true), DefByOrigPHI(false), KillsSource(false), - SpillReg(Reg), SpillVNI(VNI), SpillMBB(nullptr), DefMI(nullptr) {} - - // Returns true when a def has been found. - bool hasDef() const { return DefByOrigPHI || DefMI; } - }; - -private: - // Values in RegsToSpill defined by sibling copies. - typedef DenseMap SibValueMap; - SibValueMap SibValues; - // Dead defs generated during spilling. SmallVector DeadDefs; + // Object records spills information and does the hoisting. + HoistSpiller *HSpiller; + ~InlineSpiller() override {} public: @@ -147,9 +160,14 @@ MFI(*mf.getFrameInfo()), MRI(mf.getRegInfo()), TII(*mf.getSubtarget().getInstrInfo()), TRI(*mf.getSubtarget().getRegisterInfo()), - MBFI(pass.getAnalysis()) {} + MBFI(pass.getAnalysis()), HSpiller(nullptr) { + } void spill(LiveRangeEdit &) override; + void setHSpiller(HoistSpiller *HS) { HSpiller = HS; } + HoistSpiller *getHSpiller() { return HSpiller; } + /// Methods for support type inquiry through isa, cast, and dyn_cast: + static inline bool classof(const Spiller *V) { return true; } private: bool isSnippet(const LiveInterval &SnipLI); @@ -161,11 +179,6 @@ } bool isSibling(unsigned Reg); - MachineInstr *traceSiblingValue(unsigned, VNInfo*, VNInfo*); - void propagateSiblingValue(SibValueMap::iterator, VNInfo *VNI = nullptr); - void analyzeSiblingValues(); - - bool hoistSpill(LiveInterval &SpillLI, MachineInstr &CopyMI); void eliminateRedundantSpills(LiveInterval &LI, VNInfo *VNI); void markValueUsed(LiveInterval*, VNInfo*); @@ -194,6 +207,21 @@ return new InlineSpiller(pass, mf, vrm); } +void createHoistSpiller(MachineFunctionPass &pass, MachineFunction &mf, + VirtRegMap &vrm, Spiller *spiller) { + HoistSpiller *HSpiller = new HoistSpiller(pass, mf, vrm); + (dyn_cast(spiller))->setHSpiller(HSpiller); +} + +void startHoistSpiller(MachineFunction &mf, VirtRegMap &vrm, LiveIntervals &lis, + Spiller *spiller) { + SmallVector NewVRegs; + LiveRangeEdit LRE(nullptr, NewVRegs, nullptr, mf, lis, &vrm, nullptr); + HoistSpiller *HSpiller = (dyn_cast(spiller))->getHSpiller(); + HSpiller->hoistAllSpills(LRE); + assert(NewVRegs.size() == 0 && + "No new vregs should be generated in hoistAllSpills"); +} } //===----------------------------------------------------------------------===// @@ -297,460 +325,11 @@ } } - -//===----------------------------------------------------------------------===// -// Sibling Values -//===----------------------------------------------------------------------===// - -// After live range splitting, some values to be spilled may be defined by -// copies from sibling registers. We trace the sibling copies back to the -// original value if it still exists. We need it for rematerialization. -// -// Even when the value can't be rematerialized, we still want to determine if -// the value has already been spilled, or we may want to hoist the spill from a -// loop. - bool InlineSpiller::isSibling(unsigned Reg) { return TargetRegisterInfo::isVirtualRegister(Reg) && VRM.getOriginal(Reg) == Original; } -#ifndef NDEBUG -static raw_ostream &operator<<(raw_ostream &OS, - const InlineSpiller::SibValueInfo &SVI) { - OS << "spill " << PrintReg(SVI.SpillReg) << ':' - << SVI.SpillVNI->id << '@' << SVI.SpillVNI->def; - if (SVI.SpillMBB) - OS << " in BB#" << SVI.SpillMBB->getNumber(); - if (SVI.AllDefsAreReloads) - OS << " all-reloads"; - if (SVI.DefByOrigPHI) - OS << " orig-phi"; - if (SVI.KillsSource) - OS << " kill"; - OS << " deps["; - for (VNInfo *Dep : SVI.Deps) - OS << ' ' << Dep->id << '@' << Dep->def; - OS << " ]"; - if (SVI.DefMI) - OS << " def: " << *SVI.DefMI; - else - OS << '\n'; - return OS; -} -#endif - -/// propagateSiblingValue - Propagate the value in SVI to dependents if it is -/// known. Otherwise remember the dependency for later. -/// -/// @param SVIIter SibValues entry to propagate. -/// @param VNI Dependent value, or NULL to propagate to all saved dependents. -void InlineSpiller::propagateSiblingValue(SibValueMap::iterator SVIIter, - VNInfo *VNI) { - SibValueMap::value_type *SVI = &*SVIIter; - - // When VNI is non-NULL, add it to SVI's deps, and only propagate to that. - TinyPtrVector FirstDeps; - if (VNI) { - FirstDeps.push_back(VNI); - SVI->second.Deps.push_back(VNI); - } - - // Has the value been completely determined yet? If not, defer propagation. - if (!SVI->second.hasDef()) - return; - - // Work list of values to propagate. - SmallSetVector WorkList; - WorkList.insert(SVI); - - do { - SVI = WorkList.pop_back_val(); - TinyPtrVector *Deps = VNI ? &FirstDeps : &SVI->second.Deps; - VNI = nullptr; - - SibValueInfo &SV = SVI->second; - if (!SV.SpillMBB) - SV.SpillMBB = LIS.getMBBFromIndex(SV.SpillVNI->def); - - DEBUG(dbgs() << " prop to " << Deps->size() << ": " - << SVI->first->id << '@' << SVI->first->def << ":\t" << SV); - - assert(SV.hasDef() && "Propagating undefined value"); - - // Should this value be propagated as a preferred spill candidate? We don't - // propagate values of registers that are about to spill. - bool PropSpill = !DisableHoisting && !isRegToSpill(SV.SpillReg); - unsigned SpillDepth = ~0u; - - for (VNInfo *Dep : *Deps) { - SibValueMap::iterator DepSVI = SibValues.find(Dep); - assert(DepSVI != SibValues.end() && "Dependent value not in SibValues"); - SibValueInfo &DepSV = DepSVI->second; - if (!DepSV.SpillMBB) - DepSV.SpillMBB = LIS.getMBBFromIndex(DepSV.SpillVNI->def); - - bool Changed = false; - - // Propagate defining instruction. - if (!DepSV.hasDef()) { - Changed = true; - DepSV.DefMI = SV.DefMI; - DepSV.DefByOrigPHI = SV.DefByOrigPHI; - } - - // Propagate AllDefsAreReloads. For PHI values, this computes an AND of - // all predecessors. - if (!SV.AllDefsAreReloads && DepSV.AllDefsAreReloads) { - Changed = true; - DepSV.AllDefsAreReloads = false; - } - - // Propagate best spill value. - if (PropSpill && SV.SpillVNI != DepSV.SpillVNI) { - if (SV.SpillMBB == DepSV.SpillMBB) { - // DepSV is in the same block. Hoist when dominated. - if (DepSV.KillsSource && SV.SpillVNI->def < DepSV.SpillVNI->def) { - // This is an alternative def earlier in the same MBB. - // Hoist the spill as far as possible in SpillMBB. This can ease - // register pressure: - // - // x = def - // y = use x - // s = copy x - // - // Hoisting the spill of s to immediately after the def removes the - // interference between x and y: - // - // x = def - // spill x - // y = use x - // - // This hoist only helps when the DepSV copy kills its source. - Changed = true; - DepSV.SpillReg = SV.SpillReg; - DepSV.SpillVNI = SV.SpillVNI; - DepSV.SpillMBB = SV.SpillMBB; - } - } else { - // DepSV is in a different block. - if (SpillDepth == ~0u) - SpillDepth = Loops.getLoopDepth(SV.SpillMBB); - - // Also hoist spills to blocks with smaller loop depth, but make sure - // that the new value dominates. Non-phi dependents are always - // dominated, phis need checking. - - const BranchProbability MarginProb(4, 5); // 80% - // Hoist a spill to outer loop if there are multiple dependents (it - // can be beneficial if more than one dependents are hoisted) or - // if DepSV (the hoisting source) is hotter than SV (the hoisting - // destination) (we add a 80% margin to bias a little towards - // loop depth). - bool HoistCondition = - (MBFI.getBlockFreq(DepSV.SpillMBB) >= - (MBFI.getBlockFreq(SV.SpillMBB) * MarginProb)) || - Deps->size() > 1; - - if ((Loops.getLoopDepth(DepSV.SpillMBB) > SpillDepth) && - HoistCondition && - (!DepSVI->first->isPHIDef() || - MDT.dominates(SV.SpillMBB, DepSV.SpillMBB))) { - Changed = true; - DepSV.SpillReg = SV.SpillReg; - DepSV.SpillVNI = SV.SpillVNI; - DepSV.SpillMBB = SV.SpillMBB; - } - } - } - - if (!Changed) - continue; - - // Something changed in DepSVI. Propagate to dependents. - WorkList.insert(&*DepSVI); - - DEBUG(dbgs() << " update " << DepSVI->first->id << '@' - << DepSVI->first->def << " to:\t" << DepSV); - } - } while (!WorkList.empty()); -} - -/// traceSiblingValue - Trace a value that is about to be spilled back to the -/// real defining instructions by looking through sibling copies. Always stay -/// within the range of OrigVNI so the registers are known to carry the same -/// value. -/// -/// Determine if the value is defined by all reloads, so spilling isn't -/// necessary - the value is already in the stack slot. -/// -/// Return a defining instruction that may be a candidate for rematerialization. -/// -MachineInstr *InlineSpiller::traceSiblingValue(unsigned UseReg, VNInfo *UseVNI, - VNInfo *OrigVNI) { - // Check if a cached value already exists. - SibValueMap::iterator SVI; - bool Inserted; - std::tie(SVI, Inserted) = - SibValues.insert(std::make_pair(UseVNI, SibValueInfo(UseReg, UseVNI))); - if (!Inserted) { - DEBUG(dbgs() << "Cached value " << PrintReg(UseReg) << ':' - << UseVNI->id << '@' << UseVNI->def << ' ' << SVI->second); - return SVI->second.DefMI; - } - - DEBUG(dbgs() << "Tracing value " << PrintReg(UseReg) << ':' - << UseVNI->id << '@' << UseVNI->def << '\n'); - - // List of (Reg, VNI) that have been inserted into SibValues, but need to be - // processed. - SmallVector, 8> WorkList; - WorkList.push_back(std::make_pair(UseReg, UseVNI)); - - LiveInterval &OrigLI = LIS.getInterval(Original); - do { - unsigned Reg; - VNInfo *VNI; - std::tie(Reg, VNI) = WorkList.pop_back_val(); - DEBUG(dbgs() << " " << PrintReg(Reg) << ':' << VNI->id << '@' << VNI->def - << ":\t"); - - // First check if this value has already been computed. - SVI = SibValues.find(VNI); - assert(SVI != SibValues.end() && "Missing SibValues entry"); - - // Trace through PHI-defs created by live range splitting. - if (VNI->isPHIDef()) { - // Stop at original PHIs. We don't know the value at the - // predecessors. Look up the VNInfo for the current definition - // in OrigLI, to properly determine whether or not this phi was - // added by splitting. - if (VNI->def == OrigLI.getVNInfoAt(VNI->def)->def) { - DEBUG(dbgs() << "orig phi value\n"); - SVI->second.DefByOrigPHI = true; - SVI->second.AllDefsAreReloads = false; - propagateSiblingValue(SVI); - continue; - } - - // This is a PHI inserted by live range splitting. We could trace the - // live-out value from predecessor blocks, but that search can be very - // expensive if there are many predecessors and many more PHIs as - // generated by tail-dup when it sees an indirectbr. Instead, look at - // all the non-PHI defs that have the same value as OrigVNI. They must - // jointly dominate VNI->def. This is not optimal since VNI may actually - // be jointly dominated by a smaller subset of defs, so there is a change - // we will miss a AllDefsAreReloads optimization. - - // Separate all values dominated by OrigVNI into PHIs and non-PHIs. - SmallVector PHIs, NonPHIs; - LiveInterval &LI = LIS.getInterval(Reg); - - for (LiveInterval::vni_iterator VI = LI.vni_begin(), VE = LI.vni_end(); - VI != VE; ++VI) { - VNInfo *VNI2 = *VI; - if (VNI2->isUnused()) - continue; - if (!OrigLI.containsOneValue() && - OrigLI.getVNInfoAt(VNI2->def) != OrigVNI) - continue; - if (VNI2->isPHIDef() && VNI2->def != OrigVNI->def) - PHIs.push_back(VNI2); - else - NonPHIs.push_back(VNI2); - } - DEBUG(dbgs() << "split phi value, checking " << PHIs.size() - << " phi-defs, and " << NonPHIs.size() - << " non-phi/orig defs\n"); - - // Create entries for all the PHIs. Don't add them to the worklist, we - // are processing all of them in one go here. - for (VNInfo *PHI : PHIs) - SibValues.insert(std::make_pair(PHI, SibValueInfo(Reg, PHI))); - - // Add every PHI as a dependent of all the non-PHIs. - for (VNInfo *NonPHI : NonPHIs) { - // Known value? Try an insertion. - std::tie(SVI, Inserted) = - SibValues.insert(std::make_pair(NonPHI, SibValueInfo(Reg, NonPHI))); - // Add all the PHIs as dependents of NonPHI. - SVI->second.Deps.insert(SVI->second.Deps.end(), PHIs.begin(), - PHIs.end()); - // This is the first time we see NonPHI, add it to the worklist. - if (Inserted) - WorkList.push_back(std::make_pair(Reg, NonPHI)); - else - // Propagate to all inserted PHIs, not just VNI. - propagateSiblingValue(SVI); - } - - // Next work list item. - continue; - } - - MachineInstr *MI = LIS.getInstructionFromIndex(VNI->def); - assert(MI && "Missing def"); - - // Trace through sibling copies. - if (unsigned SrcReg = isFullCopyOf(MI, Reg)) { - if (isSibling(SrcReg)) { - LiveInterval &SrcLI = LIS.getInterval(SrcReg); - LiveQueryResult SrcQ = SrcLI.Query(VNI->def); - assert(SrcQ.valueIn() && "Copy from non-existing value"); - // Check if this COPY kills its source. - SVI->second.KillsSource = SrcQ.isKill(); - VNInfo *SrcVNI = SrcQ.valueIn(); - DEBUG(dbgs() << "copy of " << PrintReg(SrcReg) << ':' - << SrcVNI->id << '@' << SrcVNI->def - << " kill=" << unsigned(SVI->second.KillsSource) << '\n'); - // Known sibling source value? Try an insertion. - std::tie(SVI, Inserted) = SibValues.insert( - std::make_pair(SrcVNI, SibValueInfo(SrcReg, SrcVNI))); - // This is the first time we see Src, add it to the worklist. - if (Inserted) - WorkList.push_back(std::make_pair(SrcReg, SrcVNI)); - propagateSiblingValue(SVI, VNI); - // Next work list item. - continue; - } - } - - // Track reachable reloads. - SVI->second.DefMI = MI; - SVI->second.SpillMBB = MI->getParent(); - int FI; - if (Reg == TII.isLoadFromStackSlot(MI, FI) && FI == StackSlot) { - DEBUG(dbgs() << "reload\n"); - propagateSiblingValue(SVI); - // Next work list item. - continue; - } - - // Potential remat candidate. - DEBUG(dbgs() << "def " << *MI); - SVI->second.AllDefsAreReloads = false; - propagateSiblingValue(SVI); - } while (!WorkList.empty()); - - // Look up the value we were looking for. We already did this lookup at the - // top of the function, but SibValues may have been invalidated. - SVI = SibValues.find(UseVNI); - assert(SVI != SibValues.end() && "Didn't compute requested info"); - DEBUG(dbgs() << " traced to:\t" << SVI->second); - return SVI->second.DefMI; -} - -/// analyzeSiblingValues - Trace values defined by sibling copies back to -/// something that isn't a sibling copy. -/// -/// Keep track of values that may be rematerializable. -void InlineSpiller::analyzeSiblingValues() { - SibValues.clear(); - - // No siblings at all? - if (Edit->getReg() == Original) - return; - - LiveInterval &OrigLI = LIS.getInterval(Original); - for (unsigned Reg : RegsToSpill) { - LiveInterval &LI = LIS.getInterval(Reg); - for (LiveInterval::const_vni_iterator VI = LI.vni_begin(), - VE = LI.vni_end(); VI != VE; ++VI) { - VNInfo *VNI = *VI; - if (VNI->isUnused()) - continue; - MachineInstr *DefMI = nullptr; - if (!VNI->isPHIDef()) { - DefMI = LIS.getInstructionFromIndex(VNI->def); - assert(DefMI && "No defining instruction"); - } - // Check possible sibling copies. - if (VNI->isPHIDef() || DefMI->isCopy()) { - VNInfo *OrigVNI = OrigLI.getVNInfoAt(VNI->def); - assert(OrigVNI && "Def outside original live range"); - if (OrigVNI->def != VNI->def) - DefMI = traceSiblingValue(Reg, VNI, OrigVNI); - } - if (DefMI && Edit->checkRematerializable(VNI, DefMI, AA)) { - DEBUG(dbgs() << "Value " << PrintReg(Reg) << ':' << VNI->id << '@' - << VNI->def << " may remat from " << *DefMI); - } - } - } -} - -/// hoistSpill - Given a sibling copy that defines a value to be spilled, insert -/// a spill at a better location. -bool InlineSpiller::hoistSpill(LiveInterval &SpillLI, MachineInstr &CopyMI) { - SlotIndex Idx = LIS.getInstructionIndex(CopyMI); - VNInfo *VNI = SpillLI.getVNInfoAt(Idx.getRegSlot()); - assert(VNI && VNI->def == Idx.getRegSlot() && "Not defined by copy"); - SibValueMap::iterator I = SibValues.find(VNI); - if (I == SibValues.end()) - return false; - - const SibValueInfo &SVI = I->second; - - // Let the normal folding code deal with the boring case. - if (!SVI.AllDefsAreReloads && SVI.SpillVNI == VNI) - return false; - - // SpillReg may have been deleted by remat and DCE. - if (!LIS.hasInterval(SVI.SpillReg)) { - DEBUG(dbgs() << "Stale interval: " << PrintReg(SVI.SpillReg) << '\n'); - SibValues.erase(I); - return false; - } - - LiveInterval &SibLI = LIS.getInterval(SVI.SpillReg); - if (!SibLI.containsValue(SVI.SpillVNI)) { - DEBUG(dbgs() << "Stale value: " << PrintReg(SVI.SpillReg) << '\n'); - SibValues.erase(I); - return false; - } - - // Conservatively extend the stack slot range to the range of the original - // value. We may be able to do better with stack slot coloring by being more - // careful here. - assert(StackInt && "No stack slot assigned yet."); - LiveInterval &OrigLI = LIS.getInterval(Original); - VNInfo *OrigVNI = OrigLI.getVNInfoAt(Idx); - StackInt->MergeValueInAsValue(OrigLI, OrigVNI, StackInt->getValNumInfo(0)); - DEBUG(dbgs() << "\tmerged orig valno " << OrigVNI->id << ": " - << *StackInt << '\n'); - - // Already spilled everywhere. - if (SVI.AllDefsAreReloads) { - DEBUG(dbgs() << "\tno spill needed: " << SVI); - ++NumOmitReloadSpill; - return true; - } - // We are going to spill SVI.SpillVNI immediately after its def, so clear out - // any later spills of the same value. - eliminateRedundantSpills(SibLI, SVI.SpillVNI); - - MachineBasicBlock *MBB = LIS.getMBBFromIndex(SVI.SpillVNI->def); - MachineBasicBlock::iterator MII; - if (SVI.SpillVNI->isPHIDef()) - MII = MBB->SkipPHIsAndLabels(MBB->begin()); - else { - MachineInstr *DefMI = LIS.getInstructionFromIndex(SVI.SpillVNI->def); - assert(DefMI && "Defining instruction disappeared"); - MII = DefMI; - ++MII; - } - // Insert spill without kill flag immediately after def. - TII.storeRegToStackSlot(*MBB, MII, SVI.SpillReg, false, StackSlot, - MRI.getRegClass(SVI.SpillReg), &TRI); - --MII; // Point to store instruction. - LIS.InsertMachineInstrInMaps(*MII); - DEBUG(dbgs() << "\thoisted: " << SVI.SpillVNI->def << '\t' << *MII); - - ++NumSpills; - ++NumHoists; - return true; -} - /// eliminateRedundantSpills - SLI:VNI is known to be on the stack. Remove any /// redundant spills of this value in SLI.reg and sibling copies. void InlineSpiller::eliminateRedundantSpills(LiveInterval &SLI, VNInfo *VNI) { @@ -805,7 +384,8 @@ MI->setDesc(TII.get(TargetOpcode::KILL)); DeadDefs.push_back(MI); ++NumSpillsRemoved; - --NumSpills; + if (HSpiller && HSpiller->rmFromMergableSpills(MI, StackSlot)) + --NumSpills; } } } while (!WorkList.empty()); @@ -876,11 +456,11 @@ if (SnippetCopies.count(&MI)) return false; - // Use an OrigVNI from traceSiblingValue when ParentVNI is a sibling copy. - LiveRangeEdit::Remat RM(ParentVNI); - SibValueMap::const_iterator SibI = SibValues.find(ParentVNI); - if (SibI != SibValues.end()) - RM.OrigMI = SibI->second.DefMI; + LiveInterval &OrigLI = LIS.getInterval(Original); + VNInfo *OrigVNI = OrigLI.getVNInfoAt(UseIdx); + LiveRangeEdit::Remat RM(ParentVNI, OrigVNI); + RM.OrigMI = LIS.getInstructionFromIndex(OrigVNI->def); + if (!Edit->canRematerializeAt(RM, UseIdx, false)) { markValueUsed(&VirtReg, ParentVNI); DEBUG(dbgs() << "\tcannot remat for " << UseIdx << '\t' << MI); @@ -931,7 +511,6 @@ /// reMaterializeAll - Try to rematerialize as many uses as possible, /// and trim the live ranges after. void InlineSpiller::reMaterializeAll() { - // analyzeSiblingValues has already tested all relevant defining instructions. if (!Edit->anyRematerializable(AA)) return; @@ -1017,6 +596,9 @@ if (InstrReg != Reg || FI != StackSlot) return false; + if (!IsLoad && HSpiller) + HSpiller->rmFromMergableSpills(MI, StackSlot); + DEBUG(dbgs() << "Coalescing stack access: " << *MI); LIS.RemoveMachineInstrFromMaps(*MI); MI->eraseFromParent(); @@ -1141,6 +723,10 @@ LIS.removePhysRegDefAt(Reg, Idx); } + int FI; + if (TII.isStoreToStackSlot(MI, FI) && HSpiller && + HSpiller->rmFromMergableSpills(MI, FI)) + --NumSpills; LIS.ReplaceMachineInstrInMaps(*MI, *FoldMI); MI->eraseFromParent(); @@ -1166,9 +752,11 @@ if (!WasCopy) ++NumFolded; - else if (Ops.front().second == 0) + else if (Ops.front().second == 0) { ++NumSpills; - else + if (HSpiller) + HSpiller->addToMergableSpills(FoldMI, StackSlot, Original); + } else ++NumReloads; return true; } @@ -1203,6 +791,8 @@ DEBUG(dumpMachineInstrRangeWithSlotIndex(std::next(MI), MIS.end(), LIS, "spill")); ++NumSpills; + if (HSpiller) + HSpiller->addToMergableSpills(std::next(MI), StackSlot, Original); } /// spillAroundUses - insert spill code around each use of Reg. @@ -1265,15 +855,7 @@ SnippetCopies.insert(MI); continue; } - if (RI.Writes) { - // Hoist the spill of a sib-reg copy. - if (hoistSpill(OldLI, *MI)) { - // This COPY is now dead, the value is already in the stack slot. - MI->getOperand(0).setIsDead(); - DeadDefs.push_back(MI); - continue; - } - } else { + if (!RI.Writes) { // This is a reload for a sib-reg copy. Drop spills downstream. LiveInterval &SibLI = LIS.getInterval(SibReg); eliminateRedundantSpills(SibLI, SibLI.getVNInfoAt(Idx)); @@ -1380,7 +962,6 @@ assert(DeadDefs.empty() && "Previous spill didn't remove dead defs"); collectRegsToSpill(); - analyzeSiblingValues(); reMaterializeAll(); // Remat may handle everything. @@ -1389,3 +970,332 @@ Edit->calculateRegClassAndHint(MF, Loops, MBFI); } + +// When a spill is inserted, add the spill to MergableSpills map. +void HoistSpiller::addToMergableSpills(MachineInstr *Spill, int StackSlot, + unsigned Original) { + StackSlotToReg[StackSlot] = Original; + SlotIndex Idx = LIS.getInstructionIndex(*Spill); + VNInfo *OrigVNI = LIS.getInterval(Original).getVNInfoAt(Idx.getRegSlot()); + std::pair MIdx = std::make_pair(StackSlot, OrigVNI); + MergableSpills[MIdx].insert(Spill); +} + +// When a spill is removed, remove the spill from MergableSpills map. +// Return true if the spill is removed successfully. +bool HoistSpiller::rmFromMergableSpills(MachineInstr *Spill, int StackSlot) { + int Original = StackSlotToReg[StackSlot]; + if (!Original) + return false; + SlotIndex Idx = LIS.getInstructionIndex(*Spill); + VNInfo *OrigVNI = LIS.getInterval(Original).getVNInfoAt(Idx.getRegSlot()); + std::pair MIdx = std::make_pair(StackSlot, OrigVNI); + return MergableSpills[MIdx].erase(Spill); +} + +// Check BB to see if it is a possible target BB to place a hoisted spill, +// .i.e, there should be a living sibling of OrigReg at the insert point. +bool HoistSpiller::isSpillCandBB(unsigned OrigReg, VNInfo *OrigVNI, + MachineBasicBlock *BB, unsigned &LiveReg) { + SlotIndex Idx; + MachineBasicBlock::iterator MI = BB->getFirstTerminator(); + if (MI != BB->end()) + Idx = LIS.getInstructionIndex(*MI); + else + Idx = LIS.getMBBEndIdx(BB).getPrevSlot(); + DenseSet &Siblings = Virt2SiblingsMap[OrigReg]; + assert((LIS.getInterval(OrigReg)).getVNInfoAt(Idx) == OrigVNI && + "Unexpected VNI"); + + for (auto const ent : Siblings) { + LiveInterval &LI = LIS.getInterval(ent); + VNInfo *VNI = LI.getVNInfoAt(Idx); + if (VNI) { + LiveReg = ent; + return true; + } + } + return false; +} + +/// Get the top-bottom order to visit the BB nodes containing spills. +/// Redundent spills will be found and put into SpillsToRm at the +/// same time. +void HoistSpiller::getVisitOrders( + MachineBasicBlock *Root, SmallPtrSet &Spills, + SmallVectorImpl &Orders, + SmallVectorImpl &SpillsToRm, + DenseMap &SpillsToKept, + DenseMap &SpillBBToSpill) { + // For each spill, check the BB the spill is located at and set + // SpillBBToSpill[]. If a BB contains more than one spill, only + // keep the spill with smaller SlotIndex. + for (const auto CurrentSpill : Spills) { + MachineBasicBlock *Block = CurrentSpill->getParent(); + MachineDomTreeNode *Node = MDT.DT->getNode(Block); + MachineInstr *PrevSpill = SpillBBToSpill[Node]; + if (PrevSpill) { + SlotIndex PIdx = LIS.getInstructionIndex(*PrevSpill); + SlotIndex CIdx = LIS.getInstructionIndex(*CurrentSpill); + MachineInstr *SpillToRm = (CIdx > PIdx) ? CurrentSpill : PrevSpill; + MachineInstr *SpillToKeep = (CIdx > PIdx) ? PrevSpill : CurrentSpill; + SpillsToRm.push_back(SpillToRm); + SpillBBToSpill[MDT.DT->getNode(Block)] = SpillToKeep; + } else { + SpillBBToSpill[MDT.DT->getNode(Block)] = CurrentSpill; + } + } + for (const auto SpillToRm : SpillsToRm) + Spills.erase(SpillToRm); + + SmallPtrSet WorkSet; + SmallPtrSet NodesOnPath; + MachineDomTreeNode *RootIDomNode = MDT[Root]->getIDom(); + // For every node on the dominator tree with spill, walk upside on the + // dominator tree until reaching the Root node. If there is other node + // found with spill on the path, the original node is redundent and will + // be removed. All the nodes on the path from node with non-redundent spill + // to Root node will be added to the WorkSet, which is the set we want + // to look at during hoisting spills in the next step. + for (const auto Spill : Spills) { + MachineBasicBlock *Block = Spill->getParent(); + MachineDomTreeNode *Node = MDT[Block]; + MachineInstr *SpillToRm = nullptr; + while (Node != RootIDomNode) { + if (Node != MDT[Block] && SpillBBToSpill[Node]) { + SpillToRm = SpillBBToSpill[MDT[Block]]; + break; + } else if (WorkSet.count(Node)) { + break; + } else { + NodesOnPath.insert(Node); + } + Node = Node->getIDom(); + } + if (SpillToRm) { + SpillsToRm.push_back(SpillToRm); + } else { + SpillsToKept[MDT[Block]] = 0; + WorkSet.insert(NodesOnPath.begin(), NodesOnPath.end()); + } + NodesOnPath.clear(); + } + + // Sort the nodes in WorkSet in top-bottom order and save the nodes + // in Orders. + unsigned idx = 0; + Orders.push_back(MDT.DT->getNode(Root)); + do { + MachineDomTreeNode *Node = Orders[idx++]; + const std::vector &Children = Node->getChildren(); + unsigned NumChildren = Children.size(); + for (unsigned i = 0; i != NumChildren; ++i) { + MachineDomTreeNode *Child = Children[i]; + if (WorkSet.count(Child)) + Orders.push_back(Child); + } + } while (idx != Orders.size()); + + DEBUG(dbgs() << "Orders size is " << Orders.size() << "\n"); + { + SmallVector::reverse_iterator RIt = + Orders.rbegin(); + for (; RIt != Orders.rend(); RIt++) + DEBUG(dbgs() << "BB" << (*RIt)->getBlock()->getNumber() << ","); + } + DEBUG(dbgs() << "\n"); +} + +/// Try to hoist spills according to BB hotness. The spills to removed will +/// be saved in SpillsToRm. The spills to be inserted will be saved in +/// SpillsToIns. +void HoistSpiller::runHoistSpills( + unsigned OrigReg, VNInfo *OrigVNI, SmallPtrSet &Spills, + SmallVectorImpl &SpillsToRm, + DenseMap &SpillsToIns) { + // Visit order of dominator tree nodes. + SmallVector Orders; + // SpillsToKept contains all the nodes where spills are to be inserted + // during hoisting. If the spill to be inserted is an original spill + // (not a hoisted one), the value of the map entry is 0. If the spill + // is a hoisted spill, the value of the map entry is the VReg to be used + // on the RHS of the spill. + DenseMap SpillsToKept; + // Map from BB to the spill inside of it. + DenseMap SpillBBToSpill; + MachineBasicBlock *Root = LIS.getMBBFromIndex(OrigVNI->def); + getVisitOrders(Root, Spills, Orders, SpillsToRm, SpillsToKept, + SpillBBToSpill); + + // SpillsInSubTree keeps the map from a dom tree node to a nodes set. + // It saves the locations where spills are to be inserted in the + // subtree of the node. + DenseMap> + SpillsInSubTree; + // Iterate Orders set in reverse order, which will be a bottom-top order + // in the dominator tree. Once we visit a dom tree node, we know its + // children has already been visited and the spill locations in the + // subtrees of all the children have been determined. + SmallVector::reverse_iterator RIt = Orders.rbegin(); + for (; RIt != Orders.rend(); RIt++) { + MachineBasicBlock *Block = (*RIt)->getBlock(); + + // If Block contains an original spill, simply continue. + if (SpillsToKept.find(*RIt) != SpillsToKept.end() && !SpillsToKept[*RIt]) { + SpillsInSubTree[*RIt].insert(*RIt); + continue; + } + + // Collect spills in subtree of current node (*RIt) to + // SpillsInSubTree[*RIt]. + const std::vector &Children = (*RIt)->getChildren(); + unsigned NumChildren = Children.size(); + for (unsigned i = 0; i != NumChildren; ++i) { + MachineDomTreeNode *Child = Children[i]; + SpillsInSubTree[*RIt].insert(SpillsInSubTree[Child].begin(), + SpillsInSubTree[Child].end()); + SpillsInSubTree.erase(Child); + } + + // No spills in subtree, simply continue. + if (SpillsInSubTree[*RIt].empty()) + continue; + + // Check whether Block is a possible candidate to insert spill. + unsigned LiveReg = 0; + if (!isSpillCandBB(OrigReg, OrigVNI, Block, LiveReg)) + continue; + + // Now Block is a proper target BB for hoisting spills. Decide whether to + // hoist the spills to current node. Get existing cost of all the spills + // in SpillsInSubTree[Block]. + BlockFrequency SpillCost = 0; + for (const auto SpillBB : SpillsInSubTree[*RIt]) + SpillCost += MBFI.getBlockFreq(SpillBB->getBlock()); + + // If there are multiple spills that could be merged, bias a little + // to hoist the spill. + BranchProbability MarginProb = (SpillsInSubTree[*RIt].size() > 1) + ? BranchProbability(9, 10) + : BranchProbability(1, 1); + if (SpillCost > MBFI.getBlockFreq(Block) * MarginProb) { + // Hoist: Move spills to current Block. + for (const auto SpillBB : SpillsInSubTree[*RIt]) { + // When SpillBB is a BB contains original spill, insert the spill + // to SpillsToRm. + if (SpillsToKept.find(SpillBB) != SpillsToKept.end() && + !SpillsToKept[SpillBB]) { + MachineInstr *SpillToRm = SpillBBToSpill[SpillBB]; + SpillsToRm.push_back(SpillToRm); + } + // SpillBB will not contain spill anymore, remove it from SpillsToKept. + SpillsToKept.erase(SpillBB); + } + // Current Block is the BB containing the new hoisted spill. Add it to + // SpillsToKept. LiveReg is the RHS of the spill. + SpillsToKept[*RIt] = LiveReg; + DEBUG({ + dbgs() << "spills in BB: "; + for (const auto Rspill : SpillsInSubTree[*RIt]) + dbgs() << Rspill->getBlock()->getNumber() << " "; + dbgs() << "were promoted to BB" << (*RIt)->getBlock()->getNumber() + << "\n"; + }); + SpillsInSubTree[*RIt].clear(); + SpillsInSubTree[*RIt].insert(*RIt); + } + } + // For spills in SpillsToKept with LiveReg set (.i.e, not original spill), + // save them to SpillsToIns. + for (const auto ent : SpillsToKept) { + if (ent.second) + SpillsToIns[ent.first->getBlock()] = ent.second; + } +} + +/// For spills with equal values, remove redundent spills and hoist spills +/// to a less hot spot. +void HoistSpiller::hoistAllSpills(LiveRangeEdit &Edit) { + // Save the mapping between stackslot and its original reg. + DenseMap SlotToOrigReg; + for (unsigned i = 0, e = MRI.getNumVirtRegs(); i != e; ++i) { + unsigned Reg = TargetRegisterInfo::index2VirtReg(i); + int Slot = VRM.getStackSlot(Reg); + if (Slot != VirtRegMap::NO_STACK_SLOT) { + for (const auto &ent : MergableSpills) { + if (ent.first.first == Slot && + SlotToOrigReg.find(Slot) == SlotToOrigReg.end()) + SlotToOrigReg[Slot] = VRM.getOriginal(Reg); + } + } + unsigned Original = VRM.getPreSplitReg(Reg); + if (!MRI.def_empty(Reg)) + Virt2SiblingsMap[Original].insert(Reg); + } + + // Each entry in MergableSpills contains a spill set with equal values. + for (auto &ent : MergableSpills) { + int Slot = ent.first.first; + unsigned OrigReg = SlotToOrigReg[Slot]; + VNInfo *OrigVNI = ent.first.second; + SmallPtrSet &EqValSpills = ent.second; + if (!ent.second.size()) + continue; + + DEBUG({ + dbgs() << "\nFor Slot" << Slot << " and VN" << OrigVNI->id << ":\n" + << "Equal spills in BB: "; + for (const auto spill : EqValSpills) + dbgs() << spill->getParent()->getNumber() << " "; + dbgs() << "\n"; + }); + + // SpillsToRm is the spill set to be removed from EqValSpills. + SmallVector SpillsToRm; + // SpillsToIns is the spill set to be newly inserted after hoisting. + DenseMap SpillsToIns; + + runHoistSpills(OrigReg, OrigVNI, EqValSpills, SpillsToRm, SpillsToIns); + + DEBUG({ + dbgs() << "Finally inserted spills in BB: "; + for (const auto Ispill : SpillsToIns) + dbgs() << Ispill.first->getNumber() << " "; + dbgs() << "\nFinally removed spills in BB: "; + for (const auto Rspill : SpillsToRm) + dbgs() << Rspill->getParent()->getNumber() << " "; + dbgs() << "\n"; + }); + + // Stack live range update. + LiveInterval &StackIntvl = LSS.getInterval(Slot); + if (!SpillsToIns.empty() || !SpillsToRm.empty()) { + LiveInterval &OrigLI = LIS.getInterval(OrigReg); + StackIntvl.MergeValueInAsValue(OrigLI, OrigVNI, + StackIntvl.getValNumInfo(0)); + } + + // Insert hoisted spills. + for (auto const ent : SpillsToIns) { + MachineBasicBlock *BB = ent.first; + unsigned LiveReg = ent.second; + MachineBasicBlock::iterator MI = BB->getFirstTerminator(); + TII.storeRegToStackSlot(*BB, MI, LiveReg, false, Slot, + MRI.getRegClass(LiveReg), &TRI); + LIS.InsertMachineInstrRangeInMaps(std::prev(MI), MI); + ++NumSpills; + } + + // Remove redundent spills or change them to dead instructions. + NumSpills -= SpillsToRm.size(); + for (auto const ent : SpillsToRm) { + ent->setDesc(TII.get(TargetOpcode::KILL)); + for (unsigned i = ent->getNumOperands(); i; --i) { + MachineOperand &MO = ent->getOperand(i - 1); + if (MO.isReg() && MO.isImplicit() && MO.isDef() && !MO.isDead()) + ent->RemoveOperand(i - 1); + } + } + Edit.eliminateDeadDefs(SpillsToRm, None, true); + } +} Index: lib/CodeGen/LiveRangeEdit.cpp =================================================================== --- lib/CodeGen/LiveRangeEdit.cpp +++ lib/CodeGen/LiveRangeEdit.cpp @@ -63,10 +63,13 @@ for (VNInfo *VNI : getParent().valnos) { if (VNI->isUnused()) continue; - MachineInstr *DefMI = LIS.getInstructionFromIndex(VNI->def); + unsigned Original = VRM->getOriginal(getReg()); + LiveInterval &OrigLI = LIS.getInterval(Original); + VNInfo *OrigVNI = OrigLI.getVNInfoAt(VNI->def); + MachineInstr *DefMI = LIS.getInstructionFromIndex(OrigVNI->def); if (!DefMI) continue; - checkRematerializable(VNI, DefMI, aa); + checkRematerializable(OrigVNI, DefMI, aa); } ScannedRemattable = true; } @@ -119,18 +122,13 @@ assert(ScannedRemattable && "Call anyRematerializable first"); // Use scanRemattable info. - if (!Remattable.count(RM.ParentVNI)) + if (!Remattable.count(RM.OrigVNI)) return false; // No defining instruction provided. SlotIndex DefIdx; - if (RM.OrigMI) - DefIdx = LIS.getInstructionIndex(*RM.OrigMI); - else { - DefIdx = RM.ParentVNI->def; - RM.OrigMI = LIS.getInstructionFromIndex(DefIdx); - assert(RM.OrigMI && "No defining instruction for remattable value"); - } + assert(RM.OrigMI && "No defining instruction for remattable value"); + DefIdx = LIS.getInstructionIndex(*(RM.OrigMI)); // If only cheap remats were requested, bail out early. if (cheapAsAMove && !TII.isAsCheapAsAMove(RM.OrigMI)) @@ -261,6 +259,15 @@ // Collect virtual registers to be erased after MI is gone. SmallVector RegsToErase; bool ReadsPhysRegs = false; + bool isOrigDef = false; + unsigned Dest; + if (VRM && MI->getOperand(0).isReg()) { + Dest = MI->getOperand(0).getReg(); + unsigned Original = VRM->getOriginal(Dest); + LiveInterval &OrigLI = LIS.getInterval(Original); + VNInfo *OrigVNI = OrigLI.getVNInfoAt(Idx); + isOrigDef = SlotIndex::isSameInstr(OrigVNI->def, Idx); + } // Check for live intervals that may shrink for (MachineInstr::mop_iterator MOI = MI->operands_begin(), @@ -314,11 +321,24 @@ } DEBUG(dbgs() << "Converted physregs to:\t" << *MI); } else { - if (TheDelegate) - TheDelegate->LRE_WillEraseInstruction(MI); - LIS.RemoveMachineInstrFromMaps(*MI); - MI->eraseFromParent(); - ++NumDCEDeleted; + // If the dest of MI is an original reg, don't delete the inst. Replace + // the dest with a new reg, keep the inst for remat of other siblings. + // The inst is saved in LiveRangeEdit::DeadRemats and will be deleted + // after all the allocations of the func are done. + if (isOrigDef) { + unsigned NewDest = createFrom(Dest); + pop_back(); + markDeadRemat(MI); + const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo(); + MI->substituteRegister(Dest, NewDest, 0, TRI); + MI->getOperand(0).setIsDead(false); + } else { + if (TheDelegate) + TheDelegate->LRE_WillEraseInstruction(MI); + LIS.RemoveMachineInstrFromMaps(*MI); + MI->eraseFromParent(); + ++NumDCEDeleted; + } } // Erase any virtregs that are now empty and unused. There may be @@ -332,8 +352,9 @@ } } -void LiveRangeEdit::eliminateDeadDefs(SmallVectorImpl &Dead, - ArrayRef RegsBeingSpilled) { +void LiveRangeEdit::eliminateDeadDefs(SmallVectorImpl &Dead, + ArrayRef RegsBeingSpilled, + bool NoSplit) { ToShrinkSet ToShrink; for (;;) { @@ -355,6 +376,9 @@ if (!LIS.shrinkToUses(LI, &Dead)) continue; + if (NoSplit) + continue; + // Don't create new intervals for a register being spilled. // The new intervals would have to be spilled anyway so its not worth it. // Also they currently aren't spilled so creating them and not spilling Index: lib/CodeGen/RegAllocBase.h =================================================================== --- lib/CodeGen/RegAllocBase.h +++ lib/CodeGen/RegAllocBase.h @@ -65,6 +65,12 @@ LiveRegMatrix *Matrix; RegisterClassInfo RegClassInfo; + /// Inst which is a def of an original reg and whose defs are already all + /// dead after remat is saved in DeadRemats. The deletion of such inst is + /// postponed till all the allocations are done, so its remat expr is + /// always available for the remat of all the siblings of the original reg. + SmallPtrSet DeadRemats; + RegAllocBase() : TRI(nullptr), MRI(nullptr), VRM(nullptr), LIS(nullptr), Matrix(nullptr) {} @@ -77,6 +83,9 @@ // physical register assignments. void allocatePhysRegs(); + // Remove dead defs because of rematerialization. + void eliminateDeadRemats(); + // Get a temporary reference to a Spiller instance. virtual Spiller &spiller() = 0; Index: lib/CodeGen/RegAllocBase.cpp =================================================================== --- lib/CodeGen/RegAllocBase.cpp +++ lib/CodeGen/RegAllocBase.cpp @@ -153,3 +153,11 @@ } } } + +void RegAllocBase::eliminateDeadRemats() { + for (auto ent : DeadRemats) { + LIS->RemoveMachineInstrFromMaps(*ent); + ent->eraseFromParent(); + } + DeadRemats.clear(); +} Index: lib/CodeGen/RegAllocBasic.cpp =================================================================== --- lib/CodeGen/RegAllocBasic.cpp +++ lib/CodeGen/RegAllocBasic.cpp @@ -199,7 +199,7 @@ Matrix->unassign(Spill); // Spill the extracted interval. - LiveRangeEdit LRE(&Spill, SplitVRegs, *MF, *LIS, VRM); + LiveRangeEdit LRE(&Spill, SplitVRegs, &DeadRemats, *MF, *LIS, VRM); spiller().spill(LRE); } return true; @@ -258,7 +258,7 @@ DEBUG(dbgs() << "spilling: " << VirtReg << '\n'); if (!VirtReg.isSpillable()) return ~0u; - LiveRangeEdit LRE(&VirtReg, SplitVRegs, *MF, *LIS, VRM); + LiveRangeEdit LRE(&VirtReg, SplitVRegs, &DeadRemats, *MF, *LIS, VRM); spiller().spill(LRE); // The live virtual register requesting allocation was spilled, so tell @@ -283,6 +283,7 @@ SpillerInstance.reset(createInlineSpiller(*this, *MF, *VRM)); allocatePhysRegs(); + eliminateDeadRemats(); // Diagnostic output before rewriting DEBUG(dbgs() << "Post alloc VirtRegMap:\n" << *VRM << "\n"); Index: lib/CodeGen/RegAllocGreedy.cpp =================================================================== --- lib/CodeGen/RegAllocGreedy.cpp +++ lib/CodeGen/RegAllocGreedy.cpp @@ -12,7 +12,6 @@ // //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/Passes.h" #include "AllocationOrder.h" #include "InterferenceCache.h" #include "LiveDebugVariables.h" @@ -33,6 +32,7 @@ #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/RegAllocRegistry.h" #include "llvm/CodeGen/RegisterClassInfo.h" #include "llvm/CodeGen/VirtRegMap.h" @@ -44,6 +44,7 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/Timer.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" #include @@ -55,14 +56,14 @@ STATISTIC(NumLocalSplits, "Number of split local live ranges"); STATISTIC(NumEvicted, "Number of interferences evicted"); -static cl::opt -SplitSpillMode("split-spill-mode", cl::Hidden, - cl::desc("Spill mode for splitting live ranges"), - cl::values(clEnumValN(SplitEditor::SM_Partition, "default", "Default"), - clEnumValN(SplitEditor::SM_Size, "size", "Optimize for size"), - clEnumValN(SplitEditor::SM_Speed, "speed", "Optimize for speed"), - clEnumValEnd), - cl::init(SplitEditor::SM_Partition)); +static cl::opt SplitSpillMode( + "split-spill-mode", cl::Hidden, + cl::desc("Spill mode for splitting live ranges"), + cl::values(clEnumValN(SplitEditor::SM_Partition, "default", "Default"), + clEnumValN(SplitEditor::SM_Size, "size", "Optimize for size"), + clEnumValN(SplitEditor::SM_Speed, "speed", "Optimize for speed"), + clEnumValEnd), + cl::init(SplitEditor::SM_Speed)); static cl::opt LastChanceRecoloringMaxDepth("lcr-max-depth", cl::Hidden, @@ -397,6 +398,7 @@ SmallVirtRegSet &, unsigned); void tryHintRecoloring(LiveInterval &); void tryHintsRecoloring(); + void postOptimization(); /// Model the information carried by one end of a copy. struct HintInfo { @@ -1465,7 +1467,7 @@ SmallVectorImpl &NewVRegs) { SmallVector UsedCands; // Prepare split editor. - LiveRangeEdit LREdit(&VirtReg, NewVRegs, *MF, *LIS, VRM, this); + LiveRangeEdit LREdit(&VirtReg, NewVRegs, &DeadRemats, *MF, *LIS, VRM, this); SE->reset(LREdit, SplitSpillMode); // Assign all edge bundles to the preferred candidate, or NoCand. @@ -1513,7 +1515,7 @@ assert(&SA->getParent() == &VirtReg && "Live range wasn't analyzed"); unsigned Reg = VirtReg.reg; bool SingleInstrs = RegClassInfo.isProperSubClass(MRI->getRegClass(Reg)); - LiveRangeEdit LREdit(&VirtReg, NewVRegs, *MF, *LIS, VRM, this); + LiveRangeEdit LREdit(&VirtReg, NewVRegs, &DeadRemats, *MF, *LIS, VRM, this); SE->reset(LREdit, SplitSpillMode); ArrayRef UseBlocks = SA->getUseBlocks(); for (unsigned i = 0; i != UseBlocks.size(); ++i) { @@ -1585,7 +1587,7 @@ // Always enable split spill mode, since we're effectively spilling to a // register. - LiveRangeEdit LREdit(&VirtReg, NewVRegs, *MF, *LIS, VRM, this); + LiveRangeEdit LREdit(&VirtReg, NewVRegs, &DeadRemats, *MF, *LIS, VRM, this); SE->reset(LREdit, SplitEditor::SM_Size); ArrayRef Uses = SA->getUseSlots(); @@ -1908,7 +1910,7 @@ << '-' << Uses[BestAfter] << ", " << BestDiff << ", " << (BestAfter - BestBefore + 1) << " instrs\n"); - LiveRangeEdit LREdit(&VirtReg, NewVRegs, *MF, *LIS, VRM, this); + LiveRangeEdit LREdit(&VirtReg, NewVRegs, &DeadRemats, *MF, *LIS, VRM, this); SE->reset(LREdit); SE->openIntv(); @@ -2551,7 +2553,7 @@ NewVRegs.push_back(VirtReg.reg); } else { NamedRegionTimer T("Spiller", TimerGroupName, TimePassesIsEnabled); - LiveRangeEdit LRE(&VirtReg, NewVRegs, *MF, *LIS, VRM, this); + LiveRangeEdit LRE(&VirtReg, NewVRegs, &DeadRemats, *MF, *LIS, VRM, this); spiller().spill(LRE); setStage(NewVRegs.begin(), NewVRegs.end(), RS_Done); @@ -2564,6 +2566,11 @@ return 0; } +void RAGreedy::postOptimization() { + eliminateDeadRemats(); + startHoistSpiller(*MF, *VRM, *LIS, &spiller()); +} + bool RAGreedy::runOnMachineFunction(MachineFunction &mf) { DEBUG(dbgs() << "********** GREEDY REGISTER ALLOCATION **********\n" << "********** Function: " << mf.getName() << '\n'); @@ -2587,6 +2594,7 @@ MBFI = &getAnalysis(); DomTree = &getAnalysis(); SpillerInstance.reset(createInlineSpiller(*this, *MF, *VRM)); + createHoistSpiller(*this, *MF, *VRM, &spiller()); Loops = &getAnalysis(); Bundles = &getAnalysis(); SpillPlacer = &getAnalysis(); @@ -2609,6 +2617,8 @@ allocatePhysRegs(); tryHintsRecoloring(); + postOptimization(); + releaseMemory(); return true; } Index: lib/CodeGen/RegAllocPBQP.cpp =================================================================== --- lib/CodeGen/RegAllocPBQP.cpp +++ lib/CodeGen/RegAllocPBQP.cpp @@ -123,6 +123,12 @@ RegSet VRegsToAlloc, EmptyIntervalVRegs; + /// Inst which is a def of an original reg and whose defs are already all + /// dead after remat is saved in DeadRemats. The deletion of such inst is + /// postponed till all the allocations are done, so its remat expr is + /// always available for the remat of all the siblings of the original reg. + SmallPtrSet DeadRemats; + /// \brief Finds the initial set of vreg intervals to allocate. void findVRegIntervalsToAlloc(const MachineFunction &MF, LiveIntervals &LIS); @@ -146,6 +152,8 @@ void finalizeAlloc(MachineFunction &MF, LiveIntervals &LIS, VirtRegMap &VRM) const; + /// Remove dead defs because of rematerialization. + void eliminateDeadRemats(LiveIntervals &LIS); }; char RegAllocPBQP::ID = 0; @@ -631,7 +639,8 @@ VirtRegMap &VRM, Spiller &VRegSpiller) { VRegsToAlloc.erase(VReg); - LiveRangeEdit LRE(&LIS.getInterval(VReg), NewIntervals, MF, LIS, &VRM); + LiveRangeEdit LRE(&LIS.getInterval(VReg), NewIntervals, &DeadRemats, MF, LIS, + &VRM); VRegSpiller.spill(LRE); const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); @@ -713,6 +722,14 @@ } } +void RegAllocPBQP::eliminateDeadRemats(LiveIntervals &LIS) { + for (auto ent : DeadRemats) { + LIS.RemoveMachineInstrFromMaps(*ent); + ent->eraseFromParent(); + } + DeadRemats.clear(); +} + static inline float normalizePBQPSpillWeight(float UseDefFreq, unsigned Size, unsigned NumInstr) { // All intervals have a spill weight that is mostly proportional to the number @@ -798,6 +815,7 @@ // Finalise allocation, allocate empty ranges. finalizeAlloc(MF, LIS, VRM); + eliminateDeadRemats(LIS); VRegsToAlloc.clear(); EmptyIntervalVRegs.clear(); Index: lib/CodeGen/RegisterCoalescer.cpp =================================================================== --- lib/CodeGen/RegisterCoalescer.cpp +++ lib/CodeGen/RegisterCoalescer.cpp @@ -459,8 +459,8 @@ void RegisterCoalescer::eliminateDeadDefs() { SmallVector NewRegs; - LiveRangeEdit(nullptr, NewRegs, *MF, *LIS, - nullptr, this).eliminateDeadDefs(DeadDefs); + LiveRangeEdit(nullptr, NewRegs, nullptr, *MF, *LIS, nullptr, this) + .eliminateDeadDefs(DeadDefs); } void RegisterCoalescer::LRE_WillEraseInstruction(MachineInstr *MI) { Index: lib/CodeGen/Spiller.h =================================================================== --- lib/CodeGen/Spiller.h +++ lib/CodeGen/Spiller.h @@ -16,6 +16,7 @@ class MachineFunction; class MachineFunctionPass; class VirtRegMap; + class LiveIntervals; /// Spiller interface. /// @@ -28,7 +29,6 @@ /// spill - Spill the LRE.getParent() live interval. virtual void spill(LiveRangeEdit &LRE) = 0; - }; /// Create and return a spiller that will insert spill code directly instead @@ -37,6 +37,13 @@ MachineFunction &mf, VirtRegMap &vrm); + void createHoistSpiller(MachineFunctionPass &pass, MachineFunction &mf, + VirtRegMap &vrm, Spiller *); + + /// startHoistSpiller - create a HoistSpiller object and start to hoist + /// Spills. + void startHoistSpiller(MachineFunction &mf, VirtRegMap &vrm, + LiveIntervals &lis, Spiller *); } #endif Index: lib/CodeGen/SplitKit.h =================================================================== --- lib/CodeGen/SplitKit.h +++ lib/CodeGen/SplitKit.h @@ -18,6 +18,7 @@ #include "LiveRangeCalc.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseSet.h" #include "llvm/ADT/IntervalMap.h" #include "llvm/ADT/SmallPtrSet.h" @@ -329,9 +330,13 @@ MachineBasicBlock *findShallowDominator(MachineBasicBlock *MBB, MachineBasicBlock *DefMBB); - /// hoistCopiesForSize - Hoist back-copies to the complement interval in a - /// way that minimizes code size. This implements the SM_Size spill mode. - void hoistCopiesForSize(); + /// removeRedundentCopies - Remove redundent back-copies if it has been + /// decided those back-copies will not be hoisted. + void removeRedundentCopies(DenseSet &NotToHoistSet, + SmallVectorImpl &BackCopies); + + /// hoistCopies - Hoist back-copies to the complement interval. + void hoistCopies(); /// transferValues - Transfer values to the new ranges. /// Return true if any ranges were skipped. Index: lib/CodeGen/SplitKit.cpp =================================================================== --- lib/CodeGen/SplitKit.cpp +++ lib/CodeGen/SplitKit.cpp @@ -16,6 +16,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/LiveIntervalAnalysis.h" #include "llvm/CodeGen/LiveRangeEdit.h" +#include "llvm/CodeGen/MachineBlockFrequencyInfo.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineLoopInfo.h" @@ -430,7 +431,12 @@ bool Late = RegIdx != 0; // Attempt cheap-as-a-copy rematerialization. - LiveRangeEdit::Remat RM(ParentVNI); + unsigned Original = VRM.getOriginal(Edit->get(RegIdx)); + LiveInterval &OrigLI = LIS.getInterval(Original); + VNInfo *OrigVNI = OrigLI.getVNInfoAt(UseIdx); + LiveRangeEdit::Remat RM(ParentVNI, OrigVNI); + RM.OrigMI = LIS.getInstructionFromIndex(OrigVNI->def); + if (Edit->canRematerializeAt(RM, UseIdx, true)) { Def = Edit->rematerializeAt(MBB, I, LI->reg, RM, TRI, Late); ++NumRemats; @@ -716,7 +722,62 @@ } } -void SplitEditor::hoistCopiesForSize() { +/// Remove redundent backcopies if the backcopies for the same ParentVNI cannot +/// be hoisted because of too much cost. +void SplitEditor::removeRedundentCopies(DenseSet &NotToHoistSet, + SmallVectorImpl &BackCopies) { + LiveInterval *LI = &LIS.getInterval(Edit->get(0)); + LiveInterval *Parent = &Edit->getParent(); + SmallVector, 8> EqualVNs(Parent->getNumValNums()); + SmallPtrSet DominatedVNIs; + + // Aggregate VNIs having the same value as ParentVNI. + for (VNInfo *VNI : LI->valnos) { + if (VNI->isUnused()) + continue; + VNInfo *ParentVNI = Edit->getParent().getVNInfoAt(VNI->def); + EqualVNs[ParentVNI->id].insert(VNI); + } + + // For VNI aggregation of each ParentVNI, collect dominated, .i.e, + // redundent VNIs to BackCopies. + for (unsigned i = 0, e = Parent->getNumValNums(); i != e; ++i) { + VNInfo *ParentVNI = Parent->getValNumInfo(i); + if (!NotToHoistSet.count(ParentVNI->id)) + continue; + for (auto Ent1 : EqualVNs[ParentVNI->id]) { + for (auto Ent2 : EqualVNs[ParentVNI->id]) { + if (Ent1 == Ent2 || DominatedVNIs.count(Ent1) || + DominatedVNIs.count(Ent2)) + continue; + + MachineBasicBlock *MBB1 = LIS.getMBBFromIndex(Ent1->def); + MachineBasicBlock *MBB2 = LIS.getMBBFromIndex(Ent2->def); + if (MBB1 == MBB2) { + DominatedVNIs.insert(Ent1->def < Ent2->def ? Ent2 : Ent1); + } else if (MDT.dominates(MBB1, MBB2)) { + DominatedVNIs.insert(Ent2); + } else if (MDT.dominates(MBB2, MBB1)) { + DominatedVNIs.insert(Ent1); + } + } + } + if (!DominatedVNIs.empty()) { + forceRecompute(0, ParentVNI); + for (auto Ent : DominatedVNIs) { + BackCopies.push_back(Ent); + } + DominatedVNIs.clear(); + } + } +} + +/// For SM_Size mode, find a common dominator for all the back-copies for +/// the same ParentVNI and hoist the backcopies to the dominator BB. +/// For SM_Speed mode, if the common dominator is hot and it is not beneficial +/// to do the hoisting, simply remove the dominated backcopies for the same +/// ParentVNI. +void SplitEditor::hoistCopies() { // Get the complement interval, always RegIdx 0. LiveInterval *LI = &LIS.getInterval(Edit->get(0)); LiveInterval *Parent = &Edit->getParent(); @@ -725,6 +786,11 @@ // indexed by ParentVNI->id. typedef std::pair DomPair; SmallVector NearestDom(Parent->getNumValNums()); + // The total cost of all the back-copies for each ParentVNI. + SmallVector Costs(Parent->getNumValNums()); + // The ParentVNI->id set for which hoisting back-copies are not beneficial + // for Speed. + DenseSet NotToHoistSet; // Find the nearest common dominator for parent values with multiple // back-copies. If a single back-copy dominates, put it in DomPair.second. @@ -740,6 +806,7 @@ continue; MachineBasicBlock *ValMBB = LIS.getMBBFromIndex(VNI->def); + DomPair &Dom = NearestDom[ParentVNI->id]; // Keep directly defined parent values. This is either a PHI or an @@ -774,6 +841,7 @@ else if (Near != Dom.first) // None dominate. Hoist to common dominator, need new def. Dom = DomPair(Near, SlotIndex()); + Costs[ParentVNI->id] += MBFI.getBlockFreq(ValMBB); } DEBUG(dbgs() << "Multi-mapped complement " << VNI->id << '@' << VNI->def @@ -792,6 +860,11 @@ MachineBasicBlock *DefMBB = LIS.getMBBFromIndex(ParentVNI->def); // Get a less loopy dominator than Dom.first. Dom.first = findShallowDominator(Dom.first, DefMBB); + if (SpillMode == SM_Speed && + MBFI.getBlockFreq(Dom.first) > Costs[ParentVNI->id]) { + NotToHoistSet.insert(ParentVNI->id); + continue; + } SlotIndex Last = LIS.getMBBEndIdx(Dom.first).getPrevSlot(); Dom.second = defFromParent(0, ParentVNI, Last, *Dom.first, @@ -806,11 +879,18 @@ continue; VNInfo *ParentVNI = Edit->getParent().getVNInfoAt(VNI->def); const DomPair &Dom = NearestDom[ParentVNI->id]; - if (!Dom.first || Dom.second == VNI->def) + if (!Dom.first || Dom.second == VNI->def || + NotToHoistSet.count(ParentVNI->id)) continue; BackCopies.push_back(VNI); forceRecompute(0, ParentVNI); } + + // If it is not beneficial to hoist all the BackCopies, simply remove + // redundent BackCopies in speed mode. + if (SpillMode == SM_Speed && !NotToHoistSet.empty()) + removeRedundentCopies(NotToHoistSet, BackCopies); + removeBackCopies(BackCopies); } @@ -1004,6 +1084,8 @@ // Dead defs end at the dead slot. if (S.end != S.valno->def.getDeadSlot()) continue; + if (S.valno->isPHIDef()) + continue; MachineInstr *MI = LIS.getInstructionFromIndex(S.valno->def); assert(MI && "Missing instruction for dead def"); MI->addRegisterDead(LI->reg, &TRI); @@ -1048,10 +1130,8 @@ // Leave all back-copies as is. break; case SM_Size: - hoistCopiesForSize(); - break; case SM_Speed: - llvm_unreachable("Spill mode 'speed' not implemented yet"); + hoistCopies(); } // Transfer the simply mapped values, check if any are skipped. Index: test/CodeGen/AArch64/aarch64-deferred-spilling.ll =================================================================== --- test/CodeGen/AArch64/aarch64-deferred-spilling.ll +++ test/CodeGen/AArch64/aarch64-deferred-spilling.ll @@ -1,514 +0,0 @@ -;RUN: llc < %s -mtriple=aarch64--linux-android -regalloc=greedy -enable-deferred-spilling=true -mcpu=cortex-a57 -disable-fp-elim | FileCheck %s --check-prefix=CHECK --check-prefix=DEFERRED -;RUN: llc < %s -mtriple=aarch64--linux-android -regalloc=greedy -enable-deferred-spilling=false -mcpu=cortex-a57 -disable-fp-elim | FileCheck %s --check-prefix=CHECK --check-prefix=REGULAR - -; Check that we do not end up with useless spill code. -; -; Move to the basic block we are interested in. -; -; CHECK: // %if.then.120 -; -; REGULAR: str w21, [sp, #[[OFFSET:[0-9]+]]] // 4-byte Folded Spill -; Check that w21 wouldn't need to be spilled since it is never reused. -; REGULAR-NOT: {{[wx]}}21{{,?}} -; -; Check that w22 is used to carry a value through the call. -; DEFERRED-NOT: str {{[wx]}}22, -; DEFERRED: mov {{[wx]}}22, -; DEFERRED-NOT: str {{[wx]}}22, -; -; CHECK: bl fprintf -; -; DEFERRED-NOT: ldr {{[wx]}}22, -; DEFERRED: mov {{[wx][0-9]+}}, {{[wx]}}22 -; DEFERRED-NOT: ldr {{[wx]}}22, -; -; REGULAR-NOT: {{[wx]}}21{{,?}} -; REGULAR: ldr w21, [sp, #[[OFFSET]]] // 4-byte Folded Reload -; -; End of the basic block we are interested in. -; CHECK: b -; CHECK: {{[^:]+}}: // %sw.bb.123 - -%struct.__sFILE = type { i8*, i32, i32, i32, i32, %struct.__sbuf, i32, i8*, i32 (i8*)*, i32 (i8*, i8*, i32)*, i64 (i8*, i64, i32)*, i32 (i8*, i8*, i32)*, %struct.__sbuf, i8*, i32, [3 x i8], [1 x i8], %struct.__sbuf, i32, i64 } -%struct.__sbuf = type { i8*, i64 } -%struct.DState = type { %struct.bz_stream*, i32, i8, i32, i8, i32, i32, i32, i32, i32, i8, i32, i32, i32, i32, i32, [256 x i32], i32, [257 x i32], [257 x i32], i32*, i16*, i8*, i32, i32, i32, i32, i32, [256 x i8], [16 x i8], [256 x i8], [4096 x i8], [16 x i32], [18002 x i8], [18002 x i8], [6 x [258 x i8]], [6 x [258 x i32]], [6 x [258 x i32]], [6 x [258 x i32]], [6 x i32], i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32*, i32*, i32* } -%struct.bz_stream = type { i8*, i32, i32, i32, i8*, i32, i32, i32, i8*, i8* (i8*, i32, i32)*, void (i8*, i8*)*, i8* } - -@__sF = external global [0 x %struct.__sFILE], align 8 -@.str = private unnamed_addr constant [20 x i8] c"\0A [%d: stuff+mf \00", align 1 - -declare i32 @fprintf(%struct.__sFILE* nocapture, i8* nocapture readonly, ...) - -declare void @bar(i32) - -declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) - -define i32 @foo(%struct.DState* %s) { -entry: - %state = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 1 - %tmp = load i32, i32* %state, align 4 - %cmp = icmp eq i32 %tmp, 10 - %save_i = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 40 - br i1 %cmp, label %if.end.thread, label %if.end - -if.end.thread: ; preds = %entry - %save_j = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 41 - %save_t = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 42 - %save_alphaSize = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 43 - %save_nGroups = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 44 - %save_nSelectors = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 45 - %save_EOB = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 46 - %save_groupNo = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 47 - %save_groupPos = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 48 - %save_nextSym = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 49 - %save_nblockMAX = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 50 - %save_nblock = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 51 - %save_es = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 52 - %save_N = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 53 - %save_curr = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 54 - %save_zt = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 55 - %save_zn = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 56 - %save_zvec = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 57 - %save_zj = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 58 - %tmp1 = bitcast i32* %save_i to i8* - call void @llvm.memset.p0i8.i64(i8* %tmp1, i8 0, i64 108, i32 4, i1 false) - br label %sw.default - -if.end: ; preds = %entry - %.pre = load i32, i32* %save_i, align 4 - %save_j3.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 41 - %.pre406 = load i32, i32* %save_j3.phi.trans.insert, align 4 - %save_t4.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 42 - %.pre407 = load i32, i32* %save_t4.phi.trans.insert, align 4 - %save_alphaSize5.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 43 - %.pre408 = load i32, i32* %save_alphaSize5.phi.trans.insert, align 4 - %save_nGroups6.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 44 - %.pre409 = load i32, i32* %save_nGroups6.phi.trans.insert, align 4 - %save_nSelectors7.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 45 - %.pre410 = load i32, i32* %save_nSelectors7.phi.trans.insert, align 4 - %save_EOB8.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 46 - %.pre411 = load i32, i32* %save_EOB8.phi.trans.insert, align 4 - %save_groupNo9.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 47 - %.pre412 = load i32, i32* %save_groupNo9.phi.trans.insert, align 4 - %save_groupPos10.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 48 - %.pre413 = load i32, i32* %save_groupPos10.phi.trans.insert, align 4 - %save_nextSym11.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 49 - %.pre414 = load i32, i32* %save_nextSym11.phi.trans.insert, align 4 - %save_nblockMAX12.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 50 - %.pre415 = load i32, i32* %save_nblockMAX12.phi.trans.insert, align 4 - %save_nblock13.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 51 - %.pre416 = load i32, i32* %save_nblock13.phi.trans.insert, align 4 - %save_es14.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 52 - %.pre417 = load i32, i32* %save_es14.phi.trans.insert, align 4 - %save_N15.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 53 - %.pre418 = load i32, i32* %save_N15.phi.trans.insert, align 4 - %save_curr16.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 54 - %.pre419 = load i32, i32* %save_curr16.phi.trans.insert, align 4 - %save_zt17.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 55 - %.pre420 = load i32, i32* %save_zt17.phi.trans.insert, align 4 - %save_zn18.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 56 - %.pre421 = load i32, i32* %save_zn18.phi.trans.insert, align 4 - %save_zvec19.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 57 - %.pre422 = load i32, i32* %save_zvec19.phi.trans.insert, align 4 - %save_zj20.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 58 - %.pre423 = load i32, i32* %save_zj20.phi.trans.insert, align 4 - switch i32 %tmp, label %sw.default [ - i32 13, label %sw.bb - i32 14, label %if.end.sw.bb.65_crit_edge - i32 25, label %if.end.sw.bb.123_crit_edge - ] - -if.end.sw.bb.123_crit_edge: ; preds = %if.end - %.pre433 = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 8 - br label %sw.bb.123 - -if.end.sw.bb.65_crit_edge: ; preds = %if.end - %bsLive69.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 8 - %.pre426 = load i32, i32* %bsLive69.phi.trans.insert, align 4 - br label %sw.bb.65 - -sw.bb: ; preds = %if.end - %sunkaddr = ptrtoint %struct.DState* %s to i64 - %sunkaddr485 = add i64 %sunkaddr, 8 - %sunkaddr486 = inttoptr i64 %sunkaddr485 to i32* - store i32 13, i32* %sunkaddr486, align 4 - %bsLive = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 8 - %tmp2 = load i32, i32* %bsLive, align 4 - %cmp28.400 = icmp sgt i32 %tmp2, 7 - br i1 %cmp28.400, label %sw.bb.if.then.29_crit_edge, label %if.end.33.lr.ph - -sw.bb.if.then.29_crit_edge: ; preds = %sw.bb - %sunkaddr487 = ptrtoint %struct.DState* %s to i64 - %sunkaddr488 = add i64 %sunkaddr487, 32 - %sunkaddr489 = inttoptr i64 %sunkaddr488 to i32* - %.pre425 = load i32, i32* %sunkaddr489, align 4 - br label %if.then.29 - -if.end.33.lr.ph: ; preds = %sw.bb - %tmp3 = bitcast %struct.DState* %s to %struct.bz_stream** - %.pre424 = load %struct.bz_stream*, %struct.bz_stream** %tmp3, align 8 - %avail_in.phi.trans.insert = getelementptr inbounds %struct.bz_stream, %struct.bz_stream* %.pre424, i64 0, i32 1 - %.pre430 = load i32, i32* %avail_in.phi.trans.insert, align 4 - %tmp4 = add i32 %.pre430, -1 - br label %if.end.33 - -if.then.29: ; preds = %while.body.backedge, %sw.bb.if.then.29_crit_edge - %tmp5 = phi i32 [ %.pre425, %sw.bb.if.then.29_crit_edge ], [ %or, %while.body.backedge ] - %.lcssa393 = phi i32 [ %tmp2, %sw.bb.if.then.29_crit_edge ], [ %add, %while.body.backedge ] - %sub = add nsw i32 %.lcssa393, -8 - %shr = lshr i32 %tmp5, %sub - %and = and i32 %shr, 255 - %sunkaddr491 = ptrtoint %struct.DState* %s to i64 - %sunkaddr492 = add i64 %sunkaddr491, 36 - %sunkaddr493 = inttoptr i64 %sunkaddr492 to i32* - store i32 %sub, i32* %sunkaddr493, align 4 - %blockSize100k = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 9 - store i32 %and, i32* %blockSize100k, align 4 - %and.off = add nsw i32 %and, -49 - %tmp6 = icmp ugt i32 %and.off, 8 - br i1 %tmp6, label %save_state_and_return, label %if.end.62 - -if.end.33: ; preds = %while.body.backedge, %if.end.33.lr.ph - %lsr.iv482 = phi i32 [ %tmp4, %if.end.33.lr.ph ], [ %lsr.iv.next483, %while.body.backedge ] - %tmp7 = phi i32 [ %tmp2, %if.end.33.lr.ph ], [ %add, %while.body.backedge ] - %cmp35 = icmp eq i32 %lsr.iv482, -1 - br i1 %cmp35, label %save_state_and_return, label %if.end.37 - -if.end.37: ; preds = %if.end.33 - %tmp8 = bitcast %struct.bz_stream* %.pre424 to i8** - %sunkaddr494 = ptrtoint %struct.DState* %s to i64 - %sunkaddr495 = add i64 %sunkaddr494, 32 - %sunkaddr496 = inttoptr i64 %sunkaddr495 to i32* - %tmp9 = load i32, i32* %sunkaddr496, align 4 - %shl = shl i32 %tmp9, 8 - %tmp10 = load i8*, i8** %tmp8, align 8 - %tmp11 = load i8, i8* %tmp10, align 1 - %conv = zext i8 %tmp11 to i32 - %or = or i32 %conv, %shl - store i32 %or, i32* %sunkaddr496, align 4 - %add = add nsw i32 %tmp7, 8 - %sunkaddr497 = ptrtoint %struct.DState* %s to i64 - %sunkaddr498 = add i64 %sunkaddr497, 36 - %sunkaddr499 = inttoptr i64 %sunkaddr498 to i32* - store i32 %add, i32* %sunkaddr499, align 4 - %incdec.ptr = getelementptr inbounds i8, i8* %tmp10, i64 1 - store i8* %incdec.ptr, i8** %tmp8, align 8 - %sunkaddr500 = ptrtoint %struct.bz_stream* %.pre424 to i64 - %sunkaddr501 = add i64 %sunkaddr500, 8 - %sunkaddr502 = inttoptr i64 %sunkaddr501 to i32* - store i32 %lsr.iv482, i32* %sunkaddr502, align 4 - %sunkaddr503 = ptrtoint %struct.bz_stream* %.pre424 to i64 - %sunkaddr504 = add i64 %sunkaddr503, 12 - %sunkaddr505 = inttoptr i64 %sunkaddr504 to i32* - %tmp12 = load i32, i32* %sunkaddr505, align 4 - %inc = add i32 %tmp12, 1 - store i32 %inc, i32* %sunkaddr505, align 4 - %cmp49 = icmp eq i32 %inc, 0 - br i1 %cmp49, label %if.then.51, label %while.body.backedge - -if.then.51: ; preds = %if.end.37 - %sunkaddr506 = ptrtoint %struct.bz_stream* %.pre424 to i64 - %sunkaddr507 = add i64 %sunkaddr506, 16 - %sunkaddr508 = inttoptr i64 %sunkaddr507 to i32* - %tmp13 = load i32, i32* %sunkaddr508, align 4 - %inc53 = add i32 %tmp13, 1 - store i32 %inc53, i32* %sunkaddr508, align 4 - br label %while.body.backedge - -while.body.backedge: ; preds = %if.then.51, %if.end.37 - %lsr.iv.next483 = add i32 %lsr.iv482, -1 - %cmp28 = icmp sgt i32 %add, 7 - br i1 %cmp28, label %if.then.29, label %if.end.33 - -if.end.62: ; preds = %if.then.29 - %sub64 = add nsw i32 %and, -48 - %sunkaddr509 = ptrtoint %struct.DState* %s to i64 - %sunkaddr510 = add i64 %sunkaddr509, 40 - %sunkaddr511 = inttoptr i64 %sunkaddr510 to i32* - store i32 %sub64, i32* %sunkaddr511, align 4 - br label %sw.bb.65 - -sw.bb.65: ; preds = %if.end.62, %if.end.sw.bb.65_crit_edge - %bsLive69.pre-phi = phi i32* [ %bsLive69.phi.trans.insert, %if.end.sw.bb.65_crit_edge ], [ %bsLive, %if.end.62 ] - %tmp14 = phi i32 [ %.pre426, %if.end.sw.bb.65_crit_edge ], [ %sub, %if.end.62 ] - %sunkaddr512 = ptrtoint %struct.DState* %s to i64 - %sunkaddr513 = add i64 %sunkaddr512, 8 - %sunkaddr514 = inttoptr i64 %sunkaddr513 to i32* - store i32 14, i32* %sunkaddr514, align 4 - %cmp70.397 = icmp sgt i32 %tmp14, 7 - br i1 %cmp70.397, label %if.then.72, label %if.end.82.lr.ph - -if.end.82.lr.ph: ; preds = %sw.bb.65 - %tmp15 = bitcast %struct.DState* %s to %struct.bz_stream** - %.pre427 = load %struct.bz_stream*, %struct.bz_stream** %tmp15, align 8 - %avail_in84.phi.trans.insert = getelementptr inbounds %struct.bz_stream, %struct.bz_stream* %.pre427, i64 0, i32 1 - %.pre431 = load i32, i32* %avail_in84.phi.trans.insert, align 4 - %tmp16 = add i32 %.pre431, -1 - br label %if.end.82 - -if.then.72: ; preds = %while.body.68.backedge, %sw.bb.65 - %.lcssa390 = phi i32 [ %tmp14, %sw.bb.65 ], [ %add97, %while.body.68.backedge ] - %sub76 = add nsw i32 %.lcssa390, -8 - %sunkaddr516 = ptrtoint %struct.DState* %s to i64 - %sunkaddr517 = add i64 %sunkaddr516, 36 - %sunkaddr518 = inttoptr i64 %sunkaddr517 to i32* - store i32 %sub76, i32* %sunkaddr518, align 4 - %currBlockNo = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 11 - %tmp17 = load i32, i32* %currBlockNo, align 4 - %inc117 = add nsw i32 %tmp17, 1 - store i32 %inc117, i32* %currBlockNo, align 4 - %verbosity = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 12 - %tmp18 = load i32, i32* %verbosity, align 4 - %cmp118 = icmp sgt i32 %tmp18, 1 - br i1 %cmp118, label %if.then.120, label %sw.bb.123, !prof !0 - -if.end.82: ; preds = %while.body.68.backedge, %if.end.82.lr.ph - %lsr.iv480 = phi i32 [ %tmp16, %if.end.82.lr.ph ], [ %lsr.iv.next481, %while.body.68.backedge ] - %tmp19 = phi i32 [ %tmp14, %if.end.82.lr.ph ], [ %add97, %while.body.68.backedge ] - %cmp85 = icmp eq i32 %lsr.iv480, -1 - br i1 %cmp85, label %save_state_and_return, label %if.end.88 - -if.end.88: ; preds = %if.end.82 - %tmp20 = bitcast %struct.bz_stream* %.pre427 to i8** - %sunkaddr519 = ptrtoint %struct.DState* %s to i64 - %sunkaddr520 = add i64 %sunkaddr519, 32 - %sunkaddr521 = inttoptr i64 %sunkaddr520 to i32* - %tmp21 = load i32, i32* %sunkaddr521, align 4 - %shl90 = shl i32 %tmp21, 8 - %tmp22 = load i8*, i8** %tmp20, align 8 - %tmp23 = load i8, i8* %tmp22, align 1 - %conv93 = zext i8 %tmp23 to i32 - %or94 = or i32 %conv93, %shl90 - store i32 %or94, i32* %sunkaddr521, align 4 - %add97 = add nsw i32 %tmp19, 8 - %sunkaddr522 = ptrtoint %struct.DState* %s to i64 - %sunkaddr523 = add i64 %sunkaddr522, 36 - %sunkaddr524 = inttoptr i64 %sunkaddr523 to i32* - store i32 %add97, i32* %sunkaddr524, align 4 - %incdec.ptr100 = getelementptr inbounds i8, i8* %tmp22, i64 1 - store i8* %incdec.ptr100, i8** %tmp20, align 8 - %sunkaddr525 = ptrtoint %struct.bz_stream* %.pre427 to i64 - %sunkaddr526 = add i64 %sunkaddr525, 8 - %sunkaddr527 = inttoptr i64 %sunkaddr526 to i32* - store i32 %lsr.iv480, i32* %sunkaddr527, align 4 - %sunkaddr528 = ptrtoint %struct.bz_stream* %.pre427 to i64 - %sunkaddr529 = add i64 %sunkaddr528, 12 - %sunkaddr530 = inttoptr i64 %sunkaddr529 to i32* - %tmp24 = load i32, i32* %sunkaddr530, align 4 - %inc106 = add i32 %tmp24, 1 - store i32 %inc106, i32* %sunkaddr530, align 4 - %cmp109 = icmp eq i32 %inc106, 0 - br i1 %cmp109, label %if.then.111, label %while.body.68.backedge - -if.then.111: ; preds = %if.end.88 - %sunkaddr531 = ptrtoint %struct.bz_stream* %.pre427 to i64 - %sunkaddr532 = add i64 %sunkaddr531, 16 - %sunkaddr533 = inttoptr i64 %sunkaddr532 to i32* - %tmp25 = load i32, i32* %sunkaddr533, align 4 - %inc114 = add i32 %tmp25, 1 - store i32 %inc114, i32* %sunkaddr533, align 4 - br label %while.body.68.backedge - -while.body.68.backedge: ; preds = %if.then.111, %if.end.88 - %lsr.iv.next481 = add i32 %lsr.iv480, -1 - %cmp70 = icmp sgt i32 %add97, 7 - br i1 %cmp70, label %if.then.72, label %if.end.82 - -if.then.120: ; preds = %if.then.72 - %call = tail call i32 (%struct.__sFILE*, i8*, ...) @fprintf(%struct.__sFILE* getelementptr inbounds ([0 x %struct.__sFILE], [0 x %struct.__sFILE]* @__sF, i64 0, i64 2), i8* getelementptr inbounds ([20 x i8], [20 x i8]* @.str, i64 0, i64 0), i32 %inc117) - br label %sw.bb.123 - -sw.bb.123: ; preds = %if.then.120, %if.then.72, %if.end.sw.bb.123_crit_edge - %bsLive127.pre-phi = phi i32* [ %.pre433, %if.end.sw.bb.123_crit_edge ], [ %bsLive69.pre-phi, %if.then.72 ], [ %bsLive69.pre-phi, %if.then.120 ] - %sunkaddr534 = ptrtoint %struct.DState* %s to i64 - %sunkaddr535 = add i64 %sunkaddr534, 8 - %sunkaddr536 = inttoptr i64 %sunkaddr535 to i32* - store i32 25, i32* %sunkaddr536, align 4 - %tmp26 = load i32, i32* %bsLive127.pre-phi, align 4 - %cmp128.395 = icmp sgt i32 %tmp26, 7 - br i1 %cmp128.395, label %sw.bb.123.if.then.130_crit_edge, label %if.end.140.lr.ph - -sw.bb.123.if.then.130_crit_edge: ; preds = %sw.bb.123 - %sunkaddr537 = ptrtoint %struct.DState* %s to i64 - %sunkaddr538 = add i64 %sunkaddr537, 32 - %sunkaddr539 = inttoptr i64 %sunkaddr538 to i32* - %.pre429 = load i32, i32* %sunkaddr539, align 4 - br label %if.then.130 - -if.end.140.lr.ph: ; preds = %sw.bb.123 - %tmp27 = bitcast %struct.DState* %s to %struct.bz_stream** - %.pre428 = load %struct.bz_stream*, %struct.bz_stream** %tmp27, align 8 - %avail_in142.phi.trans.insert = getelementptr inbounds %struct.bz_stream, %struct.bz_stream* %.pre428, i64 0, i32 1 - %.pre432 = load i32, i32* %avail_in142.phi.trans.insert, align 4 - %tmp28 = add i32 %.pre432, -1 - br label %if.end.140 - -if.then.130: ; preds = %while.body.126.backedge, %sw.bb.123.if.then.130_crit_edge - %tmp29 = phi i32 [ %.pre429, %sw.bb.123.if.then.130_crit_edge ], [ %or152, %while.body.126.backedge ] - %.lcssa = phi i32 [ %tmp26, %sw.bb.123.if.then.130_crit_edge ], [ %add155, %while.body.126.backedge ] - %sub134 = add nsw i32 %.lcssa, -8 - %shr135 = lshr i32 %tmp29, %sub134 - store i32 %sub134, i32* %bsLive127.pre-phi, align 4 - %origPtr = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 13 - %tmp30 = load i32, i32* %origPtr, align 4 - %shl175 = shl i32 %tmp30, 8 - %conv176 = and i32 %shr135, 255 - %or177 = or i32 %shl175, %conv176 - store i32 %or177, i32* %origPtr, align 4 - %nInUse = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 27 - %tmp31 = load i32, i32* %nInUse, align 4 - %add179 = add nsw i32 %tmp31, 2 - br label %save_state_and_return - -if.end.140: ; preds = %while.body.126.backedge, %if.end.140.lr.ph - %lsr.iv = phi i32 [ %tmp28, %if.end.140.lr.ph ], [ %lsr.iv.next, %while.body.126.backedge ] - %tmp32 = phi i32 [ %tmp26, %if.end.140.lr.ph ], [ %add155, %while.body.126.backedge ] - %cmp143 = icmp eq i32 %lsr.iv, -1 - br i1 %cmp143, label %save_state_and_return, label %if.end.146 - -if.end.146: ; preds = %if.end.140 - %tmp33 = bitcast %struct.bz_stream* %.pre428 to i8** - %sunkaddr541 = ptrtoint %struct.DState* %s to i64 - %sunkaddr542 = add i64 %sunkaddr541, 32 - %sunkaddr543 = inttoptr i64 %sunkaddr542 to i32* - %tmp34 = load i32, i32* %sunkaddr543, align 4 - %shl148 = shl i32 %tmp34, 8 - %tmp35 = load i8*, i8** %tmp33, align 8 - %tmp36 = load i8, i8* %tmp35, align 1 - %conv151 = zext i8 %tmp36 to i32 - %or152 = or i32 %conv151, %shl148 - store i32 %or152, i32* %sunkaddr543, align 4 - %add155 = add nsw i32 %tmp32, 8 - store i32 %add155, i32* %bsLive127.pre-phi, align 4 - %incdec.ptr158 = getelementptr inbounds i8, i8* %tmp35, i64 1 - store i8* %incdec.ptr158, i8** %tmp33, align 8 - %sunkaddr544 = ptrtoint %struct.bz_stream* %.pre428 to i64 - %sunkaddr545 = add i64 %sunkaddr544, 8 - %sunkaddr546 = inttoptr i64 %sunkaddr545 to i32* - store i32 %lsr.iv, i32* %sunkaddr546, align 4 - %sunkaddr547 = ptrtoint %struct.bz_stream* %.pre428 to i64 - %sunkaddr548 = add i64 %sunkaddr547, 12 - %sunkaddr549 = inttoptr i64 %sunkaddr548 to i32* - %tmp37 = load i32, i32* %sunkaddr549, align 4 - %inc164 = add i32 %tmp37, 1 - store i32 %inc164, i32* %sunkaddr549, align 4 - %cmp167 = icmp eq i32 %inc164, 0 - br i1 %cmp167, label %if.then.169, label %while.body.126.backedge - -if.then.169: ; preds = %if.end.146 - %sunkaddr550 = ptrtoint %struct.bz_stream* %.pre428 to i64 - %sunkaddr551 = add i64 %sunkaddr550, 16 - %sunkaddr552 = inttoptr i64 %sunkaddr551 to i32* - %tmp38 = load i32, i32* %sunkaddr552, align 4 - %inc172 = add i32 %tmp38, 1 - store i32 %inc172, i32* %sunkaddr552, align 4 - br label %while.body.126.backedge - -while.body.126.backedge: ; preds = %if.then.169, %if.end.146 - %lsr.iv.next = add i32 %lsr.iv, -1 - %cmp128 = icmp sgt i32 %add155, 7 - br i1 %cmp128, label %if.then.130, label %if.end.140 - -sw.default: ; preds = %if.end, %if.end.thread - %tmp39 = phi i32 [ 0, %if.end.thread ], [ %.pre, %if.end ] - %tmp40 = phi i32 [ 0, %if.end.thread ], [ %.pre406, %if.end ] - %tmp41 = phi i32 [ 0, %if.end.thread ], [ %.pre407, %if.end ] - %tmp42 = phi i32 [ 0, %if.end.thread ], [ %.pre408, %if.end ] - %tmp43 = phi i32 [ 0, %if.end.thread ], [ %.pre409, %if.end ] - %tmp44 = phi i32 [ 0, %if.end.thread ], [ %.pre410, %if.end ] - %tmp45 = phi i32 [ 0, %if.end.thread ], [ %.pre411, %if.end ] - %tmp46 = phi i32 [ 0, %if.end.thread ], [ %.pre412, %if.end ] - %tmp47 = phi i32 [ 0, %if.end.thread ], [ %.pre413, %if.end ] - %tmp48 = phi i32 [ 0, %if.end.thread ], [ %.pre414, %if.end ] - %tmp49 = phi i32 [ 0, %if.end.thread ], [ %.pre415, %if.end ] - %tmp50 = phi i32 [ 0, %if.end.thread ], [ %.pre416, %if.end ] - %tmp51 = phi i32 [ 0, %if.end.thread ], [ %.pre417, %if.end ] - %tmp52 = phi i32 [ 0, %if.end.thread ], [ %.pre418, %if.end ] - %tmp53 = phi i32 [ 0, %if.end.thread ], [ %.pre419, %if.end ] - %tmp54 = phi i32 [ 0, %if.end.thread ], [ %.pre420, %if.end ] - %tmp55 = phi i32 [ 0, %if.end.thread ], [ %.pre421, %if.end ] - %tmp56 = phi i32 [ 0, %if.end.thread ], [ %.pre422, %if.end ] - %tmp57 = phi i32 [ 0, %if.end.thread ], [ %.pre423, %if.end ] - %save_j3.pre-phi469 = phi i32* [ %save_j, %if.end.thread ], [ %save_j3.phi.trans.insert, %if.end ] - %save_t4.pre-phi467 = phi i32* [ %save_t, %if.end.thread ], [ %save_t4.phi.trans.insert, %if.end ] - %save_alphaSize5.pre-phi465 = phi i32* [ %save_alphaSize, %if.end.thread ], [ %save_alphaSize5.phi.trans.insert, %if.end ] - %save_nGroups6.pre-phi463 = phi i32* [ %save_nGroups, %if.end.thread ], [ %save_nGroups6.phi.trans.insert, %if.end ] - %save_nSelectors7.pre-phi461 = phi i32* [ %save_nSelectors, %if.end.thread ], [ %save_nSelectors7.phi.trans.insert, %if.end ] - %save_EOB8.pre-phi459 = phi i32* [ %save_EOB, %if.end.thread ], [ %save_EOB8.phi.trans.insert, %if.end ] - %save_groupNo9.pre-phi457 = phi i32* [ %save_groupNo, %if.end.thread ], [ %save_groupNo9.phi.trans.insert, %if.end ] - %save_groupPos10.pre-phi455 = phi i32* [ %save_groupPos, %if.end.thread ], [ %save_groupPos10.phi.trans.insert, %if.end ] - %save_nextSym11.pre-phi453 = phi i32* [ %save_nextSym, %if.end.thread ], [ %save_nextSym11.phi.trans.insert, %if.end ] - %save_nblockMAX12.pre-phi451 = phi i32* [ %save_nblockMAX, %if.end.thread ], [ %save_nblockMAX12.phi.trans.insert, %if.end ] - %save_nblock13.pre-phi449 = phi i32* [ %save_nblock, %if.end.thread ], [ %save_nblock13.phi.trans.insert, %if.end ] - %save_es14.pre-phi447 = phi i32* [ %save_es, %if.end.thread ], [ %save_es14.phi.trans.insert, %if.end ] - %save_N15.pre-phi445 = phi i32* [ %save_N, %if.end.thread ], [ %save_N15.phi.trans.insert, %if.end ] - %save_curr16.pre-phi443 = phi i32* [ %save_curr, %if.end.thread ], [ %save_curr16.phi.trans.insert, %if.end ] - %save_zt17.pre-phi441 = phi i32* [ %save_zt, %if.end.thread ], [ %save_zt17.phi.trans.insert, %if.end ] - %save_zn18.pre-phi439 = phi i32* [ %save_zn, %if.end.thread ], [ %save_zn18.phi.trans.insert, %if.end ] - %save_zvec19.pre-phi437 = phi i32* [ %save_zvec, %if.end.thread ], [ %save_zvec19.phi.trans.insert, %if.end ] - %save_zj20.pre-phi435 = phi i32* [ %save_zj, %if.end.thread ], [ %save_zj20.phi.trans.insert, %if.end ] - tail call void @bar(i32 4001) - br label %save_state_and_return - -save_state_and_return: ; preds = %sw.default, %if.end.140, %if.then.130, %if.end.82, %if.end.33, %if.then.29 - %tmp58 = phi i32 [ %tmp39, %sw.default ], [ %.pre, %if.then.29 ], [ %.pre, %if.then.130 ], [ %.pre, %if.end.140 ], [ %.pre, %if.end.82 ], [ %.pre, %if.end.33 ] - %tmp59 = phi i32 [ %tmp40, %sw.default ], [ %.pre406, %if.then.29 ], [ %.pre406, %if.then.130 ], [ %.pre406, %if.end.140 ], [ %.pre406, %if.end.82 ], [ %.pre406, %if.end.33 ] - %tmp60 = phi i32 [ %tmp41, %sw.default ], [ %.pre407, %if.then.29 ], [ %.pre407, %if.then.130 ], [ %.pre407, %if.end.140 ], [ %.pre407, %if.end.82 ], [ %.pre407, %if.end.33 ] - %tmp61 = phi i32 [ %tmp43, %sw.default ], [ %.pre409, %if.then.29 ], [ %.pre409, %if.then.130 ], [ %.pre409, %if.end.140 ], [ %.pre409, %if.end.82 ], [ %.pre409, %if.end.33 ] - %tmp62 = phi i32 [ %tmp44, %sw.default ], [ %.pre410, %if.then.29 ], [ %.pre410, %if.then.130 ], [ %.pre410, %if.end.140 ], [ %.pre410, %if.end.82 ], [ %.pre410, %if.end.33 ] - %tmp63 = phi i32 [ %tmp45, %sw.default ], [ %.pre411, %if.then.29 ], [ %.pre411, %if.then.130 ], [ %.pre411, %if.end.140 ], [ %.pre411, %if.end.82 ], [ %.pre411, %if.end.33 ] - %tmp64 = phi i32 [ %tmp46, %sw.default ], [ %.pre412, %if.then.29 ], [ %.pre412, %if.then.130 ], [ %.pre412, %if.end.140 ], [ %.pre412, %if.end.82 ], [ %.pre412, %if.end.33 ] - %tmp65 = phi i32 [ %tmp47, %sw.default ], [ %.pre413, %if.then.29 ], [ %.pre413, %if.then.130 ], [ %.pre413, %if.end.140 ], [ %.pre413, %if.end.82 ], [ %.pre413, %if.end.33 ] - %tmp66 = phi i32 [ %tmp48, %sw.default ], [ %.pre414, %if.then.29 ], [ %.pre414, %if.then.130 ], [ %.pre414, %if.end.140 ], [ %.pre414, %if.end.82 ], [ %.pre414, %if.end.33 ] - %tmp67 = phi i32 [ %tmp49, %sw.default ], [ %.pre415, %if.then.29 ], [ %.pre415, %if.then.130 ], [ %.pre415, %if.end.140 ], [ %.pre415, %if.end.82 ], [ %.pre415, %if.end.33 ] - %tmp68 = phi i32 [ %tmp51, %sw.default ], [ %.pre417, %if.then.29 ], [ %.pre417, %if.then.130 ], [ %.pre417, %if.end.140 ], [ %.pre417, %if.end.82 ], [ %.pre417, %if.end.33 ] - %tmp69 = phi i32 [ %tmp52, %sw.default ], [ %.pre418, %if.then.29 ], [ %.pre418, %if.then.130 ], [ %.pre418, %if.end.140 ], [ %.pre418, %if.end.82 ], [ %.pre418, %if.end.33 ] - %tmp70 = phi i32 [ %tmp53, %sw.default ], [ %.pre419, %if.then.29 ], [ %.pre419, %if.then.130 ], [ %.pre419, %if.end.140 ], [ %.pre419, %if.end.82 ], [ %.pre419, %if.end.33 ] - %tmp71 = phi i32 [ %tmp54, %sw.default ], [ %.pre420, %if.then.29 ], [ %.pre420, %if.then.130 ], [ %.pre420, %if.end.140 ], [ %.pre420, %if.end.82 ], [ %.pre420, %if.end.33 ] - %tmp72 = phi i32 [ %tmp55, %sw.default ], [ %.pre421, %if.then.29 ], [ %.pre421, %if.then.130 ], [ %.pre421, %if.end.140 ], [ %.pre421, %if.end.82 ], [ %.pre421, %if.end.33 ] - %tmp73 = phi i32 [ %tmp56, %sw.default ], [ %.pre422, %if.then.29 ], [ %.pre422, %if.then.130 ], [ %.pre422, %if.end.140 ], [ %.pre422, %if.end.82 ], [ %.pre422, %if.end.33 ] - %tmp74 = phi i32 [ %tmp57, %sw.default ], [ %.pre423, %if.then.29 ], [ %.pre423, %if.then.130 ], [ %.pre423, %if.end.140 ], [ %.pre423, %if.end.82 ], [ %.pre423, %if.end.33 ] - %save_j3.pre-phi468 = phi i32* [ %save_j3.pre-phi469, %sw.default ], [ %save_j3.phi.trans.insert, %if.then.29 ], [ %save_j3.phi.trans.insert, %if.then.130 ], [ %save_j3.phi.trans.insert, %if.end.140 ], [ %save_j3.phi.trans.insert, %if.end.82 ], [ %save_j3.phi.trans.insert, %if.end.33 ] - %save_t4.pre-phi466 = phi i32* [ %save_t4.pre-phi467, %sw.default ], [ %save_t4.phi.trans.insert, %if.then.29 ], [ %save_t4.phi.trans.insert, %if.then.130 ], [ %save_t4.phi.trans.insert, %if.end.140 ], [ %save_t4.phi.trans.insert, %if.end.82 ], [ %save_t4.phi.trans.insert, %if.end.33 ] - %save_alphaSize5.pre-phi464 = phi i32* [ %save_alphaSize5.pre-phi465, %sw.default ], [ %save_alphaSize5.phi.trans.insert, %if.then.29 ], [ %save_alphaSize5.phi.trans.insert, %if.then.130 ], [ %save_alphaSize5.phi.trans.insert, %if.end.140 ], [ %save_alphaSize5.phi.trans.insert, %if.end.82 ], [ %save_alphaSize5.phi.trans.insert, %if.end.33 ] - %save_nGroups6.pre-phi462 = phi i32* [ %save_nGroups6.pre-phi463, %sw.default ], [ %save_nGroups6.phi.trans.insert, %if.then.29 ], [ %save_nGroups6.phi.trans.insert, %if.then.130 ], [ %save_nGroups6.phi.trans.insert, %if.end.140 ], [ %save_nGroups6.phi.trans.insert, %if.end.82 ], [ %save_nGroups6.phi.trans.insert, %if.end.33 ] - %save_nSelectors7.pre-phi460 = phi i32* [ %save_nSelectors7.pre-phi461, %sw.default ], [ %save_nSelectors7.phi.trans.insert, %if.then.29 ], [ %save_nSelectors7.phi.trans.insert, %if.then.130 ], [ %save_nSelectors7.phi.trans.insert, %if.end.140 ], [ %save_nSelectors7.phi.trans.insert, %if.end.82 ], [ %save_nSelectors7.phi.trans.insert, %if.end.33 ] - %save_EOB8.pre-phi458 = phi i32* [ %save_EOB8.pre-phi459, %sw.default ], [ %save_EOB8.phi.trans.insert, %if.then.29 ], [ %save_EOB8.phi.trans.insert, %if.then.130 ], [ %save_EOB8.phi.trans.insert, %if.end.140 ], [ %save_EOB8.phi.trans.insert, %if.end.82 ], [ %save_EOB8.phi.trans.insert, %if.end.33 ] - %save_groupNo9.pre-phi456 = phi i32* [ %save_groupNo9.pre-phi457, %sw.default ], [ %save_groupNo9.phi.trans.insert, %if.then.29 ], [ %save_groupNo9.phi.trans.insert, %if.then.130 ], [ %save_groupNo9.phi.trans.insert, %if.end.140 ], [ %save_groupNo9.phi.trans.insert, %if.end.82 ], [ %save_groupNo9.phi.trans.insert, %if.end.33 ] - %save_groupPos10.pre-phi454 = phi i32* [ %save_groupPos10.pre-phi455, %sw.default ], [ %save_groupPos10.phi.trans.insert, %if.then.29 ], [ %save_groupPos10.phi.trans.insert, %if.then.130 ], [ %save_groupPos10.phi.trans.insert, %if.end.140 ], [ %save_groupPos10.phi.trans.insert, %if.end.82 ], [ %save_groupPos10.phi.trans.insert, %if.end.33 ] - %save_nextSym11.pre-phi452 = phi i32* [ %save_nextSym11.pre-phi453, %sw.default ], [ %save_nextSym11.phi.trans.insert, %if.then.29 ], [ %save_nextSym11.phi.trans.insert, %if.then.130 ], [ %save_nextSym11.phi.trans.insert, %if.end.140 ], [ %save_nextSym11.phi.trans.insert, %if.end.82 ], [ %save_nextSym11.phi.trans.insert, %if.end.33 ] - %save_nblockMAX12.pre-phi450 = phi i32* [ %save_nblockMAX12.pre-phi451, %sw.default ], [ %save_nblockMAX12.phi.trans.insert, %if.then.29 ], [ %save_nblockMAX12.phi.trans.insert, %if.then.130 ], [ %save_nblockMAX12.phi.trans.insert, %if.end.140 ], [ %save_nblockMAX12.phi.trans.insert, %if.end.82 ], [ %save_nblockMAX12.phi.trans.insert, %if.end.33 ] - %save_nblock13.pre-phi448 = phi i32* [ %save_nblock13.pre-phi449, %sw.default ], [ %save_nblock13.phi.trans.insert, %if.then.29 ], [ %save_nblock13.phi.trans.insert, %if.then.130 ], [ %save_nblock13.phi.trans.insert, %if.end.140 ], [ %save_nblock13.phi.trans.insert, %if.end.82 ], [ %save_nblock13.phi.trans.insert, %if.end.33 ] - %save_es14.pre-phi446 = phi i32* [ %save_es14.pre-phi447, %sw.default ], [ %save_es14.phi.trans.insert, %if.then.29 ], [ %save_es14.phi.trans.insert, %if.then.130 ], [ %save_es14.phi.trans.insert, %if.end.140 ], [ %save_es14.phi.trans.insert, %if.end.82 ], [ %save_es14.phi.trans.insert, %if.end.33 ] - %save_N15.pre-phi444 = phi i32* [ %save_N15.pre-phi445, %sw.default ], [ %save_N15.phi.trans.insert, %if.then.29 ], [ %save_N15.phi.trans.insert, %if.then.130 ], [ %save_N15.phi.trans.insert, %if.end.140 ], [ %save_N15.phi.trans.insert, %if.end.82 ], [ %save_N15.phi.trans.insert, %if.end.33 ] - %save_curr16.pre-phi442 = phi i32* [ %save_curr16.pre-phi443, %sw.default ], [ %save_curr16.phi.trans.insert, %if.then.29 ], [ %save_curr16.phi.trans.insert, %if.then.130 ], [ %save_curr16.phi.trans.insert, %if.end.140 ], [ %save_curr16.phi.trans.insert, %if.end.82 ], [ %save_curr16.phi.trans.insert, %if.end.33 ] - %save_zt17.pre-phi440 = phi i32* [ %save_zt17.pre-phi441, %sw.default ], [ %save_zt17.phi.trans.insert, %if.then.29 ], [ %save_zt17.phi.trans.insert, %if.then.130 ], [ %save_zt17.phi.trans.insert, %if.end.140 ], [ %save_zt17.phi.trans.insert, %if.end.82 ], [ %save_zt17.phi.trans.insert, %if.end.33 ] - %save_zn18.pre-phi438 = phi i32* [ %save_zn18.pre-phi439, %sw.default ], [ %save_zn18.phi.trans.insert, %if.then.29 ], [ %save_zn18.phi.trans.insert, %if.then.130 ], [ %save_zn18.phi.trans.insert, %if.end.140 ], [ %save_zn18.phi.trans.insert, %if.end.82 ], [ %save_zn18.phi.trans.insert, %if.end.33 ] - %save_zvec19.pre-phi436 = phi i32* [ %save_zvec19.pre-phi437, %sw.default ], [ %save_zvec19.phi.trans.insert, %if.then.29 ], [ %save_zvec19.phi.trans.insert, %if.then.130 ], [ %save_zvec19.phi.trans.insert, %if.end.140 ], [ %save_zvec19.phi.trans.insert, %if.end.82 ], [ %save_zvec19.phi.trans.insert, %if.end.33 ] - %save_zj20.pre-phi434 = phi i32* [ %save_zj20.pre-phi435, %sw.default ], [ %save_zj20.phi.trans.insert, %if.then.29 ], [ %save_zj20.phi.trans.insert, %if.then.130 ], [ %save_zj20.phi.trans.insert, %if.end.140 ], [ %save_zj20.phi.trans.insert, %if.end.82 ], [ %save_zj20.phi.trans.insert, %if.end.33 ] - %nblock.1 = phi i32 [ %tmp50, %sw.default ], [ %.pre416, %if.then.29 ], [ 0, %if.then.130 ], [ %.pre416, %if.end.140 ], [ %.pre416, %if.end.82 ], [ %.pre416, %if.end.33 ] - %alphaSize.1 = phi i32 [ %tmp42, %sw.default ], [ %.pre408, %if.then.29 ], [ %add179, %if.then.130 ], [ %.pre408, %if.end.140 ], [ %.pre408, %if.end.82 ], [ %.pre408, %if.end.33 ] - %retVal.0 = phi i32 [ 0, %sw.default ], [ -5, %if.then.29 ], [ -4, %if.then.130 ], [ 0, %if.end.140 ], [ 0, %if.end.82 ], [ 0, %if.end.33 ] - store i32 %tmp58, i32* %save_i, align 4 - store i32 %tmp59, i32* %save_j3.pre-phi468, align 4 - store i32 %tmp60, i32* %save_t4.pre-phi466, align 4 - store i32 %alphaSize.1, i32* %save_alphaSize5.pre-phi464, align 4 - store i32 %tmp61, i32* %save_nGroups6.pre-phi462, align 4 - store i32 %tmp62, i32* %save_nSelectors7.pre-phi460, align 4 - store i32 %tmp63, i32* %save_EOB8.pre-phi458, align 4 - store i32 %tmp64, i32* %save_groupNo9.pre-phi456, align 4 - store i32 %tmp65, i32* %save_groupPos10.pre-phi454, align 4 - store i32 %tmp66, i32* %save_nextSym11.pre-phi452, align 4 - store i32 %tmp67, i32* %save_nblockMAX12.pre-phi450, align 4 - store i32 %nblock.1, i32* %save_nblock13.pre-phi448, align 4 - store i32 %tmp68, i32* %save_es14.pre-phi446, align 4 - store i32 %tmp69, i32* %save_N15.pre-phi444, align 4 - store i32 %tmp70, i32* %save_curr16.pre-phi442, align 4 - store i32 %tmp71, i32* %save_zt17.pre-phi440, align 4 - store i32 %tmp72, i32* %save_zn18.pre-phi438, align 4 - store i32 %tmp73, i32* %save_zvec19.pre-phi436, align 4 - store i32 %tmp74, i32* %save_zj20.pre-phi434, align 4 - ret i32 %retVal.0 -} - -!0 = !{!"branch_weights", i32 10, i32 1} Index: test/CodeGen/ARM/subreg-remat.ll =================================================================== --- test/CodeGen/ARM/subreg-remat.ll +++ test/CodeGen/ARM/subreg-remat.ll @@ -11,10 +11,10 @@ ; since it implicitly reads the ssub_1 sub-register. ; ; CHECK: f1 -; CHECK: vmov d0, r0, r0 -; CHECK: vldr s1, LCPI +; CHECK: vmov d1, r0, r0 +; CHECK: vldr s3, LCPI ; The vector must be spilled: -; CHECK: vstr d0, +; CHECK: vstr d1, ; CHECK: asm clobber d0 ; And reloaded after the asm: ; CHECK: vldr [[D16:d[0-9]+]], Index: test/CodeGen/SPARC/spill.ll =================================================================== --- test/CodeGen/SPARC/spill.ll +++ test/CodeGen/SPARC/spill.ll @@ -7,8 +7,9 @@ ;; registers to ensure the spill will happen. ; CHECK-LABEL: test_i32_spill: -; CHECK: and %i0, %i1, %o0 -; CHECK: st %o0, [%fp+{{.+}}] +; CHECK: and %i0, %i1, %i0 +; CHECK: mov %i0, %o0 +; CHECK: st %i0, [%fp+{{.+}}] ; CHECK: add %o0, %o0, %g0 ; CHECK: ld [%fp+{{.+}}, %i0 define i32 @test_i32_spill(i32 %a, i32 %b) { @@ -20,9 +21,11 @@ } ; CHECK-LABEL: test_i64_spill: -; CHECK: and %i0, %i2, %o0 -; CHECK: and %i1, %i3, %o1 -; CHECK: std %o0, [%fp+{{.+}}] +; CHECK: and %i0, %i2, %i4 +; CHECK: and %i1, %i3, %i5 +; CHECK: mov %i4, %o0 +; CHECK: mov %i5, %o1 +; CHECK: std %i4, [%fp+{{.+}}] ; CHECK: add %o0, %o0, %g0 ; CHECK: ldd [%fp+{{.+}}, %i0 define i64 @test_i64_spill(i64 %a, i64 %b) { Index: test/CodeGen/X86/avx512-bugfix-25270.ll =================================================================== --- test/CodeGen/X86/avx512-bugfix-25270.ll +++ test/CodeGen/X86/avx512-bugfix-25270.ll @@ -10,9 +10,9 @@ ; CHECK-NEXT: subq $112, %rsp ; CHECK-NEXT: movq %rdi, %rbx ; CHECK-NEXT: vmovdqu32 (%rbx), %zmm0 -; CHECK-NEXT: vmovups %zmm0, (%rsp) ## 64-byte Spill ; CHECK-NEXT: vpbroadcastd {{.*}}(%rip), %zmm1 ; CHECK-NEXT: vmovdqa32 %zmm1, (%rbx) +; CHECK-NEXT: vmovups %zmm0, (%rsp) ## 64-byte Spill ; CHECK-NEXT: callq _Print__512 ; CHECK-NEXT: vmovups (%rsp), %zmm0 ## 64-byte Reload ; CHECK-NEXT: callq _Print__512 Index: test/CodeGen/X86/fold-push.ll =================================================================== --- test/CodeGen/X86/fold-push.ll +++ test/CodeGen/X86/fold-push.ll @@ -5,8 +5,9 @@ define void @test(i32 %a, i32 %b) optsize nounwind { ; CHECK-LABEL: test: -; CHECK: movl [[EAX:%e..]], (%esp) -; CHECK-NEXT: pushl [[EAX]] +; CHECK: addl +; CHECK-NEXT: pushl [[EAX:%e..]] +; CHECK-NEXT: movl [[EAX]], 4(%esp) ; CHECK-NEXT: calll ; CHECK-NEXT: addl $4, %esp ; CHECK: nop @@ -24,8 +25,9 @@ define void @test_min(i32 %a, i32 %b) minsize nounwind { ; CHECK-LABEL: test_min: -; CHECK: movl [[EAX:%e..]], (%esp) -; CHECK-NEXT: pushl [[EAX]] +; CHECK: addl +; CHECK-NEXT: pushl [[EAX:%e..]] +; CHECK-NEXT: movl [[EAX]], 4(%esp) ; CHECK-NEXT: calll ; CHECK-NEXT: popl ; CHECK: nop Index: test/CodeGen/X86/hoist-spill.ll =================================================================== --- test/CodeGen/X86/hoist-spill.ll +++ test/CodeGen/X86/hoist-spill.ll @@ -0,0 +1,115 @@ +; RUN: llc < %s | grep 'Spill' |sed 's%.*\(-[0-9]\+(\%rsp)\).*%\1%g' |sort |uniq -d |awk '{if (/rsp/); exit -1}' +; Check no spills to the same stack slot after hoisting. + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@a = external global i32*, align 8 +@b = external global i32, align 4 +@d = external global i32*, align 8 + +; Function Attrs: norecurse noreturn nounwind uwtable +define void @fn1(i32 %p1) #0 { +entry: + %tmp = load i32*, i32** @d, align 8 + %tmp1 = load i32*, i32** @a, align 8 + %tmp2 = sext i32 %p1 to i64 + br label %for.cond + +for.cond: ; preds = %for.inc14, %entry + %indvar = phi i32 [ %indvar.next, %for.inc14 ], [ 0, %entry ] + %indvars.iv30.in = phi i32 [ %indvars.iv30, %for.inc14 ], [ %p1, %entry ] + %c.0 = phi i32 [ %inc15, %for.inc14 ], [ 1, %entry ] + %k.0 = phi i32 [ %k.1.lcssa, %for.inc14 ], [ undef, %entry ] + %tmp3 = icmp sgt i32 undef, 0 + %smax52 = select i1 %tmp3, i32 undef, i32 0 + %tmp4 = zext i32 %smax52 to i64 + %tmp5 = icmp sgt i64 undef, %tmp4 + %smax53 = select i1 %tmp5, i64 undef, i64 %tmp4 + %tmp6 = add nsw i64 %smax53, 1 + %tmp7 = sub nsw i64 %tmp6, %tmp4 + %tmp8 = add nsw i64 %tmp7, -8 + %tmp9 = sub i32 undef, %indvar + %tmp10 = icmp sgt i64 %tmp2, 0 + %smax40 = select i1 %tmp10, i64 %tmp2, i64 0 + %scevgep41 = getelementptr i32, i32* %tmp1, i64 %smax40 + %indvars.iv30 = add i32 %indvars.iv30.in, -1 + %tmp11 = icmp sgt i32 %indvars.iv30, 0 + %smax = select i1 %tmp11, i32 %indvars.iv30, i32 0 + %tmp12 = zext i32 %smax to i64 + %sub = sub nsw i32 %p1, %c.0 + %cmp = icmp sgt i32 %sub, 0 + %sub. = select i1 %cmp, i32 %sub, i32 0 + %cmp326 = icmp sgt i32 %k.0, %p1 + br i1 %cmp326, label %for.cond4.preheader, label %for.body.preheader + +for.body.preheader: ; preds = %for.cond + br label %for.body + +for.cond4.preheader: ; preds = %for.body, %for.cond + %k.1.lcssa = phi i32 [ %k.0, %for.cond ], [ %add, %for.body ] + %cmp528 = icmp sgt i32 %sub., %p1 + br i1 %cmp528, label %for.inc14, label %for.body6.preheader + +for.body6.preheader: ; preds = %for.cond4.preheader + br i1 undef, label %for.body6, label %min.iters.checked + +min.iters.checked: ; preds = %for.body6.preheader + br i1 undef, label %for.body6, label %vector.memcheck + +vector.memcheck: ; preds = %min.iters.checked + %bound1 = icmp ule i32* undef, %scevgep41 + %memcheck.conflict = and i1 undef, %bound1 + br i1 %memcheck.conflict, label %for.body6, label %vector.body.preheader + +vector.body.preheader: ; preds = %vector.memcheck + %lcmp.mod = icmp eq i64 undef, 0 + br i1 %lcmp.mod, label %vector.body.preheader.split, label %vector.body.prol + +vector.body.prol: ; preds = %vector.body.prol, %vector.body.preheader + %prol.iter.cmp = icmp eq i64 undef, 0 + br i1 %prol.iter.cmp, label %vector.body.preheader.split, label %vector.body.prol + +vector.body.preheader.split: ; preds = %vector.body.prol, %vector.body.preheader + %tmp13 = icmp ult i64 %tmp8, 24 + br i1 %tmp13, label %middle.block, label %vector.body + +vector.body: ; preds = %vector.body, %vector.body.preheader.split + %index = phi i64 [ %index.next.3, %vector.body ], [ 0, %vector.body.preheader.split ] + %index.next = add i64 %index, 8 + %offset.idx.1 = add i64 %tmp12, %index.next + %tmp14 = getelementptr inbounds i32, i32* %tmp, i64 %offset.idx.1 + %tmp15 = bitcast i32* %tmp14 to <4 x i32>* + %wide.load.1 = load <4 x i32>, <4 x i32>* %tmp15, align 4 + %tmp16 = getelementptr inbounds i32, i32* %tmp1, i64 %offset.idx.1 + %tmp17 = bitcast i32* %tmp16 to <4 x i32>* + store <4 x i32> %wide.load.1, <4 x i32>* %tmp17, align 4 + %index.next.3 = add i64 %index, 32 + br i1 undef, label %middle.block, label %vector.body + +middle.block: ; preds = %vector.body, %vector.body.preheader.split + br i1 undef, label %for.inc14, label %for.body6 + +for.body: ; preds = %for.body, %for.body.preheader + %k.127 = phi i32 [ %k.0, %for.body.preheader ], [ %add, %for.body ] + %add = add nsw i32 %k.127, 1 + %tmp18 = load i32, i32* undef, align 4 + store i32 %tmp18, i32* @b, align 4 + br i1 undef, label %for.body, label %for.cond4.preheader + +for.body6: ; preds = %for.body6, %middle.block, %vector.memcheck, %min.iters.checked, %for.body6.preheader + %indvars.iv32 = phi i64 [ undef, %for.body6 ], [ %tmp12, %vector.memcheck ], [ %tmp12, %min.iters.checked ], [ %tmp12, %for.body6.preheader ], [ undef, %middle.block ] + %arrayidx8 = getelementptr inbounds i32, i32* %tmp, i64 %indvars.iv32 + %tmp19 = load i32, i32* %arrayidx8, align 4 + %arrayidx10 = getelementptr inbounds i32, i32* %tmp1, i64 %indvars.iv32 + store i32 %tmp19, i32* %arrayidx10, align 4 + %cmp5 = icmp slt i64 %indvars.iv32, undef + br i1 %cmp5, label %for.body6, label %for.inc14 + +for.inc14: ; preds = %for.body6, %middle.block, %for.cond4.preheader + %inc15 = add nuw nsw i32 %c.0, 1 + %indvar.next = add i32 %indvar, 1 + br label %for.cond +} + +attributes #0 = { norecurse noreturn nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" } Index: test/CodeGen/X86/new-remat.ll =================================================================== --- test/CodeGen/X86/new-remat.ll +++ test/CodeGen/X86/new-remat.ll @@ -0,0 +1,75 @@ +; RUN: llc < %s | FileCheck %s +; Check all spills are rematerialized. +; CHECK-NOT: Spill + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@b = common global double 0.000000e+00, align 8 +@a = common global i32 0, align 4 + +; Function Attrs: nounwind uwtable +define i32 @uniform_testdata(i32 %p1) #0 { +entry: + %cmp3 = icmp sgt i32 %p1, 0 + br i1 %cmp3, label %for.body.preheader, label %for.end + +for.body.preheader: ; preds = %entry + %0 = add i32 %p1, -1 + %xtraiter = and i32 %p1, 7 + %lcmp.mod = icmp eq i32 %xtraiter, 0 + br i1 %lcmp.mod, label %for.body.preheader.split, label %for.body.prol.preheader + +for.body.prol.preheader: ; preds = %for.body.preheader + br label %for.body.prol + +for.body.prol: ; preds = %for.body.prol.preheader, %for.body.prol + %i.04.prol = phi i32 [ %inc.prol, %for.body.prol ], [ 0, %for.body.prol.preheader ] + %prol.iter = phi i32 [ %prol.iter.sub, %for.body.prol ], [ %xtraiter, %for.body.prol.preheader ] + %1 = load double, double* @b, align 8 + %call.prol = tail call double @pow(double %1, double 2.500000e-01) #2 + %inc.prol = add nuw nsw i32 %i.04.prol, 1 + %prol.iter.sub = add i32 %prol.iter, -1 + %prol.iter.cmp = icmp eq i32 %prol.iter.sub, 0 + br i1 %prol.iter.cmp, label %for.body.preheader.split.loopexit, label %for.body.prol + +for.body.preheader.split.loopexit: ; preds = %for.body.prol + %inc.prol.lcssa = phi i32 [ %inc.prol, %for.body.prol ] + br label %for.body.preheader.split + +for.body.preheader.split: ; preds = %for.body.preheader.split.loopexit, %for.body.preheader + %i.04.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.prol.lcssa, %for.body.preheader.split.loopexit ] + %2 = icmp ult i32 %0, 7 + br i1 %2, label %for.end.loopexit, label %for.body.preheader.split.split + +for.body.preheader.split.split: ; preds = %for.body.preheader.split + br label %for.body + +for.body: ; preds = %for.body, %for.body.preheader.split.split + %i.04 = phi i32 [ %i.04.unr, %for.body.preheader.split.split ], [ %inc.7, %for.body ] + %3 = load double, double* @b, align 8 + %call = tail call double @pow(double %3, double 2.500000e-01) #2 + %4 = load double, double* @b, align 8 + %call.1 = tail call double @pow(double %4, double 2.500000e-01) #2 + %inc.7 = add nsw i32 %i.04, 8 + %exitcond.7 = icmp eq i32 %inc.7, %p1 + br i1 %exitcond.7, label %for.end.loopexit.unr-lcssa, label %for.body + +for.end.loopexit.unr-lcssa: ; preds = %for.body + br label %for.end.loopexit + +for.end.loopexit: ; preds = %for.body.preheader.split, %for.end.loopexit.unr-lcssa + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + %5 = load i32, i32* @a, align 4 + ret i32 %5 +} + +; Function Attrs: nounwind +declare double @pow(double, double) #1 + +attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nounwind } + Index: test/CodeGen/X86/ragreedy-hoist-spill.ll =================================================================== --- test/CodeGen/X86/ragreedy-hoist-spill.ll +++ test/CodeGen/X86/ragreedy-hoist-spill.ll @@ -2,6 +2,7 @@ ; This testing case is reduced from 254.gap SyFgets function. ; We make sure a spill is not hoisted to a hotter outer loop. +; We make sure a spill is hoisted to a cold BB inside the hotter outer loop. %struct.TMP.1 = type { %struct.TMP.2*, %struct.TMP.2*, [1024 x i8] } %struct.TMP.2 = type { i8*, i32, i32, i16, i16, %struct.TMP.3, i32, i8*, i32 (i8*)*, i32 (i8*, i8*, i32)*, i64 (i8*, i64, i32)*, i32 (i8*, i8*, i32)*, %struct.TMP.3, %struct.TMP.4*, i32, [3 x i8], [1 x i8], %struct.TMP.3, i32, i64 } @@ -181,6 +182,10 @@ br i1 %cmp476, label %if.end517, label %do.body479.preheader do.body479.preheader: + ; CHECK: do.body479.preheader + ; spill is hoisted here. Although loop depth1 is even hotter than loop depth2, do.body479.preheader is cold. + ; CHECK: movq %r{{.*}}, {{[0-9]+}}(%rsp) + ; CHECK: land.rhs485 %cmp4833314 = icmp eq i8 undef, 0 br i1 %cmp4833314, label %if.end517, label %land.rhs485 @@ -200,8 +205,8 @@ lor.rhs500: ; CHECK: lor.rhs500 - ; Make sure that we don't hoist the spill to outer loops. - ; CHECK: movq %r{{.*}}, {{[0-9]+}}(%rsp) + ; Make sure spill is hoisted to a cold preheader in outside loop. + ; CHECK-NOT: movq %r{{.*}}, {{[0-9]+}}(%rsp) ; CHECK: callq {{.*}}maskrune %call3.i.i2792 = call i32 @__maskrune(i32 undef, i64 256) br i1 undef, label %land.lhs.true504, label %do.body479.backedge Index: test/CodeGen/X86/vselect-minmax.ll =================================================================== --- test/CodeGen/X86/vselect-minmax.ll +++ test/CodeGen/X86/vselect-minmax.ll @@ -4888,13 +4888,14 @@ define <8 x i64> @test122(<8 x i64> %a, <8 x i64> %b) { ; SSE2-LABEL: test122: ; SSE2: # BB#0: # %entry -; SSE2-NEXT: movdqa %xmm7, %xmm8 -; SSE2-NEXT: movdqa %xmm8, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm7, %xmm11 ; SSE2-NEXT: movdqa %xmm3, %xmm7 ; SSE2-NEXT: movdqa %xmm2, %xmm3 ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: movdqa %xmm0, %xmm9 ; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,0,2147483648,0] +; SSE2-NEXT: movdqa %xmm11, %xmm8 +; SSE2-NEXT: movdqa %xmm11, -{{[0-9]+}}(%rsp) # 16-byte Spill ; SSE2-NEXT: pxor %xmm10, %xmm8 ; SSE2-NEXT: movdqa %xmm7, %xmm0 ; SSE2-NEXT: pxor %xmm10, %xmm0 @@ -5163,7 +5164,6 @@ ; SSE2-LABEL: test124: ; SSE2: # BB#0: # %entry ; SSE2-NEXT: movdqa %xmm7, %xmm11 -; SSE2-NEXT: movdqa %xmm11, -{{[0-9]+}}(%rsp) # 16-byte Spill ; SSE2-NEXT: movdqa %xmm3, %xmm7 ; SSE2-NEXT: movdqa %xmm2, %xmm3 ; SSE2-NEXT: movdqa %xmm1, %xmm2 @@ -5172,6 +5172,7 @@ ; SSE2-NEXT: movdqa %xmm7, %xmm8 ; SSE2-NEXT: pxor %xmm10, %xmm8 ; SSE2-NEXT: movdqa %xmm11, %xmm0 +; SSE2-NEXT: movdqa %xmm11, -{{[0-9]+}}(%rsp) # 16-byte Spill ; SSE2-NEXT: pxor %xmm10, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm11 ; SSE2-NEXT: pcmpgtd %xmm8, %xmm11 @@ -5465,13 +5466,14 @@ define <8 x i64> @test126(<8 x i64> %a, <8 x i64> %b) { ; SSE2-LABEL: test126: ; SSE2: # BB#0: # %entry -; SSE2-NEXT: movdqa %xmm7, %xmm8 -; SSE2-NEXT: movdqa %xmm8, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm7, %xmm11 ; SSE2-NEXT: movdqa %xmm3, %xmm7 ; SSE2-NEXT: movdqa %xmm2, %xmm3 ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: movdqa %xmm0, %xmm9 ; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm11, %xmm8 +; SSE2-NEXT: movdqa %xmm11, -{{[0-9]+}}(%rsp) # 16-byte Spill ; SSE2-NEXT: pxor %xmm10, %xmm8 ; SSE2-NEXT: movdqa %xmm7, %xmm0 ; SSE2-NEXT: pxor %xmm10, %xmm0 @@ -5794,7 +5796,6 @@ ; SSE2-LABEL: test128: ; SSE2: # BB#0: # %entry ; SSE2-NEXT: movdqa %xmm7, %xmm11 -; SSE2-NEXT: movdqa %xmm11, -{{[0-9]+}}(%rsp) # 16-byte Spill ; SSE2-NEXT: movdqa %xmm3, %xmm7 ; SSE2-NEXT: movdqa %xmm2, %xmm3 ; SSE2-NEXT: movdqa %xmm1, %xmm2 @@ -5803,6 +5804,7 @@ ; SSE2-NEXT: movdqa %xmm7, %xmm8 ; SSE2-NEXT: pxor %xmm10, %xmm8 ; SSE2-NEXT: movdqa %xmm11, %xmm0 +; SSE2-NEXT: movdqa %xmm11, -{{[0-9]+}}(%rsp) # 16-byte Spill ; SSE2-NEXT: pxor %xmm10, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm11 ; SSE2-NEXT: pcmpgtd %xmm8, %xmm11 @@ -7608,13 +7610,14 @@ define <8 x i64> @test154(<8 x i64> %a, <8 x i64> %b) { ; SSE2-LABEL: test154: ; SSE2: # BB#0: # %entry -; SSE2-NEXT: movdqa %xmm7, %xmm8 -; SSE2-NEXT: movdqa %xmm8, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm7, %xmm11 ; SSE2-NEXT: movdqa %xmm3, %xmm7 ; SSE2-NEXT: movdqa %xmm2, %xmm3 ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: movdqa %xmm0, %xmm9 ; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,0,2147483648,0] +; SSE2-NEXT: movdqa %xmm11, %xmm8 +; SSE2-NEXT: movdqa %xmm11, -{{[0-9]+}}(%rsp) # 16-byte Spill ; SSE2-NEXT: pxor %xmm10, %xmm8 ; SSE2-NEXT: movdqa %xmm7, %xmm0 ; SSE2-NEXT: pxor %xmm10, %xmm0 @@ -7881,7 +7884,6 @@ ; SSE2-LABEL: test156: ; SSE2: # BB#0: # %entry ; SSE2-NEXT: movdqa %xmm7, %xmm11 -; SSE2-NEXT: movdqa %xmm11, -{{[0-9]+}}(%rsp) # 16-byte Spill ; SSE2-NEXT: movdqa %xmm3, %xmm7 ; SSE2-NEXT: movdqa %xmm2, %xmm3 ; SSE2-NEXT: movdqa %xmm1, %xmm2 @@ -7890,6 +7892,7 @@ ; SSE2-NEXT: movdqa %xmm7, %xmm8 ; SSE2-NEXT: pxor %xmm10, %xmm8 ; SSE2-NEXT: movdqa %xmm11, %xmm0 +; SSE2-NEXT: movdqa %xmm11, -{{[0-9]+}}(%rsp) # 16-byte Spill ; SSE2-NEXT: pxor %xmm10, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm11 ; SSE2-NEXT: pcmpgtd %xmm8, %xmm11 @@ -8181,13 +8184,14 @@ define <8 x i64> @test158(<8 x i64> %a, <8 x i64> %b) { ; SSE2-LABEL: test158: ; SSE2: # BB#0: # %entry -; SSE2-NEXT: movdqa %xmm7, %xmm8 -; SSE2-NEXT: movdqa %xmm8, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm7, %xmm11 ; SSE2-NEXT: movdqa %xmm3, %xmm7 ; SSE2-NEXT: movdqa %xmm2, %xmm3 ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: movdqa %xmm0, %xmm9 ; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm11, %xmm8 +; SSE2-NEXT: movdqa %xmm11, -{{[0-9]+}}(%rsp) # 16-byte Spill ; SSE2-NEXT: pxor %xmm10, %xmm8 ; SSE2-NEXT: movdqa %xmm7, %xmm0 ; SSE2-NEXT: pxor %xmm10, %xmm0 @@ -8508,7 +8512,6 @@ ; SSE2-LABEL: test160: ; SSE2: # BB#0: # %entry ; SSE2-NEXT: movdqa %xmm7, %xmm11 -; SSE2-NEXT: movdqa %xmm11, -{{[0-9]+}}(%rsp) # 16-byte Spill ; SSE2-NEXT: movdqa %xmm3, %xmm7 ; SSE2-NEXT: movdqa %xmm2, %xmm3 ; SSE2-NEXT: movdqa %xmm1, %xmm2 @@ -8517,6 +8520,7 @@ ; SSE2-NEXT: movdqa %xmm7, %xmm8 ; SSE2-NEXT: pxor %xmm10, %xmm8 ; SSE2-NEXT: movdqa %xmm11, %xmm0 +; SSE2-NEXT: movdqa %xmm11, -{{[0-9]+}}(%rsp) # 16-byte Spill ; SSE2-NEXT: pxor %xmm10, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm11 ; SSE2-NEXT: pcmpgtd %xmm8, %xmm11