Index: llvm/include/llvm/CodeGen/MachinePassRegistry.def =================================================================== --- llvm/include/llvm/CodeGen/MachinePassRegistry.def +++ llvm/include/llvm/CodeGen/MachinePassRegistry.def @@ -199,4 +199,5 @@ DUMMY_MACHINE_FUNCTION_PASS("machineverifier", MachineVerifierPass, ()) DUMMY_MACHINE_FUNCTION_PASS("machine-cycles", MachineCycleInfoWrapperPass, ()) DUMMY_MACHINE_FUNCTION_PASS("print-machine-cycles", MachineCycleInfoPrinterPass, ()) +DUMMY_MACHINE_FUNCTION_PASS("dupconstphiusers", DupConstPhiUsersPass, ()) #undef DUMMY_MACHINE_FUNCTION_PASS Index: llvm/include/llvm/CodeGen/Passes.h =================================================================== --- llvm/include/llvm/CodeGen/Passes.h +++ llvm/include/llvm/CodeGen/Passes.h @@ -355,6 +355,9 @@ /// This pass implements the "patchable-function" attribute. extern char &PatchableFunctionID; + /// This pass duplicate constant phi's user. + extern char &DupConstPhiUsersID; + /// createStackProtectorPass - This pass adds stack protectors to functions. /// FunctionPass *createStackProtectorPass(); Index: llvm/include/llvm/CodeGen/TargetInstrInfo.h =================================================================== --- llvm/include/llvm/CodeGen/TargetInstrInfo.h +++ llvm/include/llvm/CodeGen/TargetInstrInfo.h @@ -1578,6 +1578,14 @@ return false; } + /// Check if the ImmVal can be folded into the use instruction. + /// This function must be synchronizated with FoldImmediate. + virtual bool canFoldImmediate(MachineInstr &UseMI, Register UseReg, + int64_t ImmVal, + MachineRegisterInfo *MRI) const { + return false; + } + /// Return the number of u-operations the given machine /// instruction will be decoded to on the target cpu. The itinerary's /// IssueWidth is the number of microops that can be dispatched each Index: llvm/include/llvm/InitializePasses.h =================================================================== --- llvm/include/llvm/InitializePasses.h +++ llvm/include/llvm/InitializePasses.h @@ -144,6 +144,7 @@ void initializeDominanceFrontierWrapperPassPass(PassRegistry&); void initializeDominatorTreeWrapperPassPass(PassRegistry&); void initializeDwarfEHPrepareLegacyPassPass(PassRegistry &); +void initializeDupConstPhiUsersPass(PassRegistry&); void initializeEarlyCSELegacyPassPass(PassRegistry&); void initializeEarlyCSEMemSSALegacyPassPass(PassRegistry&); void initializeEarlyIfConverterPass(PassRegistry&); Index: llvm/lib/CodeGen/CMakeLists.txt =================================================================== --- llvm/lib/CodeGen/CMakeLists.txt +++ llvm/lib/CodeGen/CMakeLists.txt @@ -49,6 +49,7 @@ DetectDeadLanes.cpp DFAPacketizer.cpp DwarfEHPrepare.cpp + DupConstPhiUsers.cpp EarlyIfConversion.cpp EdgeBundles.cpp EHContGuardCatchret.cpp Index: llvm/lib/CodeGen/CodeGen.cpp =================================================================== --- llvm/lib/CodeGen/CodeGen.cpp +++ llvm/lib/CodeGen/CodeGen.cpp @@ -31,6 +31,7 @@ initializeDebugifyMachineModulePass(Registry); initializeDetectDeadLanesPass(Registry); initializeDwarfEHPrepareLegacyPassPass(Registry); + initializeDupConstPhiUsersPass(Registry); initializeEarlyIfConverterPass(Registry); initializeEarlyIfPredicatorPass(Registry); initializeEarlyMachineLICMPass(Registry); Index: llvm/lib/CodeGen/DupConstPhiUsers.cpp =================================================================== --- /dev/null +++ llvm/lib/CodeGen/DupConstPhiUsers.cpp @@ -0,0 +1,731 @@ +//===- DupConstPhiUsers.cpp -----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// \file This pass finds out PHIs whose operands are constants, duplicates its +// user to predecessors and folds in the constants. +// +// bb.0: +// ... +// %3:gr32 = MOV32ri 7 +// JCC_1 %bb.2, 5, implicit $eflags +// +// bb.1: +// %5:gr32 = MOV32ri 11 +// +// bb.2: +// %0:gr32 = PHI %3:gr32, %bb.0, %5:gr32, %bb.1 +// %6:gr32 = ADD32rr %2:gr32(tied-def 0), %0:gr32, implicit-def dead $eflags +// ... +// +// => +// +// bb.0: +// ... +// %7:gr32 = ADD32ri8 %2:gr32(tied-def 0), 7, implicit-def dead $eflags +// JCC_1 %bb.2, 5, implicit $eflags +// +// bb.1: +// %8:gr32 = ADD32ri8 %2:gr32(tied-def 0), 11, implicit-def dead $eflags +// +// bb.2: +// %6:gr32 = PHI %7:gr32, %bb.0, %8:gr32, %bb.1 +// ... +// +//===----------------------------------------------------------------------===// + +#include "PHIEliminationUtils.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/CodeGen/LiveRegUnits.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" + +using namespace llvm; + +#define DEBUG_TYPE "dupconstphiusers" + +#define MAX_ROUNDS 10 + +namespace { + +class DupConstPhiUsers : public MachineFunctionPass { +public: + static char ID; + DupConstPhiUsers() : MachineFunctionPass(ID) { + initializeDupConstPhiUsersPass(*PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesAll(); + AU.addRequired(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + +private: + + struct CandidateInfo { + // PHI instruction whose incoming registers are mostly constants. + MachineInstr *Phi; + // The user of PHI dest register, will be duplicated into predecessors. + MachineInstr *UserMI; + + // Some instruction accepts either fixed physical register or immediate, + // like + // %6:gr8 = PHI + // $cl = COPY %6:gr8 + // %7:gr32 = SHL32rCL %1:gr32(tied-def 0), implicit-def dead $eflags, implicit $cl + // In this case we can still duplicate SHL into predecessors and fold + // constant. + MachineInstr *PhyCopy; + + // PHI dest register, usually used by UserMI except when there is a PhyReg. + Register Reg; + // When set it is the dest physical register of PhyCopy and used by UserMI. + Register PhyReg; + + // Dependent instructions, their results are used by UserMI. They are in the + // same MBB as UserMI, will be moved to dominator. + SmallSetVector DepMIs; + + // The first Register is PHI dest register, which is used by UserMI. A + // single UserMI may have multiple operands come from PHI instructions. We + // must record all of them and replace them with appropriate incoming + // registers after duplication. + // The second Register is PHI incoming register from the corresponding + // predecessor. + DenseMap> PhiRegs; + + // Insert position in predecessors for duplicated instructions. Also include + // insert position in dominator for dependent instructions if necessary. + DenseMap InsertPos; + + CandidateInfo(MachineInstr *P) { + Phi = P; + UserMI = PhyCopy = nullptr; + } + }; + + SmallSetVector CollectConstPhis(MachineBasicBlock &MBB); + MachineInstr *FindSinglePhyRegUser(Register PhyReg, MachineInstr *CopyMI); + bool FindCandidateInfo(CandidateInfo &Candidate); + bool isMISafeToMove(MachineInstr *MI, CandidateInfo &Candidate); + void CollectPhiRegs(CandidateInfo &Candidate); + bool CheckMIAtPos(MachineInstr *MI, MachineBasicBlock *Pred, + MachineBasicBlock::iterator Pos, LiveRegUnits &RegUnits, + CandidateInfo &Candidate); + bool FindInsertPosMBB(MachineBasicBlock *PredBB, + SmallSetVector &Instrs, + MachineBasicBlock::iterator &Pos, + CandidateInfo &Candidate); + bool FindInsertPos(CandidateInfo &Candidate); + void MoveDepMIs(CandidateInfo &Candidate); + MachineInstr *DuplicatePhiUser(CandidateInfo &Candidate); + MachineInstr *tryToDupPhiUser(MachineInstr *Phi); + + SmallSetVector Worklist; + SmallSetVector HelperList; + + MachineDominatorTree *MDT; + MachineRegisterInfo *MRI; + const TargetInstrInfo *TII; + const TargetRegisterInfo *TRI; +}; + +} // end anonymous namespace + +char DupConstPhiUsers::ID; + +char &llvm::DupConstPhiUsersID = DupConstPhiUsers::ID; + +INITIALIZE_PASS_BEGIN(DupConstPhiUsers, DEBUG_TYPE, "Duplicate PHI Users", + false, false) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_END(DupConstPhiUsers, DEBUG_TYPE, "Duplicate PHI Users", + false, false) + + +// Collect a list of PHI instructions whose operands are mostly constants. There +// can be at most 1 non-const incoming value. +SmallSetVector +DupConstPhiUsers::CollectConstPhis(MachineBasicBlock &MBB) { + SmallSetVector Phis; + for (MachineBasicBlock::iterator PhiIt : MBB.phis()) { + int NonConst = 0; + for (unsigned I = 1, E = PhiIt->getNumOperands(); I != E; I += 2) { + Register Reg = PhiIt->getOperand(I).getReg(); + if (Reg.isPhysical()) { + // We don't handle physical PHI incoming register. + NonConst = 2; + break; + } + int64_t ImmVal; + MachineInstr *Def = MRI->getVRegDef(Reg); + if (!TII->getConstValDefinedInReg(*Def, Reg, ImmVal)) + NonConst++; + } + if (NonConst <= 1) + Phis.insert(&*PhiIt); + } + return Phis; +} + +// PhyReg is defined by CopyMI. This function returns the single user of PhyReg. +MachineInstr *DupConstPhiUsers::FindSinglePhyRegUser(Register PhyReg, + MachineInstr *CopyMI) { + MachineBasicBlock *MBB = CopyMI->getParent(); + MachineInstr *UserMI = nullptr; + for (auto I = std::next(MachineBasicBlock::iterator(CopyMI)); I != MBB->end(); + I++) { + if (I->findRegisterUseOperandIdx(PhyReg, false, TRI) != -1) { + if (UserMI) + return nullptr; + UserMI = &*I; + } + if (I->findRegisterDefOperandIdx(PhyReg, false, true, TRI) != -1) + return UserMI; + } + + if (!UserMI) + return nullptr; + // We have found a single user of CopyReg in current MBB. But we still need to + // prove CopyReg is not live out of MBB. + for (MachineBasicBlock *Succ : MBB->successors()) + if (Succ->isLiveIn(PhyReg)) + return nullptr; + + return UserMI; +} + +// Check if Reg is only used by PHI instructions in MBB. Otherwise it can't be +// deleted after folded into duplicated ALU instructions. +static bool hasOnlyPhiUser(Register Reg, MachineBasicBlock *MBB, + MachineRegisterInfo *MRI) { + for (MachineInstr &UseMI : MRI->use_nodbg_instructions(Reg)) + if (!UseMI.isPHI() || UseMI.getParent() != MBB) + return false; + return true; +} + +// Check and fill in basic candidate info. +bool DupConstPhiUsers::FindCandidateInfo(CandidateInfo &Candidate) { + MachineInstr *Phi = Candidate.Phi; + + // Some quick checking. + unsigned NumPHIOperands = Phi->getNumOperands(); + if (NumPHIOperands <= 3) + return false; + Register PhiReg = Phi->getOperand(0).getReg(); + if (!MRI->hasOneNonDBGUse(PhiReg)) + return false; + MachineInstr *UserMI = &*MRI->use_instr_nodbg_begin(PhiReg); + if (UserMI->getParent() != Phi->getParent()) + return false; + + // Check for the special case: + // %6:gr8 = PHI ... + // $cl = COPY %6:gr8 + // %7:gr32 = SHL32rCL %1:gr32(tied-def 0), implicit-def dead $eflags, implicit $cl + Register UseReg = PhiReg; + if (UserMI->getOpcode() == TargetOpcode::COPY) { + Register CopyReg = UserMI->getOperand(0).getReg(); + if (CopyReg.isPhysical()) { + // We need to check if there is a single user of the CopyReg in current + // MBB. + MachineInstr *CopyMI = UserMI; + UserMI = FindSinglePhyRegUser(CopyReg, CopyMI); + if (!UserMI) + return false; + Candidate.PhyReg = CopyReg; + Candidate.PhyCopy = CopyMI; + UseReg = CopyReg; + } + } + + // Check if most of the operands of the PHI are simple constants and can be + // folded into PHI user. There can be at most one value can't be folded. + MachineBasicBlock *NonFoldableBB = nullptr; + for (unsigned I = 1; I != NumPHIOperands; I += 2) { + MachineBasicBlock *PredBB = Phi->getOperand(I+1).getMBB(); + Register InReg = Phi->getOperand(I).getReg(); + MachineInstr *Def = MRI->getVRegDef(InReg); + int64_t ImmVal; + if (!TII->getConstValDefinedInReg(*Def, InReg, ImmVal) || + !TII->canFoldImmediate(*UserMI, UseReg, ImmVal, MRI) || + !hasOnlyPhiUser(InReg, Phi->getParent(), MRI)) { + if (NonFoldableBB) + return false; + // We'll move UserMI to NonFoldableBB. If (NonFoldableBB, MBB) is a + // critical edge, it will add cost to other path. So do the move when + // the current MBB is the only successor of NonFoldableBB. + // This is very conservative. In future we may compare the increased cost + // on the critical edge with the decreased cost on other edges. + if (PredBB->succ_size() > 1) + return false; + NonFoldableBB = PredBB; + } + } + + Candidate.UserMI = UserMI; + Candidate.Reg = PhiReg; + return true; +} + +// Check if it is safe to move/fold MI to predecessors. +// MI may depend on other instructions, we can move them into DominatorBB if it +// does not break dependence. All dependent instructions are pushed into +// Candidate.DepMIs. +bool DupConstPhiUsers::isMISafeToMove(MachineInstr *MI, + CandidateInfo &Candidate) { + bool DontMoveAcrossStore = true; + if (!MI->isSafeToMove(nullptr, DontMoveAcrossStore)) + return false; + + for (const MachineOperand &MO : MI->operands()) { + if (!MO.isReg()) + continue; + Register MOReg = MO.getReg(); + if (!MOReg) + continue; + if (MO.isDef()) { + if (MOReg.isVirtual()) + continue; + // MO is a physical reg def, we handle dead def only. Non-dead def is + // also possible, but need more checking. + if (MO.isDead()) + continue; + return false; + } + if (MOReg.isPhysical()) { + if (MOReg != Candidate.PhyReg) + return false; + continue; + } + MachineInstr *Def = MRI->getVRegDef(MOReg); + if (Def->getParent() != MI->getParent()) + continue; + // UserMI's operands can be PHI value. Dependent instructions' operands + // can't be PHI because they will be moved to domintor BB. + if (Def->isPHI()) { + if (MI == Candidate.UserMI) + continue; + else + return false; + } + + // Def is a dependent instruction in the same BB. We also need to check if + // it is safe to move before adding it to DepMIs. + if (Candidate.DepMIs.contains(Def)) + continue; + if (!isMISafeToMove(Def, Candidate)) + return false; + Candidate.DepMIs.insert(Def); + } + return true; +} + +// Collect all incoming registers for PHIs used by UserMI. +void DupConstPhiUsers::CollectPhiRegs(CandidateInfo &Candidate) { + for (const MachineOperand &MO : Candidate.UserMI->operands()) { + if (!MO.isReg() || MO.isDef()) + continue; + Register PhiReg = MO.getReg(); + if (PhiReg.isPhysical()) + continue; + MachineInstr *Phi = MRI->getVRegDef(PhiReg); + if (!Phi->isPHI() || Phi->getParent() != Candidate.UserMI->getParent()) + continue; + for (unsigned I = 1, E = Phi->getNumOperands(); I != E; I += 2) { + Register InReg = Phi->getOperand(I).getReg(); + MachineBasicBlock *PredBB = Phi->getOperand(I+1).getMBB(); + Candidate.PhiRegs[PhiReg].insert(std::make_pair(PredBB, InReg)); + } + } + + // If UserMI uses a physical register copied from PHI reg, we need to add the + // PHI reg. + if (Candidate.PhyReg) { + Register PhiReg = Candidate.Reg; + MachineInstr *Phi = Candidate.Phi; + for (unsigned I = 1, E = Phi->getNumOperands(); I != E; I += 2) { + Register InReg = Phi->getOperand(I).getReg(); + MachineBasicBlock *PredBB = Phi->getOperand(I+1).getMBB(); + Candidate.PhiRegs[PhiReg].insert(std::make_pair(PredBB, InReg)); + } + } +} + +// Check if we can move MI before Pos in Pred. +bool DupConstPhiUsers::CheckMIAtPos(MachineInstr *MI, MachineBasicBlock *Pred, + MachineBasicBlock::iterator Pos, + LiveRegUnits &RegUnits, + CandidateInfo &Candidate) { + for (const MachineOperand &MO : MI->operands()) { + if (!MO.isReg()) + continue; + Register Reg = MO.getReg(); + + if (Reg.isPhysical()) { + // In isMISafeToMove we allow dead def physical register access only, or a + // physical register defined by COPY and used by UserMI. So here we check + // if the physical register is live. + if (RegUnits.available(Reg)) + continue; + return false; + } + + if (MO.isDef()) + continue; + + // For a virtual register reference in MI, it is either defined in the same + // MBB as MI and added as dependent instruction, or come from predecessors, + // then it is always available at the end of predecessor. + if (Pos == Pred->end()) + continue; + + // Check if Reg is defined by a PHI. If so we need to check the + // corresponding register in PredBB. + auto PRI = Candidate.PhiRegs.find(Reg); + if (PRI != Candidate.PhiRegs.end()) + Reg = PRI->second[Pred]; + + // If the definition of Reg is in the same MBB as MI, it's a dependent + // instruction, will be moved to dominator. + MachineInstr *Def = MRI->getVRegDef(Reg); + if (Def->getParent() == MI->getParent()) + continue; + + // Otherwise Def must properly dominate Pos. + if (!(MDT->dominates(Def, &*Pos) && Def != &*Pos)) + return false; + } + return true; +} + +// Find a position in PredBB to insert instructions from Instrs. +bool DupConstPhiUsers::FindInsertPosMBB(MachineBasicBlock *PredBB, + SmallSetVector &Instrs, + MachineBasicBlock::iterator &Pos, CandidateInfo &Candidate) { + MachineBasicBlock::iterator ITerm = + findPHICopyInsertPoint(PredBB, Candidate.Phi->getParent(), + Candidate.PhiRegs[Candidate.Reg][PredBB]); + + // Collect live physical regs from PredBB->end() to first terminator. + LiveRegUnits PhyRegUnits(*TRI); + PhyRegUnits.addLiveOuts(*PredBB); + auto I = PredBB->end(); + while (I != ITerm) { + I--; + PhyRegUnits.stepBackward(*I); + }; + + // Check each position to see if dependence can be satisfied. + bool Found = false; + while (true) { + bool Fail = false; + for (MachineInstr *MI : Instrs) { + if (CheckMIAtPos(MI, PredBB, I, PhyRegUnits, Candidate)) + continue; + Fail = true; + break; + } + + if (!Fail) { + Pos = I; + Found = true; + // Found a possible insert position. But we still need to check earlier + // positions to avoid the following situation: + // + // %5:gr32 = SUB32ri %1:gr32(tied-def 0), 1025, implicit-def $eflags + // %9:gr8 = MOV8ri 3 + // JCC_1 %bb.2, 2, implicit $eflags + // + // The MOV8ri is inserted between SUB32ri and JCC. When the user of %9 is + // also duplicated into this BB, it can only be placed between MOV/JCC, + // but most X86 ALU instructions clobber $eflags, so there is no possible + // position for %9's user ALU. + // Insert the MOV instruction to an earlier position can solve the + // problem. + } + + if (I == PredBB->begin()) + return Found; + + // Move the position one instruction earlier. + I--; + PhyRegUnits.stepBackward(*I); + if (I->isPHI()) + return Found; + } +} + +// Find positions in predecessors to insert the duplicated PHI user. +bool DupConstPhiUsers::FindInsertPos(CandidateInfo &Candidate) { + MachineBasicBlock::iterator Pos; + MachineInstr *MI = Candidate.UserMI; + MachineBasicBlock *MBB = MI->getParent(); + MachineBasicBlock *DominatorBB = (*MDT)[MBB]->getIDom()->getBlock(); + + // Find insert position in predecessors. + SmallSetVector MIs; + MIs.insert(MI); + if (Candidate.PhyCopy) + MIs.insert(Candidate.PhyCopy); + for (MachineBasicBlock *Pred : MBB->predecessors()) { + bool FoundPos; + // If PredBB is also dominator, we'll move all instructions from DepMIs and + // MI to PredBB. + if (Pred == DominatorBB && !Candidate.DepMIs.empty()) { + // Temporarily add UserMI and COPY into DepMIs. Remove them later. + Candidate.DepMIs.insert(MI); + if (Candidate.PhyCopy) + Candidate.DepMIs.insert(Candidate.PhyCopy); + FoundPos = FindInsertPosMBB(Pred, Candidate.DepMIs, Pos, Candidate); + if (Candidate.PhyCopy) + Candidate.DepMIs.pop_back(); + Candidate.DepMIs.pop_back(); + } else + // PredBB is not dominator. + FoundPos = FindInsertPosMBB(Pred, MIs, Pos, Candidate); + if (!FoundPos) + return false; + Candidate.InsertPos[Pred] = Pos; + } + + // If we have instructions to be moved into dominator, and dominator is not a + // predecessor of MBB, we need to find insert position in dominator. + if (!Candidate.DepMIs.empty() && !MBB->isPredecessor(DominatorBB)) { + if (!FindInsertPosMBB(DominatorBB, Candidate.DepMIs, Pos, Candidate)) + return false; + Candidate.InsertPos[DominatorBB] = Pos; + } + return true; +} + +// Move Dependent instructions to immediate dominator. +void DupConstPhiUsers::MoveDepMIs(CandidateInfo &Candidate) { + if (!Candidate.DepMIs.empty()) { + MachineBasicBlock *MBB = Candidate.Phi->getParent(); + MachineBasicBlock *DominatorBB = (*MDT)[MBB]->getIDom()->getBlock(); + MachineBasicBlock::iterator ToPos = Candidate.InsertPos[DominatorBB]; + for (MachineInstr *MI : Candidate.DepMIs) { + MBB->remove_instr(MI); + DominatorBB->insert(ToPos, MI); + MI->clearKillInfo(); + } + } +} + +// Duplicate UserMI into predecessors and try to fold in immediate operand. +MachineInstr *DupConstPhiUsers::DuplicatePhiUser(CandidateInfo &Candidate) { + LLVM_DEBUG(dbgs() << "Duplicating PHI user: " << *Candidate.UserMI); + LLVM_DEBUG(dbgs() << "PHI: " << *Candidate.Phi); + if (Candidate.PhyCopy) + LLVM_DEBUG(dbgs() << "COPY: " << *Candidate.PhyCopy); + + // MI will be duplicated to predecessors, create a new PHI instruction for + // them. + MachineInstr *UserMI = Candidate.UserMI; + MachineBasicBlock *MBB = UserMI->getParent(); + Register DstReg = UserMI->getOperand(0).getReg(); + const TargetRegisterClass *RegRC = MRI->getRegClass(DstReg); + MachineInstrBuilder MIB = BuildMI(*MBB, MBB->begin(), UserMI->getDebugLoc(), + TII->get(TargetOpcode::PHI), DstReg); + + // COPY and UserMI are duplicated to predecessors, the attached Kill info may + // not be correct. + if (Candidate.PhyCopy) + Candidate.PhyCopy->clearKillInfo(); + UserMI->clearKillInfo(); + + // Now we can duplicate COPY and UserMI into predecessors. + for (MachineBasicBlock *Pred : MBB->predecessors()) { + MachineInstr *ImmDef; + MachineInstr *NewCopy; + MachineBasicBlock::iterator InsertPos = Candidate.InsertPos[Pred]; + + if (Candidate.PhyCopy) { + NewCopy = MBB->getParent()->CloneMachineInstr(Candidate.PhyCopy); + Pred->insert(InsertPos, NewCopy); + // Replace the COPY src with predecessor register. + Register PredReg = Candidate.PhiRegs[Candidate.Reg][Pred]; + assert(NewCopy->getOperand(1).getReg() == Candidate.Reg); + NewCopy->getOperand(1).setReg(PredReg); + // If PredReg is an immediate, try to fold it. + int64_t ImmVal; + ImmDef = MRI->getVRegDef(PredReg); + if (TII->getConstValDefinedInReg(*ImmDef, PredReg, ImmVal)) + TII->FoldImmediate(*NewCopy, *ImmDef, PredReg, MRI); + } + + MachineInstr *NewMI = MBB->getParent()->CloneMachineInstr(UserMI); + Pred->insert(InsertPos, NewMI); + + // Replace PHI registers with predecessor registers. + // FoldImmediate may commute operands. So we don't have a good method to + // visit each operand exactly once. Repeat checking operands until there is + // no change. + bool Changed = true; + while (Changed) { + Changed = false; + for (MachineOperand &MO : NewMI->operands()) { + if (!MO.isReg() || MO.isDef()) + continue; + Register PredReg, Reg = MO.getReg(); + if (Reg == Candidate.PhyReg) { + PredReg = Reg; + ImmDef = NewCopy; + } else { + if (!Candidate.PhiRegs.count(Reg)) + continue; + PredReg = Candidate.PhiRegs[Reg][Pred]; + MO.setReg(PredReg); + ImmDef = MRI->getVRegDef(PredReg); + } + // If PredReg is an immediate, try to fold it. + int64_t ImmVal; + if (TII->getConstValDefinedInReg(*ImmDef, PredReg, ImmVal)) { + if (TII->FoldImmediate(*NewMI, *ImmDef, PredReg, MRI)) { + Changed = true; + // Physical copy instruction can't be easily deleted by + // FoldImmediate. But we know it's dead after folding because + // FindSinglePhyRegUser has checked that UserMI is its only user. + // So we need to manually delete it now. + if (Candidate.PhyCopy) + NewCopy->eraseFromParent(); + } + } + } + } + + // Create a new dst register for duplicated UserMI, add it to the new PHI. + Register NewVR = MRI->createVirtualRegister(RegRC); + NewMI->getOperand(0).setReg(NewVR); + MIB.addReg(NewVR); + MIB.addMBB(Pred); + } + + LLVM_DEBUG(dbgs() << "New PHI: " << *MIB); + + if (Candidate.PhyCopy) + Candidate.PhyCopy->eraseFromParent(); + UserMI->eraseFromParent(); + + // Delete the PHI instructions which are used by UserMI only, and incoming + // register define instructions if possible. + // Don't do this before duplication, we may have + // + // %6:gr64 = MOV64ri 0 + // bb.2: + // %2:gr64 = PHI %6:gr64, %bb.0, %1:gr64, %bb.1 + // %3:gr64 = PHI %6:gr64, %bb.0, %0:gr64, %bb.1 + // %13:gr64 = OR64rr %3:gr64(tied-def 0), %2:gr64, implicit-def dead $eflags + // + // Both %3 and %2 use %6, after duplicating the OR instruction and replacing + // %3 with %6, OR will be the single user of %6, FoldImmediate will delete + // the MOV instruction. Later when %2 is replaced by %6, we can't find the + // define instruction of %6. + for (auto P : Candidate.PhiRegs) { + Register PhiReg = P.first; + if (!MRI->use_nodbg_empty(PhiReg)) + continue; + MachineInstr *Phi = MRI->getVRegDef(PhiReg); + Phi->eraseFromParent(); + Worklist.remove(Phi); + HelperList.remove(Phi); + for (auto RP : P.second) { + auto InReg = RP.second; + if (!MRI->use_nodbg_empty(InReg)) + continue; + MachineInstr *Def = MRI->getVRegDef(InReg); + if (Def) + Def->eraseFromParent(); + } + } + + return MIB; +} + +// Try to duplicate the single user of Phi into its predecessors. +// Most of Phi's incoming registers are constants. +MachineInstr *DupConstPhiUsers::tryToDupPhiUser(MachineInstr *Phi) { + CandidateInfo Candidate(Phi); + if (!FindCandidateInfo(Candidate)) + return nullptr; + + // Check if UserMI has other dependent instructions, and if they can be moved + // to dominator. + if (!isMISafeToMove(Candidate.UserMI, Candidate)) + return nullptr; + + CollectPhiRegs(Candidate); + + if (!FindInsertPos(Candidate)) + return nullptr; + + // Move the dependent instructions to immediate dominator. + MoveDepMIs(Candidate); + + // Duplicate UserMI to predecessors. + return DuplicatePhiUser(Candidate); +} + +bool DupConstPhiUsers::runOnMachineFunction(MachineFunction &MF) { + if (skipFunction(MF.getFunction())) + return false; + + MDT = &getAnalysis(); + MRI = &MF.getRegInfo(); + TII = MF.getSubtarget().getInstrInfo(); + TRI = MF.getSubtarget().getRegisterInfo(); + + bool Changed = false; + + for (auto &MBB : MF) { + Worklist = CollectConstPhis(MBB); + + // Transformation of one PHI can cause another PHI eligible for the + // optimization. So we iteratively work on the PHI list until no changes + // made. Theoretically it may have high cost, but we don't expect it + // triggers many times in a single MBB in practice. We also limit it in a + // arbitrary defined small number of rounds for pathological cases. + int Rounds = 0; + bool LocalChanged = true; + while (LocalChanged && Rounds < MAX_ROUNDS) { + LocalChanged = false; + HelperList.clear(); + + while (!Worklist.empty()) { + MachineInstr *Phi = Worklist.pop_back_val(); + MachineInstr *NewPhi = tryToDupPhiUser(Phi); + if (NewPhi) { + HelperList.insert(NewPhi); + LocalChanged = true; + } else + HelperList.insert(Phi); + } + + Changed |= LocalChanged; + Worklist = HelperList; + Rounds++; + } + } + + return Changed; +} Index: llvm/lib/CodeGen/PeepholeOptimizer.cpp =================================================================== --- llvm/lib/CodeGen/PeepholeOptimizer.cpp +++ llvm/lib/CodeGen/PeepholeOptimizer.cpp @@ -204,7 +204,8 @@ bool isMoveImmediate(MachineInstr &MI, SmallSet &ImmDefRegs, DenseMap &ImmDefMIs); bool foldImmediate(MachineInstr &MI, SmallSet &ImmDefRegs, - DenseMap &ImmDefMIs); + DenseMap &ImmDefMIs, + bool &Deleted); /// Finds recurrence cycles, but only ones that formulated around /// a def operand and a use operand that are tied. If there is a use @@ -219,7 +220,8 @@ /// copy, replace the uses of this copy with the previously seen copy's /// destination register. bool foldRedundantCopy(MachineInstr &MI, - DenseMap &CopyMIs); + DenseMap &CopyMIs, + SmallPtrSetImpl &LocalMIs); /// Is the register \p Reg a non-allocatable physical register? bool isNAPhysCopy(Register Reg); @@ -1372,7 +1374,8 @@ /// and only if the def and use are in the same BB. bool PeepholeOptimizer::foldImmediate( MachineInstr &MI, SmallSet &ImmDefRegs, - DenseMap &ImmDefMIs) { + DenseMap &ImmDefMIs, bool &Deleted) { + Deleted = false; for (unsigned i = 0, e = MI.getDesc().getNumOperands(); i != e; ++i) { MachineOperand &MO = MI.getOperand(i); if (!MO.isReg() || MO.isDef()) @@ -1386,6 +1389,17 @@ assert(II != ImmDefMIs.end() && "couldn't find immediate definition"); if (TII->FoldImmediate(MI, *II->second, Reg, MRI)) { ++NumImmFold; + // If ImmDefMI is not deleted, try to see if MI can be deleted. + if (MRI->getVRegDef(Reg) && + MI.isIdenticalTo(*II->second, MachineInstr::IgnoreVRegDefs)) { + Register DstReg = MI.getOperand(0).getReg(); + if (DstReg.isVirtual() && + MRI->getRegClass(DstReg) == MRI->getRegClass(Reg)) { + MRI->replaceRegWith(DstReg, Reg); + MI.eraseFromParent(); + Deleted = true; + } + } return true; } } @@ -1407,7 +1421,8 @@ // // Should replace %2 uses with %1:sub1 bool PeepholeOptimizer::foldRedundantCopy( - MachineInstr &MI, DenseMap &CopyMIs) { + MachineInstr &MI, DenseMap &CopyMIs, + SmallPtrSetImpl &LocalMIs) { assert(MI.isCopy() && "expected a COPY machine instruction"); Register SrcReg = MI.getOperand(1).getReg(); @@ -1427,6 +1442,8 @@ } MachineInstr *PrevCopy = CopyMIs.find(SrcPair)->second; + if (!LocalMIs.count(PrevCopy)) + return false; assert(SrcSubReg == PrevCopy->getOperand(1).getSubReg() && "Unexpected mismatching subreg!"); @@ -1734,7 +1751,7 @@ continue; } - if (MI->isCopy() && (foldRedundantCopy(*MI, CopySrcMIs) || + if (MI->isCopy() && (foldRedundantCopy(*MI, CopySrcMIs, LocalMIs) || foldRedundantNAPhysCopy(*MI, NAPhysToVirtMIs))) { LocalMIs.erase(MI); LLVM_DEBUG(dbgs() << "Deleting redundant copy: " << *MI << "\n"); @@ -1752,8 +1769,14 @@ // next iteration sees the new instructions. MII = MI; ++MII; - if (SeenMoveImm) - Changed |= foldImmediate(*MI, ImmDefRegs, ImmDefMIs); + if (SeenMoveImm) { + bool Deleted; + Changed |= foldImmediate(*MI, ImmDefRegs, ImmDefMIs, Deleted); + if (Deleted) { + LocalMIs.erase(MI); + continue; + } + } } // Check whether MI is a load candidate for folding into a later Index: llvm/lib/CodeGen/TargetPassConfig.cpp =================================================================== --- llvm/lib/CodeGen/TargetPassConfig.cpp +++ llvm/lib/CodeGen/TargetPassConfig.cpp @@ -1275,6 +1275,8 @@ /// Add passes that optimize machine instructions in SSA form. void TargetPassConfig::addMachineSSAOptimization() { + addPass(&DupConstPhiUsersID); + // Pre-ra tail duplication. addPass(&EarlyTailDuplicateID); Index: llvm/lib/Target/X86/X86InstrInfo.h =================================================================== --- llvm/lib/Target/X86/X86InstrInfo.h +++ llvm/lib/Target/X86/X86InstrInfo.h @@ -531,6 +531,20 @@ Register &FoldAsLoadDefReg, MachineInstr *&DefMI) const override; + bool FoldImmediateImpl(MachineInstr &UseMI, MachineInstr *DefMI, Register Reg, + int64_t ImmVal, MachineRegisterInfo *MRI, + bool MakeChange) const; + + /// FoldImmediate - 'Reg' is known to be defined by a move immediate + /// instruction, try to fold the immediate into the use instruction. + bool FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg, + MachineRegisterInfo *MRI) const override; + + /// Check if FoldImmediate can fold ImmVal into the given instruction. + /// This function must be synchronizated with FoldImmediate. + bool canFoldImmediate(MachineInstr &UseMI, Register UseReg, int64_t ImmVal, + MachineRegisterInfo *MRI) const override; + std::pair decomposeMachineOperandsTargetFlags(unsigned TF) const override; Index: llvm/lib/Target/X86/X86InstrInfo.cpp =================================================================== --- llvm/lib/Target/X86/X86InstrInfo.cpp +++ llvm/lib/Target/X86/X86InstrInfo.cpp @@ -3848,12 +3848,36 @@ bool X86InstrInfo::getConstValDefinedInReg(const MachineInstr &MI, const Register Reg, int64_t &ImmVal) const { - if (MI.getOpcode() != X86::MOV32ri && MI.getOpcode() != X86::MOV64ri) + Register MovReg = Reg; + const MachineInstr *MovMI = &MI; + if (MI.isSubregToReg()) { + // We use following pattern to setup 64b immediate. + // %8:gr32 = MOV32r0 implicit-def dead $eflags + // %6:gr64 = SUBREG_TO_REG 0, killed %8:gr32, %subreg.sub_32bit + unsigned FillBits = MI.getOperand(1).getImm(); + unsigned SubIdx = MI.getOperand(3).getImm(); + MovReg = MI.getOperand(2).getReg(); + if (SubIdx != X86::sub_32bit || FillBits != 0) + return false; + const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); + MovMI = MRI.getVRegDef(MovReg); + } + + if (MovMI->getOpcode() == X86::MOV32r0 && + MovMI->getOperand(0).getReg() == MovReg) { + ImmVal = 0; + return true; + } + + if (MovMI->getOpcode() != X86::MOV32ri && + MovMI->getOpcode() != X86::MOV64ri && + MovMI->getOpcode() != X86::MOV32ri64 && + MovMI->getOpcode() != X86::MOV8ri) return false; // Mov Src can be a global address. if (!MI.getOperand(1).isImm() || MI.getOperand(0).getReg() != Reg) return false; - ImmVal = MI.getOperand(1).getImm(); + ImmVal = MovMI->getOperand(1).getImm(); return true; } @@ -4720,6 +4744,230 @@ return nullptr; } +/// Conservatively check if there is live EFLAGS reaching MI. +static bool LiveEFLAGS(MachineInstr *MI, const TargetRegisterInfo *TRI) { + MachineBasicBlock *MBB = MI->getParent(); + MachineBasicBlock::iterator I(MI); + while (true) { + if (I == MBB->begin()) + break; + --I; + if (I->registerDefIsDead(X86::EFLAGS, TRI)) + return false; + if (I->readsRegister(X86::EFLAGS, TRI)) + return true; + if (I->modifiesRegister(X86::EFLAGS, TRI)) + return true; + } + return MBB->isLiveIn(X86::EFLAGS); +} + +/// Real implementation of FoldImmediate and canFoldImmediate. +bool X86InstrInfo::FoldImmediateImpl(MachineInstr &UseMI, MachineInstr *DefMI, + Register Reg, int64_t ImmVal, + MachineRegisterInfo *MRI, + bool MakeChange) const { + bool Modified = false; + bool ShiftRotate = false; + // When ImmVal is 0, some instructions can be changed to COPY. + bool CanChangeToCopy = false; + + if (!isInt<32>(ImmVal)) + return false; + + unsigned Opc = UseMI.getOpcode(); + unsigned NewOpc; + switch (Opc) { + case TargetOpcode::COPY: { + Register ToReg = UseMI.getOperand(0).getReg(); + const TargetRegisterClass *RC = nullptr; + if (ToReg.isVirtual()) + RC = MRI->getRegClass(ToReg); + bool GR32Reg = (ToReg.isVirtual() && X86::GR32RegClass.hasSubClassEq(RC)) || + (ToReg.isPhysical() && X86::GR32RegClass.contains(ToReg)); + bool GR64Reg = (ToReg.isVirtual() && X86::GR64RegClass.hasSubClassEq(RC)) || + (ToReg.isPhysical() && X86::GR64RegClass.contains(ToReg)); + bool GR8Reg = (ToReg.isVirtual() && X86::GR8RegClass.hasSubClassEq(RC)) || + (ToReg.isPhysical() && X86::GR8RegClass.contains(ToReg)); + + if (ImmVal == 0) { + // We have MOV32r0 only. + if (!GR32Reg) + return false; + } + + if (GR64Reg) + NewOpc = X86::MOV64ri; + else if (GR32Reg) { + NewOpc = X86::MOV32ri; + if (ImmVal == 0) { + // MOV32r0 is different than other cases because it doesn't encode the + // immediate in the instruction. So we directly modify it here. + if (!MakeChange) + return true; + + // MOV32r0 clobbers EFLAGS. + if (LiveEFLAGS(&UseMI, MRI->getTargetRegisterInfo())) + return false; + UseMI.setDesc(get(X86::MOV32r0)); + UseMI.RemoveOperand(UseMI.findRegisterUseOperandIdx(Reg)); + UseMI.addOperand(MachineOperand::CreateReg(X86::EFLAGS, /*isDef*/true, + /*isImp*/true, + /*isKill*/false, + /*isDead*/true)); + Modified = true; + } + } else if (GR8Reg) + NewOpc = X86::MOV8ri; + else + return false; + break; + } + + case X86::ADD64rr: + NewOpc = X86::ADD64ri32; + CanChangeToCopy = true; + break; + case X86::SUB64rr: + NewOpc = X86::SUB64ri32; + CanChangeToCopy = true; + if (UseMI.findRegisterUseOperandIdx(Reg) != 2) + return false; + break; + case X86::AND64rr: + NewOpc = X86::AND64ri32; + break; + case X86::OR64rr: + NewOpc = X86::OR64ri32; + CanChangeToCopy = true; + break; + case X86::XOR64rr: + NewOpc = X86::XOR64ri32; + CanChangeToCopy = true; + break; + case X86::SHR64rCL: + NewOpc = X86::SHR64ri; + ShiftRotate = true; + break; + case X86::SHL64rCL: + NewOpc = X86::SHL64ri; + ShiftRotate = true; + break; + case X86::SAR64rCL: + NewOpc = X86::SAR64ri; + ShiftRotate = true; + break; + + default: + switch (Opc) { + case X86::ADD32rr: + NewOpc = X86::ADD32ri; + CanChangeToCopy = true; + break; + case X86::SUB32rr: + NewOpc = X86::SUB32ri; + CanChangeToCopy = true; + if (UseMI.findRegisterUseOperandIdx(Reg) != 2) + return false; + break; + case X86::AND32rr: + NewOpc = X86::AND32ri; + break; + case X86::OR32rr: + NewOpc = X86::OR32ri; + CanChangeToCopy = true; + break; + case X86::XOR32rr: + NewOpc = X86::XOR32ri; + CanChangeToCopy = true; + break; + case X86::SHR32rCL: + NewOpc = X86::SHR32ri; + ShiftRotate = true; + break; + case X86::SHL32rCL: + NewOpc = X86::SHL32ri; + ShiftRotate = true; + break; + case X86::SAR32rCL: + NewOpc = X86::SAR32ri; + ShiftRotate = true; + break; + default: + return false; + } + } + + if (ShiftRotate) { + unsigned RegIdx = UseMI.findRegisterUseOperandIdx(Reg); + if (RegIdx < 2) + return false; + if (!isInt<8>(ImmVal)) + return false; + assert(Reg == X86::CL); + + if (!MakeChange) + return true; + UseMI.setDesc(get(NewOpc)); + UseMI.RemoveOperand(RegIdx); + UseMI.addOperand(MachineOperand::CreateImm(ImmVal)); + // Reg is physical register $cl, so we don't know if DefMI is dead through + // MRI. Let the caller handle it, or pass dead-mi-elimination can delete + // the dead physical register define instruction. + return true; + } + + if (!MakeChange) + return true; + + if (!Modified) { + // Modify the instruction. + if (ImmVal == 0 && CanChangeToCopy && + UseMI.registerDefIsDead(X86::EFLAGS)) { + // %100 = add %101, 0 + // ==> + // %100 = COPY %101 + UseMI.setDesc(get(TargetOpcode::COPY)); + UseMI.RemoveOperand(UseMI.findRegisterUseOperandIdx(Reg)); + UseMI.RemoveOperand(UseMI.findRegisterDefOperandIdx(X86::EFLAGS)); + UseMI.untieRegOperand(0); + UseMI.clearFlag(MachineInstr::MIFlag::NoSWrap); + UseMI.clearFlag(MachineInstr::MIFlag::NoUWrap); + } else { + unsigned Op1 = 1, Op2 = CommuteAnyOperandIndex; + if (findCommutedOpIndices(UseMI, Op1, Op2) && + UseMI.getOperand(1).getReg() == Reg) + commuteInstruction(UseMI); + UseMI.setDesc(get(NewOpc)); + UseMI.findRegisterUseOperand(Reg)->ChangeToImmediate(ImmVal); + } + } + + if (Reg.isVirtual() && MRI->use_nodbg_empty(Reg)) + DefMI->eraseFromBundle(); + + return true; +} + +/// FoldImmediate - 'Reg' is known to be defined by a move immediate +/// instruction, try to fold the immediate into the use instruction. +bool X86InstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, + Register Reg, MachineRegisterInfo *MRI) const { + int64_t ImmVal; + if (!getConstValDefinedInReg(DefMI, Reg, ImmVal)) + return false; + + return FoldImmediateImpl(UseMI, &DefMI, Reg, ImmVal, MRI, true); +} + +/// Check if FoldImmediate can fold ImmVal into the given instruction. +/// This function must be synchronizated with FoldImmediate. +bool X86InstrInfo::canFoldImmediate(MachineInstr &UseMI, Register UseReg, + int64_t ImmVal, + MachineRegisterInfo *MRI) const { + return FoldImmediateImpl(UseMI, nullptr, UseReg, ImmVal, MRI, false); +} + /// Expand a single-def pseudo instruction to a two-addr /// instruction with two undef reads of the register being defined. /// This is used for mapping: Index: llvm/test/CodeGen/AArch64/O3-pipeline.ll =================================================================== --- llvm/test/CodeGen/AArch64/O3-pipeline.ll +++ llvm/test/CodeGen/AArch64/O3-pipeline.ll @@ -102,6 +102,8 @@ ; CHECK-NEXT: MachineDominator Tree Construction ; CHECK-NEXT: AArch64 Local Dynamic TLS Access Clean-up ; CHECK-NEXT: Finalize ISel and expand pseudo-instructions +; CHECK-NEXT: MachineDominator Tree Construction +; CHECK-NEXT: Duplicate PHI Users ; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Early Tail Duplication ; CHECK-NEXT: Optimize machine instruction PHIs Index: llvm/test/CodeGen/AMDGPU/llc-pipeline.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -281,6 +281,8 @@ ; GCN-O1-NEXT: MachinePostDominator Tree Construction ; GCN-O1-NEXT: SI Lower i1 Copies ; GCN-O1-NEXT: Finalize ISel and expand pseudo-instructions +; GCN-O1-NEXT: MachineDominator Tree Construction +; GCN-O1-NEXT: Duplicate PHI Users ; GCN-O1-NEXT: Lazy Machine Block Frequency Analysis ; GCN-O1-NEXT: Early Tail Duplication ; GCN-O1-NEXT: Optimize machine instruction PHIs @@ -558,6 +560,8 @@ ; GCN-O1-OPTS-NEXT: MachinePostDominator Tree Construction ; GCN-O1-OPTS-NEXT: SI Lower i1 Copies ; GCN-O1-OPTS-NEXT: Finalize ISel and expand pseudo-instructions +; GCN-O1-OPTS-NEXT: MachineDominator Tree Construction +; GCN-O1-OPTS-NEXT: Duplicate PHI Users ; GCN-O1-OPTS-NEXT: Lazy Machine Block Frequency Analysis ; GCN-O1-OPTS-NEXT: Early Tail Duplication ; GCN-O1-OPTS-NEXT: Optimize machine instruction PHIs @@ -844,6 +848,8 @@ ; GCN-O2-NEXT: MachinePostDominator Tree Construction ; GCN-O2-NEXT: SI Lower i1 Copies ; GCN-O2-NEXT: Finalize ISel and expand pseudo-instructions +; GCN-O2-NEXT: MachineDominator Tree Construction +; GCN-O2-NEXT: Duplicate PHI Users ; GCN-O2-NEXT: Lazy Machine Block Frequency Analysis ; GCN-O2-NEXT: Early Tail Duplication ; GCN-O2-NEXT: Optimize machine instruction PHIs @@ -1144,6 +1150,8 @@ ; GCN-O3-NEXT: MachinePostDominator Tree Construction ; GCN-O3-NEXT: SI Lower i1 Copies ; GCN-O3-NEXT: Finalize ISel and expand pseudo-instructions +; GCN-O3-NEXT: MachineDominator Tree Construction +; GCN-O3-NEXT: Duplicate PHI Users ; GCN-O3-NEXT: Lazy Machine Block Frequency Analysis ; GCN-O3-NEXT: Early Tail Duplication ; GCN-O3-NEXT: Optimize machine instruction PHIs Index: llvm/test/CodeGen/ARM/O3-pipeline.ll =================================================================== --- llvm/test/CodeGen/ARM/O3-pipeline.ll +++ llvm/test/CodeGen/ARM/O3-pipeline.ll @@ -75,6 +75,8 @@ ; CHECK-NEXT: Lazy Block Frequency Analysis ; CHECK-NEXT: ARM Instruction Selection ; CHECK-NEXT: Finalize ISel and expand pseudo-instructions +; CHECK-NEXT: MachineDominator Tree Construction +; CHECK-NEXT: Duplicate PHI Users ; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Early Tail Duplication ; CHECK-NEXT: Optimize machine instruction PHIs Index: llvm/test/CodeGen/PowerPC/O3-pipeline.ll =================================================================== --- llvm/test/CodeGen/PowerPC/O3-pipeline.ll +++ llvm/test/CodeGen/PowerPC/O3-pipeline.ll @@ -91,6 +91,8 @@ ; CHECK-NEXT: PowerPC CTR Loops Verify ; CHECK-NEXT: PowerPC VSX Copy Legalization ; CHECK-NEXT: Finalize ISel and expand pseudo-instructions +; CHECK-NEXT: MachineDominator Tree Construction +; CHECK-NEXT: Duplicate PHI Users ; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Early Tail Duplication ; CHECK-NEXT: Optimize machine instruction PHIs Index: llvm/test/CodeGen/X86/dup-phi-users1.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/dup-phi-users1.ll @@ -0,0 +1,358 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s + + +; Base test case. +define zeroext i1 @test1(i32 %s, i32* %ptr) { +; CHECK-LABEL: test1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: cmpl $1025, %edi # imm = 0x401 +; CHECK-NEXT: jae .LBB0_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: andl $3, %edi +; CHECK-NEXT: jmp .LBB0_4 +; CHECK-NEXT: .LBB0_2: # %if.else +; CHECK-NEXT: cmpl $4096, %edi # imm = 0x1000 +; CHECK-NEXT: ja .LBB0_6 +; CHECK-NEXT: # %bb.3: +; CHECK-NEXT: andl $7, %edi +; CHECK-NEXT: .LBB0_4: # %return.sink.split +; CHECK-NEXT: movl %edi, (%rsi) +; CHECK-NEXT: movb $1, %al +; CHECK-NEXT: # kill: def $al killed $al killed $eax +; CHECK-NEXT: retq +; CHECK-NEXT: .LBB0_6: +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: # kill: def $al killed $al killed $eax +; CHECK-NEXT: retq +entry: + %cmp = icmp ult i32 %s, 1025 + br i1 %cmp, label %return.sink.split, label %if.else + +if.else: + %cmp2 = icmp ult i32 %s, 4097 + br i1 %cmp2, label %return.sink.split, label %return + +return.sink.split: + %.sink = phi i32 [ 3, %entry ], [ 7, %if.else ] + %and0 = and i32 %s, %.sink + store i32 %and0, i32* %ptr, align 4 + br label %return + +return: + %retval = phi i1 [ false, %if.else ], [ true, %return.sink.split ] + ret i1 %retval +} + + +; The PHI user XOR has a dependent instruction TRUNC, which can also be moved +; dominator. +define zeroext i1 @test2(i64 %s, i32* %ptr) { +; CHECK-LABEL: test2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: cmpq $1025, %rdi # imm = 0x401 +; CHECK-NEXT: jae .LBB1_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: xorl $3, %edi +; CHECK-NEXT: jmp .LBB1_4 +; CHECK-NEXT: .LBB1_2: # %if.else +; CHECK-NEXT: cmpq $4096, %rdi # imm = 0x1000 +; CHECK-NEXT: ja .LBB1_6 +; CHECK-NEXT: # %bb.3: +; CHECK-NEXT: xorl $7, %edi +; CHECK-NEXT: .LBB1_4: # %return.sink.split +; CHECK-NEXT: movl %edi, (%rsi) +; CHECK-NEXT: movb $1, %al +; CHECK-NEXT: # kill: def $al killed $al killed $eax +; CHECK-NEXT: retq +; CHECK-NEXT: .LBB1_6: +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: # kill: def $al killed $al killed $eax +; CHECK-NEXT: retq +entry: + %cmp = icmp ult i64 %s, 1025 + br i1 %cmp, label %return.sink.split, label %if.else + +if.else: + %cmp2 = icmp ult i64 %s, 4097 + br i1 %cmp2, label %return.sink.split, label %return + +return.sink.split: + %.sink = phi i32 [ 3, %entry ], [ 7, %if.else ] + %t = trunc i64 %s to i32 + %xor0 = xor i32 %t, %.sink + store i32 %xor0, i32* %ptr, align 4 + br label %return + +return: + %retval = phi i1 [ false, %if.else ], [ true, %return.sink.split ] + ret i1 %retval +} + + +; SHL has a physical register operand, which is set up by previous COPY +; instruction. Both instructions should be duplicated together. +define zeroext i1 @test3(i32 %s, i32 %shm, i32* %ptr) { +; CHECK-LABEL: test3: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: cmpl $1025, %edi # imm = 0x401 +; CHECK-NEXT: jae .LBB2_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: shll $3, %edi +; CHECK-NEXT: jmp .LBB2_3 +; CHECK-NEXT: .LBB2_2: # %if.else +; CHECK-NEXT: movl %esi, %ecx +; CHECK-NEXT: shll %cl, %edi +; CHECK-NEXT: .LBB2_3: # %return.sink.split +; CHECK-NEXT: movl %edi, (%rdx) +; CHECK-NEXT: movb $1, %al +; CHECK-NEXT: retq +entry: + %cmp = icmp ult i32 %s, 1025 + br i1 %cmp, label %return.sink.split, label %if.else + +if.else: + br label %return.sink.split + +return.sink.split: + %.sink = phi i32 [ 3, %entry ], [ %shm, %if.else ] + %shl0 = shl i32 %s, %.sink + store i32 %shl0, i32* %ptr, align 4 + ret i1 true +} + + +; Two PHI instructions, one PHI user depends on another PHI user. Both +; instructions can be duplicated. +define zeroext i1 @test4(i32 %s, i32* %ptr) { +; CHECK-LABEL: test4: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: cmpl $1025, %edi # imm = 0x401 +; CHECK-NEXT: jae .LBB3_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: addl $7, %edi +; CHECK-NEXT: shrl $3, %edi +; CHECK-NEXT: jmp .LBB3_4 +; CHECK-NEXT: .LBB3_2: # %if.else +; CHECK-NEXT: cmpl $4096, %edi # imm = 0x1000 +; CHECK-NEXT: ja .LBB3_6 +; CHECK-NEXT: # %bb.3: +; CHECK-NEXT: addl $100, %edi +; CHECK-NEXT: shrl $7, %edi +; CHECK-NEXT: .LBB3_4: # %return.sink.split +; CHECK-NEXT: movl %edi, (%rsi) +; CHECK-NEXT: movb $1, %al +; CHECK-NEXT: # kill: def $al killed $al killed $eax +; CHECK-NEXT: retq +; CHECK-NEXT: .LBB3_6: +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: # kill: def $al killed $al killed $eax +; CHECK-NEXT: retq +entry: + %cmp = icmp ult i32 %s, 1025 + br i1 %cmp, label %return.sink.split, label %if.else + +if.else: + %cmp2 = icmp ult i32 %s, 4097 + br i1 %cmp2, label %return.sink.split, label %return + +return.sink.split: + %.sink5 = phi i32 [ 7, %entry ], [ 100, %if.else ] + %.sink = phi i32 [ 3, %entry ], [ 7, %if.else ] + %add6 = add nuw nsw i32 %.sink5, %s + %shr7 = lshr i32 %add6, %.sink + store i32 %shr7, i32* %ptr, align 4 + br label %return + +return: + %retval = phi i1 [ false, %if.else ], [ true, %return.sink.split ] + ret i1 %retval +} + + +; Two dependent PHI users, with movable dependent instructions and physical +; register usage. +define zeroext i1 @test5(i64 %s, i32* %ptr) { +; CHECK-LABEL: test5: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: cmpq $1025, %rdi # imm = 0x401 +; CHECK-NEXT: jae .LBB4_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: leal 7(%rdi), %eax +; CHECK-NEXT: sarl $3, %eax +; CHECK-NEXT: jmp .LBB4_4 +; CHECK-NEXT: .LBB4_2: # %if.else +; CHECK-NEXT: cmpq $4096, %rdi # imm = 0x1000 +; CHECK-NEXT: ja .LBB4_6 +; CHECK-NEXT: # %bb.3: +; CHECK-NEXT: leal 100(%rdi), %eax +; CHECK-NEXT: sarl $7, %eax +; CHECK-NEXT: .LBB4_4: # %return.sink.split +; CHECK-NEXT: movl %eax, (%rsi) +; CHECK-NEXT: movb $1, %al +; CHECK-NEXT: # kill: def $al killed $al killed $eax +; CHECK-NEXT: retq +; CHECK-NEXT: .LBB4_6: +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: # kill: def $al killed $al killed $eax +; CHECK-NEXT: retq +entry: + %cmp = icmp ult i64 %s, 1025 + br i1 %cmp, label %return.sink.split, label %if.else + +if.else: + %cmp2 = icmp ult i64 %s, 4097 + br i1 %cmp2, label %return.sink.split, label %return + +return.sink.split: + %.sink5 = phi i32 [ 7, %entry ], [ 100, %if.else ] + %.sink = phi i32 [ 3, %entry ], [ 7, %if.else ] + %conv = trunc i64 %s to i32 + %add6 = add nuw nsw i32 %.sink5, %conv + %shr7 = ashr i32 %add6, %.sink + store i32 %shr7, i32* %ptr, align 4 + br label %return + +return: + %retval = phi i1 [ false, %if.else ], [ true, %return.sink.split ] + ret i1 %retval +} + + +; One user instruction with two PHI operands. +define i64 @test6(i1 %cond, i64 ()* %p) { +; CHECK-LABEL: test6: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %r14 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: .cfi_offset %rbx, -24 +; CHECK-NEXT: .cfi_offset %r14, -16 +; CHECK-NEXT: movq %rsi, %r14 +; CHECK-NEXT: testb $1, %dil +; CHECK-NEXT: je .LBB5_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: movl $5, %ebx +; CHECK-NEXT: orq $4, %rbx +; CHECK-NEXT: jmp .LBB5_3 +; CHECK-NEXT: .LBB5_2: # %if.end +; CHECK-NEXT: callq *%r14 +; CHECK-NEXT: movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000 +; CHECK-NEXT: andq %rax, %rcx +; CHECK-NEXT: movl %eax, %ebx +; CHECK-NEXT: orq %rcx, %rbx +; CHECK-NEXT: .LBB5_3: # %return +; CHECK-NEXT: callq *%r14 +; CHECK-NEXT: movq %rbx, %rax +; CHECK-NEXT: addq $8, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: popq %r14 +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq +entry: + br i1 %cond, label %return, label %if.end + +if.end: + %call = tail call i64 %p() + %high = and i64 %call, -4294967296 + %low = and i64 %call, 4294967295 + br label %return + +return: + %p1 = phi i64 [ %low, %if.end ], [ 5, %entry ] + %p2 = phi i64 [ %high, %if.end ], [ 4, %entry ] + %v = or i64 %p2, %p1 + %call2 = tail call i64 %p() + ret i64 %v +} + + +define i64 @test7(i1 %cond, i8* %Ptr, i64 %Val) { +; CHECK-LABEL: test7: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movq %rsi, %rax +; CHECK-NEXT: testb $1, %dil +; CHECK-NEXT: jne .LBB6_2 +; CHECK-NEXT: # %bb.1: # %two +; CHECK-NEXT: addq %rdx, %rax +; CHECK-NEXT: .LBB6_2: # %end +; CHECK-NEXT: retq +entry: + %t41 = ptrtoint i8* %Ptr to i64 + %t42 = zext i64 %t41 to i128 + br i1 %cond, label %end, label %two + +two: + %t36 = zext i64 %Val to i128 + %t37 = shl i128 %t36, 64 + %ins39 = or i128 %t42, %t37 + br label %end + +end: + %t869.0 = phi i128 [ %t42, %entry ], [ %ins39, %two ] + %t32 = trunc i128 %t869.0 to i64 + %t29 = lshr i128 %t869.0, 64 + %t30 = trunc i128 %t29 to i64 + + %t2 = add i64 %t32, %t30 + ret i64 %t2 +} + + +define i32 @test8(i1 %c1, i32 %v1, i32 %v2, i32 %v3) { +; CHECK-LABEL: test8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movl %ecx, %eax +; CHECK-NEXT: testb $1, %dil +; CHECK-NEXT: je .LBB7_2 +; CHECK-NEXT: # %bb.1: # %cond.true +; CHECK-NEXT: incl %esi +; CHECK-NEXT: xorl %edx, %esi +; CHECK-NEXT: incl %esi +; CHECK-NEXT: subl %esi, %eax +; CHECK-NEXT: .LBB7_2: # %cond.end +; CHECK-NEXT: retq +entry: + br i1 %c1, label %cond.true, label %cond.end + +cond.true: + %add = add nsw i32 1, %v1 + %xor = xor i32 %add, %v2 + %add1 = add nsw i32 1, %xor + br label %cond.end + +cond.end: + %cond = phi i32 [ %add1, %cond.true ], [ 0, %entry ] + %sub = sub nsw i32 %v3, %cond + ret i32 %sub +} + + +define i32 @test9(i8 %x, i1 %cond) { +; CHECK-LABEL: test9: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: testb $1, %sil +; CHECK-NEXT: je .LBB8_1 +; CHECK-NEXT: # %bb.2: # %if +; CHECK-NEXT: xorl $33, %eax +; CHECK-NEXT: retq +; CHECK-NEXT: .LBB8_1: +; CHECK-NEXT: xorl $21, %eax +; CHECK-NEXT: retq +entry: + br i1 %cond, label %if, label %endif +if: + br label %endif +endif: + %phi = phi i32 [ 21, %entry], [ 33, %if ] + %zext = zext i8 %x to i32 + %logic = xor i32 %zext, %phi + ret i32 %logic +} + Index: llvm/test/CodeGen/X86/dup-phi-users2.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/dup-phi-users2.ll @@ -0,0 +1,627 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s + +define i32 @test_basic(i1 %flag, i32 %arg) { +; CHECK-LABEL: test_basic: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: testb $1, %dil +; CHECK-NEXT: je .LBB0_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: addl $7, %eax +; CHECK-NEXT: retq +; CHECK-NEXT: .LBB0_2: # %b +; CHECK-NEXT: addl $11, %eax +; CHECK-NEXT: retq +entry: + br i1 %flag, label %a, label %b + +a: + br label %exit + +b: + br label %exit + +exit: + %p = phi i32 [ 7, %a ], [ 11, %b ] + %sum = add i32 %arg, %p + ret i32 %sum +} + +; Check that we handle commuted operands and get the constant onto the RHS. +define i32 @test_commuted(i1 %flag, i32 %arg) { +; CHECK-LABEL: test_commuted: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: testb $1, %dil +; CHECK-NEXT: je .LBB1_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: addl $7, %eax +; CHECK-NEXT: retq +; CHECK-NEXT: .LBB1_2: # %b +; CHECK-NEXT: addl $11, %eax +; CHECK-NEXT: retq +entry: + br i1 %flag, label %a, label %b + +a: + br label %exit + +b: + br label %exit + +exit: + %p = phi i32 [ 7, %a ], [ 11, %b ] + %sum = add i32 %p, %arg + ret i32 %sum +} + +; We don't split critical edge. But we can still replace the MOV immediate +; instruction with ALU instruction without any extra cost. +define i32 @test_simple_crit_edge(i1 %flag, i32 %arg) { +; CHECK-LABEL: test_simple_crit_edge: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: testb $1, %dil +; CHECK-NEXT: je .LBB2_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: addl $7, %eax +; CHECK-NEXT: retq +; CHECK-NEXT: .LBB2_2: # %a +; CHECK-NEXT: addl $11, %eax +; CHECK-NEXT: retq +entry: + br i1 %flag, label %exit, label %a + +a: + br label %exit + +exit: + %p = phi i32 [ 7, %entry ], [ 11, %a ] + %sum = add i32 %arg, %p + ret i32 %sum +} + +define i32 @test_no_spec_dominating_inst(i1 %flag, i32* %ptr) { +; CHECK-LABEL: test_no_spec_dominating_inst: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movl (%rsi), %eax +; CHECK-NEXT: testb $1, %dil +; CHECK-NEXT: je .LBB3_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: addl $7, %eax +; CHECK-NEXT: retq +; CHECK-NEXT: .LBB3_2: # %b +; CHECK-NEXT: addl $11, %eax +; CHECK-NEXT: retq +entry: + %load = load i32, i32* %ptr + br i1 %flag, label %a, label %b + +a: + br label %exit + +b: + br label %exit + +exit: + %p = phi i32 [ 7, %a ], [ 11, %b ] + %sum = add i32 %load, %p + ret i32 %sum +} + +; We have special logic handling PHI nodes, make sure it doesn't get confused +; by a dominating PHI. +define i32 @test_no_spec_dominating_phi(i1 %flag1, i1 %flag2, i32 %x, i32 %y) { +; CHECK-LABEL: test_no_spec_dominating_phi: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movl %edx, %eax +; CHECK-NEXT: testb $1, %dil +; CHECK-NEXT: jne .LBB4_2 +; CHECK-NEXT: # %bb.1: # %y.block +; CHECK-NEXT: movl %ecx, %eax +; CHECK-NEXT: .LBB4_2: # %merge +; CHECK-NEXT: testb $1, %sil +; CHECK-NEXT: je .LBB4_4 +; CHECK-NEXT: # %bb.3: +; CHECK-NEXT: addl $7, %eax +; CHECK-NEXT: retq +; CHECK-NEXT: .LBB4_4: # %b +; CHECK-NEXT: addl $11, %eax +; CHECK-NEXT: retq +entry: + br i1 %flag1, label %x.block, label %y.block + +x.block: + br label %merge + +y.block: + br label %merge + +merge: + %xy.phi = phi i32 [ %x, %x.block ], [ %y, %y.block ] + br i1 %flag2, label %a, label %b + +a: + br label %exit + +b: + br label %exit + +exit: + %p = phi i32 [ 7, %a ], [ 11, %b ] + %sum = add i32 %xy.phi, %p + ret i32 %sum +} + +; Ensure that we will speculate some number of "free" instructions on the given +; architecture even though they are unrelated to the PHI itself. +define i32 @test_speculate_free_insts(i1 %flag, i64 %arg) { +; CHECK-LABEL: test_speculate_free_insts: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movq %rsi, %rax +; CHECK-NEXT: testb $1, %dil +; CHECK-NEXT: je .LBB5_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: addl $7, %eax +; CHECK-NEXT: # kill: def $eax killed $eax killed $rax +; CHECK-NEXT: retq +; CHECK-NEXT: .LBB5_2: # %b +; CHECK-NEXT: addl $11, %eax +; CHECK-NEXT: # kill: def $eax killed $eax killed $rax +; CHECK-NEXT: retq +entry: + br i1 %flag, label %a, label %b + +a: + br label %exit + +b: + br label %exit + +exit: + %p = phi i32 [ 7, %a ], [ 11, %b ] + %t1 = trunc i64 %arg to i48 + %t2 = trunc i48 %t1 to i32 + %sum = add i32 %t2, %p + ret i32 %sum +} + +define i32 @test_speculate_free_phis(i1 %flag, i32 %arg1, i32 %arg2) { +; CHECK-LABEL: test_speculate_free_phis: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: testb $1, %dil +; CHECK-NEXT: je .LBB6_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: addl $7, %eax +; CHECK-NEXT: retq +; CHECK-NEXT: .LBB6_2: # %b +; CHECK-NEXT: addl $11, %edx +; CHECK-NEXT: movl %edx, %eax +; CHECK-NEXT: retq +entry: + br i1 %flag, label %a, label %b + +a: + br label %exit + +b: + br label %exit + +exit: + %p1 = phi i32 [ 7, %a ], [ 11, %b ] + %p2 = phi i32 [ %arg1, %a ], [ %arg2, %b ] + %sum = add i32 %p2, %p1 + ret i32 %sum +} + +; We shouldn't speculate multiple uses even if each individually looks +; profitable because of the total cost. +define i32 @test_no_spec_multi_uses(i1 %flag, i32 %arg1, i32 %arg2, i32 %arg3) { +; CHECK-LABEL: test_no_spec_multi_uses: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movl %ecx, %eax +; CHECK-NEXT: movl $7, %ecx +; CHECK-NEXT: testb $1, %dil +; CHECK-NEXT: jne .LBB7_2 +; CHECK-NEXT: # %bb.1: # %b +; CHECK-NEXT: movl $11, %ecx +; CHECK-NEXT: .LBB7_2: # %exit +; CHECK-NEXT: addl %ecx, %esi +; CHECK-NEXT: addl %ecx, %edx +; CHECK-NEXT: addl %esi, %edx +; CHECK-NEXT: addl %ecx, %eax +; CHECK-NEXT: addl %edx, %eax +; CHECK-NEXT: retq +entry: + br i1 %flag, label %a, label %b + +a: + br label %exit + +b: + br label %exit + +exit: + %p = phi i32 [ 7, %a ], [ 11, %b ] + %add1 = add i32 %arg1, %p + %add2 = add i32 %arg2, %p + %add3 = add i32 %arg3, %p + %sum1 = add i32 %add1, %add2 + %sum2 = add i32 %sum1, %add3 + ret i32 %sum2 +} + +define i32 @test_multi_phis1(i1 %flag, i32 %arg) { +; CHECK-LABEL: test_multi_phis1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: testb $1, %dil +; CHECK-NEXT: je .LBB8_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: addl $1, %eax +; CHECK-NEXT: addl $3, %eax +; CHECK-NEXT: addl $5, %eax +; CHECK-NEXT: retq +; CHECK-NEXT: .LBB8_2: # %b +; CHECK-NEXT: addl $2, %eax +; CHECK-NEXT: addl $4, %eax +; CHECK-NEXT: addl $6, %eax +; CHECK-NEXT: retq +entry: + br i1 %flag, label %a, label %b + +a: + br label %exit + +b: + br label %exit + +exit: + %p1 = phi i32 [ 1, %a ], [ 2, %b ] + %p2 = phi i32 [ 3, %a ], [ 4, %b ] + %p3 = phi i32 [ 5, %a ], [ 6, %b ] + %sum1 = add i32 %arg, %p1 + %sum2 = add i32 %sum1, %p2 + %sum3 = add i32 %sum2, %p3 + ret i32 %sum3 +} + +; Check that the order of the PHIs doesn't impact the behavior. +define i32 @test_multi_phis2(i1 %flag, i32 %arg) { +; CHECK-LABEL: test_multi_phis2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: testb $1, %dil +; CHECK-NEXT: je .LBB9_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: addl $1, %eax +; CHECK-NEXT: addl $3, %eax +; CHECK-NEXT: addl $5, %eax +; CHECK-NEXT: retq +; CHECK-NEXT: .LBB9_2: # %b +; CHECK-NEXT: addl $2, %eax +; CHECK-NEXT: addl $4, %eax +; CHECK-NEXT: addl $6, %eax +; CHECK-NEXT: retq +entry: + br i1 %flag, label %a, label %b + +a: + br label %exit + +b: + br label %exit + +exit: + %p3 = phi i32 [ 5, %a ], [ 6, %b ] + %p2 = phi i32 [ 3, %a ], [ 4, %b ] + %p1 = phi i32 [ 1, %a ], [ 2, %b ] + %sum1 = add i32 %arg, %p1 + %sum2 = add i32 %sum1, %p2 + %sum3 = add i32 %sum2, %p3 + ret i32 %sum3 +} + +define i32 @test_no_spec_indirectbr(i1 %flag, i32 %arg) { +; CHECK-LABEL: test_no_spec_indirectbr: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: testb $1, %dil +; CHECK-NEXT: je .LBB10_2 +; CHECK-NEXT: # %bb.1: # %a +; CHECK-NEXT: addl $7, %eax +; CHECK-NEXT: jmpq *%rax +; CHECK-NEXT: .LBB10_2: # %b +; CHECK-NEXT: addl $11, %eax +; CHECK-NEXT: jmpq *%rax +; CHECK-NEXT: .LBB10_3: # %exit +; CHECK-NEXT: retq +entry: + br i1 %flag, label %a, label %b + +a: + indirectbr i8* undef, [label %exit] + +b: + indirectbr i8* undef, [label %exit] + +exit: + %p = phi i32 [ 7, %a ], [ 11, %b ] + %sum = add i32 %arg, %p + ret i32 %sum +} + + +define i32 @test_no_spec_invoke_continue(i1 %flag, i32 %arg) personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) { +; CHECK-LABEL: test_no_spec_invoke_continue: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset %rbx, -16 +; CHECK-NEXT: movl %esi, %ebx +; CHECK-NEXT: testb $1, %dil +; CHECK-NEXT: je .LBB11_3 +; CHECK-NEXT: # %bb.1: # %a +; CHECK-NEXT: .Ltmp2: +; CHECK-NEXT: callq g@PLT +; CHECK-NEXT: .Ltmp3: +; CHECK-NEXT: # %bb.2: +; CHECK-NEXT: addl $7, %ebx +; CHECK-NEXT: jmp .LBB11_5 +; CHECK-NEXT: .LBB11_3: # %b +; CHECK-NEXT: .Ltmp0: +; CHECK-NEXT: callq g@PLT +; CHECK-NEXT: .Ltmp1: +; CHECK-NEXT: # %bb.4: +; CHECK-NEXT: addl $11, %ebx +; CHECK-NEXT: .LBB11_5: # %exit +; CHECK-NEXT: movl %ebx, %eax +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq +; CHECK-NEXT: .LBB11_6: # %lpad +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .Ltmp4: +; CHECK-NEXT: callq _Unwind_Resume@PLT +entry: + br i1 %flag, label %a, label %b + +a: + invoke void @g() + to label %exit unwind label %lpad + +b: + invoke void @g() + to label %exit unwind label %lpad + +exit: + %p = phi i32 [ 7, %a ], [ 11, %b ] + %sum = add i32 %arg, %p + ret i32 %sum + +lpad: + %lp = landingpad { i8*, i32 } + cleanup + resume { i8*, i32 } undef +} + +define i32 @test_no_spec_landingpad(i32 %arg, i32* %ptr) personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) { +; CHECK-LABEL: test_no_spec_landingpad: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: pushq %r14 +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: .cfi_offset %rbx, -32 +; CHECK-NEXT: .cfi_offset %r14, -24 +; CHECK-NEXT: .cfi_offset %rbp, -16 +; CHECK-NEXT: movq %rsi, %r14 +; CHECK-NEXT: movl %edi, %ebx +; CHECK-NEXT: leal 7(%rbx), %ebp +; CHECK-NEXT: .Ltmp5: +; CHECK-NEXT: callq g@PLT +; CHECK-NEXT: .Ltmp6: +; CHECK-NEXT: # %bb.1: # %invoke.cont +; CHECK-NEXT: addl $11, %ebx +; CHECK-NEXT: .Ltmp7: +; CHECK-NEXT: movl %ebx, %ebp +; CHECK-NEXT: callq g@PLT +; CHECK-NEXT: .Ltmp8: +; CHECK-NEXT: # %bb.2: # %exit +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: popq %r14 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq +; CHECK-NEXT: .LBB12_3: # %lpad +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: .Ltmp9: +; CHECK-NEXT: movl %ebp, (%r14) +; CHECK-NEXT: callq _Unwind_Resume@PLT +entry: + invoke void @g() + to label %invoke.cont unwind label %lpad + +invoke.cont: + invoke void @g() + to label %exit unwind label %lpad + +lpad: + %p = phi i32 [ 7, %entry ], [ 11, %invoke.cont ] + %lp = landingpad { i8*, i32 } + cleanup + %sum = add i32 %arg, %p + store i32 %sum, i32* %ptr + resume { i8*, i32 } undef + +exit: + ret i32 0 +} + + +define i32 @test_no_spec_cleanuppad(i32 %arg, i32* %ptr) personality i32 (...)* @__CxxFrameHandler3 { +; CHECK-LABEL: test_no_spec_cleanuppad: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset %rbp, -16 +; CHECK-NEXT: movq %rsp, %rbp +; CHECK-NEXT: .cfi_def_cfa_register %rbp +; CHECK-NEXT: subq $32, %rsp +; CHECK-NEXT: movq $-2, -8(%rbp) +; CHECK-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl $7, -12(%rbp) +; CHECK-NEXT: .Ltmp10: +; CHECK-NEXT: callq g@PLT +; CHECK-NEXT: .Ltmp11: +; CHECK-NEXT: # %bb.1: # %invoke.cont +; CHECK-NEXT: movl $11, -12(%rbp) +; CHECK-NEXT: .Ltmp12: +; CHECK-NEXT: callq g@PLT +; CHECK-NEXT: .Ltmp13: +; CHECK-NEXT: # %bb.2: # %exit +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: addq $32, %rsp +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: .cfi_def_cfa %rsp, 8 +; CHECK-NEXT: retq +; CHECK-NEXT: .LBB13_3: # %lpad +; CHECK-NEXT: .cfi_def_cfa %rbp, 16 +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset %rbp, -16 +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload +; CHECK-NEXT: addl -12(%rbp), %ecx +; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; CHECK-NEXT: movl %ecx, (%rax) +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: .cfi_def_cfa %rsp, 8 +; CHECK-NEXT: retq # CLEANUPRET +entry: + %p.wineh.spillslot = alloca i32, align 4 + store i32 7, i32* %p.wineh.spillslot, align 4 + invoke void @g() + to label %invoke.cont unwind label %lpad + +invoke.cont: ; preds = %entry + store i32 11, i32* %p.wineh.spillslot, align 4 + invoke void @g() + to label %exit unwind label %lpad + +lpad: ; preds = %invoke.cont, %entry + %cp = cleanuppad within none [] + %p.wineh.reload = load i32, i32* %p.wineh.spillslot, align 4 + %sum = add i32 %arg, %p.wineh.reload + store i32 %sum, i32* %ptr, align 4 + cleanupret from %cp unwind to caller + +exit: ; preds = %invoke.cont + ret i32 0 +} + +; Check that we don't speculate in the face of an expensive immediate. A large +; immediate can't be folded into ALU instructions, we still need an extra MOV +; instruction. So the duplication of PHI user doesn't bring us benefit. +define i64 @test_expensive_imm(i32 %flag, i64 %arg) { +; CHECK-LABEL: test_expensive_imm: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movq %rsi, %rax +; CHECK-NEXT: # kill: def $edi killed $edi def $rdi +; CHECK-NEXT: movl $1, %ecx +; CHECK-NEXT: leal -2(%rdi), %edx +; CHECK-NEXT: cmpl $2, %edx +; CHECK-NEXT: jb .LBB14_4 +; CHECK-NEXT: # %bb.1: # %entry +; CHECK-NEXT: cmpl $1, %edi +; CHECK-NEXT: jne .LBB14_2 +; CHECK-NEXT: # %bb.3: # %b +; CHECK-NEXT: movabsq $29496729640, %rcx # imm = 0x6DE246028 +; CHECK-NEXT: .LBB14_4: # %exit +; CHECK-NEXT: addq %rcx, %rax +; CHECK-NEXT: retq +; CHECK-NEXT: .LBB14_2: # %a +; CHECK-NEXT: movabsq $42949672960, %rcx # imm = 0xA00000000 +; CHECK-NEXT: addq %rcx, %rax +; CHECK-NEXT: retq +entry: + switch i32 %flag, label %a [ + i32 1, label %b + i32 2, label %c + i32 3, label %d + ] + +a: + br label %exit + +b: + br label %exit + +c: + br label %exit + +d: + br label %exit + +exit: + %p = phi i64 [ 42949672960, %a ], [ 29496729640, %b ], [ 1, %c ], [ 1, %d ] + %sum1 = add i64 %arg, %p + ret i64 %sum1 +} + +define i32 @test_no_spec_non_postdominating_uses(i1 %flag1, i1 %flag2, i32 %arg) { +; CHECK-LABEL: test_no_spec_non_postdominating_uses: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: # kill: def $edx killed $edx def $rdx +; CHECK-NEXT: testb $1, %dil +; CHECK-NEXT: je .LBB15_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: movl $13, %ecx +; CHECK-NEXT: leal 7(%rdx), %eax +; CHECK-NEXT: testb $1, %sil +; CHECK-NEXT: jne .LBB15_4 +; CHECK-NEXT: .LBB15_5: # %exit2 +; CHECK-NEXT: addl %ecx, %edx +; CHECK-NEXT: movl %edx, %eax +; CHECK-NEXT: retq +; CHECK-NEXT: .LBB15_2: # %b +; CHECK-NEXT: movl $42, %ecx +; CHECK-NEXT: leal 11(%rdx), %eax +; CHECK-NEXT: testb $1, %sil +; CHECK-NEXT: je .LBB15_5 +; CHECK-NEXT: .LBB15_4: # %exit1 +; CHECK-NEXT: retq +entry: + br i1 %flag1, label %a, label %b + +a: + br label %merge + +b: + br label %merge + +merge: + %p1 = phi i32 [ 7, %a ], [ 11, %b ] + %p2 = phi i32 [ 13, %a ], [ 42, %b ] + %sum1 = add i32 %arg, %p1 + br i1 %flag2, label %exit1, label %exit2 + +exit1: + ret i32 %sum1 + +exit2: + %sum2 = add i32 %arg, %p2 + ret i32 %sum2 +} + +declare void @g() +declare i32 @__gxx_personality_v0(...) +declare i32 @__CxxFrameHandler3(...) + Index: llvm/test/CodeGen/X86/fast-isel-freeze.ll =================================================================== --- llvm/test/CodeGen/X86/fast-isel-freeze.ll +++ llvm/test/CodeGen/X86/fast-isel-freeze.ll @@ -11,8 +11,8 @@ ; ; FAST-LABEL: freeze: ; FAST: # %bb.0: -; FAST-NEXT: movl $10, %eax -; FAST-NEXT: xorl %edi, %eax +; FAST-NEXT: movl %edi, %eax +; FAST-NEXT: xorl $10, %eax ; FAST-NEXT: retq %1 = freeze i32 %t %2 = freeze i32 10 Index: llvm/test/CodeGen/X86/lrshrink.ll =================================================================== --- llvm/test/CodeGen/X86/lrshrink.ll +++ llvm/test/CodeGen/X86/lrshrink.ll @@ -16,18 +16,20 @@ ; CHECK-NEXT: .cfi_offset %rbx, -32 ; CHECK-NEXT: .cfi_offset %r14, -24 ; CHECK-NEXT: .cfi_offset %r15, -16 -; CHECK-NEXT: movq %rcx, %r14 -; CHECK-NEXT: movl $4, %r15d ; CHECK-NEXT: testb $1, %dil -; CHECK-NEXT: je .LBB0_2 -; CHECK-NEXT: # %bb.1: # %then +; CHECK-NEXT: je .LBB0_1 +; CHECK-NEXT: # %bb.2: # %then ; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r9 -; CHECK-NEXT: movl $10, %r15d -; CHECK-NEXT: movq %rdx, %rsi +; CHECK-NEXT: addq $10, %rdx +; CHECK-NEXT: movq %rdx, %r15 ; CHECK-NEXT: movq %r8, %r14 -; CHECK-NEXT: .LBB0_2: # %else +; CHECK-NEXT: jmp .LBB0_3 +; CHECK-NEXT: .LBB0_1: +; CHECK-NEXT: movq %rcx, %r14 +; CHECK-NEXT: movq %rsi, %r15 +; CHECK-NEXT: addq $4, %r15 +; CHECK-NEXT: .LBB0_3: # %else ; CHECK-NEXT: addq %r9, %r14 -; CHECK-NEXT: addq %rsi, %r15 ; CHECK-NEXT: callq _Z3foov@PLT ; CHECK-NEXT: movl %eax, %ebx ; CHECK-NEXT: addq %r15, %rbx Index: llvm/test/CodeGen/X86/opt-pipeline.ll =================================================================== --- llvm/test/CodeGen/X86/opt-pipeline.ll +++ llvm/test/CodeGen/X86/opt-pipeline.ll @@ -81,8 +81,10 @@ ; CHECK-NEXT: MachineDominator Tree Construction ; CHECK-NEXT: Local Dynamic TLS Access Clean-up ; CHECK-NEXT: X86 PIC Global Base Reg Initialization -; CHECK-NEXT: Finalize ISel and expand pseudo-instructions +; CHECK-NEXT: Finalize ISel and expand pseudo-instructions ; CHECK-NEXT: X86 Domain Reassignment Pass +; CHECK-NEXT: MachineDominator Tree Construction +; CHECK-NEXT: Duplicate PHI Users ; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Early Tail Duplication ; CHECK-NEXT: Optimize machine instruction PHIs Index: llvm/test/CodeGen/X86/physreg-pairs.ll =================================================================== --- llvm/test/CodeGen/X86/physreg-pairs.ll +++ llvm/test/CodeGen/X86/physreg-pairs.ll @@ -145,8 +145,8 @@ ; CHECK-LABEL: test_ebp: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pushl %ebp -; CHECK-NEXT: movl $19088743, %esp # imm = 0x1234567 ; CHECK-NEXT: movl $-1985229329, %ebp # imm = 0x89ABCDEF +; CHECK-NEXT: movl $19088743, %esp # imm = 0x1234567 ; CHECK-NEXT: #APP ; CHECK-NEXT: movl %ebp, %eax ; CHECK-NEXT: #NO_APP Index: llvm/test/CodeGen/X86/popcnt.ll =================================================================== --- llvm/test/CodeGen/X86/popcnt.ll +++ llvm/test/CodeGen/X86/popcnt.ll @@ -615,12 +615,11 @@ ; X86-NEXT: shrl %ecx ; X86-NEXT: andl $1431655765, %ecx # imm = 0x55555555 ; X86-NEXT: subl %ecx, %eax -; X86-NEXT: movl $858993459, %ecx # imm = 0x33333333 -; X86-NEXT: movl %eax, %edx -; X86-NEXT: andl %ecx, %edx +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: andl $858993459, %ecx # imm = 0x33333333 ; X86-NEXT: shrl $2, %eax -; X86-NEXT: andl %ecx, %eax -; X86-NEXT: addl %edx, %eax +; X86-NEXT: andl $858993459, %eax # imm = 0x33333333 +; X86-NEXT: addl %ecx, %eax ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: shrl $4, %ecx ; X86-NEXT: addl %eax, %ecx @@ -635,12 +634,11 @@ ; X64-NEXT: shrl %eax ; X64-NEXT: andl $1431655765, %eax # imm = 0x55555555 ; X64-NEXT: subl %eax, %edi -; X64-NEXT: movl $858993459, %eax # imm = 0x33333333 -; X64-NEXT: movl %edi, %ecx -; X64-NEXT: andl %eax, %ecx +; X64-NEXT: movl %edi, %eax +; X64-NEXT: andl $858993459, %eax # imm = 0x33333333 ; X64-NEXT: shrl $2, %edi -; X64-NEXT: andl %eax, %edi -; X64-NEXT: addl %ecx, %edi +; X64-NEXT: andl $858993459, %edi # imm = 0x33333333 +; X64-NEXT: addl %eax, %edi ; X64-NEXT: movl %edi, %eax ; X64-NEXT: shrl $4, %eax ; X64-NEXT: addl %edi, %eax @@ -665,49 +663,40 @@ define i64 @cnt64_optsize(i64 %x) nounwind readnone optsize { ; X86-NOSSE-LABEL: cnt64_optsize: ; X86-NOSSE: # %bb.0: -; X86-NOSSE-NEXT: pushl %ebx -; X86-NOSSE-NEXT: pushl %edi -; X86-NOSSE-NEXT: pushl %esi ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NOSSE-NEXT: movl %esi, %ecx -; X86-NOSSE-NEXT: shrl %ecx -; X86-NOSSE-NEXT: movl $1431655765, %edx # imm = 0x55555555 -; X86-NOSSE-NEXT: andl %edx, %ecx -; X86-NOSSE-NEXT: subl %ecx, %esi -; X86-NOSSE-NEXT: movl $858993459, %ecx # imm = 0x33333333 -; X86-NOSSE-NEXT: movl %esi, %edi -; X86-NOSSE-NEXT: andl %ecx, %edi -; X86-NOSSE-NEXT: shrl $2, %esi -; X86-NOSSE-NEXT: andl %ecx, %esi -; X86-NOSSE-NEXT: addl %edi, %esi -; X86-NOSSE-NEXT: movl %esi, %ebx -; X86-NOSSE-NEXT: shrl $4, %ebx -; X86-NOSSE-NEXT: addl %esi, %ebx -; X86-NOSSE-NEXT: movl $252645135, %edi # imm = 0xF0F0F0F -; X86-NOSSE-NEXT: andl %edi, %ebx -; X86-NOSSE-NEXT: imull $16843009, %ebx, %esi # imm = 0x1010101 -; X86-NOSSE-NEXT: shrl $24, %esi -; X86-NOSSE-NEXT: movl %eax, %ebx -; X86-NOSSE-NEXT: shrl %ebx -; X86-NOSSE-NEXT: andl %edx, %ebx -; X86-NOSSE-NEXT: subl %ebx, %eax +; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NOSSE-NEXT: movl %ecx, %edx +; X86-NOSSE-NEXT: shrl %edx +; X86-NOSSE-NEXT: andl $1431655765, %edx # imm = 0x55555555 +; X86-NOSSE-NEXT: subl %edx, %ecx +; X86-NOSSE-NEXT: movl %ecx, %edx +; X86-NOSSE-NEXT: andl $858993459, %edx # imm = 0x33333333 +; X86-NOSSE-NEXT: shrl $2, %ecx +; X86-NOSSE-NEXT: andl $858993459, %ecx # imm = 0x33333333 +; X86-NOSSE-NEXT: addl %edx, %ecx +; X86-NOSSE-NEXT: movl %ecx, %edx +; X86-NOSSE-NEXT: shrl $4, %edx +; X86-NOSSE-NEXT: addl %ecx, %edx +; X86-NOSSE-NEXT: andl $252645135, %edx # imm = 0xF0F0F0F +; X86-NOSSE-NEXT: imull $16843009, %edx, %ecx # imm = 0x1010101 +; X86-NOSSE-NEXT: shrl $24, %ecx +; X86-NOSSE-NEXT: movl %eax, %edx +; X86-NOSSE-NEXT: shrl %edx +; X86-NOSSE-NEXT: andl $1431655765, %edx # imm = 0x55555555 +; X86-NOSSE-NEXT: subl %edx, %eax ; X86-NOSSE-NEXT: movl %eax, %edx -; X86-NOSSE-NEXT: andl %ecx, %edx +; X86-NOSSE-NEXT: andl $858993459, %edx # imm = 0x33333333 ; X86-NOSSE-NEXT: shrl $2, %eax -; X86-NOSSE-NEXT: andl %ecx, %eax +; X86-NOSSE-NEXT: andl $858993459, %eax # imm = 0x33333333 ; X86-NOSSE-NEXT: addl %edx, %eax -; X86-NOSSE-NEXT: movl %eax, %ecx -; X86-NOSSE-NEXT: shrl $4, %ecx -; X86-NOSSE-NEXT: addl %eax, %ecx -; X86-NOSSE-NEXT: andl %edi, %ecx -; X86-NOSSE-NEXT: imull $16843009, %ecx, %eax # imm = 0x1010101 +; X86-NOSSE-NEXT: movl %eax, %edx +; X86-NOSSE-NEXT: shrl $4, %edx +; X86-NOSSE-NEXT: addl %eax, %edx +; X86-NOSSE-NEXT: andl $252645135, %edx # imm = 0xF0F0F0F +; X86-NOSSE-NEXT: imull $16843009, %edx, %eax # imm = 0x1010101 ; X86-NOSSE-NEXT: shrl $24, %eax -; X86-NOSSE-NEXT: addl %esi, %eax +; X86-NOSSE-NEXT: addl %ecx, %eax ; X86-NOSSE-NEXT: xorl %edx, %edx -; X86-NOSSE-NEXT: popl %esi -; X86-NOSSE-NEXT: popl %edi -; X86-NOSSE-NEXT: popl %ebx ; X86-NOSSE-NEXT: retl ; ; X64-LABEL: cnt64_optsize: @@ -794,93 +783,85 @@ define i128 @cnt128_optsize(i128 %x) nounwind readnone optsize { ; X86-NOSSE-LABEL: cnt128_optsize: ; X86-NOSSE: # %bb.0: -; X86-NOSSE-NEXT: pushl %ebp ; X86-NOSSE-NEXT: pushl %ebx ; X86-NOSSE-NEXT: pushl %edi ; X86-NOSSE-NEXT: pushl %esi +; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NOSSE-NEXT: movl %ebx, %ecx -; X86-NOSSE-NEXT: shrl %ecx -; X86-NOSSE-NEXT: movl $1431655765, %edi # imm = 0x55555555 -; X86-NOSSE-NEXT: andl %edi, %ecx -; X86-NOSSE-NEXT: movl $1431655765, %edi # imm = 0x55555555 -; X86-NOSSE-NEXT: subl %ecx, %ebx -; X86-NOSSE-NEXT: movl $858993459, %ecx # imm = 0x33333333 -; X86-NOSSE-NEXT: movl %ebx, %ebp -; X86-NOSSE-NEXT: andl %ecx, %ebp -; X86-NOSSE-NEXT: shrl $2, %ebx -; X86-NOSSE-NEXT: andl %ecx, %ebx -; X86-NOSSE-NEXT: addl %ebp, %ebx -; X86-NOSSE-NEXT: movl %ebx, %ebp -; X86-NOSSE-NEXT: shrl $4, %ebp -; X86-NOSSE-NEXT: addl %ebx, %ebp -; X86-NOSSE-NEXT: movl %eax, %ebx +; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NOSSE-NEXT: movl %edi, %ebx ; X86-NOSSE-NEXT: shrl %ebx -; X86-NOSSE-NEXT: andl %edi, %ebx -; X86-NOSSE-NEXT: subl %ebx, %eax -; X86-NOSSE-NEXT: movl %eax, %ebx -; X86-NOSSE-NEXT: andl %ecx, %ebx -; X86-NOSSE-NEXT: shrl $2, %eax -; X86-NOSSE-NEXT: andl %ecx, %eax -; X86-NOSSE-NEXT: addl %ebx, %eax -; X86-NOSSE-NEXT: movl %eax, %edi -; X86-NOSSE-NEXT: shrl $4, %edi -; X86-NOSSE-NEXT: addl %eax, %edi -; X86-NOSSE-NEXT: movl $252645135, %ebx # imm = 0xF0F0F0F -; X86-NOSSE-NEXT: andl %ebx, %ebp -; X86-NOSSE-NEXT: imull $16843009, %ebp, %eax # imm = 0x1010101 -; X86-NOSSE-NEXT: shrl $24, %eax -; X86-NOSSE-NEXT: andl %ebx, %edi -; X86-NOSSE-NEXT: imull $16843009, %edi, %edi # imm = 0x1010101 +; X86-NOSSE-NEXT: andl $1431655765, %ebx # imm = 0x55555555 +; X86-NOSSE-NEXT: subl %ebx, %edi +; X86-NOSSE-NEXT: movl %edi, %ebx +; X86-NOSSE-NEXT: andl $858993459, %ebx # imm = 0x33333333 +; X86-NOSSE-NEXT: shrl $2, %edi +; X86-NOSSE-NEXT: andl $858993459, %edi # imm = 0x33333333 +; X86-NOSSE-NEXT: addl %ebx, %edi +; X86-NOSSE-NEXT: movl %edi, %ebx +; X86-NOSSE-NEXT: shrl $4, %ebx +; X86-NOSSE-NEXT: addl %edi, %ebx +; X86-NOSSE-NEXT: andl $252645135, %ebx # imm = 0xF0F0F0F +; X86-NOSSE-NEXT: imull $16843009, %ebx, %edi # imm = 0x1010101 ; X86-NOSSE-NEXT: shrl $24, %edi -; X86-NOSSE-NEXT: addl %eax, %edi -; X86-NOSSE-NEXT: movl %esi, %eax -; X86-NOSSE-NEXT: shrl %eax -; X86-NOSSE-NEXT: movl $1431655765, %ebp # imm = 0x55555555 -; X86-NOSSE-NEXT: andl %ebp, %eax -; X86-NOSSE-NEXT: subl %eax, %esi -; X86-NOSSE-NEXT: movl %esi, %eax -; X86-NOSSE-NEXT: andl %ecx, %eax +; X86-NOSSE-NEXT: movl %esi, %ebx +; X86-NOSSE-NEXT: shrl %ebx +; X86-NOSSE-NEXT: andl $1431655765, %ebx # imm = 0x55555555 +; X86-NOSSE-NEXT: subl %ebx, %esi +; X86-NOSSE-NEXT: movl %esi, %ebx +; X86-NOSSE-NEXT: andl $858993459, %ebx # imm = 0x33333333 ; X86-NOSSE-NEXT: shrl $2, %esi -; X86-NOSSE-NEXT: andl %ecx, %esi -; X86-NOSSE-NEXT: addl %eax, %esi -; X86-NOSSE-NEXT: movl %esi, %ebp -; X86-NOSSE-NEXT: shrl $4, %ebp -; X86-NOSSE-NEXT: addl %esi, %ebp -; X86-NOSSE-NEXT: movl %edx, %eax -; X86-NOSSE-NEXT: shrl %eax -; X86-NOSSE-NEXT: movl $1431655765, %esi # imm = 0x55555555 -; X86-NOSSE-NEXT: andl %esi, %eax -; X86-NOSSE-NEXT: subl %eax, %edx -; X86-NOSSE-NEXT: movl %edx, %eax -; X86-NOSSE-NEXT: andl %ecx, %eax +; X86-NOSSE-NEXT: andl $858993459, %esi # imm = 0x33333333 +; X86-NOSSE-NEXT: addl %ebx, %esi +; X86-NOSSE-NEXT: movl %esi, %ebx +; X86-NOSSE-NEXT: shrl $4, %ebx +; X86-NOSSE-NEXT: addl %esi, %ebx +; X86-NOSSE-NEXT: andl $252645135, %ebx # imm = 0xF0F0F0F +; X86-NOSSE-NEXT: imull $16843009, %ebx, %esi # imm = 0x1010101 +; X86-NOSSE-NEXT: shrl $24, %esi +; X86-NOSSE-NEXT: addl %edi, %esi +; X86-NOSSE-NEXT: movl %edx, %edi +; X86-NOSSE-NEXT: shrl %edi +; X86-NOSSE-NEXT: andl $1431655765, %edi # imm = 0x55555555 +; X86-NOSSE-NEXT: subl %edi, %edx +; X86-NOSSE-NEXT: movl %edx, %edi +; X86-NOSSE-NEXT: andl $858993459, %edi # imm = 0x33333333 ; X86-NOSSE-NEXT: shrl $2, %edx -; X86-NOSSE-NEXT: andl %ecx, %edx -; X86-NOSSE-NEXT: addl %eax, %edx -; X86-NOSSE-NEXT: movl %edx, %eax -; X86-NOSSE-NEXT: shrl $4, %eax -; X86-NOSSE-NEXT: addl %edx, %eax -; X86-NOSSE-NEXT: andl %ebx, %ebp -; X86-NOSSE-NEXT: andl %ebx, %eax -; X86-NOSSE-NEXT: imull $16843009, %ebp, %ecx # imm = 0x1010101 -; X86-NOSSE-NEXT: shrl $24, %ecx -; X86-NOSSE-NEXT: imull $16843009, %eax, %edx # imm = 0x1010101 -; X86-NOSSE-NEXT: shrl $24, %edx -; X86-NOSSE-NEXT: addl %ecx, %edx -; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NOSSE-NEXT: andl $858993459, %edx # imm = 0x33333333 ; X86-NOSSE-NEXT: addl %edi, %edx -; X86-NOSSE-NEXT: xorl %ecx, %ecx -; X86-NOSSE-NEXT: movl %ecx, 12(%eax) -; X86-NOSSE-NEXT: movl %ecx, 8(%eax) -; X86-NOSSE-NEXT: movl %ecx, 4(%eax) -; X86-NOSSE-NEXT: movl %edx, (%eax) +; X86-NOSSE-NEXT: movl %edx, %edi +; X86-NOSSE-NEXT: shrl $4, %edi +; X86-NOSSE-NEXT: addl %edx, %edi +; X86-NOSSE-NEXT: andl $252645135, %edi # imm = 0xF0F0F0F +; X86-NOSSE-NEXT: imull $16843009, %edi, %edx # imm = 0x1010101 +; X86-NOSSE-NEXT: shrl $24, %edx +; X86-NOSSE-NEXT: movl %ecx, %edi +; X86-NOSSE-NEXT: shrl %edi +; X86-NOSSE-NEXT: andl $1431655765, %edi # imm = 0x55555555 +; X86-NOSSE-NEXT: subl %edi, %ecx +; X86-NOSSE-NEXT: movl %ecx, %edi +; X86-NOSSE-NEXT: andl $858993459, %edi # imm = 0x33333333 +; X86-NOSSE-NEXT: shrl $2, %ecx +; X86-NOSSE-NEXT: andl $858993459, %ecx # imm = 0x33333333 +; X86-NOSSE-NEXT: addl %edi, %ecx +; X86-NOSSE-NEXT: movl %ecx, %edi +; X86-NOSSE-NEXT: shrl $4, %edi +; X86-NOSSE-NEXT: addl %ecx, %edi +; X86-NOSSE-NEXT: andl $252645135, %edi # imm = 0xF0F0F0F +; X86-NOSSE-NEXT: imull $16843009, %edi, %ecx # imm = 0x1010101 +; X86-NOSSE-NEXT: shrl $24, %ecx +; X86-NOSSE-NEXT: addl %edx, %ecx +; X86-NOSSE-NEXT: addl %esi, %ecx +; X86-NOSSE-NEXT: xorl %edx, %edx +; X86-NOSSE-NEXT: movl %edx, 12(%eax) +; X86-NOSSE-NEXT: movl %edx, 8(%eax) +; X86-NOSSE-NEXT: movl %edx, 4(%eax) +; X86-NOSSE-NEXT: movl %ecx, (%eax) ; X86-NOSSE-NEXT: popl %esi ; X86-NOSSE-NEXT: popl %edi ; X86-NOSSE-NEXT: popl %ebx -; X86-NOSSE-NEXT: popl %ebp ; X86-NOSSE-NEXT: retl $4 ; ; X64-LABEL: cnt128_optsize: @@ -1045,12 +1026,11 @@ ; X86-NEXT: shrl %ecx ; X86-NEXT: andl $1431655765, %ecx # imm = 0x55555555 ; X86-NEXT: subl %ecx, %eax -; X86-NEXT: movl $858993459, %ecx # imm = 0x33333333 -; X86-NEXT: movl %eax, %edx -; X86-NEXT: andl %ecx, %edx +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: andl $858993459, %ecx # imm = 0x33333333 ; X86-NEXT: shrl $2, %eax -; X86-NEXT: andl %ecx, %eax -; X86-NEXT: addl %edx, %eax +; X86-NEXT: andl $858993459, %eax # imm = 0x33333333 +; X86-NEXT: addl %ecx, %eax ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: shrl $4, %ecx ; X86-NEXT: addl %eax, %ecx @@ -1065,12 +1045,11 @@ ; X64-NEXT: shrl %eax ; X64-NEXT: andl $1431655765, %eax # imm = 0x55555555 ; X64-NEXT: subl %eax, %edi -; X64-NEXT: movl $858993459, %eax # imm = 0x33333333 -; X64-NEXT: movl %edi, %ecx -; X64-NEXT: andl %eax, %ecx +; X64-NEXT: movl %edi, %eax +; X64-NEXT: andl $858993459, %eax # imm = 0x33333333 ; X64-NEXT: shrl $2, %edi -; X64-NEXT: andl %eax, %edi -; X64-NEXT: addl %ecx, %edi +; X64-NEXT: andl $858993459, %edi # imm = 0x33333333 +; X64-NEXT: addl %eax, %edi ; X64-NEXT: movl %edi, %eax ; X64-NEXT: shrl $4, %eax ; X64-NEXT: addl %edi, %eax @@ -1095,49 +1074,40 @@ define i64 @cnt64_pgso(i64 %x) nounwind readnone !prof !14 { ; X86-NOSSE-LABEL: cnt64_pgso: ; X86-NOSSE: # %bb.0: -; X86-NOSSE-NEXT: pushl %ebx -; X86-NOSSE-NEXT: pushl %edi -; X86-NOSSE-NEXT: pushl %esi ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NOSSE-NEXT: movl %esi, %ecx -; X86-NOSSE-NEXT: shrl %ecx -; X86-NOSSE-NEXT: movl $1431655765, %edx # imm = 0x55555555 -; X86-NOSSE-NEXT: andl %edx, %ecx -; X86-NOSSE-NEXT: subl %ecx, %esi -; X86-NOSSE-NEXT: movl $858993459, %ecx # imm = 0x33333333 -; X86-NOSSE-NEXT: movl %esi, %edi -; X86-NOSSE-NEXT: andl %ecx, %edi -; X86-NOSSE-NEXT: shrl $2, %esi -; X86-NOSSE-NEXT: andl %ecx, %esi -; X86-NOSSE-NEXT: addl %edi, %esi -; X86-NOSSE-NEXT: movl %esi, %ebx -; X86-NOSSE-NEXT: shrl $4, %ebx -; X86-NOSSE-NEXT: addl %esi, %ebx -; X86-NOSSE-NEXT: movl $252645135, %edi # imm = 0xF0F0F0F -; X86-NOSSE-NEXT: andl %edi, %ebx -; X86-NOSSE-NEXT: imull $16843009, %ebx, %esi # imm = 0x1010101 -; X86-NOSSE-NEXT: shrl $24, %esi -; X86-NOSSE-NEXT: movl %eax, %ebx -; X86-NOSSE-NEXT: shrl %ebx -; X86-NOSSE-NEXT: andl %edx, %ebx -; X86-NOSSE-NEXT: subl %ebx, %eax +; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NOSSE-NEXT: movl %ecx, %edx +; X86-NOSSE-NEXT: shrl %edx +; X86-NOSSE-NEXT: andl $1431655765, %edx # imm = 0x55555555 +; X86-NOSSE-NEXT: subl %edx, %ecx +; X86-NOSSE-NEXT: movl %ecx, %edx +; X86-NOSSE-NEXT: andl $858993459, %edx # imm = 0x33333333 +; X86-NOSSE-NEXT: shrl $2, %ecx +; X86-NOSSE-NEXT: andl $858993459, %ecx # imm = 0x33333333 +; X86-NOSSE-NEXT: addl %edx, %ecx +; X86-NOSSE-NEXT: movl %ecx, %edx +; X86-NOSSE-NEXT: shrl $4, %edx +; X86-NOSSE-NEXT: addl %ecx, %edx +; X86-NOSSE-NEXT: andl $252645135, %edx # imm = 0xF0F0F0F +; X86-NOSSE-NEXT: imull $16843009, %edx, %ecx # imm = 0x1010101 +; X86-NOSSE-NEXT: shrl $24, %ecx +; X86-NOSSE-NEXT: movl %eax, %edx +; X86-NOSSE-NEXT: shrl %edx +; X86-NOSSE-NEXT: andl $1431655765, %edx # imm = 0x55555555 +; X86-NOSSE-NEXT: subl %edx, %eax ; X86-NOSSE-NEXT: movl %eax, %edx -; X86-NOSSE-NEXT: andl %ecx, %edx +; X86-NOSSE-NEXT: andl $858993459, %edx # imm = 0x33333333 ; X86-NOSSE-NEXT: shrl $2, %eax -; X86-NOSSE-NEXT: andl %ecx, %eax +; X86-NOSSE-NEXT: andl $858993459, %eax # imm = 0x33333333 ; X86-NOSSE-NEXT: addl %edx, %eax -; X86-NOSSE-NEXT: movl %eax, %ecx -; X86-NOSSE-NEXT: shrl $4, %ecx -; X86-NOSSE-NEXT: addl %eax, %ecx -; X86-NOSSE-NEXT: andl %edi, %ecx -; X86-NOSSE-NEXT: imull $16843009, %ecx, %eax # imm = 0x1010101 +; X86-NOSSE-NEXT: movl %eax, %edx +; X86-NOSSE-NEXT: shrl $4, %edx +; X86-NOSSE-NEXT: addl %eax, %edx +; X86-NOSSE-NEXT: andl $252645135, %edx # imm = 0xF0F0F0F +; X86-NOSSE-NEXT: imull $16843009, %edx, %eax # imm = 0x1010101 ; X86-NOSSE-NEXT: shrl $24, %eax -; X86-NOSSE-NEXT: addl %esi, %eax +; X86-NOSSE-NEXT: addl %ecx, %eax ; X86-NOSSE-NEXT: xorl %edx, %edx -; X86-NOSSE-NEXT: popl %esi -; X86-NOSSE-NEXT: popl %edi -; X86-NOSSE-NEXT: popl %ebx ; X86-NOSSE-NEXT: retl ; ; X64-LABEL: cnt64_pgso: @@ -1224,93 +1194,85 @@ define i128 @cnt128_pgso(i128 %x) nounwind readnone !prof !14 { ; X86-NOSSE-LABEL: cnt128_pgso: ; X86-NOSSE: # %bb.0: -; X86-NOSSE-NEXT: pushl %ebp ; X86-NOSSE-NEXT: pushl %ebx ; X86-NOSSE-NEXT: pushl %edi ; X86-NOSSE-NEXT: pushl %esi +; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NOSSE-NEXT: movl %ebx, %ecx -; X86-NOSSE-NEXT: shrl %ecx -; X86-NOSSE-NEXT: movl $1431655765, %edi # imm = 0x55555555 -; X86-NOSSE-NEXT: andl %edi, %ecx -; X86-NOSSE-NEXT: movl $1431655765, %edi # imm = 0x55555555 -; X86-NOSSE-NEXT: subl %ecx, %ebx -; X86-NOSSE-NEXT: movl $858993459, %ecx # imm = 0x33333333 -; X86-NOSSE-NEXT: movl %ebx, %ebp -; X86-NOSSE-NEXT: andl %ecx, %ebp -; X86-NOSSE-NEXT: shrl $2, %ebx -; X86-NOSSE-NEXT: andl %ecx, %ebx -; X86-NOSSE-NEXT: addl %ebp, %ebx -; X86-NOSSE-NEXT: movl %ebx, %ebp -; X86-NOSSE-NEXT: shrl $4, %ebp -; X86-NOSSE-NEXT: addl %ebx, %ebp -; X86-NOSSE-NEXT: movl %eax, %ebx +; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NOSSE-NEXT: movl %edi, %ebx ; X86-NOSSE-NEXT: shrl %ebx -; X86-NOSSE-NEXT: andl %edi, %ebx -; X86-NOSSE-NEXT: subl %ebx, %eax -; X86-NOSSE-NEXT: movl %eax, %ebx -; X86-NOSSE-NEXT: andl %ecx, %ebx -; X86-NOSSE-NEXT: shrl $2, %eax -; X86-NOSSE-NEXT: andl %ecx, %eax -; X86-NOSSE-NEXT: addl %ebx, %eax -; X86-NOSSE-NEXT: movl %eax, %edi -; X86-NOSSE-NEXT: shrl $4, %edi -; X86-NOSSE-NEXT: addl %eax, %edi -; X86-NOSSE-NEXT: movl $252645135, %ebx # imm = 0xF0F0F0F -; X86-NOSSE-NEXT: andl %ebx, %ebp -; X86-NOSSE-NEXT: imull $16843009, %ebp, %eax # imm = 0x1010101 -; X86-NOSSE-NEXT: shrl $24, %eax -; X86-NOSSE-NEXT: andl %ebx, %edi -; X86-NOSSE-NEXT: imull $16843009, %edi, %edi # imm = 0x1010101 +; X86-NOSSE-NEXT: andl $1431655765, %ebx # imm = 0x55555555 +; X86-NOSSE-NEXT: subl %ebx, %edi +; X86-NOSSE-NEXT: movl %edi, %ebx +; X86-NOSSE-NEXT: andl $858993459, %ebx # imm = 0x33333333 +; X86-NOSSE-NEXT: shrl $2, %edi +; X86-NOSSE-NEXT: andl $858993459, %edi # imm = 0x33333333 +; X86-NOSSE-NEXT: addl %ebx, %edi +; X86-NOSSE-NEXT: movl %edi, %ebx +; X86-NOSSE-NEXT: shrl $4, %ebx +; X86-NOSSE-NEXT: addl %edi, %ebx +; X86-NOSSE-NEXT: andl $252645135, %ebx # imm = 0xF0F0F0F +; X86-NOSSE-NEXT: imull $16843009, %ebx, %edi # imm = 0x1010101 ; X86-NOSSE-NEXT: shrl $24, %edi -; X86-NOSSE-NEXT: addl %eax, %edi -; X86-NOSSE-NEXT: movl %esi, %eax -; X86-NOSSE-NEXT: shrl %eax -; X86-NOSSE-NEXT: movl $1431655765, %ebp # imm = 0x55555555 -; X86-NOSSE-NEXT: andl %ebp, %eax -; X86-NOSSE-NEXT: subl %eax, %esi -; X86-NOSSE-NEXT: movl %esi, %eax -; X86-NOSSE-NEXT: andl %ecx, %eax +; X86-NOSSE-NEXT: movl %esi, %ebx +; X86-NOSSE-NEXT: shrl %ebx +; X86-NOSSE-NEXT: andl $1431655765, %ebx # imm = 0x55555555 +; X86-NOSSE-NEXT: subl %ebx, %esi +; X86-NOSSE-NEXT: movl %esi, %ebx +; X86-NOSSE-NEXT: andl $858993459, %ebx # imm = 0x33333333 ; X86-NOSSE-NEXT: shrl $2, %esi -; X86-NOSSE-NEXT: andl %ecx, %esi -; X86-NOSSE-NEXT: addl %eax, %esi -; X86-NOSSE-NEXT: movl %esi, %ebp -; X86-NOSSE-NEXT: shrl $4, %ebp -; X86-NOSSE-NEXT: addl %esi, %ebp -; X86-NOSSE-NEXT: movl %edx, %eax -; X86-NOSSE-NEXT: shrl %eax -; X86-NOSSE-NEXT: movl $1431655765, %esi # imm = 0x55555555 -; X86-NOSSE-NEXT: andl %esi, %eax -; X86-NOSSE-NEXT: subl %eax, %edx -; X86-NOSSE-NEXT: movl %edx, %eax -; X86-NOSSE-NEXT: andl %ecx, %eax +; X86-NOSSE-NEXT: andl $858993459, %esi # imm = 0x33333333 +; X86-NOSSE-NEXT: addl %ebx, %esi +; X86-NOSSE-NEXT: movl %esi, %ebx +; X86-NOSSE-NEXT: shrl $4, %ebx +; X86-NOSSE-NEXT: addl %esi, %ebx +; X86-NOSSE-NEXT: andl $252645135, %ebx # imm = 0xF0F0F0F +; X86-NOSSE-NEXT: imull $16843009, %ebx, %esi # imm = 0x1010101 +; X86-NOSSE-NEXT: shrl $24, %esi +; X86-NOSSE-NEXT: addl %edi, %esi +; X86-NOSSE-NEXT: movl %edx, %edi +; X86-NOSSE-NEXT: shrl %edi +; X86-NOSSE-NEXT: andl $1431655765, %edi # imm = 0x55555555 +; X86-NOSSE-NEXT: subl %edi, %edx +; X86-NOSSE-NEXT: movl %edx, %edi +; X86-NOSSE-NEXT: andl $858993459, %edi # imm = 0x33333333 ; X86-NOSSE-NEXT: shrl $2, %edx -; X86-NOSSE-NEXT: andl %ecx, %edx -; X86-NOSSE-NEXT: addl %eax, %edx -; X86-NOSSE-NEXT: movl %edx, %eax -; X86-NOSSE-NEXT: shrl $4, %eax -; X86-NOSSE-NEXT: addl %edx, %eax -; X86-NOSSE-NEXT: andl %ebx, %ebp -; X86-NOSSE-NEXT: andl %ebx, %eax -; X86-NOSSE-NEXT: imull $16843009, %ebp, %ecx # imm = 0x1010101 -; X86-NOSSE-NEXT: shrl $24, %ecx -; X86-NOSSE-NEXT: imull $16843009, %eax, %edx # imm = 0x1010101 -; X86-NOSSE-NEXT: shrl $24, %edx -; X86-NOSSE-NEXT: addl %ecx, %edx -; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NOSSE-NEXT: andl $858993459, %edx # imm = 0x33333333 ; X86-NOSSE-NEXT: addl %edi, %edx -; X86-NOSSE-NEXT: xorl %ecx, %ecx -; X86-NOSSE-NEXT: movl %ecx, 12(%eax) -; X86-NOSSE-NEXT: movl %ecx, 8(%eax) -; X86-NOSSE-NEXT: movl %ecx, 4(%eax) -; X86-NOSSE-NEXT: movl %edx, (%eax) +; X86-NOSSE-NEXT: movl %edx, %edi +; X86-NOSSE-NEXT: shrl $4, %edi +; X86-NOSSE-NEXT: addl %edx, %edi +; X86-NOSSE-NEXT: andl $252645135, %edi # imm = 0xF0F0F0F +; X86-NOSSE-NEXT: imull $16843009, %edi, %edx # imm = 0x1010101 +; X86-NOSSE-NEXT: shrl $24, %edx +; X86-NOSSE-NEXT: movl %ecx, %edi +; X86-NOSSE-NEXT: shrl %edi +; X86-NOSSE-NEXT: andl $1431655765, %edi # imm = 0x55555555 +; X86-NOSSE-NEXT: subl %edi, %ecx +; X86-NOSSE-NEXT: movl %ecx, %edi +; X86-NOSSE-NEXT: andl $858993459, %edi # imm = 0x33333333 +; X86-NOSSE-NEXT: shrl $2, %ecx +; X86-NOSSE-NEXT: andl $858993459, %ecx # imm = 0x33333333 +; X86-NOSSE-NEXT: addl %edi, %ecx +; X86-NOSSE-NEXT: movl %ecx, %edi +; X86-NOSSE-NEXT: shrl $4, %edi +; X86-NOSSE-NEXT: addl %ecx, %edi +; X86-NOSSE-NEXT: andl $252645135, %edi # imm = 0xF0F0F0F +; X86-NOSSE-NEXT: imull $16843009, %edi, %ecx # imm = 0x1010101 +; X86-NOSSE-NEXT: shrl $24, %ecx +; X86-NOSSE-NEXT: addl %edx, %ecx +; X86-NOSSE-NEXT: addl %esi, %ecx +; X86-NOSSE-NEXT: xorl %edx, %edx +; X86-NOSSE-NEXT: movl %edx, 12(%eax) +; X86-NOSSE-NEXT: movl %edx, 8(%eax) +; X86-NOSSE-NEXT: movl %edx, 4(%eax) +; X86-NOSSE-NEXT: movl %ecx, (%eax) ; X86-NOSSE-NEXT: popl %esi ; X86-NOSSE-NEXT: popl %edi ; X86-NOSSE-NEXT: popl %ebx -; X86-NOSSE-NEXT: popl %ebp ; X86-NOSSE-NEXT: retl $4 ; ; X64-LABEL: cnt128_pgso: Index: llvm/test/CodeGen/X86/ragreedy-hoist-spill.ll =================================================================== --- llvm/test/CodeGen/X86/ragreedy-hoist-spill.ll +++ llvm/test/CodeGen/X86/ragreedy-hoist-spill.ll @@ -46,11 +46,11 @@ ; CHECK-NEXT: ## %bb.2: ## %if.then4 ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: testb %al, %al -; CHECK-NEXT: je LBB0_55 +; CHECK-NEXT: je LBB0_56 ; CHECK-NEXT: ## %bb.3: ## %SyTime.exit ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: testb %al, %al -; CHECK-NEXT: je LBB0_55 +; CHECK-NEXT: je LBB0_56 ; CHECK-NEXT: LBB0_4: ## %cleanup ; CHECK-NEXT: addq $552, %rsp ## imm = 0x228 ; CHECK-NEXT: popq %rbx @@ -63,7 +63,7 @@ ; CHECK-NEXT: LBB0_5: ## %if.end25 ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: testb %al, %al -; CHECK-NEXT: je LBB0_55 +; CHECK-NEXT: je LBB0_56 ; CHECK-NEXT: ## %bb.6: ## %SyTime.exit2720 ; CHECK-NEXT: movq %rdx, %rbx ; CHECK-NEXT: movq %rdi, %rbp @@ -76,11 +76,12 @@ ; CHECK-NEXT: movl $32, %esi ; CHECK-NEXT: callq _memset ; CHECK-NEXT: LBB0_8: ## %while.body.preheader +; CHECK-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; CHECK-NEXT: imulq $1040, %rbx, %rax ## imm = 0x410 ; CHECK-NEXT: movq _syBuf@GOTPCREL(%rip), %rcx ; CHECK-NEXT: leaq 8(%rcx,%rax), %rdx -; CHECK-NEXT: movl $1, %r15d ; CHECK-NEXT: movq _syCTRO@GOTPCREL(%rip), %rax +; CHECK-NEXT: movl $1, %r15d ; CHECK-NEXT: movb $1, %cl ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: LBB0_9: ## %do.body @@ -90,232 +91,228 @@ ; CHECK-NEXT: jne LBB0_9 ; CHECK-NEXT: ## %bb.10: ## %do.end ; CHECK-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; CHECK-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; CHECK-NEXT: xorl %r13d, %r13d -; CHECK-NEXT: testb %r13b, %r13b -; CHECK-NEXT: jne LBB0_11 -; CHECK-NEXT: ## %bb.12: ## %while.body200.preheader +; CHECK-NEXT: xorl %r14d, %r14d +; CHECK-NEXT: testb %r14b, %r14b +; CHECK-NEXT: jne LBB0_42 +; CHECK-NEXT: ## %bb.11: ## %while.body200.preheader ; CHECK-NEXT: xorl %r12d, %r12d ; CHECK-NEXT: leaq LJTI0_0(%rip), %rdx ; CHECK-NEXT: leaq LJTI0_1(%rip), %rbx ; CHECK-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Folded Spill -; CHECK-NEXT: xorl %r14d, %r14d -; CHECK-NEXT: jmp LBB0_13 +; CHECK-NEXT: xorl %r13d, %r13d +; CHECK-NEXT: jmp LBB0_14 ; CHECK-NEXT: .p2align 4, 0x90 -; CHECK-NEXT: LBB0_20: ## %sw.bb256 -; CHECK-NEXT: ## in Loop: Header=BB0_13 Depth=1 -; CHECK-NEXT: movl %r13d, %r14d -; CHECK-NEXT: LBB0_21: ## %while.cond197.backedge -; CHECK-NEXT: ## in Loop: Header=BB0_13 Depth=1 +; CHECK-NEXT: LBB0_12: ## %sw.bb256 +; CHECK-NEXT: ## in Loop: Header=BB0_14 Depth=1 +; CHECK-NEXT: movl %r14d, %r13d +; CHECK-NEXT: LBB0_13: ## %while.cond197.backedge +; CHECK-NEXT: ## in Loop: Header=BB0_14 Depth=1 ; CHECK-NEXT: decl %r15d ; CHECK-NEXT: testl %r15d, %r15d -; CHECK-NEXT: movl %r14d, %r13d -; CHECK-NEXT: jle LBB0_22 -; CHECK-NEXT: LBB0_13: ## %while.body200 +; CHECK-NEXT: movl %r13d, %r14d +; CHECK-NEXT: jle LBB0_43 +; CHECK-NEXT: LBB0_14: ## %while.body200 ; CHECK-NEXT: ## =>This Loop Header: Depth=1 -; CHECK-NEXT: ## Child Loop BB0_29 Depth 2 -; CHECK-NEXT: ## Child Loop BB0_38 Depth 2 -; CHECK-NEXT: leal -268(%r13), %eax +; CHECK-NEXT: ## Child Loop BB0_24 Depth 2 +; CHECK-NEXT: ## Child Loop BB0_37 Depth 2 +; CHECK-NEXT: leal -268(%r14), %eax ; CHECK-NEXT: cmpl $105, %eax -; CHECK-NEXT: ja LBB0_14 -; CHECK-NEXT: ## %bb.56: ## %while.body200 -; CHECK-NEXT: ## in Loop: Header=BB0_13 Depth=1 +; CHECK-NEXT: ja LBB0_17 +; CHECK-NEXT: ## %bb.15: ## %while.body200 +; CHECK-NEXT: ## in Loop: Header=BB0_14 Depth=1 ; CHECK-NEXT: movslq (%rbx,%rax,4), %rax ; CHECK-NEXT: addq %rbx, %rax ; CHECK-NEXT: jmpq *%rax -; CHECK-NEXT: LBB0_44: ## %while.cond1037.preheader -; CHECK-NEXT: ## in Loop: Header=BB0_13 Depth=1 +; CHECK-NEXT: LBB0_16: ## %while.cond1037.preheader +; CHECK-NEXT: ## in Loop: Header=BB0_14 Depth=1 ; CHECK-NEXT: testb %r12b, %r12b -; CHECK-NEXT: movl %r13d, %r14d -; CHECK-NEXT: jne LBB0_21 -; CHECK-NEXT: jmp LBB0_55 +; CHECK-NEXT: movl %r14d, %r13d +; CHECK-NEXT: jne LBB0_13 +; CHECK-NEXT: jmp LBB0_56 ; CHECK-NEXT: .p2align 4, 0x90 -; CHECK-NEXT: LBB0_14: ## %while.body200 -; CHECK-NEXT: ## in Loop: Header=BB0_13 Depth=1 -; CHECK-NEXT: leal 1(%r13), %eax +; CHECK-NEXT: LBB0_17: ## %while.body200 +; CHECK-NEXT: ## in Loop: Header=BB0_14 Depth=1 +; CHECK-NEXT: leal 1(%r14), %eax ; CHECK-NEXT: cmpl $21, %eax -; CHECK-NEXT: ja LBB0_20 -; CHECK-NEXT: ## %bb.15: ## %while.body200 -; CHECK-NEXT: ## in Loop: Header=BB0_13 Depth=1 -; CHECK-NEXT: movl $-1, %r14d +; CHECK-NEXT: ja LBB0_12 +; CHECK-NEXT: ## %bb.18: ## %while.body200 +; CHECK-NEXT: ## in Loop: Header=BB0_14 Depth=1 +; CHECK-NEXT: movl $-1, %r13d ; CHECK-NEXT: movslq (%rdx,%rax,4), %rax ; CHECK-NEXT: addq %rdx, %rax ; CHECK-NEXT: jmpq *%rax -; CHECK-NEXT: LBB0_18: ## %while.cond201.preheader -; CHECK-NEXT: ## in Loop: Header=BB0_13 Depth=1 -; CHECK-NEXT: movl $1, %r14d -; CHECK-NEXT: jmp LBB0_21 -; CHECK-NEXT: LBB0_26: ## %sw.bb474 -; CHECK-NEXT: ## in Loop: Header=BB0_13 Depth=1 +; CHECK-NEXT: LBB0_19: ## %while.cond201.preheader +; CHECK-NEXT: ## in Loop: Header=BB0_14 Depth=1 +; CHECK-NEXT: movl $1, %r13d +; CHECK-NEXT: jmp LBB0_13 +; CHECK-NEXT: LBB0_20: ## %sw.bb474 +; CHECK-NEXT: ## in Loop: Header=BB0_14 Depth=1 ; CHECK-NEXT: testb %r12b, %r12b ; CHECK-NEXT: ## implicit-def: $rbp -; CHECK-NEXT: jne LBB0_34 -; CHECK-NEXT: ## %bb.27: ## %do.body479.preheader -; CHECK-NEXT: ## in Loop: Header=BB0_13 Depth=1 +; CHECK-NEXT: jne LBB0_32 +; CHECK-NEXT: ## %bb.21: ## %do.body479.preheader +; CHECK-NEXT: ## in Loop: Header=BB0_14 Depth=1 ; CHECK-NEXT: testb %r12b, %r12b ; CHECK-NEXT: ## implicit-def: $rbp -; CHECK-NEXT: jne LBB0_34 -; CHECK-NEXT: ## %bb.28: ## %land.rhs485.preheader -; CHECK-NEXT: ## in Loop: Header=BB0_13 Depth=1 +; CHECK-NEXT: jne LBB0_32 +; CHECK-NEXT: ## %bb.22: ## %land.rhs485.preheader +; CHECK-NEXT: ## in Loop: Header=BB0_14 Depth=1 ; CHECK-NEXT: ## implicit-def: $rax -; CHECK-NEXT: jmp LBB0_29 +; CHECK-NEXT: jmp LBB0_24 ; CHECK-NEXT: .p2align 4, 0x90 -; CHECK-NEXT: LBB0_32: ## %do.body479.backedge -; CHECK-NEXT: ## in Loop: Header=BB0_29 Depth=2 +; CHECK-NEXT: LBB0_23: ## %do.body479.backedge +; CHECK-NEXT: ## in Loop: Header=BB0_24 Depth=2 ; CHECK-NEXT: leaq 1(%rbp), %rax ; CHECK-NEXT: testb %r12b, %r12b -; CHECK-NEXT: je LBB0_33 -; CHECK-NEXT: LBB0_29: ## %land.rhs485 -; CHECK-NEXT: ## Parent Loop BB0_13 Depth=1 +; CHECK-NEXT: je LBB0_31 +; CHECK-NEXT: LBB0_24: ## %land.rhs485 +; CHECK-NEXT: ## Parent Loop BB0_14 Depth=1 ; CHECK-NEXT: ## => This Inner Loop Header: Depth=2 ; CHECK-NEXT: testb %al, %al -; CHECK-NEXT: js LBB0_55 -; CHECK-NEXT: ## %bb.30: ## %cond.true.i.i2780 -; CHECK-NEXT: ## in Loop: Header=BB0_29 Depth=2 +; CHECK-NEXT: js LBB0_56 +; CHECK-NEXT: ## %bb.25: ## %cond.true.i.i2780 +; CHECK-NEXT: ## in Loop: Header=BB0_24 Depth=2 ; CHECK-NEXT: movq %rax, %rbp ; CHECK-NEXT: testb %r12b, %r12b -; CHECK-NEXT: jne LBB0_32 -; CHECK-NEXT: ## %bb.31: ## %lor.rhs500 -; CHECK-NEXT: ## in Loop: Header=BB0_29 Depth=2 +; CHECK-NEXT: jne LBB0_23 +; CHECK-NEXT: ## %bb.26: ## %lor.rhs500 +; CHECK-NEXT: ## in Loop: Header=BB0_24 Depth=2 ; CHECK-NEXT: movl $256, %esi ## imm = 0x100 ; CHECK-NEXT: callq ___maskrune ; CHECK-NEXT: testb %r12b, %r12b -; CHECK-NEXT: jne LBB0_32 -; CHECK-NEXT: jmp LBB0_34 -; CHECK-NEXT: LBB0_45: ## %sw.bb1134 -; CHECK-NEXT: ## in Loop: Header=BB0_13 Depth=1 +; CHECK-NEXT: jne LBB0_23 +; CHECK-NEXT: jmp LBB0_32 +; CHECK-NEXT: LBB0_27: ## %sw.bb1134 +; CHECK-NEXT: ## in Loop: Header=BB0_14 Depth=1 ; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rax ; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rcx ; CHECK-NEXT: cmpq %rax, %rcx -; CHECK-NEXT: jb LBB0_55 -; CHECK-NEXT: ## %bb.46: ## in Loop: Header=BB0_13 Depth=1 +; CHECK-NEXT: jb LBB0_56 +; CHECK-NEXT: ## %bb.28: ## in Loop: Header=BB0_14 Depth=1 ; CHECK-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Folded Spill -; CHECK-NEXT: movl $268, %r14d ## imm = 0x10C -; CHECK-NEXT: jmp LBB0_21 -; CHECK-NEXT: LBB0_40: ## %sw.bb566 -; CHECK-NEXT: ## in Loop: Header=BB0_13 Depth=1 -; CHECK-NEXT: movl $20, %r14d -; CHECK-NEXT: jmp LBB0_21 -; CHECK-NEXT: LBB0_19: ## %sw.bb243 -; CHECK-NEXT: ## in Loop: Header=BB0_13 Depth=1 -; CHECK-NEXT: movl $2, %r14d -; CHECK-NEXT: jmp LBB0_21 -; CHECK-NEXT: LBB0_33: ## %if.end517.loopexitsplit -; CHECK-NEXT: ## in Loop: Header=BB0_13 Depth=1 +; CHECK-NEXT: movl $268, %r13d ## imm = 0x10C +; CHECK-NEXT: jmp LBB0_13 +; CHECK-NEXT: LBB0_29: ## %sw.bb566 +; CHECK-NEXT: ## in Loop: Header=BB0_14 Depth=1 +; CHECK-NEXT: movl $20, %r13d +; CHECK-NEXT: jmp LBB0_13 +; CHECK-NEXT: LBB0_30: ## %sw.bb243 +; CHECK-NEXT: ## in Loop: Header=BB0_14 Depth=1 +; CHECK-NEXT: movl $2, %r13d +; CHECK-NEXT: jmp LBB0_13 +; CHECK-NEXT: LBB0_31: ## %if.end517.loopexitsplit +; CHECK-NEXT: ## in Loop: Header=BB0_14 Depth=1 ; CHECK-NEXT: incq %rbp -; CHECK-NEXT: LBB0_34: ## %if.end517 -; CHECK-NEXT: ## in Loop: Header=BB0_13 Depth=1 -; CHECK-NEXT: leal -324(%r14), %eax +; CHECK-NEXT: LBB0_32: ## %if.end517 +; CHECK-NEXT: ## in Loop: Header=BB0_14 Depth=1 +; CHECK-NEXT: leal -324(%r13), %eax ; CHECK-NEXT: cmpl $59, %eax -; CHECK-NEXT: ja LBB0_35 -; CHECK-NEXT: ## %bb.57: ## %if.end517 -; CHECK-NEXT: ## in Loop: Header=BB0_13 Depth=1 +; CHECK-NEXT: ja LBB0_34 +; CHECK-NEXT: ## %bb.33: ## %if.end517 +; CHECK-NEXT: ## in Loop: Header=BB0_14 Depth=1 ; CHECK-NEXT: movabsq $576460756598390785, %rcx ## imm = 0x800000100000001 ; CHECK-NEXT: btq %rax, %rcx -; CHECK-NEXT: jb LBB0_38 -; CHECK-NEXT: LBB0_35: ## %if.end517 -; CHECK-NEXT: ## in Loop: Header=BB0_13 Depth=1 -; CHECK-NEXT: cmpl $11, %r14d -; CHECK-NEXT: je LBB0_38 -; CHECK-NEXT: ## %bb.36: ## %if.end517 -; CHECK-NEXT: ## in Loop: Header=BB0_13 Depth=1 -; CHECK-NEXT: cmpl $24, %r14d -; CHECK-NEXT: je LBB0_38 -; CHECK-NEXT: ## %bb.37: ## %if.then532 -; CHECK-NEXT: ## in Loop: Header=BB0_13 Depth=1 +; CHECK-NEXT: jb LBB0_37 +; CHECK-NEXT: LBB0_34: ## %if.end517 +; CHECK-NEXT: ## in Loop: Header=BB0_14 Depth=1 +; CHECK-NEXT: cmpl $11, %r13d +; CHECK-NEXT: je LBB0_37 +; CHECK-NEXT: ## %bb.35: ## %if.end517 +; CHECK-NEXT: ## in Loop: Header=BB0_14 Depth=1 +; CHECK-NEXT: cmpl $24, %r13d +; CHECK-NEXT: je LBB0_37 +; CHECK-NEXT: ## %bb.36: ## %if.then532 +; CHECK-NEXT: ## in Loop: Header=BB0_14 Depth=1 ; CHECK-NEXT: movq _SyFgets.yank@GOTPCREL(%rip), %rax ; CHECK-NEXT: movb $0, (%rax) ; CHECK-NEXT: .p2align 4, 0x90 -; CHECK-NEXT: LBB0_38: ## %for.cond534 -; CHECK-NEXT: ## Parent Loop BB0_13 Depth=1 +; CHECK-NEXT: LBB0_37: ## %for.cond534 +; CHECK-NEXT: ## Parent Loop BB0_14 Depth=1 ; CHECK-NEXT: ## => This Inner Loop Header: Depth=2 ; CHECK-NEXT: testb %r12b, %r12b -; CHECK-NEXT: jne LBB0_38 -; CHECK-NEXT: ## %bb.39: ## %for.cond542.preheader -; CHECK-NEXT: ## in Loop: Header=BB0_13 Depth=1 +; CHECK-NEXT: jne LBB0_37 +; CHECK-NEXT: ## %bb.38: ## %for.cond542.preheader +; CHECK-NEXT: ## in Loop: Header=BB0_14 Depth=1 ; CHECK-NEXT: testb %r12b, %r12b ; CHECK-NEXT: movb $0, (%rbp) -; CHECK-NEXT: movl %r13d, %r14d +; CHECK-NEXT: movl %r14d, %r13d ; CHECK-NEXT: leaq LJTI0_0(%rip), %rdx -; CHECK-NEXT: jmp LBB0_21 +; CHECK-NEXT: jmp LBB0_13 ; CHECK-NEXT: .p2align 4, 0x90 -; CHECK-NEXT: LBB0_42: ## %while.cond864 +; CHECK-NEXT: LBB0_39: ## %while.cond864 ; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: jmp LBB0_42 +; CHECK-NEXT: jmp LBB0_39 ; CHECK-NEXT: .p2align 4, 0x90 -; CHECK-NEXT: LBB0_43: ## %while.cond962 +; CHECK-NEXT: LBB0_40: ## %while.cond962 ; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: jmp LBB0_43 +; CHECK-NEXT: jmp LBB0_40 ; CHECK-NEXT: .p2align 4, 0x90 -; CHECK-NEXT: LBB0_25: ## %for.cond357 +; CHECK-NEXT: LBB0_41: ## %for.cond357 ; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: jmp LBB0_25 -; CHECK-NEXT: LBB0_11: +; CHECK-NEXT: jmp LBB0_41 +; CHECK-NEXT: LBB0_42: ; CHECK-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Folded Spill -; CHECK-NEXT: xorl %r14d, %r14d -; CHECK-NEXT: LBB0_22: ## %while.end1465 -; CHECK-NEXT: incl %r14d -; CHECK-NEXT: cmpl $16, %r14d -; CHECK-NEXT: ja LBB0_50 -; CHECK-NEXT: ## %bb.23: ## %while.end1465 -; CHECK-NEXT: movl $83969, %eax ## imm = 0x14801 -; CHECK-NEXT: btl %r14d, %eax -; CHECK-NEXT: jae LBB0_50 -; CHECK-NEXT: ## %bb.24: -; CHECK-NEXT: xorl %ebp, %ebp +; CHECK-NEXT: xorl %r13d, %r13d +; CHECK-NEXT: LBB0_43: ## %while.end1465 +; CHECK-NEXT: incl %r13d +; CHECK-NEXT: cmpl $16, %r13d ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx ## 8-byte Reload -; CHECK-NEXT: LBB0_48: ## %if.then1477 +; CHECK-NEXT: ja LBB0_51 +; CHECK-NEXT: ## %bb.44: ## %while.end1465 +; CHECK-NEXT: movl $83969, %eax ## imm = 0x14801 +; CHECK-NEXT: btl %r13d, %eax +; CHECK-NEXT: jae LBB0_51 +; CHECK-NEXT: LBB0_45: ## %if.then1477 ; CHECK-NEXT: movl $1, %edx ; CHECK-NEXT: callq _write -; CHECK-NEXT: subq %rbp, %rbx ; CHECK-NEXT: movq _syHistory@GOTPCREL(%rip), %rax ; CHECK-NEXT: leaq 8189(%rbx,%rax), %rax ; CHECK-NEXT: .p2align 4, 0x90 -; CHECK-NEXT: LBB0_49: ## %for.body1723 +; CHECK-NEXT: LBB0_46: ## %for.body1723 ; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: decq %rax -; CHECK-NEXT: jmp LBB0_49 +; CHECK-NEXT: jmp LBB0_46 ; CHECK-NEXT: LBB0_47: ## %if.then1477.loopexit ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx ## 8-byte Reload -; CHECK-NEXT: movq %rbx, %rbp -; CHECK-NEXT: jmp LBB0_48 -; CHECK-NEXT: LBB0_16: ## %while.cond635.preheader +; CHECK-NEXT: subq %rbx, %rbx +; CHECK-NEXT: jmp LBB0_45 +; CHECK-NEXT: LBB0_48: ## %while.cond635.preheader ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: testb %al, %al -; CHECK-NEXT: je LBB0_41 +; CHECK-NEXT: je LBB0_50 ; CHECK-NEXT: .p2align 4, 0x90 -; CHECK-NEXT: LBB0_17: ## %for.body643.us +; CHECK-NEXT: LBB0_49: ## %for.body643.us ; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: jmp LBB0_17 +; CHECK-NEXT: jmp LBB0_49 ; CHECK-NEXT: .p2align 4, 0x90 -; CHECK-NEXT: LBB0_41: ## %while.cond661 +; CHECK-NEXT: LBB0_50: ## %while.cond661 ; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: jmp LBB0_41 -; CHECK-NEXT: LBB0_50: ## %for.cond1480.preheader +; CHECK-NEXT: jmp LBB0_50 +; CHECK-NEXT: LBB0_51: ## %for.cond1480.preheader ; CHECK-NEXT: movl $512, %eax ## imm = 0x200 ; CHECK-NEXT: cmpq %rax, %rax -; CHECK-NEXT: jae LBB0_55 -; CHECK-NEXT: ## %bb.51: ## %for.body1664.lr.ph +; CHECK-NEXT: jae LBB0_56 +; CHECK-NEXT: ## %bb.52: ## %for.body1664.lr.ph ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: testb %al, %al ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx ## 8-byte Reload ; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ebp ## 4-byte Reload -; CHECK-NEXT: jne LBB0_54 -; CHECK-NEXT: ## %bb.52: ## %while.body1679.preheader +; CHECK-NEXT: jne LBB0_55 +; CHECK-NEXT: ## %bb.53: ## %while.body1679.preheader ; CHECK-NEXT: incl %ebp ; CHECK-NEXT: .p2align 4, 0x90 -; CHECK-NEXT: LBB0_53: ## %while.body1679 +; CHECK-NEXT: LBB0_54: ## %while.body1679 ; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: movq (%rbx), %rdi ; CHECK-NEXT: callq _fileno ; CHECK-NEXT: movslq %ebp, %rax ; CHECK-NEXT: leal 1(%rax), %ebp ; CHECK-NEXT: cmpq %rax, %rax -; CHECK-NEXT: jl LBB0_53 -; CHECK-NEXT: LBB0_54: ## %while.cond1683.preheader +; CHECK-NEXT: jl LBB0_54 +; CHECK-NEXT: LBB0_55: ## %while.cond1683.preheader ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: testb %al, %al -; CHECK-NEXT: LBB0_55: ## %if.then.i +; CHECK-NEXT: LBB0_56: ## %if.then.i ; CHECK-NEXT: ud2 entry: %sub.ptr.rhs.cast646 = ptrtoint i8* %line to i64 Index: llvm/test/CodeGen/X86/remat-phys-dead.ll =================================================================== --- llvm/test/CodeGen/X86/remat-phys-dead.ll +++ llvm/test/CodeGen/X86/remat-phys-dead.ll @@ -18,6 +18,5 @@ define i32 @test_remat32() { ret i32 0 ; CHECK: REGISTER COALESCING -; CHECK: Remat: $eax = MOV32r0 implicit-def dead $eflags } Index: llvm/test/CodeGen/X86/speculative-load-hardening-call-and-ret.ll =================================================================== --- llvm/test/CodeGen/X86/speculative-load-hardening-call-and-ret.ll +++ llvm/test/CodeGen/X86/speculative-load-hardening-call-and-ret.ll @@ -283,43 +283,41 @@ ; X64-NOPIC-NEXT: pushq %rbp ; X64-NOPIC-NEXT: pushq %r15 ; X64-NOPIC-NEXT: pushq %r14 -; X64-NOPIC-NEXT: pushq %r13 ; X64-NOPIC-NEXT: pushq %r12 ; X64-NOPIC-NEXT: pushq %rbx -; X64-NOPIC-NEXT: subq $24, %rsp +; X64-NOPIC-NEXT: subq $16, %rsp ; X64-NOPIC-NEXT: movq %rsp, %rax ; X64-NOPIC-NEXT: movq %rdi, %rbx ; X64-NOPIC-NEXT: movq $-1, %r15 ; X64-NOPIC-NEXT: sarq $63, %rax -; X64-NOPIC-NEXT: leaq {{[0-9]+}}(%rsp), %r14 +; X64-NOPIC-NEXT: movq %rsp, %r14 ; X64-NOPIC-NEXT: shlq $47, %rax ; X64-NOPIC-NEXT: movq %r14, %rdi ; X64-NOPIC-NEXT: orq %rax, %rsp -; X64-NOPIC-NEXT: movq $.Lslh_ret_addr4, %rbp +; X64-NOPIC-NEXT: movq $.Lslh_ret_addr4, %r12 ; X64-NOPIC-NEXT: callq setjmp@PLT ; X64-NOPIC-NEXT: .Lslh_ret_addr4: ; X64-NOPIC-NEXT: movq %rsp, %rax ; X64-NOPIC-NEXT: sarq $63, %rax -; X64-NOPIC-NEXT: cmpq $.Lslh_ret_addr4, %rbp +; X64-NOPIC-NEXT: cmpq $.Lslh_ret_addr4, %r12 ; X64-NOPIC-NEXT: cmovneq %r15, %rax ; X64-NOPIC-NEXT: movl (%rbx), %ebp -; X64-NOPIC-NEXT: movl $42, %r12d ; X64-NOPIC-NEXT: shlq $47, %rax ; X64-NOPIC-NEXT: movq %r14, %rdi -; X64-NOPIC-NEXT: movl %r12d, %esi +; X64-NOPIC-NEXT: movl $42, %esi ; X64-NOPIC-NEXT: orq %rax, %rsp -; X64-NOPIC-NEXT: movq $.Lslh_ret_addr5, %r13 +; X64-NOPIC-NEXT: movq $.Lslh_ret_addr5, %r12 ; X64-NOPIC-NEXT: callq sigsetjmp@PLT ; X64-NOPIC-NEXT: .Lslh_ret_addr5: ; X64-NOPIC-NEXT: movq %rsp, %rax ; X64-NOPIC-NEXT: sarq $63, %rax -; X64-NOPIC-NEXT: cmpq $.Lslh_ret_addr5, %r13 +; X64-NOPIC-NEXT: cmpq $.Lslh_ret_addr5, %r12 ; X64-NOPIC-NEXT: cmovneq %r15, %rax ; X64-NOPIC-NEXT: addl (%rbx), %ebp ; X64-NOPIC-NEXT: shlq $47, %rax ; X64-NOPIC-NEXT: movq %r14, %rdi ; X64-NOPIC-NEXT: movq %r14, %rsi -; X64-NOPIC-NEXT: movl %r12d, %edx +; X64-NOPIC-NEXT: movl $42, %edx ; X64-NOPIC-NEXT: orq %rax, %rsp ; X64-NOPIC-NEXT: movq $.Lslh_ret_addr6, %r14 ; X64-NOPIC-NEXT: callq __sigsetjmp@PLT @@ -334,10 +332,9 @@ ; X64-NOPIC-NEXT: orl %ecx, %eax ; X64-NOPIC-NEXT: shlq $47, %rcx ; X64-NOPIC-NEXT: orq %rcx, %rsp -; X64-NOPIC-NEXT: addq $24, %rsp +; X64-NOPIC-NEXT: addq $16, %rsp ; X64-NOPIC-NEXT: popq %rbx ; X64-NOPIC-NEXT: popq %r12 -; X64-NOPIC-NEXT: popq %r13 ; X64-NOPIC-NEXT: popq %r14 ; X64-NOPIC-NEXT: popq %r15 ; X64-NOPIC-NEXT: popq %rbp @@ -348,45 +345,43 @@ ; X64-NOPIC-MCM-NEXT: pushq %rbp ; X64-NOPIC-MCM-NEXT: pushq %r15 ; X64-NOPIC-MCM-NEXT: pushq %r14 -; X64-NOPIC-MCM-NEXT: pushq %r13 ; X64-NOPIC-MCM-NEXT: pushq %r12 ; X64-NOPIC-MCM-NEXT: pushq %rbx -; X64-NOPIC-MCM-NEXT: subq $24, %rsp +; X64-NOPIC-MCM-NEXT: subq $16, %rsp ; X64-NOPIC-MCM-NEXT: movq %rsp, %rax ; X64-NOPIC-MCM-NEXT: movq %rdi, %rbx ; X64-NOPIC-MCM-NEXT: movq $-1, %r15 ; X64-NOPIC-MCM-NEXT: sarq $63, %rax -; X64-NOPIC-MCM-NEXT: leaq {{[0-9]+}}(%rsp), %r14 +; X64-NOPIC-MCM-NEXT: movq %rsp, %r14 ; X64-NOPIC-MCM-NEXT: shlq $47, %rax ; X64-NOPIC-MCM-NEXT: movq %r14, %rdi ; X64-NOPIC-MCM-NEXT: orq %rax, %rsp -; X64-NOPIC-MCM-NEXT: leaq .Lslh_ret_addr4(%rip), %rbp +; X64-NOPIC-MCM-NEXT: leaq .Lslh_ret_addr4(%rip), %r12 ; X64-NOPIC-MCM-NEXT: callq setjmp@PLT ; X64-NOPIC-MCM-NEXT: .Lslh_ret_addr4: ; X64-NOPIC-MCM-NEXT: movq %rsp, %rax ; X64-NOPIC-MCM-NEXT: sarq $63, %rax ; X64-NOPIC-MCM-NEXT: leaq .Lslh_ret_addr4(%rip), %rcx -; X64-NOPIC-MCM-NEXT: cmpq %rcx, %rbp +; X64-NOPIC-MCM-NEXT: cmpq %rcx, %r12 ; X64-NOPIC-MCM-NEXT: cmovneq %r15, %rax ; X64-NOPIC-MCM-NEXT: movl (%rbx), %ebp -; X64-NOPIC-MCM-NEXT: movl $42, %r12d ; X64-NOPIC-MCM-NEXT: shlq $47, %rax ; X64-NOPIC-MCM-NEXT: movq %r14, %rdi -; X64-NOPIC-MCM-NEXT: movl %r12d, %esi +; X64-NOPIC-MCM-NEXT: movl $42, %esi ; X64-NOPIC-MCM-NEXT: orq %rax, %rsp -; X64-NOPIC-MCM-NEXT: leaq .Lslh_ret_addr5(%rip), %r13 +; X64-NOPIC-MCM-NEXT: leaq .Lslh_ret_addr5(%rip), %r12 ; X64-NOPIC-MCM-NEXT: callq sigsetjmp@PLT ; X64-NOPIC-MCM-NEXT: .Lslh_ret_addr5: ; X64-NOPIC-MCM-NEXT: movq %rsp, %rax ; X64-NOPIC-MCM-NEXT: sarq $63, %rax ; X64-NOPIC-MCM-NEXT: leaq .Lslh_ret_addr5(%rip), %rcx -; X64-NOPIC-MCM-NEXT: cmpq %rcx, %r13 +; X64-NOPIC-MCM-NEXT: cmpq %rcx, %r12 ; X64-NOPIC-MCM-NEXT: cmovneq %r15, %rax ; X64-NOPIC-MCM-NEXT: addl (%rbx), %ebp ; X64-NOPIC-MCM-NEXT: shlq $47, %rax ; X64-NOPIC-MCM-NEXT: movq %r14, %rdi ; X64-NOPIC-MCM-NEXT: movq %r14, %rsi -; X64-NOPIC-MCM-NEXT: movl %r12d, %edx +; X64-NOPIC-MCM-NEXT: movl $42, %edx ; X64-NOPIC-MCM-NEXT: orq %rax, %rsp ; X64-NOPIC-MCM-NEXT: leaq .Lslh_ret_addr6(%rip), %r14 ; X64-NOPIC-MCM-NEXT: callq __sigsetjmp@PLT @@ -402,10 +397,9 @@ ; X64-NOPIC-MCM-NEXT: orl %ecx, %eax ; X64-NOPIC-MCM-NEXT: shlq $47, %rcx ; X64-NOPIC-MCM-NEXT: orq %rcx, %rsp -; X64-NOPIC-MCM-NEXT: addq $24, %rsp +; X64-NOPIC-MCM-NEXT: addq $16, %rsp ; X64-NOPIC-MCM-NEXT: popq %rbx ; X64-NOPIC-MCM-NEXT: popq %r12 -; X64-NOPIC-MCM-NEXT: popq %r13 ; X64-NOPIC-MCM-NEXT: popq %r14 ; X64-NOPIC-MCM-NEXT: popq %r15 ; X64-NOPIC-MCM-NEXT: popq %rbp @@ -416,45 +410,43 @@ ; X64-PIC-NEXT: pushq %rbp ; X64-PIC-NEXT: pushq %r15 ; X64-PIC-NEXT: pushq %r14 -; X64-PIC-NEXT: pushq %r13 ; X64-PIC-NEXT: pushq %r12 ; X64-PIC-NEXT: pushq %rbx -; X64-PIC-NEXT: subq $24, %rsp +; X64-PIC-NEXT: subq $16, %rsp ; X64-PIC-NEXT: movq %rsp, %rax ; X64-PIC-NEXT: movq %rdi, %rbx ; X64-PIC-NEXT: movq $-1, %r15 ; X64-PIC-NEXT: sarq $63, %rax -; X64-PIC-NEXT: leaq {{[0-9]+}}(%rsp), %r14 +; X64-PIC-NEXT: movq %rsp, %r14 ; X64-PIC-NEXT: shlq $47, %rax ; X64-PIC-NEXT: movq %r14, %rdi ; X64-PIC-NEXT: orq %rax, %rsp -; X64-PIC-NEXT: leaq .Lslh_ret_addr4(%rip), %rbp +; X64-PIC-NEXT: leaq .Lslh_ret_addr4(%rip), %r12 ; X64-PIC-NEXT: callq setjmp@PLT ; X64-PIC-NEXT: .Lslh_ret_addr4: ; X64-PIC-NEXT: movq %rsp, %rax ; X64-PIC-NEXT: sarq $63, %rax ; X64-PIC-NEXT: leaq .Lslh_ret_addr4(%rip), %rcx -; X64-PIC-NEXT: cmpq %rcx, %rbp +; X64-PIC-NEXT: cmpq %rcx, %r12 ; X64-PIC-NEXT: cmovneq %r15, %rax ; X64-PIC-NEXT: movl (%rbx), %ebp -; X64-PIC-NEXT: movl $42, %r12d ; X64-PIC-NEXT: shlq $47, %rax ; X64-PIC-NEXT: movq %r14, %rdi -; X64-PIC-NEXT: movl %r12d, %esi +; X64-PIC-NEXT: movl $42, %esi ; X64-PIC-NEXT: orq %rax, %rsp -; X64-PIC-NEXT: leaq .Lslh_ret_addr5(%rip), %r13 +; X64-PIC-NEXT: leaq .Lslh_ret_addr5(%rip), %r12 ; X64-PIC-NEXT: callq sigsetjmp@PLT ; X64-PIC-NEXT: .Lslh_ret_addr5: ; X64-PIC-NEXT: movq %rsp, %rax ; X64-PIC-NEXT: sarq $63, %rax ; X64-PIC-NEXT: leaq .Lslh_ret_addr5(%rip), %rcx -; X64-PIC-NEXT: cmpq %rcx, %r13 +; X64-PIC-NEXT: cmpq %rcx, %r12 ; X64-PIC-NEXT: cmovneq %r15, %rax ; X64-PIC-NEXT: addl (%rbx), %ebp ; X64-PIC-NEXT: shlq $47, %rax ; X64-PIC-NEXT: movq %r14, %rdi ; X64-PIC-NEXT: movq %r14, %rsi -; X64-PIC-NEXT: movl %r12d, %edx +; X64-PIC-NEXT: movl $42, %edx ; X64-PIC-NEXT: orq %rax, %rsp ; X64-PIC-NEXT: leaq .Lslh_ret_addr6(%rip), %r14 ; X64-PIC-NEXT: callq __sigsetjmp@PLT @@ -470,10 +462,9 @@ ; X64-PIC-NEXT: orl %ecx, %eax ; X64-PIC-NEXT: shlq $47, %rcx ; X64-PIC-NEXT: orq %rcx, %rsp -; X64-PIC-NEXT: addq $24, %rsp +; X64-PIC-NEXT: addq $16, %rsp ; X64-PIC-NEXT: popq %rbx ; X64-PIC-NEXT: popq %r12 -; X64-PIC-NEXT: popq %r13 ; X64-PIC-NEXT: popq %r14 ; X64-PIC-NEXT: popq %r15 ; X64-PIC-NEXT: popq %rbp