diff --git a/llvm/include/llvm/CodeGen/Spiller.h b/llvm/include/llvm/CodeGen/Spiller.h --- a/llvm/include/llvm/CodeGen/Spiller.h +++ b/llvm/include/llvm/CodeGen/Spiller.h @@ -12,8 +12,10 @@ namespace llvm { class LiveRangeEdit; +class LiveRegMatrix; class MachineFunction; class MachineFunctionPass; +class RegisterClassInfo; class VirtRegMap; /// Spiller interface. @@ -35,7 +37,8 @@ /// Create and return a spiller that will insert spill code directly instead /// of deferring though VirtRegMap. Spiller *createInlineSpiller(MachineFunctionPass &pass, MachineFunction &mf, - VirtRegMap &vrm); + VirtRegMap &VRM, const RegisterClassInfo &RegClassInfo, + LiveRegMatrix &Matrix); } // end namespace llvm diff --git a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h --- a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h +++ b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h @@ -584,6 +584,11 @@ return RC; } + virtual const TargetRegisterClass* spillToOtherClass(const MachineRegisterInfo& MRI, + Register Reg) const { + return nullptr; + } + /// Return the subregister index you get from composing /// two subregister indices. /// diff --git a/llvm/lib/CodeGen/InlineSpiller.cpp b/llvm/lib/CodeGen/InlineSpiller.cpp --- a/llvm/lib/CodeGen/InlineSpiller.cpp +++ b/llvm/lib/CodeGen/InlineSpiller.cpp @@ -26,6 +26,7 @@ #include "llvm/CodeGen/LiveIntervalCalc.h" #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/LiveRangeEdit.h" +#include "llvm/CodeGen/LiveRegMatrix.h" #include "llvm/CodeGen/LiveStacks.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineBlockFrequencyInfo.h" @@ -81,6 +82,10 @@ cl::init(false), cl::Hidden, cl::desc("Restrict remat for statepoint operands")); +static cl::opt DisableSpillOtherClass("disable-spill-other-class", + cl::init(false), cl::Hidden, + cl::desc("Disable spilling to other register classes")); + namespace { class HoistSpillHelper : private LiveRangeEdit::Delegate { @@ -168,6 +173,8 @@ const TargetInstrInfo &TII; const TargetRegisterInfo &TRI; const MachineBlockFrequencyInfo &MBFI; + const RegisterClassInfo& RegClassInfo; + LiveRegMatrix& Matrix; // Variables that are valid during spill(), but used by multiple methods. LiveRangeEdit *Edit; @@ -194,16 +201,19 @@ ~InlineSpiller() override = default; public: - InlineSpiller(MachineFunctionPass &pass, MachineFunction &mf, VirtRegMap &vrm) + InlineSpiller(MachineFunctionPass &pass, MachineFunction &mf, VirtRegMap &VRM, + const RegisterClassInfo& RegClassInfo, LiveRegMatrix &Matrix) : MF(mf), LIS(pass.getAnalysis()), LSS(pass.getAnalysis()), AA(&pass.getAnalysis().getAAResults()), MDT(pass.getAnalysis()), - Loops(pass.getAnalysis()), VRM(vrm), + Loops(pass.getAnalysis()), VRM(VRM), MRI(mf.getRegInfo()), TII(*mf.getSubtarget().getInstrInfo()), TRI(*mf.getSubtarget().getRegisterInfo()), MBFI(pass.getAnalysis()), - HSpiller(pass, mf, vrm) {} + RegClassInfo(RegClassInfo), + Matrix(Matrix), + HSpiller(pass, mf, VRM) {} void spill(LiveRangeEdit &) override; void postOptimization() override; @@ -231,6 +241,8 @@ void spillAroundUses(Register Reg); void spillAll(); + + bool spillToOtherClass(); }; } // end anonymous namespace @@ -241,8 +253,10 @@ Spiller *llvm::createInlineSpiller(MachineFunctionPass &pass, MachineFunction &mf, - VirtRegMap &vrm) { - return new InlineSpiller(pass, mf, vrm); + VirtRegMap &VRM, + const RegisterClassInfo &RegClassInfo, + LiveRegMatrix &Matrix) { + return new InlineSpiller(Pass, MF, VRM, RegClassInfo, Matrix); } //===----------------------------------------------------------------------===// @@ -1130,26 +1144,30 @@ /// spillAll - Spill all registers remaining after rematerialization. void InlineSpiller::spillAll() { - // Update LiveStacks now that we are committed to spilling. - if (StackSlot == VirtRegMap::NO_STACK_SLOT) { - StackSlot = VRM.assignVirt2StackSlot(Original); - StackInt = &LSS.getOrCreateInterval(StackSlot, MRI.getRegClass(Original)); - StackInt->getNextValue(SlotIndex(), LSS.getVNInfoAllocator()); - } else - StackInt = &LSS.getInterval(StackSlot); + if (spillToOtherClass()) { + // Succeeded in copying value to a different register class. + } else { + // Update LiveStacks now that we are committed to spilling. + if (StackSlot == VirtRegMap::NO_STACK_SLOT) { + StackSlot = VRM.assignVirt2StackSlot(Original); + StackInt = &LSS.getOrCreateInterval(StackSlot, MRI.getRegClass(Original)); + StackInt->getNextValue(SlotIndex(), LSS.getVNInfoAllocator()); + } else + StackInt = &LSS.getInterval(StackSlot); - if (Original != Edit->getReg()) - VRM.assignVirt2StackSlot(Edit->getReg(), StackSlot); + if (Original != Edit->getReg()) + VRM.assignVirt2StackSlot(Edit->getReg(), StackSlot); - assert(StackInt->getNumValNums() == 1 && "Bad stack interval values"); - for (Register Reg : RegsToSpill) - StackInt->MergeSegmentsInAsValue(LIS.getInterval(Reg), - StackInt->getValNumInfo(0)); - LLVM_DEBUG(dbgs() << "Merged spilled regs: " << *StackInt << '\n'); + assert(StackInt->getNumValNums() == 1 && "Bad stack interval values"); + for (Register Reg : RegsToSpill) + StackInt->MergeSegmentsInAsValue(LIS.getInterval(Reg), + StackInt->getValNumInfo(0)); + LLVM_DEBUG(dbgs() << "Merged spilled regs: " << *StackInt << '\n'); - // Spill around uses of all RegsToSpill. - for (Register Reg : RegsToSpill) - spillAroundUses(Reg); + // Spill around uses of all RegsToSpill. + for (Register Reg : RegsToSpill) + spillAroundUses(Reg); + } // Hoisted spills may cause dead code. if (!DeadDefs.empty()) { @@ -1175,6 +1193,122 @@ Edit->eraseVirtReg(Reg); } +bool InlineSpiller::spillToOtherClass() { + if (DisableSpillOtherClass) + return false; + + const TargetRegisterClass* SpillRC = TRI.spillToOtherClass(MRI, Original); + if (SpillRC == nullptr) + return false; + + // Don't deal with subranges for now. + for (Register Reg : RegsToSpill) { + LiveInterval& LI = LIS.getInterval(Reg); + if (LI.hasSubRanges()) { + return false; + } + } + + // TODO: Find a way to do the interference check without creating new + // LiveInterval objects. + ArrayRef Order = RegClassInfo.getOrder(SpillRC); + VNInfo::Allocator VNIAllocator = LIS.getVNInfoAllocator(); + MCRegister SpillPhysReg = MCRegister::NoRegister; + for (MCPhysReg Candidate : Order) { + bool RegFree = true; + for (Register Reg : RegsToSpill) { + const LiveInterval& LI = LIS.getInterval(Reg); + if (Matrix.checkInterferenceWithRange(LI, Candidate) != LiveRegMatrix::IK_Free) { + RegFree = false; + break; + } + } + if (RegFree) { + SpillPhysReg = Candidate; + break; + } + } + if (SpillPhysReg == MCRegister::NoRegister) { + return false; + } + + Register SpillReg = MRI.createVirtualRegister(SpillRC); + + LLVM_DEBUG(dbgs() << "Spill by copying to " << printReg(SpillPhysReg, &TRI) << ".\n"); + VRM.assignVirt2Phys(SpillReg, SpillPhysReg); + + // Iterate over instructions using Reg. + for (Register Reg : RegsToSpill) { + for (MachineRegisterInfo::reg_bundle_iterator + RegI = MRI.reg_bundle_begin(Reg), E = MRI.reg_bundle_end(); + RegI != E; ) { + MachineInstr &MI = *(RegI++); + if (MI.isDebugValue()) { + // TODO + abort(); + } + assert(!MI.isDebugInstr() && "Did not expect to find a use in debug " + "instruction that isn't a DBG_VALUE"); + + // Ignore copies to/from snippets. We'll delete them. + if (SnippetCopies.count(&MI)) + continue; + + // Analyze instruction. + SmallVector, 8> Ops; + VirtRegInfo RI = AnalyzeVirtRegInBundle(MI, Reg, &Ops); + + Register NewVReg = Edit->createFrom(Reg); + if (RI.Reads) { + // Insert copy before use. + MachineBasicBlock &MBB = *MI.getParent(); + const MCInstrDesc &Desc = TII.get(TargetOpcode::COPY); + MachineInstr* CopyMI = BuildMI(MBB, MI.getIterator(), DebugLoc(), Desc) + .addReg(NewVReg, RegState::Define) + .addReg(SpillReg); + + LIS.InsertMachineInstrInMaps(*CopyMI); + LLVM_DEBUG(dumpMachineInstrRangeWithSlotIndex(CopyMI->getIterator(), + MI.getIterator(), LIS, + "copy from other", + NewVReg)); + } + + // Rewrite instruction operands. + bool hasLiveDef = false; + for (const auto &OpPair : Ops) { + MachineOperand &MO = OpPair.first->getOperand(OpPair.second); + MO.setReg(NewVReg); + if (MO.isUse()) { + if (!OpPair.first->isRegTiedToDefOperand(OpPair.second)) + MO.setIsKill(); + } else { + if (!MO.isDead()) + hasLiveDef = true; + } + } + + if (hasLiveDef) { + // Insert COPY from def to newvreg. + MachineBasicBlock &MBB = *MI.getParent(); + const MCInstrDesc &Desc = TII.get(TargetOpcode::COPY); + MachineInstr* CopyMI = BuildMI(MBB, std::next(MI.getIterator()), + DebugLoc(), Desc) + .addReg(SpillReg, RegState::Define) + .addReg(NewVReg); + + LIS.InsertMachineInstrInMaps(*CopyMI); + LLVM_DEBUG(dumpMachineInstrRangeWithSlotIndex(MI.getIterator(), + std::next(CopyMI->getIterator()), LIS, + "copy to other", + NewVReg)); + } + } + } + + return true; +} + void InlineSpiller::spill(LiveRangeEdit &edit) { ++NumSpilledRanges; Edit = &edit; diff --git a/llvm/lib/CodeGen/LiveRegMatrix.cpp b/llvm/lib/CodeGen/LiveRegMatrix.cpp --- a/llvm/lib/CodeGen/LiveRegMatrix.cpp +++ b/llvm/lib/CodeGen/LiveRegMatrix.cpp @@ -104,8 +104,12 @@ void LiveRegMatrix::assign(LiveInterval &VirtReg, MCRegister PhysReg) { LLVM_DEBUG(dbgs() << "assigning " << printReg(VirtReg.reg(), TRI) << " to " << printReg(PhysReg, TRI) << ':'); - assert(!VRM->hasPhys(VirtReg.reg()) && "Duplicate VirtReg assignment"); - VRM->assignVirt2Phys(VirtReg.reg(), PhysReg); + Register Reg = VirtReg.reg(); + if (!VRM->hasPhys(Reg)) { + VRM->assignVirt2Phys(Reg, PhysReg); + } else { + assert(VRM->getPhys(Reg) == PhysReg && "Duplicate VirtReg assignment"); + } foreachUnit( TRI, VirtReg, PhysReg, [&](unsigned Unit, const LiveRange &Range) { diff --git a/llvm/lib/CodeGen/RegAllocBase.cpp b/llvm/lib/CodeGen/RegAllocBase.cpp --- a/llvm/lib/CodeGen/RegAllocBase.cpp +++ b/llvm/lib/CodeGen/RegAllocBase.cpp @@ -146,19 +146,27 @@ for (Register Reg : SplitVRegs) { assert(LIS->hasInterval(Reg)); - LiveInterval *SplitVirtReg = &LIS->getInterval(Reg); - assert(!VRM->hasPhys(SplitVirtReg->reg()) && "Register already assigned"); - if (MRI->reg_nodbg_empty(SplitVirtReg->reg())) { - assert(SplitVirtReg->empty() && "Non-empty but used interval"); - LLVM_DEBUG(dbgs() << "not queueing unused " << *SplitVirtReg << '\n'); - aboutToRemoveInterval(*SplitVirtReg); - LIS->removeInterval(SplitVirtReg->reg()); + LiveInterval &SplitVirtReg = LIS->getInterval(Reg); + if (MRI->reg_nodbg_empty(SplitVirtReg.reg())) { + assert(SplitVirtReg.empty() && "Non-empty but used interval"); + LLVM_DEBUG(dbgs() << "not queueing unused " << SplitVirtReg << '\n'); + aboutToRemoveInterval(SplitVirtReg); + LIS->removeInterval(Reg); continue; } - LLVM_DEBUG(dbgs() << "queuing new interval: " << *SplitVirtReg << "\n"); - assert(Register::isVirtualRegister(SplitVirtReg->reg()) && + + // Strategies like copying to an alternate register have already picked a + // physreg ahead of time. + Register Assigned = VRM->getPhys(Reg); + if (Assigned != VirtRegMap::NO_PHYS_REG) { + Matrix->assign(SplitVirtReg, Assigned); + continue; + } + + LLVM_DEBUG(dbgs() << "queuing new interval: " << SplitVirtReg << "\n"); + assert(Register::isVirtualRegister(Reg) && "expect split value in virtual register"); - enqueue(SplitVirtReg); + enqueue(&SplitVirtReg); ++NumNewQueued; } } diff --git a/llvm/lib/CodeGen/RegAllocBasic.cpp b/llvm/lib/CodeGen/RegAllocBasic.cpp --- a/llvm/lib/CodeGen/RegAllocBasic.cpp +++ b/llvm/lib/CodeGen/RegAllocBasic.cpp @@ -322,7 +322,7 @@ getAnalysis()); VRAI.calculateSpillWeightsAndHints(); - SpillerInstance.reset(createInlineSpiller(*this, *MF, *VRM)); + SpillerInstance.reset(createInlineSpiller(*this, *MF, *VRM, RegClassInfo, *Matrix)); allocatePhysRegs(); postOptimization(); diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp --- a/llvm/lib/CodeGen/RegAllocGreedy.cpp +++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp @@ -693,6 +693,7 @@ if (ExtraRegInfo[Reg].Stage == RS_New) ExtraRegInfo[Reg].Stage = RS_Assign; + const TargetRegisterClass &RC = *MRI->getRegClass(Reg); if (ExtraRegInfo[Reg].Stage == RS_Split) { // Unsplit ranges that couldn't be allocated immediately are deferred until // everything else has been allocated. @@ -708,7 +709,6 @@ // Giant live ranges fall back to the global assignment heuristic, which // prevents excessive spilling in pathological cases. bool ReverseLocal = TRI->reverseLocalAssignment(); - const TargetRegisterClass &RC = *MRI->getRegClass(Reg); bool ForceGlobal = !ReverseLocal && (Size / SlotIndex::InstrDist) > (2 * RC.getNumRegs()); @@ -725,20 +725,20 @@ // large blocks on targets with many physical registers. Prio = Indexes->getZeroIndex().getInstrDistance(LI->endIndex()); } - Prio |= RC.AllocationPriority << 24; } else { // Allocate global and split ranges in long->short order. Long ranges that // don't fit should be spilled (or split) ASAP so they don't create // interference. Mark a bit to prioritize global above local ranges. - Prio = (1u << 29) + Size; + Prio = (1u << 21) + Size; } // Mark a higher bit to prioritize global and local above RS_Split. - Prio |= (1u << 31); + Prio |= (1u << 23); // Boost ranges that have a physical register hint. if (VRM->hasKnownPreference(Reg)) - Prio |= (1u << 30); + Prio |= (1u << 22); } + Prio |= RC.AllocationPriority << 24; // The virtual register number is a tie breaker for same-sized ranges. // Give lower vreg numbers higher priority to assign them first. CurQueue.push(std::make_pair(Prio, ~Reg)); @@ -3233,7 +3233,7 @@ MBFI = &getAnalysis(); DomTree = &getAnalysis(); ORE = &getAnalysis().getORE(); - SpillerInstance.reset(createInlineSpiller(*this, *MF, *VRM)); + SpillerInstance.reset(createInlineSpiller(*this, *MF, *VRM, RegClassInfo, *Matrix)); Loops = &getAnalysis(); Bundles = &getAnalysis(); SpillPlacer = &getAnalysis(); diff --git a/llvm/lib/CodeGen/RegAllocPBQP.cpp b/llvm/lib/CodeGen/RegAllocPBQP.cpp --- a/llvm/lib/CodeGen/RegAllocPBQP.cpp +++ b/llvm/lib/CodeGen/RegAllocPBQP.cpp @@ -803,7 +803,8 @@ PBQPVirtRegAuxInfo VRAI(MF, LIS, VRM, getAnalysis(), MBFI); VRAI.calculateSpillWeightsAndHints(); - std::unique_ptr VRegSpiller(createInlineSpiller(*this, MF, VRM)); + // TODO! FIXME! + std::unique_ptr VRegSpiller(nullptr); MF.getRegInfo().freezeReservedRegs(MF); diff --git a/llvm/lib/CodeGen/VirtRegMap.cpp b/llvm/lib/CodeGen/VirtRegMap.cpp --- a/llvm/lib/CodeGen/VirtRegMap.cpp +++ b/llvm/lib/CodeGen/VirtRegMap.cpp @@ -243,6 +243,7 @@ VRM = &getAnalysis(); LLVM_DEBUG(dbgs() << "********** REWRITE VIRTUAL REGISTERS **********\n" << "********** Function: " << MF->getName() << '\n'); + LLVM_DEBUG(LIS->dump()); LLVM_DEBUG(VRM->dump()); // Add kill flags while we still have virtual registers. diff --git a/llvm/lib/Target/X86/X86RegisterInfo.h b/llvm/lib/Target/X86/X86RegisterInfo.h --- a/llvm/lib/Target/X86/X86RegisterInfo.h +++ b/llvm/lib/Target/X86/X86RegisterInfo.h @@ -149,6 +149,8 @@ SmallVectorImpl &Hints, const MachineFunction &MF, const VirtRegMap *VRM, const LiveRegMatrix *Matrix) const override; + + virtual const TargetRegisterClass* spillToOtherClass(const MachineRegisterInfo& MRI, Register Reg) const override; }; } // End llvm namespace diff --git a/llvm/lib/Target/X86/X86RegisterInfo.cpp b/llvm/lib/Target/X86/X86RegisterInfo.cpp --- a/llvm/lib/Target/X86/X86RegisterInfo.cpp +++ b/llvm/lib/Target/X86/X86RegisterInfo.cpp @@ -933,3 +933,73 @@ return true; } + +const TargetRegisterClass* X86RegisterInfo::spillToOtherClass(const MachineRegisterInfo& MRI, Register Reg) const { + unsigned RCId = MRI.getRegClass(Reg)->getID(); + // TODO: We should somehow compute a list of relevant classes + // (all classes that only have RxX registers as members) + switch (RCId) { + case X86::GR32RegClassID: + case X86::GR32_NOSPRegClassID: + case X86::GR32_NOREXRegClassID: + case X86::GR32_NOREX_NOSPRegClassID: + case X86::GR32_ABCDRegClassID: + case X86::GR32_TCRegClassID: + case X86::GR32_ABCD_and_GR32_TCRegClassID: + case X86::GR32_ADRegClassID: + case X86::GR32_BPSPRegClassID: + case X86::GR32_BSIRegClassID: + case X86::GR32_CBRegClassID: + case X86::GR32_DCRegClassID: + case X86::GR32_DIBPRegClassID: + case X86::GR32_SIDIRegClassID: + case X86::GR32_ABCD_and_GR32_BSIRegClassID: + case X86::GR32_AD_and_GR32_DCRegClassID: + case X86::GR32_BPSP_and_GR32_DIBPRegClassID: + case X86::GR32_BPSP_and_GR32_TCRegClassID: + case X86::GR32_BSI_and_GR32_SIDIRegClassID: + case X86::GR32_CB_and_GR32_DCRegClassID: + case X86::GR32_DIBP_and_GR32_SIDIRegClassID: + case X86::GR64RegClassID: + case X86::GR64_with_sub_8bitRegClassID: + case X86::GR64_NOSPRegClassID: + case X86::GR64_TCRegClassID: + case X86::GR64_NOREXRegClassID: + case X86::GR64_TCW64RegClassID: + case X86::GR64_TC_with_sub_8bitRegClassID: + case X86::GR64_NOSP_and_GR64_TCRegClassID: + case X86::GR64_TCW64_with_sub_8bitRegClassID: + case X86::GR64_TC_and_GR64_TCW64RegClassID: + case X86::GR64_with_sub_16bit_in_GR16_NOREXRegClassID: + case X86::GR64_NOREX_NOSPRegClassID: + case X86::GR64_NOREX_and_GR64_TCRegClassID: + case X86::GR64_NOSP_and_GR64_TCW64RegClassID: + case X86::GR64_TCW64_and_GR64_TC_with_sub_8bitRegClassID: + case X86::GR64_TC_and_GR64_NOSP_and_GR64_TCW64RegClassID: + case X86::GR64_TC_and_GR64_with_sub_16bit_in_GR16_NOREXRegClassID: + case X86::GR64_NOREX_NOSP_and_GR64_TCRegClassID: + case X86::GR64_NOREX_and_GR64_TCW64RegClassID: + case X86::GR64_ABCDRegClassID: + case X86::GR64_with_sub_32bit_in_GR32_TCRegClassID: + case X86::GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_TCRegClassID: + case X86::GR64_ADRegClassID: + case X86::GR64_and_LOW32_ADDR_ACCESS_RBPRegClassID: + case X86::GR64_with_sub_32bit_in_GR32_BPSPRegClassID: + case X86::GR64_with_sub_32bit_in_GR32_BSIRegClassID: + case X86::GR64_with_sub_32bit_in_GR32_CBRegClassID: + case X86::GR64_with_sub_32bit_in_GR32_DCRegClassID: + case X86::GR64_with_sub_32bit_in_GR32_DIBPRegClassID: + case X86::GR64_with_sub_32bit_in_GR32_SIDIRegClassID: + case X86::GR64_and_LOW32_ADDR_ACCESSRegClassID: + case X86::GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_BSIRegClassID: + case X86::GR64_with_sub_32bit_in_GR32_AD_and_GR32_DCRegClassID: + case X86::GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_DIBPRegClassID: + case X86::GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_TCRegClassID: + case X86::GR64_with_sub_32bit_in_GR32_BSI_and_GR32_SIDIRegClassID: + case X86::GR64_with_sub_32bit_in_GR32_CB_and_GR32_DCRegClassID: + case X86::GR64_with_sub_32bit_in_GR32_DIBP_and_GR32_SIDIRegClassID: + return &X86::FR64RegClass; + default: + return nullptr; + } +} diff --git a/llvm/lib/Target/X86/X86RegisterInfo.td b/llvm/lib/Target/X86/X86RegisterInfo.td --- a/llvm/lib/Target/X86/X86RegisterInfo.td +++ b/llvm/lib/Target/X86/X86RegisterInfo.td @@ -533,9 +533,13 @@ def GR32_BPSP : RegisterClass<"X86", [i32], 32, (add EBP, ESP)>; // Scalar SSE2 floating point registers. -def FR32 : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 15)>; +def FR32 : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 15)> { + let AllocationPriority = 10; +} -def FR64 : RegisterClass<"X86", [f64], 64, (add FR32)>; +def FR64 : RegisterClass<"X86", [f64], 64, (add FR32)> { + let AllocationPriority = 10; +} // FIXME: This sets up the floating point register files as though they are f64 @@ -566,11 +570,17 @@ // Generic vector registers: VR64 and VR128. // Ensure that float types are declared first - only float is legal on SSE1. -def VR64: RegisterClass<"X86", [x86mmx], 64, (sequence "MM%u", 0, 7)>; +def VR64: RegisterClass<"X86", [x86mmx], 64, (sequence "MM%u", 0, 7)> { + let AllocationPriority = 10; +} def VR128 : RegisterClass<"X86", [v4f32, v2f64, v16i8, v8i16, v4i32, v2i64, f128], - 128, (add FR32)>; + 128, (add FR32)> { + let AllocationPriority = 10; +} def VR256 : RegisterClass<"X86", [v8f32, v4f64, v32i8, v16i16, v8i32, v4i64], - 256, (sequence "YMM%u", 0, 15)>; + 256, (sequence "YMM%u", 0, 15)> { + let AllocationPriority = 10; +} // Status flags registers. def CCR : RegisterClass<"X86", [i32], 32, (add EFLAGS)> { @@ -588,22 +598,34 @@ // AVX-512 vector/mask registers. def VR512 : RegisterClass<"X86", [v16f32, v8f64, v64i8, v32i16, v16i32, v8i64], - 512, (sequence "ZMM%u", 0, 31)>; + 512, (sequence "ZMM%u", 0, 31)> { + let AllocationPriority = 10; +} // Represents the lower 16 registers that have VEX/legacy encodable subregs. def VR512_0_15 : RegisterClass<"X86", [v16f32, v8f64, v64i8, v32i16, v16i32, v8i64], - 512, (sequence "ZMM%u", 0, 15)>; + 512, (sequence "ZMM%u", 0, 15)> { + let AllocationPriority = 10; +} // Scalar AVX-512 floating point registers. -def FR32X : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 31)>; +def FR32X : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 31)> { + let AllocationPriority = 10; +} -def FR64X : RegisterClass<"X86", [f64], 64, (add FR32X)>; +def FR64X : RegisterClass<"X86", [f64], 64, (add FR32X)> { + let AllocationPriority = 10; +} // Extended VR128 and VR256 for AVX-512 instructions def VR128X : RegisterClass<"X86", [v4f32, v2f64, v16i8, v8i16, v4i32, v2i64, f128], - 128, (add FR32X)>; + 128, (add FR32X)> { + let AllocationPriority = 10; +} def VR256X : RegisterClass<"X86", [v8f32, v4f64, v32i8, v16i16, v8i32, v4i64], - 256, (sequence "YMM%u", 0, 31)>; + 256, (sequence "YMM%u", 0, 31)> { + let AllocationPriority = 10; +} // Mask registers def VK1 : RegisterClass<"X86", [v1i1], 16, (sequence "K%u", 0, 7)> {let Size = 16;} diff --git a/llvm/test/CodeGen/X86/anyregcc.ll b/llvm/test/CodeGen/X86/anyregcc.ll --- a/llvm/test/CodeGen/X86/anyregcc.ll +++ b/llvm/test/CodeGen/X86/anyregcc.ll @@ -1,6 +1,6 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -frame-pointer=all | FileCheck %s -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -frame-pointer=all | FileCheck --check-prefix=SSE %s -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -frame-pointer=all | FileCheck --check-prefix=AVX %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin -frame-pointer=all -disable-spill-other-class | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -frame-pointer=all -disable-spill-other-class | FileCheck --check-prefix=SSE %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -frame-pointer=all -disable-spill-other-class | FileCheck --check-prefix=AVX %s ; Stackmap Header: no constants - 6 callsites diff --git a/llvm/test/CodeGen/X86/fma.ll b/llvm/test/CodeGen/X86/fma.ll --- a/llvm/test/CodeGen/X86/fma.ll +++ b/llvm/test/CodeGen/X86/fma.ll @@ -1934,9 +1934,9 @@ ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x29,0x94,0x24,0x00,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractf128 $1, %ymm3, %xmm2 ## encoding: [0xc4,0xe3,0x7d,0x19,0xda,0x01] ; FMACALL32_BDVER2-NEXT: vmovaps %ymm3, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x29,0x9c,0x24,0xc0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x29,0x9c,0x24,0x80,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vmovaps %ymm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x29,0x8c,0x24,0xa0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x29,0x4c,0x24,0x60] ; FMACALL32_BDVER2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x94,0x24,0x30,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vmovsd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfb,0x11,0x44,0x24,0x10] @@ -1952,12 +1952,12 @@ ; FMACALL32_BDVER2-NEXT: vmovsd 48(%ebp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x45,0x30] ; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0],zero ; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill -; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0x94,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0xd4,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vmovsd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfb,0x11,0x44,0x24,0x10] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0xa0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x44,0x24,0x60] ; FMACALL32_BDVER2-NEXT: vunpckhpd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf9,0x15,0x84,0x24,0xc0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf9,0x15,0x84,0x24,0x80,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[1],mem[1] ; FMACALL32_BDVER2-NEXT: vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24] ; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] @@ -1966,12 +1966,12 @@ ; FMACALL32_BDVER2-NEXT: vmovsd 40(%ebp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x45,0x28] ; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0],zero ; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill -; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0x88,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0xc8,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vmovsd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfb,0x11,0x44,0x24,0x10] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0xa0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x44,0x24,0x60] ; FMACALL32_BDVER2-NEXT: vunpcklpd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf9,0x14,0x84,0x24,0xc0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf9,0x14,0x84,0x24,0x80,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0],mem[0] ; FMACALL32_BDVER2-NEXT: vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24] ; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] @@ -1980,7 +1980,7 @@ ; FMACALL32_BDVER2-NEXT: vmovsd 32(%ebp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x45,0x20] ; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0],zero ; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill -; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0xc0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0xbc,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vmovsd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfb,0x11,0x44,0x24,0x10] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x00,0x01,0x00,0x00] @@ -1988,10 +1988,10 @@ ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0xe0,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x4c,0x24,0x20] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x4c,0x24,0x60] ; FMACALL32_BDVER2-NEXT: vextractf128 $1, %ymm0, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01] ; FMACALL32_BDVER2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x44,0x24,0x30] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x84,0x24,0x80,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vunpckhpd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x15,0xc1] ; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[1],xmm1[1] ; FMACALL32_BDVER2-NEXT: vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24] @@ -2001,12 +2001,12 @@ ; FMACALL32_BDVER2-NEXT: vmovsd 24(%ebp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x45,0x18] ; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0],zero ; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill -; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0xa0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0xb0,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vmovsd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfb,0x11,0x44,0x24,0x10] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x30] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0x80,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vunpcklpd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf9,0x14,0x44,0x24,0x20] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf9,0x14,0x44,0x24,0x60] ; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0],mem[0] ; FMACALL32_BDVER2-NEXT: vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24] ; FMACALL32_BDVER2-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A] @@ -2014,7 +2014,7 @@ ; FMACALL32_BDVER2-NEXT: vmovsd 16(%ebp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x45,0x10] ; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0],zero ; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill -; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0x7c,0x24,0x30] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0x80,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vmovsd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfb,0x11,0x44,0x24,0x10] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0xe0,0x00,0x00,0x00] @@ -2028,7 +2028,7 @@ ; FMACALL32_BDVER2-NEXT: vmovsd 8(%ebp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x45,0x08] ; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0],zero ; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill -; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0x7c,0x24,0x20] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0x7c,0x24,0x60] ; FMACALL32_BDVER2-NEXT: vmovsd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfb,0x11,0x44,0x24,0x10] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0xe0,0x00,0x00,0x00] @@ -2048,44 +2048,44 @@ ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x12,0x84,0x24,0x48,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0,1],xmm0[2,3] ; FMACALL32_BDVER2-NEXT: vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24] -; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x60] +; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x38] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0x6c,0x24,0x20] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0x6c,0x24,0x60] +; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x30] +; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0x80,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x28] +; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0xb0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x20] +; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0xbc,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x58] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0x6c,0x24,0x30] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0xc8,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x50] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0xa0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0xd4,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x48] -; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0xc0,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x9c,0x24,0x80,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0x88,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x78] -; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0x94,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x70] ; FMACALL32_BDVER2-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 -; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x68] -; FMACALL32_BDVER2-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x44,0x24,0x60] +; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x40] +; FMACALL32_BDVER2-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x44,0x24,0x38] ; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0],zero -; FMACALL32_BDVER2-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfb,0x10,0x4c,0x24,0x50] +; FMACALL32_BDVER2-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfb,0x10,0x4c,0x24,0x28] ; FMACALL32_BDVER2-NEXT: ## xmm1 = mem[0],zero -; FMACALL32_BDVER2-NEXT: vmovhps {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x16,0x44,0x24,0x58] +; FMACALL32_BDVER2-NEXT: vmovhps {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x16,0x44,0x24,0x30] ; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0,1],mem[0,1] -; FMACALL32_BDVER2-NEXT: vmovhps {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x16,0x4c,0x24,0x48] +; FMACALL32_BDVER2-NEXT: vmovhps {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x16,0x4c,0x24,0x20] ; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm1[0,1],mem[0,1] -; FMACALL32_BDVER2-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm2 ## encoding: [0xc5,0xfb,0x10,0x54,0x24,0x70] +; FMACALL32_BDVER2-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm2 ## encoding: [0xc5,0xfb,0x10,0x54,0x24,0x48] ; FMACALL32_BDVER2-NEXT: ## xmm2 = mem[0],zero -; FMACALL32_BDVER2-NEXT: vmovhps {{[0-9]+}}(%esp), %xmm2, %xmm2 ## encoding: [0xc5,0xe8,0x16,0x54,0x24,0x68] +; FMACALL32_BDVER2-NEXT: vmovhps {{[0-9]+}}(%esp), %xmm2, %xmm2 ## encoding: [0xc5,0xe8,0x16,0x54,0x24,0x40] ; FMACALL32_BDVER2-NEXT: ## xmm2 = xmm2[0,1],mem[0,1] ; FMACALL32_BDVER2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x18,0xc1,0x01] -; FMACALL32_BDVER2-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfb,0x10,0x8c,0x24,0x80,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfb,0x10,0x4c,0x24,0x58] ; FMACALL32_BDVER2-NEXT: ## xmm1 = mem[0],zero -; FMACALL32_BDVER2-NEXT: vmovhps {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x16,0x4c,0x24,0x78] +; FMACALL32_BDVER2-NEXT: vmovhps {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x16,0x4c,0x24,0x50] ; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm1[0,1],mem[0,1] ; FMACALL32_BDVER2-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ## encoding: [0xc4,0xe3,0x75,0x18,0xca,0x01] ; FMACALL32_BDVER2-NEXT: movl %ebp, %esp ## encoding: [0x89,0xec] diff --git a/llvm/test/CodeGen/X86/mul-i1024.ll b/llvm/test/CodeGen/X86/mul-i1024.ll --- a/llvm/test/CodeGen/X86/mul-i1024.ll +++ b/llvm/test/CodeGen/X86/mul-i1024.ll @@ -4802,14 +4802,13 @@ ; X64-NEXT: pushq %r13 ; X64-NEXT: pushq %r12 ; X64-NEXT: pushq %rbx -; X64-NEXT: subq $240, %rsp -; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: subq $112, %rsp +; X64-NEXT: movq %rdx, %xmm0 ; X64-NEXT: movq 40(%rdi), %r15 ; X64-NEXT: movq 32(%rdi), %r9 ; X64-NEXT: movq 56(%rdi), %r8 ; X64-NEXT: movq 48(%rdi), %rbx -; X64-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %rdi, %xmm1 ; X64-NEXT: movq (%rsi), %rdi ; X64-NEXT: movq 8(%rsi), %r11 ; X64-NEXT: movq %rsi, %r13 @@ -4818,13 +4817,13 @@ ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq %rax, %r12 ; X64-NEXT: movq %r8, %rax -; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: mulq %rdi ; X64-NEXT: movq %rdx, %rsi ; X64-NEXT: movq %rax, %rbp ; X64-NEXT: addq %rcx, %rbp ; X64-NEXT: adcq $0, %rsi ; X64-NEXT: movq %rbx, %rax +; X64-NEXT: movq %rbx, %xmm4 ; X64-NEXT: mulq %r11 ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq %rax, %r10 @@ -4833,34 +4832,35 @@ ; X64-NEXT: setb %al ; X64-NEXT: movzbl %al, %esi ; X64-NEXT: movq %r8, %rax +; X64-NEXT: movq %r8, %xmm11 ; X64-NEXT: mulq %r11 ; X64-NEXT: movq %rdx, %r8 ; X64-NEXT: movq %rax, %r14 ; X64-NEXT: addq %rcx, %r14 ; X64-NEXT: adcq %rsi, %r8 ; X64-NEXT: movq %r9, %rax -; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: mulq %rdi ; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %rax, %xmm5 ; X64-NEXT: movq %r15, %rax ; X64-NEXT: mulq %rdi +; X64-NEXT: movq %rdi, %xmm2 ; X64-NEXT: movq %rdx, %rbp ; X64-NEXT: movq %rax, %rdi ; X64-NEXT: addq %rcx, %rdi ; X64-NEXT: adcq $0, %rbp ; X64-NEXT: movq %r9, %rax -; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %r11, %rcx ; X64-NEXT: mulq %r11 ; X64-NEXT: movq %rdx, %rbx ; X64-NEXT: addq %rdi, %rax -; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %rax, %xmm6 ; X64-NEXT: adcq %rbp, %rbx ; X64-NEXT: setb %sil -; X64-NEXT: movq %r15, %rbp +; X64-NEXT: movq %r15, %r11 ; X64-NEXT: movq %r15, %rax -; X64-NEXT: mulq %r11 +; X64-NEXT: mulq %rcx +; X64-NEXT: movq %rcx, %xmm3 ; X64-NEXT: movq %rdx, %r15 ; X64-NEXT: movq %rax, %rcx ; X64-NEXT: addq %rbx, %rcx @@ -4870,23 +4870,22 @@ ; X64-NEXT: adcq %r10, %r15 ; X64-NEXT: adcq $0, %r14 ; X64-NEXT: adcq $0, %r8 -; X64-NEXT: movq %r13, %rdi -; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %r13, %r12 ; X64-NEXT: movq 16(%r13), %r10 ; X64-NEXT: movq %r9, %rax ; X64-NEXT: mulq %r10 -; X64-NEXT: movq %rdx, %r12 +; X64-NEXT: movq %rdx, %rdi ; X64-NEXT: movq %rax, %r13 -; X64-NEXT: movq %rbp, %rax -; X64-NEXT: movq %rbp, %r11 -; X64-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %r11, %rax ; X64-NEXT: mulq %r10 ; X64-NEXT: movq %rdx, %rbx ; X64-NEXT: movq %rax, %rsi -; X64-NEXT: addq %r12, %rsi +; X64-NEXT: addq %rdi, %rsi ; X64-NEXT: adcq $0, %rbx -; X64-NEXT: movq 24(%rdi), %rdi +; X64-NEXT: movq 24(%r12), %rdi +; X64-NEXT: movq %r12, %xmm7 ; X64-NEXT: movq %r9, %rax +; X64-NEXT: movq %r9, %xmm9 ; X64-NEXT: mulq %rdi ; X64-NEXT: movq %rdi, %r12 ; X64-NEXT: movq %rdx, %rbp @@ -4895,6 +4894,7 @@ ; X64-NEXT: adcq %rbx, %rbp ; X64-NEXT: setb %r9b ; X64-NEXT: movq %r11, %rax +; X64-NEXT: movq %r11, %xmm10 ; X64-NEXT: mulq %rdi ; X64-NEXT: movq %rdx, %rdi ; X64-NEXT: movq %rax, %rbx @@ -4902,7 +4902,7 @@ ; X64-NEXT: movzbl %r9b, %eax ; X64-NEXT: adcq %rax, %rdi ; X64-NEXT: addq %rcx, %r13 -; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %r13, %xmm8 ; X64-NEXT: adcq %r15, %rsi ; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq $0, %rbx @@ -4910,12 +4910,12 @@ ; X64-NEXT: addq %r14, %rbx ; X64-NEXT: adcq %r8, %rdi ; X64-NEXT: setb %r11b -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; X64-NEXT: movq %xmm4, %rcx ; X64-NEXT: movq %rcx, %rax ; X64-NEXT: mulq %r10 ; X64-NEXT: movq %rdx, %r14 ; X64-NEXT: movq %rax, %r15 -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; X64-NEXT: movq %xmm11, %r9 ; X64-NEXT: movq %r9, %rax ; X64-NEXT: mulq %r10 ; X64-NEXT: movq %rdx, %rsi @@ -4923,7 +4923,7 @@ ; X64-NEXT: addq %r14, %rbp ; X64-NEXT: adcq $0, %rsi ; X64-NEXT: movq %rcx, %rax -; X64-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movdqa %xmm4, %xmm11 ; X64-NEXT: mulq %r12 ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: addq %rbp, %rax @@ -4931,54 +4931,58 @@ ; X64-NEXT: adcq %rsi, %rcx ; X64-NEXT: setb %sil ; X64-NEXT: movq %r9, %rax +; X64-NEXT: movq %r9, %xmm12 ; X64-NEXT: mulq %r12 +; X64-NEXT: movq %r12, %xmm4 ; X64-NEXT: addq %rcx, %rax ; X64-NEXT: movzbl %sil, %ecx ; X64-NEXT: adcq %rcx, %rdx ; X64-NEXT: addq %rbx, %r15 -; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %r15, %xmm14 ; X64-NEXT: adcq %rdi, %rbp ; X64-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movzbl %r11b, %ecx ; X64-NEXT: adcq %rcx, %rax -; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %rax, %xmm15 ; X64-NEXT: adcq $0, %rdx -; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload -; X64-NEXT: movq 16(%rsi), %rcx -; X64-NEXT: movq %rcx, %rax -; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; X64-NEXT: mulq %r8 -; X64-NEXT: movq %rax, %r14 -; X64-NEXT: movq %rdx, %r9 -; X64-NEXT: movq 24(%rsi), %rbx +; X64-NEXT: movq %rdx, %xmm13 +; X64-NEXT: movq %xmm1, %rsi +; X64-NEXT: movq 16(%rsi), %rbx ; X64-NEXT: movq %rbx, %rax ; X64-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %xmm2, %r8 +; X64-NEXT: mulq %r8 +; X64-NEXT: movq %rax, %r14 +; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq 24(%rsi), %r9 +; X64-NEXT: movq %r9, %rax +; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rdx, %rdi ; X64-NEXT: movq %rax, %rbp -; X64-NEXT: addq %r9, %rbp +; X64-NEXT: addq %rcx, %rbp ; X64-NEXT: adcq $0, %rdi -; X64-NEXT: movq %rcx, %rax -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload +; X64-NEXT: movq %rbx, %rax +; X64-NEXT: movq %xmm3, %r13 ; X64-NEXT: mulq %r13 ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq %rax, %r15 ; X64-NEXT: addq %rbp, %r15 ; X64-NEXT: adcq %rdi, %rcx -; X64-NEXT: setb %dil -; X64-NEXT: movq %rbx, %rax +; X64-NEXT: setb %bl +; X64-NEXT: movq %r9, %rax ; X64-NEXT: mulq %r13 ; X64-NEXT: movq %rdx, %r11 ; X64-NEXT: movq %rax, %r12 ; X64-NEXT: addq %rcx, %r12 -; X64-NEXT: movzbl %dil, %eax +; X64-NEXT: movzbl %bl, %eax ; X64-NEXT: adcq %rax, %r11 +; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq (%rsi), %rdi ; X64-NEXT: movq %rdi, %rax +; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: mulq %r8 -; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %rax, %xmm1 ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq 8(%rsi), %r9 ; X64-NEXT: movq %r9, %rax @@ -4988,10 +4992,11 @@ ; X64-NEXT: addq %rcx, %rbx ; X64-NEXT: adcq $0, %rbp ; X64-NEXT: movq %rdi, %rax +; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: mulq %r13 ; X64-NEXT: movq %rdx, %rsi ; X64-NEXT: addq %rbx, %rax -; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %rax, %xmm2 ; X64-NEXT: adcq %rbp, %rsi ; X64-NEXT: setb %bl ; X64-NEXT: movq %r9, %rax @@ -5019,7 +5024,7 @@ ; X64-NEXT: addq %rsi, %rdi ; X64-NEXT: adcq $0, %rbx ; X64-NEXT: movq %rbp, %rax -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; X64-NEXT: movq %xmm4, %rsi ; X64-NEXT: mulq %rsi ; X64-NEXT: movq %rdx, %rbp ; X64-NEXT: addq %rdi, %rax @@ -5034,17 +5039,17 @@ ; X64-NEXT: movzbl %r15b, %eax ; X64-NEXT: adcq %rax, %r8 ; X64-NEXT: addq %rcx, %r14 -; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %r14, %xmm3 ; X64-NEXT: adcq %r13, %rdi -; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %rdi, %xmm4 ; X64-NEXT: adcq $0, %rbx ; X64-NEXT: adcq $0, %r8 ; X64-NEXT: addq %r12, %rbx ; X64-NEXT: adcq %r11, %r8 ; X64-NEXT: setb %r14b -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload -; X64-NEXT: movq %rsi, %rax -; X64-NEXT: movq %r10, (%rsp) # 8-byte Spill +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; X64-NEXT: movq %r15, %rax +; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: mulq %r10 ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq %rax, %r11 @@ -5055,9 +5060,8 @@ ; X64-NEXT: movq %rax, %rbp ; X64-NEXT: addq %rcx, %rbp ; X64-NEXT: adcq $0, %rdi -; X64-NEXT: movq %rsi, %rax -; X64-NEXT: movq %rsi, %r15 -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; X64-NEXT: movq %r15, %rax +; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: mulq %rsi ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq %rax, %r10 @@ -5066,33 +5070,44 @@ ; X64-NEXT: setb %dil ; X64-NEXT: movq %r9, %rax ; X64-NEXT: mulq %rsi -; X64-NEXT: addq %rcx, %rax -; X64-NEXT: movzbl %dil, %ecx -; X64-NEXT: adcq %rcx, %rdx +; X64-NEXT: movq %rax, %rsi +; X64-NEXT: addq %rcx, %rsi +; X64-NEXT: movzbl %dil, %eax +; X64-NEXT: adcq %rax, %rdx ; X64-NEXT: addq %rbx, %r11 ; X64-NEXT: adcq %r8, %r10 -; X64-NEXT: movzbl %r14b, %ecx -; X64-NEXT: adcq %rcx, %rax +; X64-NEXT: movzbl %r14b, %eax +; X64-NEXT: adcq %rax, %rsi ; X64-NEXT: adcq $0, %rdx -; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload +; X64-NEXT: movq %xmm5, %rax +; X64-NEXT: addq %rax, %r11 ; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload +; X64-NEXT: movq %xmm6, %rax +; X64-NEXT: adcq %rax, %r10 ; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload -; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %xmm8, %rax +; X64-NEXT: adcq %rax, %rsi +; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload -; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %rdx, %xmm8 +; X64-NEXT: movq %xmm14, %rax +; X64-NEXT: adcq $0, %rax +; X64-NEXT: movq %rax, %xmm14 ; X64-NEXT: adcq $0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; X64-NEXT: adcq $0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; X64-NEXT: adcq $0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; X64-NEXT: adcq $0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; X64-NEXT: movq %xmm15, %rax +; X64-NEXT: adcq $0, %rax +; X64-NEXT: movq %rax, %xmm15 +; X64-NEXT: movq %xmm13, %rax +; X64-NEXT: adcq $0, %rax +; X64-NEXT: movq %rax, %xmm13 +; X64-NEXT: movq %xmm7, %rsi ; X64-NEXT: movq 32(%rsi), %rdi +; X64-NEXT: movq %r15, %rbx ; X64-NEXT: movq %r15, %rax ; X64-NEXT: mulq %rdi ; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %r9, %rbx +; X64-NEXT: movq %rax, %r15 +; X64-NEXT: movq %r9, %r10 ; X64-NEXT: movq %r9, %rax ; X64-NEXT: mulq %rdi ; X64-NEXT: movq %rdi, %r11 @@ -5102,29 +5117,29 @@ ; X64-NEXT: adcq $0, %rdi ; X64-NEXT: movq 40(%rsi), %rcx ; X64-NEXT: movq %rsi, %r13 -; X64-NEXT: movq %r15, %rax +; X64-NEXT: movq %rbx, %rax ; X64-NEXT: mulq %rcx ; X64-NEXT: movq %rcx, %r8 ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq %rax, %r9 ; X64-NEXT: addq %rbp, %r9 ; X64-NEXT: adcq %rdi, %rcx -; X64-NEXT: setb %sil -; X64-NEXT: movq %rbx, %rax +; X64-NEXT: setb %bl +; X64-NEXT: movq %r10, %rax ; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rdx, %r10 ; X64-NEXT: movq %rax, %r14 ; X64-NEXT: addq %rcx, %r14 -; X64-NEXT: movzbl %sil, %eax +; X64-NEXT: movzbl %bl, %eax ; X64-NEXT: adcq %rax, %r10 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload ; X64-NEXT: movq %r12, %rax ; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: mulq %r11 ; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; X64-NEXT: movq %r15, %rax +; X64-NEXT: movq %rax, %xmm5 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload +; X64-NEXT: movq %rbx, %rax ; X64-NEXT: mulq %r11 ; X64-NEXT: movq %rdx, %rdi ; X64-NEXT: movq %rax, %rsi @@ -5135,27 +5150,29 @@ ; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rdx, %rbp ; X64-NEXT: addq %rsi, %rax -; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %rax, %xmm6 ; X64-NEXT: adcq %rdi, %rbp ; X64-NEXT: setb %sil -; X64-NEXT: movq %r15, %rax +; X64-NEXT: movq %rbx, %rax ; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rdx, %r8 ; X64-NEXT: movq %rax, %rcx ; X64-NEXT: addq %rbp, %rcx ; X64-NEXT: movzbl %sil, %eax ; X64-NEXT: adcq %rax, %r8 -; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload +; X64-NEXT: addq %r15, %rcx ; X64-NEXT: adcq %r9, %r8 ; X64-NEXT: adcq $0, %r14 ; X64-NEXT: adcq $0, %r10 +; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq 48(%r13), %r11 ; X64-NEXT: movq %r12, %rdi ; X64-NEXT: movq %r12, %rax ; X64-NEXT: mulq %r11 ; X64-NEXT: movq %rdx, %r12 ; X64-NEXT: movq %rax, %r9 -; X64-NEXT: movq %r15, %rax +; X64-NEXT: movq %rbx, %rax +; X64-NEXT: movq %rbx, %r15 ; X64-NEXT: mulq %r11 ; X64-NEXT: movq %rdx, %rbp ; X64-NEXT: movq %rax, %rbx @@ -5178,7 +5195,7 @@ ; X64-NEXT: movzbl %bl, %eax ; X64-NEXT: adcq %rax, %rsi ; X64-NEXT: addq %rcx, %r9 -; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %r9, %xmm7 ; X64-NEXT: adcq %r8, %r13 ; X64-NEXT: adcq $0, %r12 ; X64-NEXT: adcq $0, %rsi @@ -5218,32 +5235,39 @@ ; X64-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; X64-NEXT: adcq %rax, %r14 ; X64-NEXT: adcq $0, %r10 -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; X64-NEXT: addq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; X64-NEXT: adcq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; X64-NEXT: adcq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload -; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %xmm5, %rax +; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload +; X64-NEXT: movq %rax, %xmm5 +; X64-NEXT: movq %xmm6, %rax +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload +; X64-NEXT: movq %rax, %xmm6 +; X64-NEXT: movq %xmm7, %rax +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload +; X64-NEXT: movq %rax, %xmm7 +; X64-NEXT: movq %xmm8, %rax +; X64-NEXT: adcq %rax, %r13 +; X64-NEXT: movq %r13, %xmm8 ; X64-NEXT: adcq $0, %r8 ; X64-NEXT: adcq $0, %r9 ; X64-NEXT: adcq $0, %r14 ; X64-NEXT: adcq $0, %r10 -; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload +; X64-NEXT: movq %xmm14, %rax +; X64-NEXT: addq %rax, %r8 ; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload -; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload +; X64-NEXT: movq %r9, %xmm14 +; X64-NEXT: movq %xmm15, %rax +; X64-NEXT: adcq %rax, %r14 +; X64-NEXT: movq %xmm13, %rax +; X64-NEXT: adcq %rax, %r10 ; X64-NEXT: setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; X64-NEXT: movq %xmm11, %rdi ; X64-NEXT: movq %rdi, %rax ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload ; X64-NEXT: mulq %rsi ; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload +; X64-NEXT: movq %rax, %xmm13 +; X64-NEXT: movq %xmm12, %rbp ; X64-NEXT: movq %rbp, %rax ; X64-NEXT: mulq %rsi ; X64-NEXT: movq %rsi, %r8 @@ -5267,12 +5291,12 @@ ; X64-NEXT: addq %rcx, %r13 ; X64-NEXT: movzbl %bl, %eax ; X64-NEXT: adcq %rax, %r11 -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload +; X64-NEXT: movq %xmm9, %rbp ; X64-NEXT: movq %rbp, %rax ; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; X64-NEXT: movq %rax, %xmm9 +; X64-NEXT: movq %xmm10, %rsi ; X64-NEXT: movq %rsi, %rax ; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rdx, %rbx @@ -5284,7 +5308,7 @@ ; X64-NEXT: mulq %r9 ; X64-NEXT: movq %rdx, %rbp ; X64-NEXT: addq %rdi, %rax -; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %rax, %xmm10 ; X64-NEXT: adcq %rbx, %rbp ; X64-NEXT: setb %bl ; X64-NEXT: movq %rsi, %rax @@ -5294,19 +5318,23 @@ ; X64-NEXT: addq %rbp, %rcx ; X64-NEXT: movzbl %bl, %eax ; X64-NEXT: adcq %rax, %r15 -; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload +; X64-NEXT: movq %xmm13, %rax +; X64-NEXT: addq %rax, %rcx ; X64-NEXT: adcq %r12, %r15 ; X64-NEXT: adcq $0, %r13 ; X64-NEXT: adcq $0, %r11 ; X64-NEXT: movq %r8, %rbx +; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq %r8, %rax -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; X64-NEXT: mulq %rdi +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload +; X64-NEXT: mulq %rbp ; X64-NEXT: movq %rdx, %r8 ; X64-NEXT: movq %rax, %r12 ; X64-NEXT: movq %rsi, %rax ; X64-NEXT: movq %rsi, %r9 -; X64-NEXT: mulq %rdi +; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: mulq %rbp +; X64-NEXT: movq %rbp, %xmm13 ; X64-NEXT: movq %rdx, %rbp ; X64-NEXT: movq %rax, %rsi ; X64-NEXT: addq %r8, %rsi @@ -5321,6 +5349,7 @@ ; X64-NEXT: setb %sil ; X64-NEXT: movq %r9, %rax ; X64-NEXT: mulq %rdi +; X64-NEXT: movq %rdi, %xmm15 ; X64-NEXT: movq %rdx, %rdi ; X64-NEXT: movq %rax, %rbp ; X64-NEXT: addq %rbx, %rbp @@ -5333,13 +5362,15 @@ ; X64-NEXT: addq %r13, %rbp ; X64-NEXT: adcq %r11, %rdi ; X64-NEXT: setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; X64-NEXT: movq %xmm11, %rcx +; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq %rcx, %rax -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; X64-NEXT: movq %xmm13, %rsi ; X64-NEXT: mulq %rsi ; X64-NEXT: movq %rdx, %r15 ; X64-NEXT: movq %rax, %r13 -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; X64-NEXT: movq %xmm12, %r9 +; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq %r9, %rax ; X64-NEXT: mulq %rsi ; X64-NEXT: movq %rdx, %rsi @@ -5347,7 +5378,7 @@ ; X64-NEXT: addq %r15, %rbx ; X64-NEXT: adcq $0, %rsi ; X64-NEXT: movq %rcx, %rax -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; X64-NEXT: movq %xmm15, %rcx ; X64-NEXT: mulq %rcx ; X64-NEXT: movq %rdx, %r11 ; X64-NEXT: addq %rbx, %rax @@ -5365,26 +5396,29 @@ ; X64-NEXT: adcq %rcx, %rax ; X64-NEXT: movq %rax, %rcx ; X64-NEXT: adcq $0, %rdx -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; X64-NEXT: addq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; X64-NEXT: adcq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; X64-NEXT: movq %xmm9, %rax +; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload +; X64-NEXT: movq %rax, %xmm9 +; X64-NEXT: movq %xmm10, %rax +; X64-NEXT: movq %xmm14, %rsi +; X64-NEXT: adcq %rsi, %rax +; X64-NEXT: movq %rax, %xmm10 ; X64-NEXT: adcq %r14, %r12 -; X64-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %r12, %xmm12 ; X64-NEXT: adcq %r10, %r8 -; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %r8, %xmm11 ; X64-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; X64-NEXT: adcq %rax, %r13 -; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %r13, %xmm13 ; X64-NEXT: adcq $0, %r15 -; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %r15, %xmm14 ; X64-NEXT: adcq $0, %rcx ; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq $0, %rdx -; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %rdx, %xmm15 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload ; X64-NEXT: movq 64(%rsi), %rdi -; X64-NEXT: movq (%rsp), %rbx # 8-byte Reload +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload ; X64-NEXT: movq %rbx, %rax ; X64-NEXT: mulq %rdi ; X64-NEXT: movq %rdx, %rcx @@ -5483,7 +5517,7 @@ ; X64-NEXT: addq %r15, %rsi ; X64-NEXT: adcq %r10, %rbx ; X64-NEXT: setb %r9b -; X64-NEXT: movq (%rsp), %r14 # 8-byte Reload +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload ; X64-NEXT: movq %r14, %rax ; X64-NEXT: mulq %r11 ; X64-NEXT: movq %rdx, %rcx @@ -5510,7 +5544,7 @@ ; X64-NEXT: movzbl %dil, %eax ; X64-NEXT: adcq %rax, %rdx ; X64-NEXT: addq %rsi, %r10 -; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %r10, (%rsp) # 8-byte Spill ; X64-NEXT: adcq %rbx, %rbp ; X64-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movzbl %r9b, %eax @@ -5622,7 +5656,7 @@ ; X64-NEXT: adcq %r14, %rsi ; X64-NEXT: adcq %r11, %rax ; X64-NEXT: adcq %r13, %rdx -; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload +; X64-NEXT: addq (%rsp), %r15 # 8-byte Folded Reload ; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload ; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill @@ -5666,7 +5700,7 @@ ; X64-NEXT: movq %r13, %rax ; X64-NEXT: movq %r14, %rdi ; X64-NEXT: mulq %r14 -; X64-NEXT: movq %rax, (%rsp) # 8-byte Spill +; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq %rdx, %r14 ; X64-NEXT: movq 72(%r9), %rax ; X64-NEXT: movq %rax, %r9 @@ -5747,18 +5781,18 @@ ; X64-NEXT: mulq %r9 ; X64-NEXT: movq %rdx, %r14 ; X64-NEXT: addq %rcx, %rax -; X64-NEXT: movq %rax, %rcx +; X64-NEXT: movq %rax, %rbp ; X64-NEXT: adcq %rsi, %r14 -; X64-NEXT: setb %sil +; X64-NEXT: setb %cl ; X64-NEXT: movq %r10, %rax ; X64-NEXT: mulq %r9 ; X64-NEXT: addq %r14, %rax -; X64-NEXT: movzbl %sil, %esi -; X64-NEXT: adcq %rsi, %rdx +; X64-NEXT: movzbl %cl, %ecx +; X64-NEXT: adcq %rcx, %rdx ; X64-NEXT: addq %r13, %r12 ; X64-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq %r15, %rcx -; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq %r15, %rbp +; X64-NEXT: movq %rbp, (%rsp) # 8-byte Spill ; X64-NEXT: movzbl %r8b, %ecx ; X64-NEXT: adcq %rcx, %rax ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill @@ -5802,8 +5836,8 @@ ; X64-NEXT: movq %rbp, %rax ; X64-NEXT: mulq %r15 ; X64-NEXT: movq %rdx, %rdi -; X64-NEXT: addq %rcx, %rax -; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %rax, %rbp +; X64-NEXT: addq %rcx, %rbp ; X64-NEXT: adcq %rsi, %rdi ; X64-NEXT: setb %cl ; X64-NEXT: movq %rbx, %rax @@ -5829,8 +5863,8 @@ ; X64-NEXT: movq %rsi, %r9 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; X64-NEXT: movq %rax, %rsi -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; X64-NEXT: imulq %rbp, %rsi +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload +; X64-NEXT: imulq %rbx, %rsi ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload ; X64-NEXT: mulq %rdi ; X64-NEXT: movq %rax, %r13 @@ -5845,7 +5879,7 @@ ; X64-NEXT: mulq %r11 ; X64-NEXT: movq %rdx, %rsi ; X64-NEXT: movq %rax, %r8 -; X64-NEXT: movq %rbp, %rax +; X64-NEXT: movq %rbx, %rax ; X64-NEXT: mulq %r11 ; X64-NEXT: movq %rdx, %r9 ; X64-NEXT: movq %rax, %rdi @@ -5854,11 +5888,11 @@ ; X64-NEXT: movq %r10, %rax ; X64-NEXT: mulq %r14 ; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, %rsi -; X64-NEXT: addq %rdi, %rsi +; X64-NEXT: movq %rax, %r10 +; X64-NEXT: addq %rdi, %r10 ; X64-NEXT: adcq %r9, %rcx ; X64-NEXT: setb %dil -; X64-NEXT: movq %rbp, %rax +; X64-NEXT: movq %rbx, %rax ; X64-NEXT: mulq %r14 ; X64-NEXT: addq %rcx, %rax ; X64-NEXT: movzbl %dil, %ecx @@ -5866,61 +5900,67 @@ ; X64-NEXT: addq %r13, %rax ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload ; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload +; X64-NEXT: adcq %rbp, %r10 ; X64-NEXT: adcq %r15, %rax ; X64-NEXT: adcq %r12, %rdx ; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload +; X64-NEXT: adcq (%rsp), %r10 # 8-byte Folded Reload ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload -; X64-NEXT: movq (%rsp), %rcx # 8-byte Reload -; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload +; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload -; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload -; X64-NEXT: movq %rcx, %r9 -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload -; X64-NEXT: movq %rdi, %r10 -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload +; X64-NEXT: movq %xmm9, %rcx +; X64-NEXT: addq %rcx, %rdi +; X64-NEXT: movq %rdi, %r9 +; X64-NEXT: movq %xmm10, %rcx +; X64-NEXT: adcq %rcx, %rbx +; X64-NEXT: movq %xmm12, %rcx +; X64-NEXT: adcq %rcx, %rsi +; X64-NEXT: movq %xmm11, %rcx +; X64-NEXT: adcq %rcx, %rbp +; X64-NEXT: movq %xmm13, %rcx +; X64-NEXT: adcq %rcx, %r8 +; X64-NEXT: movq %xmm14, %rcx +; X64-NEXT: adcq %rcx, %r10 ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; X64-NEXT: movq %xmm15, %rcx +; X64-NEXT: adcq %rcx, %rdx +; X64-NEXT: movq %xmm0, %rcx +; X64-NEXT: movq %xmm1, %rdi ; X64-NEXT: movq %rdi, (%rcx) -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; X64-NEXT: movq %xmm2, %rdi ; X64-NEXT: movq %rdi, 8(%rcx) -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; X64-NEXT: movq %xmm3, %rdi ; X64-NEXT: movq %rdi, 16(%rcx) -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; X64-NEXT: movq %xmm4, %rdi ; X64-NEXT: movq %rdi, 24(%rcx) -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; X64-NEXT: movq %xmm5, %rdi ; X64-NEXT: movq %rdi, 32(%rcx) -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; X64-NEXT: movq %xmm6, %rdi ; X64-NEXT: movq %rdi, 40(%rcx) -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; X64-NEXT: movq %xmm7, %rdi ; X64-NEXT: movq %rdi, 48(%rcx) -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; X64-NEXT: movq %xmm8, %rdi ; X64-NEXT: movq %rdi, 56(%rcx) ; X64-NEXT: movq %r9, 64(%rcx) -; X64-NEXT: movq %r10, 72(%rcx) -; X64-NEXT: movq %rbx, 80(%rcx) +; X64-NEXT: movq %rbx, 72(%rcx) +; X64-NEXT: movq %rsi, 80(%rcx) ; X64-NEXT: movq %rbp, 88(%rcx) ; X64-NEXT: movq %r8, 96(%rcx) -; X64-NEXT: movq %rsi, 104(%rcx) +; X64-NEXT: movq %r10, 104(%rcx) ; X64-NEXT: movq %rax, 112(%rcx) ; X64-NEXT: movq %rdx, 120(%rcx) -; X64-NEXT: addq $240, %rsp +; X64-NEXT: addq $112, %rsp ; X64-NEXT: popq %rbx ; X64-NEXT: popq %r12 ; X64-NEXT: popq %r13 diff --git a/llvm/test/CodeGen/X86/mul-i512.ll b/llvm/test/CodeGen/X86/mul-i512.ll --- a/llvm/test/CodeGen/X86/mul-i512.ll +++ b/llvm/test/CodeGen/X86/mul-i512.ll @@ -1183,29 +1183,27 @@ ; X64-NEXT: pushq %r13 ; X64-NEXT: pushq %r12 ; X64-NEXT: pushq %rbx -; X64-NEXT: pushq %rax -; X64-NEXT: movq %rdx, (%rsp) # 8-byte Spill -; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %rdx, %xmm0 ; X64-NEXT: movq (%rdi), %r9 ; X64-NEXT: movq 8(%rdi), %r15 ; X64-NEXT: movq 24(%rdi), %r12 ; X64-NEXT: movq 16(%rdi), %rax -; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %rdi, %xmm5 ; X64-NEXT: movq (%rsi), %rdi ; X64-NEXT: movq 8(%rsi), %r14 +; X64-NEXT: movq %rsi, %xmm3 ; X64-NEXT: movq %rax, %rsi -; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: mulq %rdi ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq %rax, %r11 ; X64-NEXT: movq %r12, %rax -; X64-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: mulq %rdi ; X64-NEXT: movq %rdx, %rbp ; X64-NEXT: movq %rax, %rbx ; X64-NEXT: addq %rcx, %rbx ; X64-NEXT: adcq $0, %rbp ; X64-NEXT: movq %rsi, %rax +; X64-NEXT: movq %rsi, %xmm6 ; X64-NEXT: mulq %r14 ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq %rax, %r10 @@ -1214,35 +1212,35 @@ ; X64-NEXT: setb %al ; X64-NEXT: movzbl %al, %esi ; X64-NEXT: movq %r12, %rax +; X64-NEXT: movq %r12, %xmm7 ; X64-NEXT: mulq %r14 -; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq %rdx, %r8 ; X64-NEXT: movq %rax, %r13 ; X64-NEXT: addq %rcx, %r13 ; X64-NEXT: adcq %rsi, %r8 ; X64-NEXT: movq %r9, %rax -; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: mulq %rdi ; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %rax, %xmm1 ; X64-NEXT: movq %r15, %rax ; X64-NEXT: mulq %rdi +; X64-NEXT: movq %rdi, %xmm13 ; X64-NEXT: movq %rdx, %rbx ; X64-NEXT: movq %rax, %rbp ; X64-NEXT: addq %rcx, %rbp ; X64-NEXT: adcq $0, %rbx ; X64-NEXT: movq %r9, %rax ; X64-NEXT: movq %r9, %r12 -; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: mulq %r14 ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: addq %rbp, %rax -; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %rax, %xmm2 ; X64-NEXT: adcq %rbx, %rcx ; X64-NEXT: setb %sil ; X64-NEXT: movq %r15, %rdi ; X64-NEXT: movq %r15, %rax ; X64-NEXT: mulq %r14 +; X64-NEXT: movq %r14, %xmm14 ; X64-NEXT: movq %rdx, %r15 ; X64-NEXT: movq %rax, %rbx ; X64-NEXT: addq %rcx, %rbx @@ -1253,7 +1251,7 @@ ; X64-NEXT: adcq $0, %r13 ; X64-NEXT: movq %r8, %r14 ; X64-NEXT: adcq $0, %r14 -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; X64-NEXT: movq %xmm3, %rsi ; X64-NEXT: movq 16(%rsi), %r8 ; X64-NEXT: movq %r9, %rax ; X64-NEXT: mulq %r8 @@ -1261,14 +1259,15 @@ ; X64-NEXT: movq %rax, %r9 ; X64-NEXT: movq %rdi, %rax ; X64-NEXT: movq %rdi, %r11 -; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq %rax, %rbp ; X64-NEXT: addq %r10, %rbp ; X64-NEXT: adcq $0, %rcx ; X64-NEXT: movq 24(%rsi), %rdi +; X64-NEXT: movdqa %xmm3, %xmm11 ; X64-NEXT: movq %r12, %rax +; X64-NEXT: movq %r12, %xmm10 ; X64-NEXT: mulq %rdi ; X64-NEXT: movq %rdx, %rsi ; X64-NEXT: addq %rbp, %rax @@ -1276,6 +1275,7 @@ ; X64-NEXT: adcq %rcx, %rsi ; X64-NEXT: setb %cl ; X64-NEXT: movq %r11, %rax +; X64-NEXT: movq %r11, %xmm12 ; X64-NEXT: mulq %rdi ; X64-NEXT: movq %rdx, %r11 ; X64-NEXT: movq %rax, %r12 @@ -1283,20 +1283,21 @@ ; X64-NEXT: movzbl %cl, %eax ; X64-NEXT: adcq %rax, %r11 ; X64-NEXT: addq %rbx, %r9 -; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %r9, %xmm3 ; X64-NEXT: adcq %r15, %rbp -; X64-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %rbp, %xmm4 ; X64-NEXT: adcq $0, %r12 ; X64-NEXT: adcq $0, %r11 ; X64-NEXT: addq %r13, %r12 ; X64-NEXT: adcq %r14, %r11 ; X64-NEXT: setb %r9b -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload +; X64-NEXT: movq %xmm6, %rbx ; X64-NEXT: movq %rbx, %rax ; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq %rax, %r14 -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; X64-NEXT: movq %xmm7, %r10 +; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq %r10, %rax ; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rdx, %rsi @@ -1304,6 +1305,7 @@ ; X64-NEXT: addq %rcx, %rbp ; X64-NEXT: adcq $0, %rsi ; X64-NEXT: movq %rbx, %rax +; X64-NEXT: movdqa %xmm6, %xmm15 ; X64-NEXT: mulq %rdi ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: addq %rbp, %rax @@ -1316,15 +1318,15 @@ ; X64-NEXT: movzbl %bl, %ecx ; X64-NEXT: adcq %rcx, %rdx ; X64-NEXT: addq %r12, %r14 -; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %r14, %xmm8 ; X64-NEXT: adcq %r11, %rbp -; X64-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %rbp, %xmm6 ; X64-NEXT: movzbl %r9b, %ecx ; X64-NEXT: adcq %rcx, %rax -; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %rax, %xmm9 ; X64-NEXT: adcq $0, %rdx -; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; X64-NEXT: movq %rdx, %xmm7 +; X64-NEXT: movq %xmm5, %rcx ; X64-NEXT: movq 32(%rcx), %r10 ; X64-NEXT: imulq %r10, %rdi ; X64-NEXT: movq %r10, %rax @@ -1336,9 +1338,9 @@ ; X64-NEXT: addq %rdx, %r8 ; X64-NEXT: movq 48(%rcx), %rax ; X64-NEXT: movq %rax, %rdi -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload +; X64-NEXT: movq %xmm14, %rbx ; X64-NEXT: imulq %rbx, %rdi -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; X64-NEXT: movq %xmm13, %rsi ; X64-NEXT: mulq %rsi ; X64-NEXT: movq %rax, %r12 ; X64-NEXT: addq %rdi, %rdx @@ -1351,7 +1353,7 @@ ; X64-NEXT: movq %rsi, %rcx ; X64-NEXT: mulq %r10 ; X64-NEXT: movq %rdx, %rdi -; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %rax, %xmm5 ; X64-NEXT: movq %rbx, %rax ; X64-NEXT: movq %rbx, %r11 ; X64-NEXT: mulq %r10 @@ -1375,9 +1377,9 @@ ; X64-NEXT: adcq %rax, %r15 ; X64-NEXT: addq %r12, %r13 ; X64-NEXT: adcq %rbp, %r15 -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; X64-NEXT: movq %xmm11, %rdx ; X64-NEXT: movq 56(%rdx), %rcx -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; X64-NEXT: movq %xmm10, %rax ; X64-NEXT: imulq %rax, %rcx ; X64-NEXT: movq 48(%rdx), %rbp ; X64-NEXT: movq %rdx, %rsi @@ -1385,12 +1387,12 @@ ; X64-NEXT: mulq %rbp ; X64-NEXT: movq %rax, %r12 ; X64-NEXT: addq %rcx, %rdx -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; X64-NEXT: movq %xmm12, %r8 ; X64-NEXT: imulq %r8, %rbp ; X64-NEXT: addq %rdx, %rbp ; X64-NEXT: movq 32(%rsi), %rdi ; X64-NEXT: movq 40(%rsi), %rbx -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; X64-NEXT: movq %xmm15, %rax ; X64-NEXT: movq %rax, %rsi ; X64-NEXT: imulq %rbx, %rsi ; X64-NEXT: mulq %rdi @@ -1425,28 +1427,32 @@ ; X64-NEXT: adcq %rbp, %rdx ; X64-NEXT: addq %rcx, %rax ; X64-NEXT: adcq %r14, %rdx -; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload +; X64-NEXT: movq %xmm5, %rcx +; X64-NEXT: addq %rcx, %rsi ; X64-NEXT: adcq %r10, %rdi ; X64-NEXT: adcq %r13, %rax ; X64-NEXT: adcq %r15, %rdx -; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload -; X64-NEXT: movq (%rsp), %rcx # 8-byte Reload -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload +; X64-NEXT: movq %xmm8, %rcx +; X64-NEXT: addq %rcx, %rsi +; X64-NEXT: movq %xmm6, %rcx +; X64-NEXT: adcq %rcx, %rdi +; X64-NEXT: movq %xmm9, %rcx +; X64-NEXT: adcq %rcx, %rax +; X64-NEXT: movq %xmm7, %rcx +; X64-NEXT: adcq %rcx, %rdx +; X64-NEXT: movq %xmm0, %rcx +; X64-NEXT: movq %xmm1, %rbp ; X64-NEXT: movq %rbp, (%rcx) -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload +; X64-NEXT: movq %xmm2, %rbp ; X64-NEXT: movq %rbp, 8(%rcx) -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload +; X64-NEXT: movq %xmm3, %rbp ; X64-NEXT: movq %rbp, 16(%rcx) -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload +; X64-NEXT: movq %xmm4, %rbp ; X64-NEXT: movq %rbp, 24(%rcx) ; X64-NEXT: movq %rsi, 32(%rcx) ; X64-NEXT: movq %rdi, 40(%rcx) ; X64-NEXT: movq %rax, 48(%rcx) ; X64-NEXT: movq %rdx, 56(%rcx) -; X64-NEXT: addq $8, %rsp ; X64-NEXT: popq %rbx ; X64-NEXT: popq %r12 ; X64-NEXT: popq %r13 diff --git a/llvm/test/CodeGen/X86/stack-folding-adx-x86_64.ll b/llvm/test/CodeGen/X86/stack-folding-adx-x86_64.ll --- a/llvm/test/CodeGen/X86/stack-folding-adx-x86_64.ll +++ b/llvm/test/CodeGen/X86/stack-folding-adx-x86_64.ll @@ -58,13 +58,13 @@ ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq - %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() - %2 = call { i8, i32 } @llvm.x86.addcarry.32(i8 %a0, i32 %a1, i32 %a2) - %3 = extractvalue { i8, i32 } %2, 1 - %4 = bitcast i8* %a3 to i32* - store i32 %3, i32* %4, align 1 - %5 = extractvalue { i8, i32 } %2, 0 - ret i8 %5 + call void asm sideeffect "nop", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"() + %1 = call { i8, i32 } @llvm.x86.addcarry.32(i8 %a0, i32 %a1, i32 %a2) + %2 = extractvalue { i8, i32 } %1, 1 + %3 = bitcast i8* %a3 to i32* + store i32 %2, i32* %3, align 1 + %4 = extractvalue { i8, i32 } %1, 0 + ret i8 %4 } define i8 @stack_fold_addcarry_u64(i8 %a0, i64 %a1, i64 %a2, i8* %a3) { @@ -115,13 +115,13 @@ ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq - %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() - %2 = call { i8, i64 } @llvm.x86.addcarry.64(i8 %a0, i64 %a1, i64 %a2) - %3 = extractvalue { i8, i64 } %2, 1 - %4 = bitcast i8* %a3 to i64* - store i64 %3, i64* %4, align 1 - %5 = extractvalue { i8, i64 } %2, 0 - ret i8 %5 + call void asm sideeffect "nop", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"() + %1 = call { i8, i64 } @llvm.x86.addcarry.64(i8 %a0, i64 %a1, i64 %a2) + %2 = extractvalue { i8, i64 } %1, 1 + %3 = bitcast i8* %a3 to i64* + store i64 %2, i64* %3, align 1 + %4 = extractvalue { i8, i64 } %1, 0 + ret i8 %4 } define i8 @stack_fold_addcarryx_u32(i8 %a0, i32 %a1, i32 %a2, i8* %a3) { @@ -172,13 +172,13 @@ ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq - %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() - %2 = call { i8, i32 } @llvm.x86.addcarry.32(i8 %a0, i32 %a1, i32 %a2) - %3 = extractvalue { i8, i32 } %2, 1 - %4 = bitcast i8* %a3 to i32* - store i32 %3, i32* %4, align 1 - %5 = extractvalue { i8, i32 } %2, 0 - ret i8 %5 + call void asm sideeffect "nop", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"() + %1 = call { i8, i32 } @llvm.x86.addcarry.32(i8 %a0, i32 %a1, i32 %a2) + %2 = extractvalue { i8, i32 } %1, 1 + %3 = bitcast i8* %a3 to i32* + store i32 %2, i32* %3, align 1 + %4 = extractvalue { i8, i32 } %1, 0 + ret i8 %4 } define i8 @stack_fold_addcarryx_u64(i8 %a0, i64 %a1, i64 %a2, i8* %a3) { @@ -229,13 +229,13 @@ ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq - %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() - %2 = call { i8, i64 } @llvm.x86.addcarry.64(i8 %a0, i64 %a1, i64 %a2) - %3 = extractvalue { i8, i64 } %2, 1 - %4 = bitcast i8* %a3 to i64* - store i64 %3, i64* %4, align 1 - %5 = extractvalue { i8, i64 } %2, 0 - ret i8 %5 + call void asm sideeffect "nop", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"() + %1 = call { i8, i64 } @llvm.x86.addcarry.64(i8 %a0, i64 %a1, i64 %a2) + %2 = extractvalue { i8, i64 } %1, 1 + %3 = bitcast i8* %a3 to i64* + store i64 %2, i64* %3, align 1 + %4 = extractvalue { i8, i64 } %1, 0 + ret i8 %4 } define i8 @stack_fold_subborrow_u32(i8 %a0, i32 %a1, i32 %a2, i8* %a3) { @@ -286,13 +286,13 @@ ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq - %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() - %2 = call { i8, i32 } @llvm.x86.subborrow.32(i8 %a0, i32 %a1, i32 %a2) - %3 = extractvalue { i8, i32 } %2, 1 - %4 = bitcast i8* %a3 to i32* - store i32 %3, i32* %4, align 1 - %5 = extractvalue { i8, i32 } %2, 0 - ret i8 %5 + call void asm sideeffect "nop", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"() + %1 = call { i8, i32 } @llvm.x86.subborrow.32(i8 %a0, i32 %a1, i32 %a2) + %2 = extractvalue { i8, i32 } %1, 1 + %3 = bitcast i8* %a3 to i32* + store i32 %2, i32* %3, align 1 + %4 = extractvalue { i8, i32 } %1, 0 + ret i8 %4 } define i8 @stack_fold_subborrow_u64(i8 %a0, i64 %a1, i64 %a2, i8* %a3) { @@ -343,13 +343,13 @@ ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq - %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() - %2 = call { i8, i64 } @llvm.x86.subborrow.64(i8 %a0, i64 %a1, i64 %a2) - %3 = extractvalue { i8, i64 } %2, 1 - %4 = bitcast i8* %a3 to i64* - store i64 %3, i64* %4, align 1 - %5 = extractvalue { i8, i64 } %2, 0 - ret i8 %5 + call void asm sideeffect "nop", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"() + %1 = call { i8, i64 } @llvm.x86.subborrow.64(i8 %a0, i64 %a1, i64 %a2) + %2 = extractvalue { i8, i64 } %1, 1 + %3 = bitcast i8* %a3 to i64* + store i64 %2, i64* %3, align 1 + %4 = extractvalue { i8, i64 } %1, 0 + ret i8 %4 } declare { i8, i32 } @llvm.x86.addcarry.32(i8, i32, i32) diff --git a/llvm/test/CodeGen/X86/stack-folding-bmi.ll b/llvm/test/CodeGen/X86/stack-folding-bmi.ll --- a/llvm/test/CodeGen/X86/stack-folding-bmi.ll +++ b/llvm/test/CodeGen/X86/stack-folding-bmi.ll @@ -50,10 +50,10 @@ ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq - %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() - %2 = xor i32 %a0, -1 - %3 = and i32 %a1, %2 - ret i32 %3 + call void asm sideeffect "nop", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"() + %1 = xor i32 %a0, -1 + %2 = and i32 %a1, %1 + ret i32 %2 } define i64 @stack_fold_andn_u64(i64 %a0, i64 %a1) { @@ -97,10 +97,10 @@ ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq - %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() - %2 = xor i64 %a0, -1 - %3 = and i64 %a1, %2 - ret i64 %3 + call void asm sideeffect "nop", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"() + %1 = xor i64 %a0, -1 + %2 = and i64 %a1, %1 + ret i64 %2 } define i32 @stack_fold_bextr_u32(i32 %a0, i32 %a1) { @@ -144,9 +144,9 @@ ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq - %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() - %2 = tail call i32 @llvm.x86.bmi.bextr.32(i32 %a0, i32 %a1) - ret i32 %2 + call void asm sideeffect "nop", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"() + %1 = call i32 @llvm.x86.bmi.bextr.32(i32 %a0, i32 %a1) + ret i32 %1 } declare i32 @llvm.x86.bmi.bextr.32(i32, i32) @@ -191,9 +191,9 @@ ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq - %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() - %2 = tail call i64 @llvm.x86.bmi.bextr.64(i64 %a0, i64 %a1) - ret i64 %2 + call void asm sideeffect "nop", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"() + %1 = call i64 @llvm.x86.bmi.bextr.64(i64 %a0, i64 %a1) + ret i64 %1 } declare i64 @llvm.x86.bmi.bextr.64(i64, i64) @@ -236,10 +236,10 @@ ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq - %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() - %2 = sub i32 0, %a0 - %3 = and i32 %2, %a0 - ret i32 %3 + call void asm sideeffect "nop", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"() + %1 = sub i32 0, %a0 + %2 = and i32 %1, %a0 + ret i32 %2 } define i64 @stack_fold_blsi_u64(i64 %a0) { @@ -281,10 +281,10 @@ ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq - %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() - %2 = sub i64 0, %a0 - %3 = and i64 %2, %a0 - ret i64 %3 + call void asm sideeffect "nop", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"() + %1 = sub i64 0, %a0 + %2 = and i64 %1, %a0 + ret i64 %2 } define i32 @stack_fold_blsmsk_u32(i32 %a0) { @@ -326,10 +326,10 @@ ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq - %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() - %2 = sub i32 %a0, 1 - %3 = xor i32 %2, %a0 - ret i32 %3 + call void asm sideeffect "nop", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"() + %1 = sub i32 %a0, 1 + %2 = xor i32 %1, %a0 + ret i32 %2 } define i64 @stack_fold_blsmsk_u64(i64 %a0) { @@ -371,10 +371,10 @@ ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq - %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() - %2 = sub i64 %a0, 1 - %3 = xor i64 %2, %a0 - ret i64 %3 + call void asm sideeffect "nop", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"() + %1 = sub i64 %a0, 1 + %2 = xor i64 %1, %a0 + ret i64 %2 } define i32 @stack_fold_blsr_u32(i32 %a0) { @@ -416,10 +416,10 @@ ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq - %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() - %2 = sub i32 %a0, 1 - %3 = and i32 %2, %a0 - ret i32 %3 + call void asm sideeffect "nop", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"() + %1 = sub i32 %a0, 1 + %2 = and i32 %1, %a0 + ret i32 %2 } define i64 @stack_fold_blsr_u64(i64 %a0) { @@ -461,10 +461,10 @@ ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq - %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() - %2 = sub i64 %a0, 1 - %3 = and i64 %2, %a0 - ret i64 %3 + call void asm sideeffect "nop", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"() + %1 = sub i64 %a0, 1 + %2 = and i64 %1, %a0 + ret i64 %2 } ;TODO stack_fold_tzcnt_u16 @@ -508,9 +508,9 @@ ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq - %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() - %2 = tail call i32 @llvm.cttz.i32(i32 %a0, i1 0) - ret i32 %2 + call void asm sideeffect "nop", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"() + %1 = call i32 @llvm.cttz.i32(i32 %a0, i1 0) + ret i32 %1 } declare i32 @llvm.cttz.i32(i32, i1) @@ -553,8 +553,8 @@ ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq - %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() - %2 = tail call i64 @llvm.cttz.i64(i64 %a0, i1 0) - ret i64 %2 + call void asm sideeffect "nop", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"() + %1 = call i64 @llvm.cttz.i64(i64 %a0, i1 0) + ret i64 %1 } declare i64 @llvm.cttz.i64(i64, i1) diff --git a/llvm/test/CodeGen/X86/stack-folding-bmi2.ll b/llvm/test/CodeGen/X86/stack-folding-bmi2.ll --- a/llvm/test/CodeGen/X86/stack-folding-bmi2.ll +++ b/llvm/test/CodeGen/X86/stack-folding-bmi2.ll @@ -50,9 +50,9 @@ ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq - %1 = tail call i32 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() - %2 = tail call i32 @llvm.x86.bmi.bzhi.32(i32 %a0, i32 %a1) - ret i32 %2 + call void asm sideeffect "nop", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"() + %1 = tail call i32 @llvm.x86.bmi.bzhi.32(i32 %a0, i32 %a1) + ret i32 %1 } declare i32 @llvm.x86.bmi.bzhi.32(i32, i32) @@ -97,9 +97,9 @@ ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq - %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() - %2 = tail call i64 @llvm.x86.bmi.bzhi.64(i64 %a0, i64 %a1) - ret i64 %2 + call void asm sideeffect "nop", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"() + %1 = tail call i64 @llvm.x86.bmi.bzhi.64(i64 %a0, i64 %a1) + ret i64 %1 } declare i64 @llvm.x86.bmi.bzhi.64(i64, i64) @@ -144,9 +144,9 @@ ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq - %1 = tail call i32 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() - %2 = tail call i32 @llvm.x86.bmi.pdep.32(i32 %a0, i32 %a1) - ret i32 %2 + call void asm sideeffect "nop", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"() + %1 = tail call i32 @llvm.x86.bmi.pdep.32(i32 %a0, i32 %a1) + ret i32 %1 } declare i32 @llvm.x86.bmi.pdep.32(i32, i32) @@ -191,9 +191,9 @@ ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq - %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() - %2 = tail call i64 @llvm.x86.bmi.pdep.64(i64 %a0, i64 %a1) - ret i64 %2 + call void asm sideeffect "nop", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"() + %1 = tail call i64 @llvm.x86.bmi.pdep.64(i64 %a0, i64 %a1) + ret i64 %1 } declare i64 @llvm.x86.bmi.pdep.64(i64, i64) @@ -238,9 +238,9 @@ ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq - %1 = tail call i32 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() - %2 = tail call i32 @llvm.x86.bmi.pext.32(i32 %a0, i32 %a1) - ret i32 %2 + call void asm sideeffect "nop", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"() + %1 = tail call i32 @llvm.x86.bmi.pext.32(i32 %a0, i32 %a1) + ret i32 %1 } declare i32 @llvm.x86.bmi.pext.32(i32, i32) @@ -285,8 +285,8 @@ ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq - %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() - %2 = tail call i64 @llvm.x86.bmi.pext.64(i64 %a0, i64 %a1) - ret i64 %2 + call void asm sideeffect "nop", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"() + %1 = tail call i64 @llvm.x86.bmi.pext.64(i64 %a0, i64 %a1) + ret i64 %1 } declare i64 @llvm.x86.bmi.pext.64(i64, i64) diff --git a/llvm/test/CodeGen/X86/stack-folding-fp-avx1.ll b/llvm/test/CodeGen/X86/stack-folding-fp-avx1.ll --- a/llvm/test/CodeGen/X86/stack-folding-fp-avx1.ll +++ b/llvm/test/CodeGen/X86/stack-folding-fp-avx1.ll @@ -990,7 +990,8 @@ ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vcvtsi2sdl {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload +; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vcvtsi2sdl {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: .cfi_def_cfa_offset 48 ; CHECK-NEXT: popq %r12 @@ -1004,9 +1005,9 @@ ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq - %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() - %2 = sitofp i32 %a0 to double - ret double %2 + call void asm sideeffect "nop", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"() + %1 = sitofp i32 %a0 to double + ret double %1 } define <2 x double> @stack_fold_cvtsi2sd_int(i32 %a0) { @@ -1034,7 +1035,8 @@ ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vcvtsi2sdl {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload +; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vcvtsi2sdl {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload ; CHECK-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: .cfi_def_cfa_offset 48 @@ -1049,10 +1051,10 @@ ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq - %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() - %2 = sitofp i32 %a0 to double - %3 = insertelement <2 x double> zeroinitializer, double %2, i64 0 - ret <2 x double> %3 + call void asm sideeffect "nop", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"() + %1 = sitofp i32 %a0 to double + %2 = insertelement <2 x double> zeroinitializer, double %1, i64 0 + ret <2 x double> %2 } define double @stack_fold_cvtsi642sd(i64 %a0) { @@ -1080,7 +1082,8 @@ ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vcvtsi2sdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 8-byte Folded Reload +; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vcvtsi2sdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 8-byte Folded Reload ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: .cfi_def_cfa_offset 48 ; CHECK-NEXT: popq %r12 @@ -1094,9 +1097,9 @@ ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq - %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() - %2 = sitofp i64 %a0 to double - ret double %2 + call void asm sideeffect "nop", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"() + %1 = sitofp i64 %a0 to double + ret double %1 } define <2 x double> @stack_fold_cvtsi642sd_int(i64 %a0) { @@ -1124,7 +1127,8 @@ ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vcvtsi2sdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 8-byte Folded Reload +; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vcvtsi2sdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 8-byte Folded Reload ; CHECK-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: .cfi_def_cfa_offset 48 @@ -1139,10 +1143,10 @@ ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq - %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() - %2 = sitofp i64 %a0 to double - %3 = insertelement <2 x double> zeroinitializer, double %2, i64 0 - ret <2 x double> %3 + call void asm sideeffect "nop", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"() + %1 = sitofp i64 %a0 to double + %2 = insertelement <2 x double> zeroinitializer, double %1, i64 0 + ret <2 x double> %2 } define float @stack_fold_cvtsi2ss(i32 %a0) { @@ -1261,7 +1265,8 @@ ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vcvtsi2ssq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 8-byte Folded Reload +; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vcvtsi2ssq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 8-byte Folded Reload ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: .cfi_def_cfa_offset 48 ; CHECK-NEXT: popq %r12 @@ -1275,9 +1280,9 @@ ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq - %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() - %2 = sitofp i64 %a0 to float - ret float %2 + call void asm sideeffect "nop", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"() + %1 = sitofp i64 %a0 to float + ret float %1 } define <4 x float> @stack_fold_cvtsi642ss_int(i64 %a0) { @@ -1305,7 +1310,8 @@ ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vcvtsi2ssq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 8-byte Folded Reload +; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vcvtsi2ssq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 8-byte Folded Reload ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; CHECK-NEXT: popq %rbx @@ -1321,10 +1327,10 @@ ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq - %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() - %2 = sitofp i64 %a0 to float - %3 = insertelement <4 x float> zeroinitializer, float %2, i64 0 - ret <4 x float> %3 + call void asm sideeffect "nop", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"() + %1 = sitofp i64 %a0 to float + %2 = insertelement <4 x float> zeroinitializer, float %1, i64 0 + ret <4 x float> %2 } ; TODO stack_fold_cvtss2si diff --git a/llvm/test/CodeGen/X86/stack-folding-fp-sse42.ll b/llvm/test/CodeGen/X86/stack-folding-fp-sse42.ll --- a/llvm/test/CodeGen/X86/stack-folding-fp-sse42.ll +++ b/llvm/test/CodeGen/X86/stack-folding-fp-sse42.ll @@ -647,9 +647,9 @@ ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq - %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() - %2 = sitofp i32 %a0 to double - ret double %2 + call void asm sideeffect "nop", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"() + %1 = sitofp i32 %a0 to double + ret double %1 } define <2 x double> @stack_fold_cvtsi2sd_int(i32 %a0, <2 x double> %b0) { @@ -691,10 +691,10 @@ ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq - %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() - %2 = sitofp i32 %a0 to double - %3 = insertelement <2 x double> %b0, double %2, i64 0 - ret <2 x double> %3 + call void asm sideeffect "nop", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"() + %1 = sitofp i32 %a0 to double + %2 = insertelement <2 x double> %b0, double %1, i64 0 + ret <2 x double> %2 } define double @stack_fold_cvtsi642sd(i64 %a0) { @@ -737,9 +737,9 @@ ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq - %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() - %2 = sitofp i64 %a0 to double - ret double %2 + call void asm sideeffect "nop", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"() + %1 = sitofp i64 %a0 to double + ret double %1 } define <2 x double> @stack_fold_cvtsi642sd_int(i64 %a0, <2 x double> %b0) { @@ -781,10 +781,10 @@ ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq - %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() - %2 = sitofp i64 %a0 to double - %3 = insertelement <2 x double> %b0, double %2, i64 0 - ret <2 x double> %3 + call void asm sideeffect "nop", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"() + %1 = sitofp i64 %a0 to double + %2 = insertelement <2 x double> %b0, double %1, i64 0 + ret <2 x double> %2 } define float @stack_fold_cvtsi2ss(i32 %a0) { @@ -917,9 +917,9 @@ ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq - %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() - %2 = sitofp i64 %a0 to float - ret float %2 + call void asm sideeffect "nop", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"() + %1 = sitofp i64 %a0 to float + ret float %1 } define <4 x float> @stack_fold_cvtsi642ss_int(i64 %a0, <4 x float> %b0) { @@ -961,10 +961,10 @@ ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq - %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() - %2 = sitofp i64 %a0 to float - %3 = insertelement <4 x float> %b0, float %2, i64 0 - ret <4 x float> %3 + call void asm sideeffect "nop", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"() + %1 = sitofp i64 %a0 to float + %2 = insertelement <4 x float> %b0, float %1, i64 0 + ret <4 x float> %2 } define double @stack_fold_cvtss2sd(float %a0) minsize { diff --git a/llvm/test/CodeGen/X86/stack-folding-int-avx1.ll b/llvm/test/CodeGen/X86/stack-folding-int-avx1.ll --- a/llvm/test/CodeGen/X86/stack-folding-int-avx1.ll +++ b/llvm/test/CodeGen/X86/stack-folding-int-avx1.ll @@ -141,11 +141,11 @@ ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq - %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() - %2 = insertelement <4 x i32> zeroinitializer, i32 %a0, i32 0 + call void asm sideeffect "nop", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"() + %1 = insertelement <4 x i32> zeroinitializer, i32 %a0, i32 0 ; add forces execution domain - %3 = add <4 x i32> %2, - ret <4 x i32> %3 + %2 = add <4 x i32> %1, + ret <4 x i32> %2 } define i32 @stack_fold_movd_store(<4 x i32> %a0, <4 x i32> %a1) { @@ -191,7 +191,7 @@ ; add forces execution domain %1 = add <4 x i32> %a0, %a1 %2 = extractelement <4 x i32> %1, i32 0 - %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() + call void asm sideeffect "nop", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"() ret i32 %2 } @@ -257,7 +257,7 @@ ; add forces execution domain %1 = add <2 x i64> %a0, %a1 %2 = extractelement <2 x i64> %1, i32 0 - %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() + call void asm sideeffect "nop", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"() ret i64 %2 } @@ -866,7 +866,7 @@ ; add forces execution domain %1 = add <4 x i32> %a0, %a1 %2 = extractelement <4 x i32> %1, i32 1 - %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() + call void asm sideeffect "nop", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"() ret i32 %2 } @@ -910,7 +910,7 @@ ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq %1 = extractelement <2 x i64> %a0, i32 1 - %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() + call void asm sideeffect "nop", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"() ret i64 %1 } @@ -1060,9 +1060,9 @@ ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq - %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() - %2 = insertelement <16 x i8> %a0, i8 %a1, i32 1 - ret <16 x i8> %2 + call void asm sideeffect "nop", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"() + %1 = insertelement <16 x i8> %a0, i8 %a1, i32 1 + ret <16 x i8> %1 } define <4 x i32> @stack_fold_pinsrd(<4 x i32> %a0, i32 %a1) { @@ -1104,9 +1104,9 @@ ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq - %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() - %2 = insertelement <4 x i32> %a0, i32 %a1, i32 1 - ret <4 x i32> %2 + call void asm sideeffect "nop", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"() + %1 = insertelement <4 x i32> %a0, i32 %a1, i32 1 + ret <4 x i32> %1 } define <2 x i64> @stack_fold_pinsrq(<2 x i64> %a0, i64 %a1) { @@ -1148,9 +1148,9 @@ ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq - %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() - %2 = insertelement <2 x i64> %a0, i64 %a1, i32 1 - ret <2 x i64> %2 + call void asm sideeffect "nop", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"() + %1 = insertelement <2 x i64> %a0, i64 %a1, i32 1 + ret <2 x i64> %1 } define <8 x i16> @stack_fold_pinsrw(<8 x i16> %a0, i16 %a1) { @@ -1192,9 +1192,9 @@ ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq - %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() - %2 = insertelement <8 x i16> %a0, i16 %a1, i32 1 - ret <8 x i16> %2 + call void asm sideeffect "nop", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"() + %1 = insertelement <8 x i16> %a0, i16 %a1, i32 1 + ret <8 x i16> %1 } define <8 x i16> @stack_fold_pmaddubsw(<16 x i8> %a0, <16 x i8> %a1) { diff --git a/llvm/test/CodeGen/X86/stack-folding-int-avx512.ll b/llvm/test/CodeGen/X86/stack-folding-int-avx512.ll --- a/llvm/test/CodeGen/X86/stack-folding-int-avx512.ll +++ b/llvm/test/CodeGen/X86/stack-folding-int-avx512.ll @@ -2535,7 +2535,7 @@ ; add forces execution domain %1 = add <4 x i32> %a0, %a1 %2 = extractelement <4 x i32> %1, i32 1 - %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() + call void asm sideeffect "nop", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"() ret i32 %2 } @@ -2579,7 +2579,7 @@ ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq %1 = extractelement <2 x i64> %a0, i32 1 - %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() + call void asm sideeffect "nop", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"() ret i64 %1 } @@ -2622,9 +2622,9 @@ ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq - %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() - %2 = insertelement <16 x i8> %a0, i8 %a1, i32 1 - ret <16 x i8> %2 + call void asm sideeffect "nop", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"() + %1 = insertelement <16 x i8> %a0, i8 %a1, i32 1 + ret <16 x i8> %1 } define <4 x i32> @stack_fold_pinsrd(<4 x i32> %a0, i32 %a1) { @@ -2710,9 +2710,9 @@ ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq - %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() - %2 = insertelement <2 x i64> %a0, i64 %a1, i32 1 - ret <2 x i64> %2 + call void asm sideeffect "nop", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"() + %1 = insertelement <2 x i64> %a0, i64 %a1, i32 1 + ret <2 x i64> %1 } define <8 x i16> @stack_fold_pinsrw(<8 x i16> %a0, i16 %a1) { @@ -2754,9 +2754,9 @@ ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq - %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() - %2 = insertelement <8 x i16> %a0, i16 %a1, i32 1 - ret <8 x i16> %2 + call void asm sideeffect "nop", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"() + %1 = insertelement <8 x i16> %a0, i16 %a1, i32 1 + ret <8 x i16> %1 } define <16 x i32> @stack_fold_vplzcntd(<16 x i32> %a0) { diff --git a/llvm/test/CodeGen/X86/stack-folding-int-sse42.ll b/llvm/test/CodeGen/X86/stack-folding-int-sse42.ll --- a/llvm/test/CodeGen/X86/stack-folding-int-sse42.ll +++ b/llvm/test/CodeGen/X86/stack-folding-int-sse42.ll @@ -277,9 +277,9 @@ ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq - %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() - %2 = call i64 @llvm.x86.sse42.crc32.64.64(i64 %a0, i64 %a1) - ret i64 %2 + call void asm sideeffect "nop", "~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"() + %1 = call i64 @llvm.x86.sse42.crc32.64.64(i64 %a0, i64 %a1) + ret i64 %1 } declare i64 @llvm.x86.sse42.crc32.64.64(i64, i64) nounwind @@ -441,7 +441,7 @@ ; add forces execution domain %1 = add <2 x i64> %a0, %a1 %2 = extractelement <2 x i64> %1, i32 0 - %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() + call void asm sideeffect "nop", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"() ret i64 %2 } @@ -1148,7 +1148,7 @@ ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq %1 = extractelement <2 x i64> %a0, i32 1 - %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() + call void asm sideeffect "nop", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"() ret i64 %1 } @@ -1384,9 +1384,9 @@ ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq - %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() - %2 = insertelement <2 x i64> %a0, i64 %a1, i32 1 - ret <2 x i64> %2 + call void asm sideeffect "nop", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"() + %1 = insertelement <2 x i64> %a0, i64 %a1, i32 1 + ret <2 x i64> %1 } define <8 x i16> @stack_fold_pinsrw(<8 x i16> %a0, i16 %a1) { @@ -1428,9 +1428,9 @@ ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq - %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() - %2 = insertelement <8 x i16> %a0, i16 %a1, i32 1 - ret <8 x i16> %2 + call void asm sideeffect "nop", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"() + %1 = insertelement <8 x i16> %a0, i16 %a1, i32 1 + ret <8 x i16> %1 } define <8 x i16> @stack_fold_pmaddubsw(<16 x i8> %a0, <16 x i8> %a1) { diff --git a/llvm/test/CodeGen/X86/stack-folding-lwp.ll b/llvm/test/CodeGen/X86/stack-folding-lwp.ll --- a/llvm/test/CodeGen/X86/stack-folding-lwp.ll +++ b/llvm/test/CodeGen/X86/stack-folding-lwp.ll @@ -52,9 +52,9 @@ ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq - %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() - %2 = tail call i8 @llvm.x86.lwpins32(i32 %a0, i32 %a1, i32 2814) - ret i8 %2 + call void asm sideeffect "nop", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"() + %1 = tail call i8 @llvm.x86.lwpins32(i32 %a0, i32 %a1, i32 2814) + ret i8 %1 } declare i8 @llvm.x86.lwpins32(i32, i32, i32) @@ -101,9 +101,9 @@ ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq - %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() - %2 = tail call i8 @llvm.x86.lwpins64(i64 %a0, i32 %a1, i32 2814) - ret i8 %2 + call void asm sideeffect "nop", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"() + %1 = tail call i8 @llvm.x86.lwpins64(i64 %a0, i32 %a1, i32 2814) + ret i8 %1 } declare i8 @llvm.x86.lwpins64(i64, i32, i32) @@ -149,7 +149,7 @@ ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq - %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() + call void asm sideeffect "nop", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"() tail call void @llvm.x86.lwpval32(i32 %a0, i32 %a1, i32 2814) ret void } @@ -197,7 +197,7 @@ ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq - %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() + call void asm sideeffect "nop", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"() tail call void @llvm.x86.lwpval64(i64 %a0, i32 %a1, i32 2814) ret void } diff --git a/llvm/test/CodeGen/X86/stack-folding-mmx.ll b/llvm/test/CodeGen/X86/stack-folding-mmx.ll --- a/llvm/test/CodeGen/X86/stack-folding-mmx.ll +++ b/llvm/test/CodeGen/X86/stack-folding-mmx.ll @@ -154,7 +154,7 @@ ; CHECK-NEXT: retq %1 = call x86_mmx @llvm.x86.mmx.padd.b(x86_mmx %a0, x86_mmx %a0) %2 = bitcast x86_mmx %1 to i64 - %3 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() + call void asm sideeffect "nop", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"() ret i64 %2 } diff --git a/llvm/test/CodeGen/X86/stack-folding-tbm.ll b/llvm/test/CodeGen/X86/stack-folding-tbm.ll --- a/llvm/test/CodeGen/X86/stack-folding-tbm.ll +++ b/llvm/test/CodeGen/X86/stack-folding-tbm.ll @@ -49,9 +49,9 @@ ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq - %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() - %2 = tail call i32 @llvm.x86.tbm.bextri.u32(i32 %a0, i32 3841) - ret i32 %2 + call void asm sideeffect "nop", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"() + %1 = tail call i32 @llvm.x86.tbm.bextri.u32(i32 %a0, i32 3841) + ret i32 %1 } declare i32 @llvm.x86.tbm.bextri.u32(i32, i32) @@ -95,9 +95,9 @@ ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq - %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() - %2 = tail call i64 @llvm.x86.tbm.bextri.u64(i64 %a0, i64 3841) - ret i64 %2 + call void asm sideeffect "nop", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"() + %1 = tail call i64 @llvm.x86.tbm.bextri.u64(i64 %a0, i64 3841) + ret i64 %1 } declare i64 @llvm.x86.tbm.bextri.u64(i64, i64) @@ -140,10 +140,10 @@ ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq - %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() - %2 = add i32 %a0, 1 - %3 = and i32 %a0, %2 - ret i32 %3 + call void asm sideeffect "nop", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"() + %1 = add i32 %a0, 1 + %2 = and i32 %a0, %1 + ret i32 %2 } define i64 @stack_fold_blcfill_u64(i64 %a0) { @@ -185,10 +185,10 @@ ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq - %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() - %2 = add i64 %a0, 1 - %3 = and i64 %a0, %2 - ret i64 %3 + call void asm sideeffect "nop", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"() + %1 = add i64 %a0, 1 + %2 = and i64 %a0, %1 + ret i64 %2 } define i32 @stack_fold_blci_u32(i32 %a0) { @@ -230,11 +230,11 @@ ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq - %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() - %2 = add i32 %a0, 1 - %3 = xor i32 %2, -1 - %4 = or i32 %a0, %3 - ret i32 %4 + call void asm sideeffect "nop", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"() + %1 = add i32 %a0, 1 + %2 = xor i32 %1, -1 + %3 = or i32 %a0, %2 + ret i32 %3 } define i64 @stack_fold_blci_u64(i64 %a0) { @@ -276,11 +276,11 @@ ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq - %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() - %2 = add i64 %a0, 1 - %3 = xor i64 %2, -1 - %4 = or i64 %a0, %3 - ret i64 %4 + call void asm sideeffect "nop", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"() + %1 = add i64 %a0, 1 + %2 = xor i64 %1, -1 + %3 = or i64 %a0, %2 + ret i64 %3 } define i32 @stack_fold_blcic_u32(i32 %a0) { @@ -322,11 +322,11 @@ ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq - %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() - %2 = add i32 %a0, 1 - %3 = xor i32 %a0, -1 - %4 = and i32 %2, %3 - ret i32 %4 + call void asm sideeffect "nop", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"() + %1 = add i32 %a0, 1 + %2 = xor i32 %a0, -1 + %3 = and i32 %1, %2 + ret i32 %3 } define i64 @stack_fold_blcic_u64(i64 %a0) { @@ -368,11 +368,11 @@ ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq - %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() - %2 = add i64 %a0, 1 - %3 = xor i64 %a0, -1 - %4 = and i64 %2, %3 - ret i64 %4 + call void asm sideeffect "nop", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"() + %1 = add i64 %a0, 1 + %2 = xor i64 %a0, -1 + %3 = and i64 %1, %2 + ret i64 %3 } define i32 @stack_fold_blcmsk_u32(i32 %a0) { @@ -414,10 +414,10 @@ ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq - %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() - %2 = add i32 %a0, 1 - %3 = xor i32 %a0, %2 - ret i32 %3 + call void asm sideeffect "nop", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"() + %1 = add i32 %a0, 1 + %2 = xor i32 %a0, %1 + ret i32 %2 } define i64 @stack_fold_blcmsk_u64(i64 %a0) { @@ -459,10 +459,10 @@ ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq - %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() - %2 = add i64 %a0, 1 - %3 = xor i64 %a0, %2 - ret i64 %3 + call void asm sideeffect "nop", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"() + %1 = add i64 %a0, 1 + %2 = xor i64 %a0, %1 + ret i64 %2 } define i32 @stack_fold_blcs_u32(i32 %a0) { @@ -504,10 +504,10 @@ ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq - %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() - %2 = add i32 %a0, 1 - %3 = or i32 %a0, %2 - ret i32 %3 + call void asm sideeffect "nop", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"() + %1 = add i32 %a0, 1 + %2 = or i32 %a0, %1 + ret i32 %2 } define i64 @stack_fold_blcs_u64(i64 %a0) { @@ -549,10 +549,10 @@ ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq - %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() - %2 = add i64 %a0, 1 - %3 = or i64 %a0, %2 - ret i64 %3 + call void asm sideeffect "nop", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"() + %1 = add i64 %a0, 1 + %2 = or i64 %a0, %1 + ret i64 %2 } define i32 @stack_fold_blsfill_u32(i32 %a0) { @@ -594,10 +594,10 @@ ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq - %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() - %2 = sub i32 %a0, 1 - %3 = or i32 %a0, %2 - ret i32 %3 + call void asm sideeffect "nop", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"() + %1 = sub i32 %a0, 1 + %2 = or i32 %a0, %1 + ret i32 %2 } define i64 @stack_fold_blsfill_u64(i64 %a0) { @@ -639,10 +639,10 @@ ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq - %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() - %2 = sub i64 %a0, 1 - %3 = or i64 %a0, %2 - ret i64 %3 + call void asm sideeffect "nop", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"() + %1 = sub i64 %a0, 1 + %2 = or i64 %a0, %1 + ret i64 %2 } define i32 @stack_fold_blsic_u32(i32 %a0) { @@ -684,11 +684,11 @@ ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq - %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() - %2 = sub i32 %a0, 1 - %3 = xor i32 %a0, -1 - %4 = or i32 %2, %3 - ret i32 %4 + call void asm sideeffect "nop", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"() + %1 = sub i32 %a0, 1 + %2 = xor i32 %a0, -1 + %3 = or i32 %1, %2 + ret i32 %3 } define i64 @stack_fold_blsic_u64(i64 %a0) { @@ -730,11 +730,11 @@ ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq - %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() - %2 = sub i64 %a0, 1 - %3 = xor i64 %a0, -1 - %4 = or i64 %2, %3 - ret i64 %4 + call void asm sideeffect "nop", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"() + %1 = sub i64 %a0, 1 + %2 = xor i64 %a0, -1 + %3 = or i64 %1, %2 + ret i64 %3 } define i32 @stack_fold_t1mskc_u32(i32 %a0) { @@ -776,11 +776,11 @@ ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq - %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() - %2 = add i32 %a0, 1 - %3 = xor i32 %a0, -1 - %4 = or i32 %2, %3 - ret i32 %4 + call void asm sideeffect "nop", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"() + %1 = add i32 %a0, 1 + %2 = xor i32 %a0, -1 + %3 = or i32 %1, %2 + ret i32 %3 } define i64 @stack_fold_t1mskc_u64(i64 %a0) { @@ -822,11 +822,11 @@ ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq - %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() - %2 = add i64 %a0, 1 - %3 = xor i64 %a0, -1 - %4 = or i64 %2, %3 - ret i64 %4 + call void asm sideeffect "nop", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"() + %1 = add i64 %a0, 1 + %2 = xor i64 %a0, -1 + %3 = or i64 %1, %2 + ret i64 %3 } define i32 @stack_fold_tzmsk_u32(i32 %a0) { @@ -868,11 +868,11 @@ ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq - %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() - %2 = sub i32 %a0, 1 - %3 = xor i32 %a0, -1 - %4 = and i32 %2, %3 - ret i32 %4 + call void asm sideeffect "nop", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"() + %1 = sub i32 %a0, 1 + %2 = xor i32 %a0, -1 + %3 = and i32 %1, %2 + ret i32 %3 } define i64 @stack_fold_tzmsk_u64(i64 %a0) { @@ -914,9 +914,9 @@ ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq - %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() - %2 = sub i64 %a0, 1 - %3 = xor i64 %a0, -1 - %4 = and i64 %2, %3 - ret i64 %4 + call void asm sideeffect "nop", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"() + %1 = sub i64 %a0, 1 + %2 = xor i64 %a0, -1 + %3 = and i64 %1, %2 + ret i64 %3 } diff --git a/llvm/test/CodeGen/X86/stack-folding-x86_64.ll b/llvm/test/CodeGen/X86/stack-folding-x86_64.ll --- a/llvm/test/CodeGen/X86/stack-folding-x86_64.ll +++ b/llvm/test/CodeGen/X86/stack-folding-x86_64.ll @@ -51,9 +51,9 @@ ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq - %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() - %2 = call i32 @llvm.cttz.i32(i32 %a0, i1 -1) - ret i32 %2 + call void asm sideeffect "nop", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"() + %1 = call i32 @llvm.cttz.i32(i32 %a0, i1 -1) + ret i32 %1 } declare i32 @llvm.cttz.i32(i32, i1) @@ -96,9 +96,9 @@ ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq - %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() - %2 = call i64 @llvm.cttz.i64(i64 %a0, i1 -1) - ret i64 %2 + call void asm sideeffect "nop", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"() + %1 = call i64 @llvm.cttz.i64(i64 %a0, i1 -1) + ret i64 %1 } declare i64 @llvm.cttz.i64(i64, i1) @@ -145,9 +145,9 @@ ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq - %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() - %2 = call i32 @llvm.ctlz.i32(i32 %a0, i1 -1) - ret i32 %2 + call void asm sideeffect "nop", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"() + %1 = call i32 @llvm.ctlz.i32(i32 %a0, i1 -1) + ret i32 %1 } declare i32 @llvm.ctlz.i32(i32, i1) @@ -191,8 +191,8 @@ ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq - %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() - %2 = call i64 @llvm.ctlz.i64(i64 %a0, i1 -1) - ret i64 %2 + call void asm sideeffect "nop", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"() + %1 = call i64 @llvm.ctlz.i64(i64 %a0, i1 -1) + ret i64 %1 } declare i64 @llvm.ctlz.i64(i64, i1)