diff --git a/llvm/lib/CodeGen/MachineCopyPropagation.cpp b/llvm/lib/CodeGen/MachineCopyPropagation.cpp --- a/llvm/lib/CodeGen/MachineCopyPropagation.cpp +++ b/llvm/lib/CodeGen/MachineCopyPropagation.cpp @@ -85,6 +85,69 @@ namespace { +/// Check that \p MI does not have implicit uses that overlap with it's \p Use +/// operand (the register being replaced), since these can sometimes be +/// implicitly tied to other operands. For example, on AMDGPU: +/// +/// V_MOVRELS_B32_e32 %VGPR2, %M0, %EXEC, +/// %VGPR2_VGPR3_VGPR4_VGPR5 +/// +/// the %VGPR2 is implicitly tied to the larger reg operand, but we have no +/// way of knowing we need to update the latter when updating the former. +bool hasImplicitOverlap(const MachineInstr &MI, const MachineOperand &Use, + const TargetRegisterInfo &TRI) { + for (const MachineOperand &MIUse : MI.uses()) + if (&MIUse != &Use && MIUse.isReg() && MIUse.isImplicit() && + MIUse.isUse() && TRI.regsOverlap(Use.getReg(), MIUse.getReg())) + return true; + + return false; +} + +/// Decide whether we should forward the source of \param Copy to its use in +/// \param UseI based on the physical register class constraints of the opcode +/// and avoiding introducing more cross-class COPYs. +bool isForwardableRegClassCopy(const Register &NewReg, const MachineInstr &UseI, + unsigned UseIdx, const TargetInstrInfo &TII, + const TargetRegisterInfo &TRI) { + + // If the new register meets the opcode register constraints, then allow + // forwarding. + if (const TargetRegisterClass *URC = + UseI.getRegClassConstraint(UseIdx, &TII, &TRI)) + return URC->contains(NewReg); + + if (!UseI.isCopy()) + return false; + + /// COPYs don't have register class constraints, so if the user instruction + /// is a COPY, we just try to avoid introducing additional cross-class + /// COPYs. For example: + /// + /// RegClassA = COPY RegClassB // Copy parameter + /// ... + /// RegClassB = COPY RegClassA // UseI parameter + /// + /// which after forwarding becomes + /// + /// RegClassA = COPY RegClassB + /// ... + /// RegClassB = COPY RegClassB + /// + /// so we have reduced the number of cross-class COPYs and potentially + /// introduced a nop COPY that can be removed. + const TargetRegisterClass *UseDstRC = + TRI.getMinimalPhysRegClass(UseI.getOperand(0).getReg()); + + const TargetRegisterClass *SuperRC = UseDstRC; + for (TargetRegisterClass::sc_iterator SuperRCI = UseDstRC->getSuperClasses(); + SuperRC; SuperRC = *SuperRCI++) + if (SuperRC->contains(NewReg)) + return true; + + return false; +} + class CopyTracker { struct CopyInfo { MachineInstr *MI; @@ -94,6 +157,10 @@ DenseMap Copies; + /// Mapping registers to where they are used later. Key is actual registers + /// (not register units) Only used for Backward Propagation + DenseMap> Uses; + public: /// Mark all of the given registers and their subregisters as unavailable for /// copying. @@ -115,21 +182,37 @@ // enough. We have to find the COPY defines Reg or registers defined by Reg // and invalidate all of them. SmallSet RegsToInvalidate; - RegsToInvalidate.insert(Reg); + + // Remove in 2 stages to make sure when we invalidate a register, all traces + // of the corresponding copy instruction are removed from Copies for (MCRegUnitIterator RUI(Reg, &TRI); RUI.isValid(); ++RUI) { auto I = Copies.find(*RUI); if (I != Copies.end()) { - if (MachineInstr *MI = I->second.MI) { + if (MachineInstr *MI = I->second.MI) RegsToInvalidate.insert(MI->getOperand(0).getReg().asMCReg()); - RegsToInvalidate.insert(MI->getOperand(1).getReg().asMCReg()); - } RegsToInvalidate.insert(I->second.DefRegs.begin(), I->second.DefRegs.end()); } } - for (MCRegister InvalidReg : RegsToInvalidate) + + // Make sure when we remove Src registers from Copies, we remove the entire + // register used in the copy instruction. + for (MCRegister InvalidDefReg : RegsToInvalidate) { + for (MCRegUnitIterator RUI(InvalidDefReg, &TRI); RUI.isValid(); ++RUI) { + auto I = Copies.find(*RUI); + if (I != Copies.end()) { + assert(I->second.MI); + RegsToInvalidate.insert( + I->second.MI->getOperand(1).getReg().asMCReg()); + } + } + } + + for (MCRegister InvalidReg : RegsToInvalidate) { for (MCRegUnitIterator RUI(InvalidReg, &TRI); RUI.isValid(); ++RUI) Copies.erase(*RUI); + Uses.erase(InvalidReg); + } } /// Clobber a single register, removing it from the tracker's copy maps. @@ -171,9 +254,7 @@ } } - bool hasAnyCopies() { - return !Copies.empty(); - } + bool hasAnyCopies() { return !Copies.empty(); } MachineInstr *findCopyForUnit(MCRegister RegUnit, const TargetRegisterInfo &TRI, @@ -186,6 +267,8 @@ return CI->second.MI; } + /// Find the corresponding copy instruction where \p RegUnit appeared as the + /// Src MachineInstr *findCopyDefViaUnit(MCRegister RegUnit, const TargetRegisterInfo &TRI) { auto CI = Copies.find(RegUnit); @@ -197,13 +280,14 @@ return findCopyForUnit(*RUI, TRI, true); } - MachineInstr *findAvailBackwardCopy(MachineInstr &I, MCRegister Reg, - const TargetRegisterInfo &TRI) { + std::pair> + findAvailBackwardCopy(MachineInstr &I, MCRegister Reg, + const TargetRegisterInfo &TRI) { MCRegUnitIterator RUI(Reg, &TRI); MachineInstr *AvailCopy = findCopyDefViaUnit(*RUI, TRI); if (!AvailCopy || !TRI.isSubRegisterEq(AvailCopy->getOperand(1).getReg(), Reg)) - return nullptr; + return {nullptr, {}}; Register AvailSrc = AvailCopy->getOperand(1).getReg(); Register AvailDef = AvailCopy->getOperand(0).getReg(); @@ -213,9 +297,100 @@ if (MO.isRegMask()) // FIXME: Shall we simultaneously invalidate AvailSrc or AvailDef? if (MO.clobbersPhysReg(AvailSrc) || MO.clobbersPhysReg(AvailDef)) - return nullptr; + return {nullptr, {}}; - return AvailCopy; + auto UseI = Uses.find(AvailSrc); + return {AvailCopy, + UseI == Uses.end() ? SmallVector{} : UseI->second}; + } + + /// Track uses of registers that are candidates for backward copy propagation + void trackUse(MachineInstr *MI, const TargetRegisterInfo &TRI, + const TargetInstrInfo &TII) { + SmallSet RegsToInvalidate; + for (unsigned OpIdx = 0, OpEnd = MI->getNumOperands(); OpIdx != OpEnd; + ++OpIdx) { + MachineOperand &MOUse = MI->getOperand(OpIdx); + if (!MOUse.isReg() || !MOUse.isUse() || MOUse.isUndef()) { + continue; + } + if (!MOUse.getReg()) { + continue; + } + + MCRegister Use = MOUse.getReg().asMCReg(); + + // Three cases where we should give up propagating copies: + // + // 1) If a read register overlaps, but is not equal to, some of the + // candidate src registers, we need to give up on propagating those + // overlapping registers. + // + // 2) If a copy candidate's Def is read or partially read. + // + // 3) This instruction has uses, but we don't know how to + // rewrite those uses because of overlaps/ties/unrenamble registers, so + // give up on propagating copies related to these uses + bool isOverlappingUse = false; + Register candidateSrc = {0}, candidateDef = {0}; + for (MCRegUnitIterator RUI(Use, &TRI); RUI.isValid(); ++RUI) { + auto CopyInfoI = Copies.find(*RUI); + if (CopyInfoI == Copies.end()) { + continue; + } + if (!CopyInfoI->second.Avail) { + // Use matches or overlaps with an Src + // Find the actual Src in the copy instruction + MachineInstr *Copy = findCopyDefViaUnit(*RUI, TRI); + MCRegister Src = Copy->getOperand(1).getReg().asMCReg(); + MCRegister Def = Copy->getOperand(0).getReg().asMCReg(); + if (Src != Use) { + // Case (1) + isOverlappingUse = true; + RegsToInvalidate.insert(Src); + RegsToInvalidate.insert(Def); + } else { + candidateSrc = Src; + candidateDef = Def; + break; + } + } else { + // Case (2) + RegsToInvalidate.insert( + CopyInfoI->second.MI->getOperand(0).getReg().asMCReg()); + RegsToInvalidate.insert( + CopyInfoI->second.MI->getOperand(1).getReg().asMCReg()); + } + } + + // Can't have matching and overlapping Srcs at the same time + assert(!candidateSrc || !isOverlappingUse); + + if (candidateSrc) { + // implies: !isOverlappingUse + if (!MI->isDebugValue() && + (hasImplicitOverlap(*MI, MOUse, TRI) || !MOUse.isRenamable() || + MOUse.isTied() || + !isForwardableRegClassCopy(candidateSrc, *MI, OpIdx, TII, TRI))) { + // Case (3) + RegsToInvalidate.insert(candidateSrc.asMCReg()); + RegsToInvalidate.insert(candidateDef.asMCReg()); + continue; + } + + // Add to Uses for future rewrite + auto I = Uses.insert({Use, {MI}}); + if (!I.second) { + I.first->second.push_back(MI); + } + } + } + + for (MCRegister InvalidReg : RegsToInvalidate) { + for (MCRegUnitIterator RUI(InvalidReg, &TRI); RUI.isValid(); ++RUI) + Copies.erase(*RUI); + Uses.erase(InvalidReg); + } } MachineInstr *findAvailCopy(MachineInstr &DestCopy, MCRegister Reg, @@ -245,6 +420,7 @@ void clear() { Copies.clear(); + Uses.clear(); } }; @@ -281,12 +457,9 @@ bool eraseIfRedundant(MachineInstr &Copy, MCRegister Src, MCRegister Def); void forwardUses(MachineInstr &MI); void propagateDefs(MachineInstr &MI); - bool isForwardableRegClassCopy(const MachineInstr &Copy, - const MachineInstr &UseI, unsigned UseIdx); bool isBackwardPropagatableRegClassCopy(const MachineInstr &Copy, const MachineInstr &UseI, unsigned UseIdx); - bool hasImplicitOverlap(const MachineInstr &MI, const MachineOperand &Use); bool hasOverlappingMultipleDef(const MachineInstr &MI, const MachineOperand &MODef, Register Def); @@ -294,7 +467,7 @@ SmallSetVector MaybeDeadCopies; /// Multimap tracking debug users in current BB - DenseMap> CopyDbgUsers; + DenseMap> CopyDbgUsers; CopyTracker Tracker; @@ -396,70 +569,6 @@ return false; } -/// Decide whether we should forward the source of \param Copy to its use in -/// \param UseI based on the physical register class constraints of the opcode -/// and avoiding introducing more cross-class COPYs. -bool MachineCopyPropagation::isForwardableRegClassCopy(const MachineInstr &Copy, - const MachineInstr &UseI, - unsigned UseIdx) { - - Register CopySrcReg = Copy.getOperand(1).getReg(); - - // If the new register meets the opcode register constraints, then allow - // forwarding. - if (const TargetRegisterClass *URC = - UseI.getRegClassConstraint(UseIdx, TII, TRI)) - return URC->contains(CopySrcReg); - - if (!UseI.isCopy()) - return false; - - /// COPYs don't have register class constraints, so if the user instruction - /// is a COPY, we just try to avoid introducing additional cross-class - /// COPYs. For example: - /// - /// RegClassA = COPY RegClassB // Copy parameter - /// ... - /// RegClassB = COPY RegClassA // UseI parameter - /// - /// which after forwarding becomes - /// - /// RegClassA = COPY RegClassB - /// ... - /// RegClassB = COPY RegClassB - /// - /// so we have reduced the number of cross-class COPYs and potentially - /// introduced a nop COPY that can be removed. - const TargetRegisterClass *UseDstRC = - TRI->getMinimalPhysRegClass(UseI.getOperand(0).getReg()); - - const TargetRegisterClass *SuperRC = UseDstRC; - for (TargetRegisterClass::sc_iterator SuperRCI = UseDstRC->getSuperClasses(); - SuperRC; SuperRC = *SuperRCI++) - if (SuperRC->contains(CopySrcReg)) - return true; - - return false; -} - -/// Check that \p MI does not have implicit uses that overlap with it's \p Use -/// operand (the register being replaced), since these can sometimes be -/// implicitly tied to other operands. For example, on AMDGPU: -/// -/// V_MOVRELS_B32_e32 %VGPR2, %M0, %EXEC, %VGPR2_VGPR3_VGPR4_VGPR5 -/// -/// the %VGPR2 is implicitly tied to the larger reg operand, but we have no -/// way of knowing we need to update the latter when updating the former. -bool MachineCopyPropagation::hasImplicitOverlap(const MachineInstr &MI, - const MachineOperand &Use) { - for (const MachineOperand &MIUse : MI.uses()) - if (&MIUse != &Use && MIUse.isReg() && MIUse.isImplicit() && - MIUse.isUse() && TRI->regsOverlap(Use.getReg(), MIUse.getReg())) - return true; - - return false; -} - /// For an MI that has multiple definitions, check whether \p MI has /// a definition that overlaps with another of its definitions. /// For example, on ARM: umull r9, r9, lr, r0 @@ -526,10 +635,10 @@ if (MRI->isReserved(CopySrcReg) && !MRI->isConstantPhysReg(CopySrcReg)) continue; - if (!isForwardableRegClassCopy(*Copy, MI, OpIdx)) + if (!isForwardableRegClassCopy(CopySrcReg, MI, OpIdx, *TII, *TRI)) continue; - if (hasImplicitOverlap(MI, MOUse)) + if (hasImplicitOverlap(MI, MOUse, *TRI)) continue; // Check that the instruction is not a copy that partially overwrites the @@ -571,7 +680,7 @@ LLVM_DEBUG(dbgs() << "MCP: ForwardCopyPropagateBlock " << MBB.getName() << "\n"); - for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; ) { + for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) { MachineInstr *MI = &*I; ++I; @@ -784,35 +893,61 @@ if (!MODef.isRenamable()) continue; - MachineInstr *Copy = + auto Copy = Tracker.findAvailBackwardCopy(MI, MODef.getReg().asMCReg(), *TRI); - if (!Copy) + if (!Copy.first) continue; - Register Def = Copy->getOperand(0).getReg(); - Register Src = Copy->getOperand(1).getReg(); + Register Def = Copy.first->getOperand(0).getReg(); + Register Src = Copy.first->getOperand(1).getReg(); if (MODef.getReg() != Src) continue; - if (!isBackwardPropagatableRegClassCopy(*Copy, MI, OpIdx)) + if (!isBackwardPropagatableRegClassCopy(*Copy.first, MI, OpIdx)) continue; - if (hasImplicitOverlap(MI, MODef)) + if (hasImplicitOverlap(MI, MODef, *TRI)) { continue; + } if (hasOverlappingMultipleDef(MI, MODef, Def)) continue; LLVM_DEBUG(dbgs() << "MCP: Replacing " << printReg(MODef.getReg(), TRI) << "\n with " << printReg(Def, TRI) << "\n in " - << MI << " from " << *Copy); + << MI << " from " << *Copy.first); + bool isRenamable = Copy.first->getOperand(0).isRenamable(); MODef.setReg(Def); - MODef.setIsRenamable(Copy->getOperand(0).isRenamable()); + MODef.setIsRenamable(isRenamable); LLVM_DEBUG(dbgs() << "MCP: After replacement: " << MI << "\n"); - MaybeDeadCopies.insert(Copy); + + // Update uses of the original Def. + // We don't need to perform checks here, Uses only contains rewrittable + // uses + for (MachineInstr *User : Copy.second) { + LLVM_DEBUG(dbgs() << "MCP: Replacing " << printReg(Src, TRI) + << "\n with " << printReg(Def, TRI) << "\n in " + << *User << " from " << *Copy.first); + if (User->isDebugValue()) { + MRI->updateDbgUsersToReg(Def, User); + } else { + for (unsigned UseIdx = 0, UseEnd = User->getNumOperands(); + UseIdx != UseEnd; ++UseIdx) { + MachineOperand &MOUse = User->getOperand(UseIdx); + if (!MOUse.isReg() || MOUse.getReg() != Src) { + continue; + } + MOUse.setReg(Def); + MOUse.setIsRenamable(isRenamable); + } + } + LLVM_DEBUG(dbgs() << "MCP: After replacement: " << *User << "\n"); + } + + MaybeDeadCopies.insert(Copy.first); Changed = true; ++NumCopyBackwardPropagated; } @@ -839,7 +974,11 @@ // Unlike forward cp, we don't invoke propagateDefs here, // just let forward cp do COPY-to-COPY propagation. if (isBackwardPropagatableCopy(*MI, *MRI)) { + // Remove copies related to Src. Src can only possibly appear + // in copy candidates as define, because otherwise the current copy + // won't kill Src. Tracker.invalidateRegister(Src, *TRI); + // Remove copies related to Def. Tracker.invalidateRegister(Def, *TRI); Tracker.trackCopy(MI, *TRI); continue; @@ -856,6 +995,9 @@ } propagateDefs(*MI); + + // Track uses after propagation, because we need the correct Def register. + Tracker.trackUse(MI, *TRI, *TII); for (const MachineOperand &MO : MI->operands()) { if (!MO.isReg()) continue; @@ -863,11 +1005,14 @@ if (!MO.getReg()) continue; - if (MO.isDef()) - Tracker.invalidateRegister(MO.getReg().asMCReg(), *TRI); - - if (MO.readsReg()) + if (MO.isDef()) { + // Even if we did apply propagation, the relevant copy instruction + // would still get invalidated here. The original instruction would + // trigger invalidation because its Def matches the Src of the relevant + // copy, the updated instruction would still do because its Def now + // matches the Def of the relevant copy. Tracker.invalidateRegister(MO.getReg().asMCReg(), *TRI); + } } } diff --git a/llvm/test/CodeGen/X86/bitreverse.ll b/llvm/test/CodeGen/X86/bitreverse.ll --- a/llvm/test/CodeGen/X86/bitreverse.ll +++ b/llvm/test/CodeGen/X86/bitreverse.ll @@ -1026,8 +1026,8 @@ ; X64-NEXT: movq %rbx, %r10 ; X64-NEXT: andq %r13, %r10 ; X64-NEXT: shlq $4, %r10 -; X64-NEXT: movabsq $-1085102592571150096, %rax # imm = 0xF0F0F0F0F0F0F0F0 -; X64-NEXT: andq %rax, %rbx +; X64-NEXT: movabsq $-1085102592571150096, %r15 # imm = 0xF0F0F0F0F0F0F0F0 +; X64-NEXT: andq %r15, %rbx ; X64-NEXT: shrq $4, %rbx ; X64-NEXT: orq %r10, %rbx ; X64-NEXT: movabsq $3689348814741910323, %r11 # imm = 0x3333333333333333 @@ -1048,7 +1048,7 @@ ; X64-NEXT: movq %rbp, %rdi ; X64-NEXT: andq %r13, %rdi ; X64-NEXT: shlq $4, %rdi -; X64-NEXT: andq %rax, %rbp +; X64-NEXT: andq %r15, %rbp ; X64-NEXT: shrq $4, %rbp ; X64-NEXT: orq %rdi, %rbp ; X64-NEXT: movq %rbp, %rdi @@ -1069,8 +1069,7 @@ ; X64-NEXT: movq %rbp, %r10 ; X64-NEXT: andq %r13, %r10 ; X64-NEXT: shlq $4, %r10 -; X64-NEXT: andq %rax, %rbp -; X64-NEXT: movq %rax, %r15 +; X64-NEXT: andq %r15, %rbp ; X64-NEXT: shrq $4, %rbp ; X64-NEXT: orq %r10, %rbp ; X64-NEXT: movq %rbp, %r10 diff --git a/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll b/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll --- a/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll +++ b/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll @@ -1298,13 +1298,12 @@ ; X86-NEXT: subl $12, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: pushl {{[0-9]+}}(%esp) ; X86-NEXT: pushl {{[0-9]+}}(%esp) ; X86-NEXT: pushl {{[0-9]+}}(%esp) ; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %ebx -; X86-NEXT: movl %ebx, %esi +; X86-NEXT: pushl %esi ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %ebp ; X86-NEXT: pushl {{[0-9]+}}(%esp) @@ -1381,13 +1380,12 @@ ; X86-NEXT: subl $12, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: pushl {{[0-9]+}}(%esp) ; X86-NEXT: pushl {{[0-9]+}}(%esp) ; X86-NEXT: pushl {{[0-9]+}}(%esp) ; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %ebx -; X86-NEXT: movl %ebx, %esi +; X86-NEXT: pushl %esi ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %ebp ; X86-NEXT: pushl {{[0-9]+}}(%esp) diff --git a/llvm/test/CodeGen/X86/mul-i1024.ll b/llvm/test/CodeGen/X86/mul-i1024.ll --- a/llvm/test/CodeGen/X86/mul-i1024.ll +++ b/llvm/test/CodeGen/X86/mul-i1024.ll @@ -4305,18 +4305,17 @@ ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[0-9]+}}(%esp), %edi -; X32-NEXT: movl 124(%edi), %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X32-NEXT: movl 124(%ebx), %ecx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: imull %eax, %ecx -; X32-NEXT: movl 120(%edi), %esi -; X32-NEXT: movl %edi, %ebx +; X32-NEXT: movl 120(%ebx), %esi ; X32-NEXT: mull %esi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: addl %ecx, %edx ; X32-NEXT: imull (%esp), %esi # 4-byte Folded Reload ; X32-NEXT: addl %edx, %esi -; X32-NEXT: movl 112(%edi), %edi +; X32-NEXT: movl 112(%ebx), %edi ; X32-NEXT: movl 116(%ebx), %ebp ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: movl %eax, %ebx @@ -4885,17 +4884,16 @@ ; X64-NEXT: movq %rax, %rsi ; X64-NEXT: addq %r12, %rsi ; X64-NEXT: adcq $0, %rbx -; X64-NEXT: movq 24(%rdi), %rdi +; X64-NEXT: movq 24(%rdi), %r12 ; X64-NEXT: movq %r9, %rax -; X64-NEXT: mulq %rdi -; X64-NEXT: movq %rdi, %r12 +; X64-NEXT: mulq %r12 ; X64-NEXT: movq %rdx, %rbp ; X64-NEXT: addq %rsi, %rax ; X64-NEXT: movq %rax, %rsi ; X64-NEXT: adcq %rbx, %rbp ; X64-NEXT: setb %r9b ; X64-NEXT: movq %r11, %rax -; X64-NEXT: mulq %rdi +; X64-NEXT: mulq %r12 ; X64-NEXT: movq %rdx, %rdi ; X64-NEXT: movq %rax, %rbx ; X64-NEXT: addq %rbp, %rbx @@ -5042,8 +5040,8 @@ ; X64-NEXT: addq %r12, %rbx ; X64-NEXT: adcq %r11, %r8 ; X64-NEXT: setb %r14b -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload -; X64-NEXT: movq %rsi, %rax +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; X64-NEXT: movq %r15, %rax ; X64-NEXT: movq %r10, (%rsp) # 8-byte Spill ; X64-NEXT: mulq %r10 ; X64-NEXT: movq %rdx, %rcx @@ -5055,8 +5053,7 @@ ; X64-NEXT: movq %rax, %rbp ; X64-NEXT: addq %rcx, %rbp ; X64-NEXT: adcq $0, %rdi -; X64-NEXT: movq %rsi, %rax -; X64-NEXT: movq %rsi, %r15 +; X64-NEXT: movq %r15, %rax ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload ; X64-NEXT: mulq %rsi ; X64-NEXT: movq %rdx, %rcx @@ -5086,25 +5083,22 @@ ; X64-NEXT: adcq $0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill ; X64-NEXT: adcq $0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill ; X64-NEXT: adcq $0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload -; X64-NEXT: movq 32(%rsi), %rdi +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload +; X64-NEXT: movq 32(%r13), %r11 ; X64-NEXT: movq %r15, %rax -; X64-NEXT: mulq %rdi +; X64-NEXT: mulq %r11 ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq %r9, %rbx ; X64-NEXT: movq %r9, %rax -; X64-NEXT: mulq %rdi -; X64-NEXT: movq %rdi, %r11 +; X64-NEXT: mulq %r11 ; X64-NEXT: movq %rdx, %rdi ; X64-NEXT: movq %rax, %rbp ; X64-NEXT: addq %rcx, %rbp ; X64-NEXT: adcq $0, %rdi -; X64-NEXT: movq 40(%rsi), %rcx -; X64-NEXT: movq %rsi, %r13 +; X64-NEXT: movq 40(%r13), %r8 ; X64-NEXT: movq %r15, %rax -; X64-NEXT: mulq %rcx -; X64-NEXT: movq %rcx, %r8 +; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq %rax, %r9 ; X64-NEXT: addq %rbp, %r9 @@ -5239,29 +5233,27 @@ ; X64-NEXT: setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload ; X64-NEXT: movq %rdi, %rax -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload -; X64-NEXT: mulq %rsi +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload ; X64-NEXT: movq %rbp, %rax -; X64-NEXT: mulq %rsi -; X64-NEXT: movq %rsi, %r8 +; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rdx, %rsi ; X64-NEXT: movq %rax, %rbx ; X64-NEXT: addq %rcx, %rbx ; X64-NEXT: adcq $0, %rsi ; X64-NEXT: movq %rdi, %rax -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; X64-NEXT: mulq %rdi +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; X64-NEXT: mulq %r9 ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq %rax, %r12 ; X64-NEXT: addq %rbx, %r12 ; X64-NEXT: adcq %rsi, %rcx ; X64-NEXT: setb %bl ; X64-NEXT: movq %rbp, %rax -; X64-NEXT: mulq %rdi -; X64-NEXT: movq %rdi, %r9 +; X64-NEXT: mulq %r9 ; X64-NEXT: movq %rdx, %r11 ; X64-NEXT: movq %rax, %r13 ; X64-NEXT: addq %rcx, %r13 @@ -5382,26 +5374,23 @@ ; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq $0, %rdx ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload -; X64-NEXT: movq 64(%rsi), %rdi +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload +; X64-NEXT: movq 64(%r13), %r8 ; X64-NEXT: movq (%rsp), %rbx # 8-byte Reload ; X64-NEXT: movq %rbx, %rax -; X64-NEXT: mulq %rdi +; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq %rax, %r11 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload ; X64-NEXT: movq %r9, %rax -; X64-NEXT: mulq %rdi -; X64-NEXT: movq %rdi, %r8 +; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rdx, %rdi ; X64-NEXT: movq %rax, %rbp ; X64-NEXT: addq %rcx, %rbp ; X64-NEXT: adcq $0, %rdi -; X64-NEXT: movq 72(%rsi), %rcx -; X64-NEXT: movq %rsi, %r13 +; X64-NEXT: movq 72(%r13), %rsi ; X64-NEXT: movq %rbx, %rax -; X64-NEXT: mulq %rcx -; X64-NEXT: movq %rcx, %rsi +; X64-NEXT: mulq %rsi ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq %rax, %r14 ; X64-NEXT: addq %rbp, %r14 @@ -5483,8 +5472,8 @@ ; X64-NEXT: addq %r15, %rsi ; X64-NEXT: adcq %r10, %rbx ; X64-NEXT: setb %r9b -; X64-NEXT: movq (%rsp), %r14 # 8-byte Reload -; X64-NEXT: movq %r14, %rax +; X64-NEXT: movq (%rsp), %r15 # 8-byte Reload +; X64-NEXT: movq %r15, %rax ; X64-NEXT: mulq %r11 ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq %rax, %r10 @@ -5495,8 +5484,7 @@ ; X64-NEXT: movq %rax, %rbp ; X64-NEXT: addq %rcx, %rbp ; X64-NEXT: adcq $0, %rdi -; X64-NEXT: movq %r14, %rax -; X64-NEXT: movq %r14, %r15 +; X64-NEXT: movq %r15, %rax ; X64-NEXT: mulq %r13 ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: addq %rbp, %rax @@ -5567,12 +5555,11 @@ ; X64-NEXT: adcq %rax, %r13 ; X64-NEXT: addq %r9, %r11 ; X64-NEXT: adcq %r8, %r13 -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; X64-NEXT: movq 120(%rdx), %rcx +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; X64-NEXT: movq 120(%rdi), %rcx ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; X64-NEXT: imulq %rax, %rcx -; X64-NEXT: movq 112(%rdx), %rsi -; X64-NEXT: movq %rdx, %rdi +; X64-NEXT: movq 112(%rdi), %rsi ; X64-NEXT: movq %rax, %rbx ; X64-NEXT: mulq %rsi ; X64-NEXT: movq %rax, %r10 @@ -5764,19 +5751,18 @@ ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq $0, %rdx ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; X64-NEXT: movq 96(%rbp), %rsi +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; X64-NEXT: movq 96(%rdi), %rsi ; X64-NEXT: imulq %rsi, %r9 ; X64-NEXT: movq %rsi, %rax ; X64-NEXT: mulq %rbx ; X64-NEXT: movq %rax, %r14 ; X64-NEXT: addq %r9, %rdx -; X64-NEXT: movq 104(%rbp), %r15 +; X64-NEXT: movq 104(%rdi), %r15 ; X64-NEXT: imulq %r15, %rbx ; X64-NEXT: addq %rdx, %rbx ; X64-NEXT: movq %rbx, %r9 -; X64-NEXT: movq 112(%rbp), %rax -; X64-NEXT: movq %rbp, %rdi +; X64-NEXT: movq 112(%rdi), %rax ; X64-NEXT: movq %rax, %rcx ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload ; X64-NEXT: imulq %rbx, %rcx diff --git a/llvm/test/CodeGen/X86/mul-i512.ll b/llvm/test/CodeGen/X86/mul-i512.ll --- a/llvm/test/CodeGen/X86/mul-i512.ll +++ b/llvm/test/CodeGen/X86/mul-i512.ll @@ -796,18 +796,17 @@ ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[0-9]+}}(%esp), %edi -; X32-NEXT: movl 60(%edi), %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X32-NEXT: movl 60(%ebx), %ecx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: imull %eax, %ecx -; X32-NEXT: movl 56(%edi), %esi -; X32-NEXT: movl %edi, %ebx +; X32-NEXT: movl 56(%ebx), %esi ; X32-NEXT: mull %esi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: addl %ecx, %edx ; X32-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: addl %edx, %esi -; X32-NEXT: movl 48(%edi), %edi +; X32-NEXT: movl 48(%ebx), %edi ; X32-NEXT: movl 52(%ebx), %ebp ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: movl %eax, %ebx @@ -1375,12 +1374,11 @@ ; X64-NEXT: adcq %rax, %r15 ; X64-NEXT: addq %r12, %r13 ; X64-NEXT: adcq %rbp, %r15 -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; X64-NEXT: movq 56(%rdx), %rcx +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; X64-NEXT: movq 56(%rsi), %rcx ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; X64-NEXT: imulq %rax, %rcx -; X64-NEXT: movq 48(%rdx), %rbp -; X64-NEXT: movq %rdx, %rsi +; X64-NEXT: movq 48(%rsi), %rbp ; X64-NEXT: movq %rax, %r11 ; X64-NEXT: mulq %rbp ; X64-NEXT: movq %rax, %r12 diff --git a/llvm/test/CodeGen/X86/pr43820.ll b/llvm/test/CodeGen/X86/pr43820.ll --- a/llvm/test/CodeGen/X86/pr43820.ll +++ b/llvm/test/CodeGen/X86/pr43820.ll @@ -18,17 +18,15 @@ ; CHECK-NEXT: movq %rbx, %rbp ; CHECK-NEXT: andq %rdi, %rbp ; CHECK-NEXT: shlq $4, %rbp -; CHECK-NEXT: movabsq $-1085102592571150096, %r11 # imm = 0xF0F0F0F0F0F0F0F0 -; CHECK-NEXT: andq %r11, %rbx -; CHECK-NEXT: movq %r11, %rax +; CHECK-NEXT: movabsq $-1085102592571150096, %rax # imm = 0xF0F0F0F0F0F0F0F0 +; CHECK-NEXT: andq %rax, %rbx ; CHECK-NEXT: shrq $4, %rbx ; CHECK-NEXT: orq %rbp, %rbx ; CHECK-NEXT: movabsq $3689348814741910323, %r11 # imm = 0x3333333333333333 ; CHECK-NEXT: movq %rbx, %r14 ; CHECK-NEXT: andq %r11, %r14 -; CHECK-NEXT: movabsq $-3689348814741910324, %rbp # imm = 0xCCCCCCCCCCCCCCCC -; CHECK-NEXT: andq %rbp, %rbx -; CHECK-NEXT: movq %rbp, %r15 +; CHECK-NEXT: movabsq $-3689348814741910324, %r15 # imm = 0xCCCCCCCCCCCCCCCC +; CHECK-NEXT: andq %r15, %rbx ; CHECK-NEXT: shrq $2, %rbx ; CHECK-NEXT: leaq (%rbx,%r14,4), %r14 ; CHECK-NEXT: movabsq $6148914691230924800, %rbx # imm = 0x5555555555000000 diff --git a/llvm/test/CodeGen/X86/sdiv_fix.ll b/llvm/test/CodeGen/X86/sdiv_fix.ll --- a/llvm/test/CodeGen/X86/sdiv_fix.ll +++ b/llvm/test/CodeGen/X86/sdiv_fix.ll @@ -337,11 +337,10 @@ ; X86-NEXT: pushl %eax ; X86-NEXT: calll __divti3 ; X86-NEXT: addl $32, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ecx, %eax ; X86-NEXT: subl $1, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sbbl $0, %ebx @@ -586,8 +585,8 @@ ; X86-NEXT: calll __divdi3 ; X86-NEXT: addl $16, %esp ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %eax, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %esi, %ecx ; X86-NEXT: sarl $31, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl %edx, %ebx @@ -598,8 +597,7 @@ ; X86-NEXT: pushl %ecx ; X86-NEXT: movl %ecx, %ebp ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: pushl %eax -; X86-NEXT: movl %eax, %esi +; X86-NEXT: pushl %esi ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %ebx ; X86-NEXT: calll __moddi3 diff --git a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll --- a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll +++ b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll @@ -985,10 +985,9 @@ ; X86-NEXT: pushl %ecx ; X86-NEXT: pushl %ecx ; X86-NEXT: pushl 36(%ebp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: pushl %ecx -; X86-NEXT: pushl %ecx -; X86-NEXT: movl %ecx, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %edi ; X86-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload ; X86-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload ; X86-NEXT: pushl %eax diff --git a/llvm/test/CodeGen/X86/shift-and.ll b/llvm/test/CodeGen/X86/shift-and.ll --- a/llvm/test/CodeGen/X86/shift-and.ll +++ b/llvm/test/CodeGen/X86/shift-and.ll @@ -172,9 +172,8 @@ ; X32-NEXT: pushl %esi ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movl {{[0-9]+}}(%esp), %esi -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: shrdl $3, %eax, %esi -; X32-NEXT: movl %eax, %edi +; X32-NEXT: movl {{[0-9]+}}(%esp), %edi +; X32-NEXT: shrdl $3, %edi, %esi ; X32-NEXT: shrl $3, %edi ; X32-NEXT: movl (%ecx), %eax ; X32-NEXT: movl 4(%ecx), %edx diff --git a/llvm/test/CodeGen/X86/shrink_vmul.ll b/llvm/test/CodeGen/X86/shrink_vmul.ll --- a/llvm/test/CodeGen/X86/shrink_vmul.ll +++ b/llvm/test/CodeGen/X86/shrink_vmul.ll @@ -2082,17 +2082,16 @@ ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE-NEXT: movzwl 16(%eax), %edx ; X86-SSE-NEXT: movl %edx, (%esp) # 4-byte Spill -; X86-SSE-NEXT: movdqa (%eax), %xmm3 +; X86-SSE-NEXT: movdqa (%eax), %xmm5 ; X86-SSE-NEXT: movdqa (%ecx), %xmm0 ; X86-SSE-NEXT: movdqa 16(%ecx), %xmm1 ; X86-SSE-NEXT: pxor %xmm4, %xmm4 -; X86-SSE-NEXT: movdqa %xmm3, %xmm2 -; X86-SSE-NEXT: pextrw $7, %xmm3, %eax -; X86-SSE-NEXT: pextrw $4, %xmm3, %esi -; X86-SSE-NEXT: pextrw $0, %xmm3, %edi -; X86-SSE-NEXT: pextrw $1, %xmm3, %ebx -; X86-SSE-NEXT: pextrw $3, %xmm3, %ebp -; X86-SSE-NEXT: movdqa %xmm3, %xmm5 +; X86-SSE-NEXT: movdqa %xmm5, %xmm2 +; X86-SSE-NEXT: pextrw $7, %xmm5, %eax +; X86-SSE-NEXT: pextrw $4, %xmm5, %esi +; X86-SSE-NEXT: pextrw $0, %xmm5, %edi +; X86-SSE-NEXT: pextrw $1, %xmm5, %ebx +; X86-SSE-NEXT: pextrw $3, %xmm5, %ebp ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] ; X86-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[3,3,3,3] @@ -2327,17 +2326,16 @@ ; X64-SSE-LABEL: PR34947: ; X64-SSE: # %bb.0: ; X64-SSE-NEXT: movzwl 16(%rdi), %r8d -; X64-SSE-NEXT: movdqa (%rdi), %xmm3 +; X64-SSE-NEXT: movdqa (%rdi), %xmm5 ; X64-SSE-NEXT: movdqa (%rsi), %xmm0 ; X64-SSE-NEXT: movdqa 16(%rsi), %xmm1 ; X64-SSE-NEXT: pxor %xmm4, %xmm4 -; X64-SSE-NEXT: movdqa %xmm3, %xmm2 -; X64-SSE-NEXT: pextrw $7, %xmm3, %eax -; X64-SSE-NEXT: pextrw $4, %xmm3, %r9d -; X64-SSE-NEXT: pextrw $0, %xmm3, %r10d -; X64-SSE-NEXT: pextrw $1, %xmm3, %r11d -; X64-SSE-NEXT: pextrw $3, %xmm3, %ecx -; X64-SSE-NEXT: movdqa %xmm3, %xmm5 +; X64-SSE-NEXT: movdqa %xmm5, %xmm2 +; X64-SSE-NEXT: pextrw $7, %xmm5, %eax +; X64-SSE-NEXT: pextrw $4, %xmm5, %r9d +; X64-SSE-NEXT: pextrw $0, %xmm5, %r10d +; X64-SSE-NEXT: pextrw $1, %xmm5, %r11d +; X64-SSE-NEXT: pextrw $3, %xmm5, %ecx ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] ; X64-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[3,3,3,3] diff --git a/llvm/test/CodeGen/X86/sqrt-fastmath.ll b/llvm/test/CodeGen/X86/sqrt-fastmath.ll --- a/llvm/test/CodeGen/X86/sqrt-fastmath.ll +++ b/llvm/test/CodeGen/X86/sqrt-fastmath.ll @@ -559,11 +559,10 @@ define <16 x float> @v16f32_estimate(<16 x float> %x) #1 { ; SSE-LABEL: v16f32_estimate: ; SSE: # %bb.0: -; SSE-NEXT: rsqrtps %xmm0, %xmm5 +; SSE-NEXT: rsqrtps %xmm0, %xmm6 ; SSE-NEXT: movaps {{.*#+}} xmm4 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] -; SSE-NEXT: mulps %xmm5, %xmm0 -; SSE-NEXT: mulps %xmm5, %xmm0 -; SSE-NEXT: movaps %xmm5, %xmm6 +; SSE-NEXT: mulps %xmm6, %xmm0 +; SSE-NEXT: mulps %xmm6, %xmm0 ; SSE-NEXT: mulps %xmm4, %xmm6 ; SSE-NEXT: movaps {{.*#+}} xmm5 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] ; SSE-NEXT: addps %xmm5, %xmm0 diff --git a/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll --- a/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll +++ b/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll @@ -64,10 +64,9 @@ ; X86-NEXT: movl %edx, %ecx ; X86-NEXT: addl %esi, %ecx ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %esi -; X86-NEXT: movl %esi, %ebx +; X86-NEXT: mull %ebx ; X86-NEXT: movl %eax, %esi ; X86-NEXT: seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi