diff --git a/llvm/lib/CodeGen/MachineCopyPropagation.cpp b/llvm/lib/CodeGen/MachineCopyPropagation.cpp --- a/llvm/lib/CodeGen/MachineCopyPropagation.cpp +++ b/llvm/lib/CodeGen/MachineCopyPropagation.cpp @@ -46,6 +46,18 @@ // $R1 = OP ... // ... // +// or +// +// $R0 = OP ... +// ... // No read/clobber of $R1, no clobber of $R0 +// $R2 = OP $R0 // $R0 is used +// $R1 = COPY $R0 // $R0 is killed +// Replace $R0 and its uses with $R1 and remove the COPY +// $R1 = OP ... +// $R2 = OP $R1 +// ... +// +// //===----------------------------------------------------------------------===// #include "llvm/ADT/DenseMap.h" @@ -85,6 +97,69 @@ namespace { +/// Check that \p MI does not have implicit uses that overlap with it's \p Use +/// operand (the register being replaced), since these can sometimes be +/// implicitly tied to other operands. For example, on AMDGPU: +/// +/// V_MOVRELS_B32_e32 %VGPR2, %M0, %EXEC, +/// %VGPR2_VGPR3_VGPR4_VGPR5 +/// +/// the %VGPR2 is implicitly tied to the larger reg operand, but we have no +/// way of knowing we need to update the latter when updating the former. +bool hasImplicitOverlap(const MachineInstr &MI, const MachineOperand &Use, + const TargetRegisterInfo &TRI) { + for (const MachineOperand &MIUse : MI.uses()) + if (&MIUse != &Use && MIUse.isReg() && MIUse.isImplicit() && + MIUse.isUse() && TRI.regsOverlap(Use.getReg(), MIUse.getReg())) + return true; + + return false; +} + +/// Decide whether we should forward the source of \param Copy to its use in +/// \param UseI based on the physical register class constraints of the opcode +/// and avoiding introducing more cross-class COPYs. +bool isForwardableRegClassCopy(const Register &NewReg, const MachineInstr &UseI, + unsigned UseIdx, const TargetInstrInfo &TII, + const TargetRegisterInfo &TRI) { + + // If the new register meets the opcode register constraints, then allow + // forwarding. + if (const TargetRegisterClass *URC = + UseI.getRegClassConstraint(UseIdx, &TII, &TRI)) + return URC->contains(NewReg); + + if (!UseI.isCopy()) + return false; + + /// COPYs don't have register class constraints, so if the user instruction + /// is a COPY, we just try to avoid introducing additional cross-class + /// COPYs. For example: + /// + /// RegClassA = COPY RegClassB // Copy parameter + /// ... + /// RegClassB = COPY RegClassA // UseI parameter + /// + /// which after forwarding becomes + /// + /// RegClassA = COPY RegClassB + /// ... + /// RegClassB = COPY RegClassB + /// + /// so we have reduced the number of cross-class COPYs and potentially + /// introduced a nop COPY that can be removed. + const TargetRegisterClass *UseDstRC = + TRI.getMinimalPhysRegClass(UseI.getOperand(0).getReg()); + + const TargetRegisterClass *SuperRC = UseDstRC; + for (TargetRegisterClass::sc_iterator SuperRCI = UseDstRC->getSuperClasses(); + SuperRC; SuperRC = *SuperRCI++) + if (SuperRC->contains(NewReg)) + return true; + + return false; +} + class CopyTracker { struct CopyInfo { MachineInstr *MI; @@ -94,6 +169,10 @@ DenseMap Copies; + /// Mapping registers to where they are used later. Key is actual registers + /// (not register units) Only used for Backward Propagation + DenseMap> Uses; + public: /// Mark all of the given registers and their subregisters as unavailable for /// copying. @@ -115,21 +194,37 @@ // enough. We have to find the COPY defines Reg or registers defined by Reg // and invalidate all of them. SmallSet RegsToInvalidate; - RegsToInvalidate.insert(Reg); + + // Remove in 2 stages to make sure when we invalidate a register, all traces + // of the corresponding copy instruction are removed from Copies for (MCRegUnitIterator RUI(Reg, &TRI); RUI.isValid(); ++RUI) { auto I = Copies.find(*RUI); if (I != Copies.end()) { - if (MachineInstr *MI = I->second.MI) { + if (MachineInstr *MI = I->second.MI) RegsToInvalidate.insert(MI->getOperand(0).getReg().asMCReg()); - RegsToInvalidate.insert(MI->getOperand(1).getReg().asMCReg()); - } RegsToInvalidate.insert(I->second.DefRegs.begin(), I->second.DefRegs.end()); } } - for (MCRegister InvalidReg : RegsToInvalidate) + + // Make sure when we remove Src registers from Copies, we remove the entire + // register used in the copy instruction. + for (MCRegister InvalidDefReg : RegsToInvalidate) { + for (MCRegUnitIterator RUI(InvalidDefReg, &TRI); RUI.isValid(); ++RUI) { + auto I = Copies.find(*RUI); + if (I != Copies.end()) { + assert(I->second.MI); + RegsToInvalidate.insert( + I->second.MI->getOperand(1).getReg().asMCReg()); + } + } + } + + for (MCRegister InvalidReg : RegsToInvalidate) { for (MCRegUnitIterator RUI(InvalidReg, &TRI); RUI.isValid(); ++RUI) Copies.erase(*RUI); + Uses.erase(InvalidReg); + } } /// Clobber a single register, removing it from the tracker's copy maps. @@ -171,9 +266,7 @@ } } - bool hasAnyCopies() { - return !Copies.empty(); - } + bool hasAnyCopies() { return !Copies.empty(); } MachineInstr *findCopyForUnit(MCRegister RegUnit, const TargetRegisterInfo &TRI, @@ -186,6 +279,8 @@ return CI->second.MI; } + /// Find the corresponding copy instruction where \p RegUnit appeared as the + /// Src. MachineInstr *findCopyDefViaUnit(MCRegister RegUnit, const TargetRegisterInfo &TRI) { auto CI = Copies.find(RegUnit); @@ -197,13 +292,14 @@ return findCopyForUnit(*RUI, TRI, true); } - MachineInstr *findAvailBackwardCopy(MachineInstr &I, MCRegister Reg, - const TargetRegisterInfo &TRI) { + std::pair *> + findAvailBackwardCopy(MachineInstr &I, MCRegister Reg, + const TargetRegisterInfo &TRI) { MCRegUnitIterator RUI(Reg, &TRI); MachineInstr *AvailCopy = findCopyDefViaUnit(*RUI, TRI); if (!AvailCopy || !TRI.isSubRegisterEq(AvailCopy->getOperand(1).getReg(), Reg)) - return nullptr; + return {nullptr, nullptr}; Register AvailSrc = AvailCopy->getOperand(1).getReg(); Register AvailDef = AvailCopy->getOperand(0).getReg(); @@ -213,9 +309,96 @@ if (MO.isRegMask()) // FIXME: Shall we simultaneously invalidate AvailSrc or AvailDef? if (MO.clobbersPhysReg(AvailSrc) || MO.clobbersPhysReg(AvailDef)) - return nullptr; + return {nullptr, nullptr}; - return AvailCopy; + auto UseI = Uses.find(AvailSrc); + return {AvailCopy, UseI == Uses.end() ? nullptr : &UseI->second}; + } + + /// Track uses of registers that are candidates for backward copy propagation + void trackUse(MachineInstr *MI, const TargetRegisterInfo &TRI, + const TargetInstrInfo &TII) { + SmallSet RegsToInvalidate; + for (unsigned OpIdx = 0, OpEnd = MI->getNumOperands(); OpIdx != OpEnd; + ++OpIdx) { + MachineOperand &MOUse = MI->getOperand(OpIdx); + if (!MOUse.isReg() || !MOUse.isUse() || MOUse.isUndef()) + continue; + if (!MOUse.getReg()) + continue; + + MCRegister Use = MOUse.getReg().asMCReg(); + + // Three cases where we should give up propagating copies: + // + // 1) If a read register overlaps, but is not equal to, some of the + // candidate src registers, we need to give up on propagating those + // overlapping registers. + // + // 2) If a copy candidate's Def is read or partially read. + // + // 3) This instruction has uses, but we don't know how to + // rewrite those uses because of overlaps/ties/unrenamble registers, so + // give up on propagating copies related to these uses + bool IsOverlappingUse = false; + Register CandidateSrc = {0}, CandidateDef = {0}; + for (MCRegUnitIterator RUI(Use, &TRI); RUI.isValid(); ++RUI) { + auto CopyInfoI = Copies.find(*RUI); + if (CopyInfoI == Copies.end()) + continue; + if (!CopyInfoI->second.Avail) { + // Use matches or overlaps with an Src + // Find the actual Src in the copy instruction + MachineInstr *Copy = findCopyDefViaUnit(*RUI, TRI); + MCRegister Src = Copy->getOperand(1).getReg().asMCReg(); + MCRegister Def = Copy->getOperand(0).getReg().asMCReg(); + if (Src != Use) { + // Case (1) + IsOverlappingUse = true; + RegsToInvalidate.insert(Src); + RegsToInvalidate.insert(Def); + } else { + CandidateSrc = Src; + CandidateDef = Def; + break; + } + } else { + // Case (2) + RegsToInvalidate.insert( + CopyInfoI->second.MI->getOperand(0).getReg().asMCReg()); + RegsToInvalidate.insert( + CopyInfoI->second.MI->getOperand(1).getReg().asMCReg()); + } + } + + // Can't have matching and overlapping Srcs at the same time. + assert(!CandidateSrc || !IsOverlappingUse); + + if (CandidateSrc) { + // Implies: !IsOverlappingUse. + if (!MI->isDebugValue() && + (hasImplicitOverlap(*MI, MOUse, TRI) || !MOUse.isRenamable() || + MOUse.isTied() || + !isForwardableRegClassCopy(CandidateSrc, *MI, OpIdx, TII, TRI))) { + // Case (3) + RegsToInvalidate.insert(CandidateSrc.asMCReg()); + RegsToInvalidate.insert(CandidateDef.asMCReg()); + continue; + } + + // Add to Uses for future rewrite + auto I = Uses.insert({Use, {MI}}); + if (!I.second) { + I.first->second.push_back(MI); + } + } + } + + for (MCRegister InvalidReg : RegsToInvalidate) { + for (MCRegUnitIterator RUI(InvalidReg, &TRI); RUI.isValid(); ++RUI) + Copies.erase(*RUI); + Uses.erase(InvalidReg); + } } MachineInstr *findAvailCopy(MachineInstr &DestCopy, MCRegister Reg, @@ -245,6 +428,7 @@ void clear() { Copies.clear(); + Uses.clear(); } }; @@ -281,12 +465,9 @@ bool eraseIfRedundant(MachineInstr &Copy, MCRegister Src, MCRegister Def); void forwardUses(MachineInstr &MI); void propagateDefs(MachineInstr &MI); - bool isForwardableRegClassCopy(const MachineInstr &Copy, - const MachineInstr &UseI, unsigned UseIdx); bool isBackwardPropagatableRegClassCopy(const MachineInstr &Copy, const MachineInstr &UseI, unsigned UseIdx); - bool hasImplicitOverlap(const MachineInstr &MI, const MachineOperand &Use); bool hasOverlappingMultipleDef(const MachineInstr &MI, const MachineOperand &MODef, Register Def); @@ -396,70 +577,6 @@ return false; } -/// Decide whether we should forward the source of \param Copy to its use in -/// \param UseI based on the physical register class constraints of the opcode -/// and avoiding introducing more cross-class COPYs. -bool MachineCopyPropagation::isForwardableRegClassCopy(const MachineInstr &Copy, - const MachineInstr &UseI, - unsigned UseIdx) { - - Register CopySrcReg = Copy.getOperand(1).getReg(); - - // If the new register meets the opcode register constraints, then allow - // forwarding. - if (const TargetRegisterClass *URC = - UseI.getRegClassConstraint(UseIdx, TII, TRI)) - return URC->contains(CopySrcReg); - - if (!UseI.isCopy()) - return false; - - /// COPYs don't have register class constraints, so if the user instruction - /// is a COPY, we just try to avoid introducing additional cross-class - /// COPYs. For example: - /// - /// RegClassA = COPY RegClassB // Copy parameter - /// ... - /// RegClassB = COPY RegClassA // UseI parameter - /// - /// which after forwarding becomes - /// - /// RegClassA = COPY RegClassB - /// ... - /// RegClassB = COPY RegClassB - /// - /// so we have reduced the number of cross-class COPYs and potentially - /// introduced a nop COPY that can be removed. - const TargetRegisterClass *UseDstRC = - TRI->getMinimalPhysRegClass(UseI.getOperand(0).getReg()); - - const TargetRegisterClass *SuperRC = UseDstRC; - for (TargetRegisterClass::sc_iterator SuperRCI = UseDstRC->getSuperClasses(); - SuperRC; SuperRC = *SuperRCI++) - if (SuperRC->contains(CopySrcReg)) - return true; - - return false; -} - -/// Check that \p MI does not have implicit uses that overlap with it's \p Use -/// operand (the register being replaced), since these can sometimes be -/// implicitly tied to other operands. For example, on AMDGPU: -/// -/// V_MOVRELS_B32_e32 %VGPR2, %M0, %EXEC, %VGPR2_VGPR3_VGPR4_VGPR5 -/// -/// the %VGPR2 is implicitly tied to the larger reg operand, but we have no -/// way of knowing we need to update the latter when updating the former. -bool MachineCopyPropagation::hasImplicitOverlap(const MachineInstr &MI, - const MachineOperand &Use) { - for (const MachineOperand &MIUse : MI.uses()) - if (&MIUse != &Use && MIUse.isReg() && MIUse.isImplicit() && - MIUse.isUse() && TRI->regsOverlap(Use.getReg(), MIUse.getReg())) - return true; - - return false; -} - /// For an MI that has multiple definitions, check whether \p MI has /// a definition that overlaps with another of its definitions. /// For example, on ARM: umull r9, r9, lr, r0 @@ -526,10 +643,10 @@ if (MRI->isReserved(CopySrcReg) && !MRI->isConstantPhysReg(CopySrcReg)) continue; - if (!isForwardableRegClassCopy(*Copy, MI, OpIdx)) + if (!isForwardableRegClassCopy(CopySrcReg, MI, OpIdx, *TII, *TRI)) continue; - if (hasImplicitOverlap(MI, MOUse)) + if (hasImplicitOverlap(MI, MOUse, *TRI)) continue; // Check that the instruction is not a copy that partially overwrites the @@ -571,7 +688,7 @@ LLVM_DEBUG(dbgs() << "MCP: ForwardCopyPropagateBlock " << MBB.getName() << "\n"); - for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; ) { + for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) { MachineInstr *MI = &*I; ++I; @@ -788,21 +905,21 @@ if (!MODef.isRenamable()) continue; - MachineInstr *Copy = + auto Copy = Tracker.findAvailBackwardCopy(MI, MODef.getReg().asMCReg(), *TRI); - if (!Copy) + if (!Copy.first) continue; - Register Def = Copy->getOperand(0).getReg(); - Register Src = Copy->getOperand(1).getReg(); + Register Def = Copy.first->getOperand(0).getReg(); + Register Src = Copy.first->getOperand(1).getReg(); if (MODef.getReg() != Src) continue; - if (!isBackwardPropagatableRegClassCopy(*Copy, MI, OpIdx)) + if (!isBackwardPropagatableRegClassCopy(*Copy.first, MI, OpIdx)) continue; - if (hasImplicitOverlap(MI, MODef)) + if (hasImplicitOverlap(MI, MODef, *TRI)) continue; if (hasOverlappingMultipleDef(MI, MODef, Def)) @@ -810,13 +927,40 @@ LLVM_DEBUG(dbgs() << "MCP: Replacing " << printReg(MODef.getReg(), TRI) << "\n with " << printReg(Def, TRI) << "\n in " - << MI << " from " << *Copy); + << MI << " from " << *Copy.first); + bool IsRenamable = Copy.first->getOperand(0).isRenamable(); MODef.setReg(Def); - MODef.setIsRenamable(Copy->getOperand(0).isRenamable()); + MODef.setIsRenamable(IsRenamable); LLVM_DEBUG(dbgs() << "MCP: After replacement: " << MI << "\n"); - MaybeDeadCopies.insert(Copy); + + // Update uses of the original Def. + // We don't need to perform checks here, Uses only contains rewrittable + // uses. + if (Copy.second) { + for (MachineInstr *User : *Copy.second) { + LLVM_DEBUG(dbgs() << "MCP: Replacing " << printReg(Src, TRI) + << "\n with " << printReg(Def, TRI) + << "\n in " << *User << " from " + << *Copy.first); + if (User->isDebugValue()) { + MRI->updateDbgUsersToReg(Src, Def, User); + } else { + for (unsigned UseIdx = 0, UseEnd = User->getNumOperands(); + UseIdx != UseEnd; ++UseIdx) { + MachineOperand &MOUse = User->getOperand(UseIdx); + if (!MOUse.isReg() || MOUse.getReg() != Src) + continue; + MOUse.setReg(Def); + MOUse.setIsRenamable(IsRenamable); + } + } + LLVM_DEBUG(dbgs() << "MCP: After replacement: " << *User << "\n"); + } + } + + MaybeDeadCopies.insert(Copy.first); Changed = true; ++NumCopyBackwardPropagated; } @@ -843,7 +987,11 @@ // Unlike forward cp, we don't invoke propagateDefs here, // just let forward cp do COPY-to-COPY propagation. if (isBackwardPropagatableCopy(*MI, *MRI)) { + // Remove copies related to Src. Src can only possibly appear + // in copy candidates as define, because otherwise the current copy + // won't kill Src. Tracker.invalidateRegister(Src, *TRI); + // Remove copies related to Def. Tracker.invalidateRegister(Def, *TRI); Tracker.trackCopy(MI, *TRI); continue; @@ -860,6 +1008,9 @@ } propagateDefs(*MI); + + // Track uses after propagation, because we need the correct Def register. + Tracker.trackUse(MI, *TRI, *TII); for (const MachineOperand &MO : MI->operands()) { if (!MO.isReg()) continue; @@ -867,11 +1018,14 @@ if (!MO.getReg()) continue; - if (MO.isDef()) - Tracker.invalidateRegister(MO.getReg().asMCReg(), *TRI); - - if (MO.readsReg()) + if (MO.isDef()) { + // Even if we did apply propagation, the relevant copy instruction + // would still get invalidated here. The original instruction would + // trigger invalidation because its Def matches the Src of the relevant + // copy, the updated instruction would still do because its Def now + // matches the Def of the relevant copy. Tracker.invalidateRegister(MO.getReg().asMCReg(), *TRI); + } } } diff --git a/llvm/test/CodeGen/ARM/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/ARM/umulo-128-legalisation-lowering.ll --- a/llvm/test/CodeGen/ARM/umulo-128-legalisation-lowering.ll +++ b/llvm/test/CodeGen/ARM/umulo-128-legalisation-lowering.ll @@ -7,12 +7,11 @@ ; ARMV6: @ %bb.0: @ %start ; ARMV6-NEXT: push {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; ARMV6-NEXT: sub sp, sp, #28 -; ARMV6-NEXT: ldr r7, [sp, #72] +; ARMV6-NEXT: ldr lr, [sp, #72] ; ARMV6-NEXT: mov r6, r0 ; ARMV6-NEXT: str r0, [sp, #8] @ 4-byte Spill ; ARMV6-NEXT: ldr r4, [sp, #84] -; ARMV6-NEXT: umull r1, r0, r2, r7 -; ARMV6-NEXT: mov lr, r7 +; ARMV6-NEXT: umull r1, r0, r2, lr ; ARMV6-NEXT: umull r5, r10, r4, r2 ; ARMV6-NEXT: str r1, [r6] ; ARMV6-NEXT: ldr r6, [sp, #80] @@ -163,39 +162,38 @@ ; ARMV7-NEXT: ldr r0, [sp, #92] ; ARMV7-NEXT: cmp r3, #0 ; ARMV7-NEXT: movwne r3, #1 -; ARMV7-NEXT: ldr r2, [sp, #76] +; ARMV7-NEXT: ldr r2, [sp, #72] ; ARMV7-NEXT: cmp r0, #0 ; ARMV7-NEXT: movwne r0, #1 ; ARMV7-NEXT: cmp r1, #0 +; ARMV7-NEXT: and r0, r0, r3 +; ARMV7-NEXT: ldr r3, [sp, #76] ; ARMV7-NEXT: movwne r1, #1 ; ARMV7-NEXT: cmp r12, #0 -; ARMV7-NEXT: and r0, r0, r3 ; ARMV7-NEXT: movwne r12, #1 ; ARMV7-NEXT: cmp r5, #0 ; ARMV7-NEXT: orr r0, r0, r1 ; ARMV7-NEXT: movwne r5, #1 -; ARMV7-NEXT: cmp r2, #0 -; ARMV7-NEXT: mov r1, r2 -; ARMV7-NEXT: mov r3, r2 +; ARMV7-NEXT: cmp r3, #0 +; ARMV7-NEXT: mov r1, r3 ; ARMV7-NEXT: movwne r1, #1 ; ARMV7-NEXT: cmp r4, #0 -; ARMV7-NEXT: ldr r2, [sp, #72] ; ARMV7-NEXT: movwne r4, #1 ; ARMV7-NEXT: cmp lr, #0 -; ARMV7-NEXT: and r1, r1, r5 ; ARMV7-NEXT: movwne lr, #1 ; ARMV7-NEXT: orrs r2, r2, r3 ; ARMV7-NEXT: ldr r3, [sp, #88] ; ARMV7-NEXT: movwne r2, #1 -; ARMV7-NEXT: orr r1, r1, r4 +; ARMV7-NEXT: and r1, r1, r5 ; ARMV7-NEXT: orr r0, r0, r12 ; ARMV7-NEXT: orrs r3, r3, r6 -; ARMV7-NEXT: orr r1, r1, lr +; ARMV7-NEXT: orr r1, r1, r4 ; ARMV7-NEXT: movwne r3, #1 ; ARMV7-NEXT: adds r7, r9, r7 ; ARMV7-NEXT: str r7, [r8, #8] -; ARMV7-NEXT: and r2, r2, r3 +; ARMV7-NEXT: orr r1, r1, lr ; ARMV7-NEXT: ldr r7, [sp, #28] @ 4-byte Reload +; ARMV7-NEXT: and r2, r2, r3 ; ARMV7-NEXT: orr r0, r0, r11 ; ARMV7-NEXT: adcs r7, r10, r7 ; ARMV7-NEXT: str r7, [r8, #12] diff --git a/llvm/test/CodeGen/Mips/llvm-ir/mul.ll b/llvm/test/CodeGen/Mips/llvm-ir/mul.ll --- a/llvm/test/CodeGen/Mips/llvm-ir/mul.ll +++ b/llvm/test/CodeGen/Mips/llvm-ir/mul.ll @@ -249,7 +249,7 @@ ; 64R6: daddu $2, $[[T1]], $[[T0]] ; 64R6-DAG: dmul $3, $5, $7 - ; MM32: lw $25, %call16(__multi3)($16) + ; MM32: lw $25, %call16(__multi3)($gp) %r = mul i128 %a, %b ret i128 %r diff --git a/llvm/test/CodeGen/Mips/llvm-ir/sdiv.ll b/llvm/test/CodeGen/Mips/llvm-ir/sdiv.ll --- a/llvm/test/CodeGen/Mips/llvm-ir/sdiv.ll +++ b/llvm/test/CodeGen/Mips/llvm-ir/sdiv.ll @@ -302,9 +302,8 @@ ; MMR3-NEXT: .cfi_def_cfa_offset 24 ; MMR3-NEXT: sw $ra, 20($sp) # 4-byte Folded Spill ; MMR3-NEXT: .cfi_offset 31, -4 -; MMR3-NEXT: addu $2, $2, $25 -; MMR3-NEXT: lw $25, %call16(__divdi3)($2) -; MMR3-NEXT: move $gp, $2 +; MMR3-NEXT: addu $gp, $2, $25 +; MMR3-NEXT: lw $25, %call16(__divdi3)($gp) ; MMR3-NEXT: jalr $25 ; MMR3-NEXT: nop ; MMR3-NEXT: lw $ra, 20($sp) # 4-byte Folded Reload @@ -319,9 +318,8 @@ ; MMR6-NEXT: .cfi_def_cfa_offset 24 ; MMR6-NEXT: sw $ra, 20($sp) # 4-byte Folded Spill ; MMR6-NEXT: .cfi_offset 31, -4 -; MMR6-NEXT: addu $2, $2, $25 -; MMR6-NEXT: lw $25, %call16(__divdi3)($2) -; MMR6-NEXT: move $gp, $2 +; MMR6-NEXT: addu $gp, $2, $25 +; MMR6-NEXT: lw $25, %call16(__divdi3)($gp) ; MMR6-NEXT: jalr $25 ; MMR6-NEXT: lw $ra, 20($sp) # 4-byte Folded Reload ; MMR6-NEXT: addiu $sp, $sp, 24 @@ -420,65 +418,59 @@ ; MMR3: # %bb.0: # %entry ; MMR3-NEXT: lui $2, %hi(_gp_disp) ; MMR3-NEXT: addiu $2, $2, %lo(_gp_disp) -; MMR3-NEXT: addiusp -48 -; MMR3-NEXT: .cfi_def_cfa_offset 48 -; MMR3-NEXT: sw $ra, 44($sp) # 4-byte Folded Spill -; MMR3-NEXT: swp $16, 36($sp) +; MMR3-NEXT: addiusp -40 +; MMR3-NEXT: .cfi_def_cfa_offset 40 +; MMR3-NEXT: sw $ra, 36($sp) # 4-byte Folded Spill +; MMR3-NEXT: sw $17, 32($sp) # 4-byte Folded Spill ; MMR3-NEXT: .cfi_offset 31, -4 ; MMR3-NEXT: .cfi_offset 17, -8 -; MMR3-NEXT: .cfi_offset 16, -12 -; MMR3-NEXT: addu $16, $2, $25 +; MMR3-NEXT: addu $gp, $2, $25 ; MMR3-NEXT: move $1, $7 -; MMR3-NEXT: lw $7, 68($sp) -; MMR3-NEXT: lw $17, 72($sp) -; MMR3-NEXT: lw $3, 76($sp) +; MMR3-NEXT: lw $7, 60($sp) +; MMR3-NEXT: lw $17, 64($sp) +; MMR3-NEXT: lw $3, 68($sp) ; MMR3-NEXT: move $2, $sp ; MMR3-NEXT: sw16 $3, 28($2) ; MMR3-NEXT: sw16 $17, 24($2) ; MMR3-NEXT: sw16 $7, 20($2) -; MMR3-NEXT: lw $3, 64($sp) +; MMR3-NEXT: lw $3, 56($sp) ; MMR3-NEXT: sw16 $3, 16($2) -; MMR3-NEXT: lw $25, %call16(__divti3)($16) +; MMR3-NEXT: lw $25, %call16(__divti3)($gp) ; MMR3-NEXT: move $7, $1 -; MMR3-NEXT: move $gp, $16 ; MMR3-NEXT: jalr $25 ; MMR3-NEXT: nop -; MMR3-NEXT: lwp $16, 36($sp) -; MMR3-NEXT: lw $ra, 44($sp) # 4-byte Folded Reload -; MMR3-NEXT: addiusp 48 +; MMR3-NEXT: lw $17, 32($sp) # 4-byte Folded Reload +; MMR3-NEXT: lw $ra, 36($sp) # 4-byte Folded Reload +; MMR3-NEXT: addiusp 40 ; MMR3-NEXT: jrc $ra ; ; MMR6-LABEL: sdiv_i128: ; MMR6: # %bb.0: # %entry ; MMR6-NEXT: lui $2, %hi(_gp_disp) ; MMR6-NEXT: addiu $2, $2, %lo(_gp_disp) -; MMR6-NEXT: addiu $sp, $sp, -48 -; MMR6-NEXT: .cfi_def_cfa_offset 48 -; MMR6-NEXT: sw $ra, 44($sp) # 4-byte Folded Spill -; MMR6-NEXT: sw $17, 40($sp) # 4-byte Folded Spill -; MMR6-NEXT: sw $16, 36($sp) # 4-byte Folded Spill +; MMR6-NEXT: addiu $sp, $sp, -40 +; MMR6-NEXT: .cfi_def_cfa_offset 40 +; MMR6-NEXT: sw $ra, 36($sp) # 4-byte Folded Spill +; MMR6-NEXT: sw $17, 32($sp) # 4-byte Folded Spill ; MMR6-NEXT: .cfi_offset 31, -4 ; MMR6-NEXT: .cfi_offset 17, -8 -; MMR6-NEXT: .cfi_offset 16, -12 -; MMR6-NEXT: addu $16, $2, $25 +; MMR6-NEXT: addu $gp, $2, $25 ; MMR6-NEXT: move $1, $7 -; MMR6-NEXT: lw $7, 68($sp) -; MMR6-NEXT: lw $17, 72($sp) -; MMR6-NEXT: lw $3, 76($sp) +; MMR6-NEXT: lw $7, 60($sp) +; MMR6-NEXT: lw $17, 64($sp) +; MMR6-NEXT: lw $3, 68($sp) ; MMR6-NEXT: move $2, $sp ; MMR6-NEXT: sw16 $3, 28($2) ; MMR6-NEXT: sw16 $17, 24($2) ; MMR6-NEXT: sw16 $7, 20($2) -; MMR6-NEXT: lw $3, 64($sp) +; MMR6-NEXT: lw $3, 56($sp) ; MMR6-NEXT: sw16 $3, 16($2) -; MMR6-NEXT: lw $25, %call16(__divti3)($16) +; MMR6-NEXT: lw $25, %call16(__divti3)($gp) ; MMR6-NEXT: move $7, $1 -; MMR6-NEXT: move $gp, $16 ; MMR6-NEXT: jalr $25 -; MMR6-NEXT: lw $16, 36($sp) # 4-byte Folded Reload -; MMR6-NEXT: lw $17, 40($sp) # 4-byte Folded Reload -; MMR6-NEXT: lw $ra, 44($sp) # 4-byte Folded Reload -; MMR6-NEXT: addiu $sp, $sp, 48 +; MMR6-NEXT: lw $17, 32($sp) # 4-byte Folded Reload +; MMR6-NEXT: lw $ra, 36($sp) # 4-byte Folded Reload +; MMR6-NEXT: addiu $sp, $sp, 40 ; MMR6-NEXT: jrc $ra entry: %r = sdiv i128 %a, %b diff --git a/llvm/test/CodeGen/Mips/llvm-ir/srem.ll b/llvm/test/CodeGen/Mips/llvm-ir/srem.ll --- a/llvm/test/CodeGen/Mips/llvm-ir/srem.ll +++ b/llvm/test/CodeGen/Mips/llvm-ir/srem.ll @@ -254,9 +254,8 @@ ; MMR3-NEXT: .cfi_def_cfa_offset 24 ; MMR3-NEXT: sw $ra, 20($sp) # 4-byte Folded Spill ; MMR3-NEXT: .cfi_offset 31, -4 -; MMR3-NEXT: addu $2, $2, $25 -; MMR3-NEXT: lw $25, %call16(__moddi3)($2) -; MMR3-NEXT: move $gp, $2 +; MMR3-NEXT: addu $gp, $2, $25 +; MMR3-NEXT: lw $25, %call16(__moddi3)($gp) ; MMR3-NEXT: jalr $25 ; MMR3-NEXT: nop ; MMR3-NEXT: lw $ra, 20($sp) # 4-byte Folded Reload @@ -271,9 +270,8 @@ ; MMR6-NEXT: .cfi_def_cfa_offset 24 ; MMR6-NEXT: sw $ra, 20($sp) # 4-byte Folded Spill ; MMR6-NEXT: .cfi_offset 31, -4 -; MMR6-NEXT: addu $2, $2, $25 -; MMR6-NEXT: lw $25, %call16(__moddi3)($2) -; MMR6-NEXT: move $gp, $2 +; MMR6-NEXT: addu $gp, $2, $25 +; MMR6-NEXT: lw $25, %call16(__moddi3)($gp) ; MMR6-NEXT: jalr $25 ; MMR6-NEXT: lw $ra, 20($sp) # 4-byte Folded Reload ; MMR6-NEXT: addiu $sp, $sp, 24 @@ -372,65 +370,59 @@ ; MMR3: # %bb.0: # %entry ; MMR3-NEXT: lui $2, %hi(_gp_disp) ; MMR3-NEXT: addiu $2, $2, %lo(_gp_disp) -; MMR3-NEXT: addiusp -48 -; MMR3-NEXT: .cfi_def_cfa_offset 48 -; MMR3-NEXT: sw $ra, 44($sp) # 4-byte Folded Spill -; MMR3-NEXT: swp $16, 36($sp) +; MMR3-NEXT: addiusp -40 +; MMR3-NEXT: .cfi_def_cfa_offset 40 +; MMR3-NEXT: sw $ra, 36($sp) # 4-byte Folded Spill +; MMR3-NEXT: sw $17, 32($sp) # 4-byte Folded Spill ; MMR3-NEXT: .cfi_offset 31, -4 ; MMR3-NEXT: .cfi_offset 17, -8 -; MMR3-NEXT: .cfi_offset 16, -12 -; MMR3-NEXT: addu $16, $2, $25 +; MMR3-NEXT: addu $gp, $2, $25 ; MMR3-NEXT: move $1, $7 -; MMR3-NEXT: lw $7, 68($sp) -; MMR3-NEXT: lw $17, 72($sp) -; MMR3-NEXT: lw $3, 76($sp) +; MMR3-NEXT: lw $7, 60($sp) +; MMR3-NEXT: lw $17, 64($sp) +; MMR3-NEXT: lw $3, 68($sp) ; MMR3-NEXT: move $2, $sp ; MMR3-NEXT: sw16 $3, 28($2) ; MMR3-NEXT: sw16 $17, 24($2) ; MMR3-NEXT: sw16 $7, 20($2) -; MMR3-NEXT: lw $3, 64($sp) +; MMR3-NEXT: lw $3, 56($sp) ; MMR3-NEXT: sw16 $3, 16($2) -; MMR3-NEXT: lw $25, %call16(__modti3)($16) +; MMR3-NEXT: lw $25, %call16(__modti3)($gp) ; MMR3-NEXT: move $7, $1 -; MMR3-NEXT: move $gp, $16 ; MMR3-NEXT: jalr $25 ; MMR3-NEXT: nop -; MMR3-NEXT: lwp $16, 36($sp) -; MMR3-NEXT: lw $ra, 44($sp) # 4-byte Folded Reload -; MMR3-NEXT: addiusp 48 +; MMR3-NEXT: lw $17, 32($sp) # 4-byte Folded Reload +; MMR3-NEXT: lw $ra, 36($sp) # 4-byte Folded Reload +; MMR3-NEXT: addiusp 40 ; MMR3-NEXT: jrc $ra ; ; MMR6-LABEL: srem_i128: ; MMR6: # %bb.0: # %entry ; MMR6-NEXT: lui $2, %hi(_gp_disp) ; MMR6-NEXT: addiu $2, $2, %lo(_gp_disp) -; MMR6-NEXT: addiu $sp, $sp, -48 -; MMR6-NEXT: .cfi_def_cfa_offset 48 -; MMR6-NEXT: sw $ra, 44($sp) # 4-byte Folded Spill -; MMR6-NEXT: sw $17, 40($sp) # 4-byte Folded Spill -; MMR6-NEXT: sw $16, 36($sp) # 4-byte Folded Spill +; MMR6-NEXT: addiu $sp, $sp, -40 +; MMR6-NEXT: .cfi_def_cfa_offset 40 +; MMR6-NEXT: sw $ra, 36($sp) # 4-byte Folded Spill +; MMR6-NEXT: sw $17, 32($sp) # 4-byte Folded Spill ; MMR6-NEXT: .cfi_offset 31, -4 ; MMR6-NEXT: .cfi_offset 17, -8 -; MMR6-NEXT: .cfi_offset 16, -12 -; MMR6-NEXT: addu $16, $2, $25 +; MMR6-NEXT: addu $gp, $2, $25 ; MMR6-NEXT: move $1, $7 -; MMR6-NEXT: lw $7, 68($sp) -; MMR6-NEXT: lw $17, 72($sp) -; MMR6-NEXT: lw $3, 76($sp) +; MMR6-NEXT: lw $7, 60($sp) +; MMR6-NEXT: lw $17, 64($sp) +; MMR6-NEXT: lw $3, 68($sp) ; MMR6-NEXT: move $2, $sp ; MMR6-NEXT: sw16 $3, 28($2) ; MMR6-NEXT: sw16 $17, 24($2) ; MMR6-NEXT: sw16 $7, 20($2) -; MMR6-NEXT: lw $3, 64($sp) +; MMR6-NEXT: lw $3, 56($sp) ; MMR6-NEXT: sw16 $3, 16($2) -; MMR6-NEXT: lw $25, %call16(__modti3)($16) +; MMR6-NEXT: lw $25, %call16(__modti3)($gp) ; MMR6-NEXT: move $7, $1 -; MMR6-NEXT: move $gp, $16 ; MMR6-NEXT: jalr $25 -; MMR6-NEXT: lw $16, 36($sp) # 4-byte Folded Reload -; MMR6-NEXT: lw $17, 40($sp) # 4-byte Folded Reload -; MMR6-NEXT: lw $ra, 44($sp) # 4-byte Folded Reload -; MMR6-NEXT: addiu $sp, $sp, 48 +; MMR6-NEXT: lw $17, 32($sp) # 4-byte Folded Reload +; MMR6-NEXT: lw $ra, 36($sp) # 4-byte Folded Reload +; MMR6-NEXT: addiu $sp, $sp, 40 ; MMR6-NEXT: jrc $ra entry: %r = srem i128 %a, %b diff --git a/llvm/test/CodeGen/Mips/llvm-ir/udiv.ll b/llvm/test/CodeGen/Mips/llvm-ir/udiv.ll --- a/llvm/test/CodeGen/Mips/llvm-ir/udiv.ll +++ b/llvm/test/CodeGen/Mips/llvm-ir/udiv.ll @@ -254,9 +254,8 @@ ; MMR3-NEXT: .cfi_def_cfa_offset 24 ; MMR3-NEXT: sw $ra, 20($sp) # 4-byte Folded Spill ; MMR3-NEXT: .cfi_offset 31, -4 -; MMR3-NEXT: addu $2, $2, $25 -; MMR3-NEXT: lw $25, %call16(__udivdi3)($2) -; MMR3-NEXT: move $gp, $2 +; MMR3-NEXT: addu $gp, $2, $25 +; MMR3-NEXT: lw $25, %call16(__udivdi3)($gp) ; MMR3-NEXT: jalr $25 ; MMR3-NEXT: nop ; MMR3-NEXT: lw $ra, 20($sp) # 4-byte Folded Reload @@ -271,9 +270,8 @@ ; MMR6-NEXT: .cfi_def_cfa_offset 24 ; MMR6-NEXT: sw $ra, 20($sp) # 4-byte Folded Spill ; MMR6-NEXT: .cfi_offset 31, -4 -; MMR6-NEXT: addu $2, $2, $25 -; MMR6-NEXT: lw $25, %call16(__udivdi3)($2) -; MMR6-NEXT: move $gp, $2 +; MMR6-NEXT: addu $gp, $2, $25 +; MMR6-NEXT: lw $25, %call16(__udivdi3)($gp) ; MMR6-NEXT: jalr $25 ; MMR6-NEXT: lw $ra, 20($sp) # 4-byte Folded Reload ; MMR6-NEXT: addiu $sp, $sp, 24 @@ -372,65 +370,59 @@ ; MMR3: # %bb.0: # %entry ; MMR3-NEXT: lui $2, %hi(_gp_disp) ; MMR3-NEXT: addiu $2, $2, %lo(_gp_disp) -; MMR3-NEXT: addiusp -48 -; MMR3-NEXT: .cfi_def_cfa_offset 48 -; MMR3-NEXT: sw $ra, 44($sp) # 4-byte Folded Spill -; MMR3-NEXT: swp $16, 36($sp) +; MMR3-NEXT: addiusp -40 +; MMR3-NEXT: .cfi_def_cfa_offset 40 +; MMR3-NEXT: sw $ra, 36($sp) # 4-byte Folded Spill +; MMR3-NEXT: sw $17, 32($sp) # 4-byte Folded Spill ; MMR3-NEXT: .cfi_offset 31, -4 ; MMR3-NEXT: .cfi_offset 17, -8 -; MMR3-NEXT: .cfi_offset 16, -12 -; MMR3-NEXT: addu $16, $2, $25 +; MMR3-NEXT: addu $gp, $2, $25 ; MMR3-NEXT: move $1, $7 -; MMR3-NEXT: lw $7, 68($sp) -; MMR3-NEXT: lw $17, 72($sp) -; MMR3-NEXT: lw $3, 76($sp) +; MMR3-NEXT: lw $7, 60($sp) +; MMR3-NEXT: lw $17, 64($sp) +; MMR3-NEXT: lw $3, 68($sp) ; MMR3-NEXT: move $2, $sp ; MMR3-NEXT: sw16 $3, 28($2) ; MMR3-NEXT: sw16 $17, 24($2) ; MMR3-NEXT: sw16 $7, 20($2) -; MMR3-NEXT: lw $3, 64($sp) +; MMR3-NEXT: lw $3, 56($sp) ; MMR3-NEXT: sw16 $3, 16($2) -; MMR3-NEXT: lw $25, %call16(__udivti3)($16) +; MMR3-NEXT: lw $25, %call16(__udivti3)($gp) ; MMR3-NEXT: move $7, $1 -; MMR3-NEXT: move $gp, $16 ; MMR3-NEXT: jalr $25 ; MMR3-NEXT: nop -; MMR3-NEXT: lwp $16, 36($sp) -; MMR3-NEXT: lw $ra, 44($sp) # 4-byte Folded Reload -; MMR3-NEXT: addiusp 48 +; MMR3-NEXT: lw $17, 32($sp) # 4-byte Folded Reload +; MMR3-NEXT: lw $ra, 36($sp) # 4-byte Folded Reload +; MMR3-NEXT: addiusp 40 ; MMR3-NEXT: jrc $ra ; ; MMR6-LABEL: udiv_i128: ; MMR6: # %bb.0: # %entry ; MMR6-NEXT: lui $2, %hi(_gp_disp) ; MMR6-NEXT: addiu $2, $2, %lo(_gp_disp) -; MMR6-NEXT: addiu $sp, $sp, -48 -; MMR6-NEXT: .cfi_def_cfa_offset 48 -; MMR6-NEXT: sw $ra, 44($sp) # 4-byte Folded Spill -; MMR6-NEXT: sw $17, 40($sp) # 4-byte Folded Spill -; MMR6-NEXT: sw $16, 36($sp) # 4-byte Folded Spill +; MMR6-NEXT: addiu $sp, $sp, -40 +; MMR6-NEXT: .cfi_def_cfa_offset 40 +; MMR6-NEXT: sw $ra, 36($sp) # 4-byte Folded Spill +; MMR6-NEXT: sw $17, 32($sp) # 4-byte Folded Spill ; MMR6-NEXT: .cfi_offset 31, -4 ; MMR6-NEXT: .cfi_offset 17, -8 -; MMR6-NEXT: .cfi_offset 16, -12 -; MMR6-NEXT: addu $16, $2, $25 +; MMR6-NEXT: addu $gp, $2, $25 ; MMR6-NEXT: move $1, $7 -; MMR6-NEXT: lw $7, 68($sp) -; MMR6-NEXT: lw $17, 72($sp) -; MMR6-NEXT: lw $3, 76($sp) +; MMR6-NEXT: lw $7, 60($sp) +; MMR6-NEXT: lw $17, 64($sp) +; MMR6-NEXT: lw $3, 68($sp) ; MMR6-NEXT: move $2, $sp ; MMR6-NEXT: sw16 $3, 28($2) ; MMR6-NEXT: sw16 $17, 24($2) ; MMR6-NEXT: sw16 $7, 20($2) -; MMR6-NEXT: lw $3, 64($sp) +; MMR6-NEXT: lw $3, 56($sp) ; MMR6-NEXT: sw16 $3, 16($2) -; MMR6-NEXT: lw $25, %call16(__udivti3)($16) +; MMR6-NEXT: lw $25, %call16(__udivti3)($gp) ; MMR6-NEXT: move $7, $1 -; MMR6-NEXT: move $gp, $16 ; MMR6-NEXT: jalr $25 -; MMR6-NEXT: lw $16, 36($sp) # 4-byte Folded Reload -; MMR6-NEXT: lw $17, 40($sp) # 4-byte Folded Reload -; MMR6-NEXT: lw $ra, 44($sp) # 4-byte Folded Reload -; MMR6-NEXT: addiu $sp, $sp, 48 +; MMR6-NEXT: lw $17, 32($sp) # 4-byte Folded Reload +; MMR6-NEXT: lw $ra, 36($sp) # 4-byte Folded Reload +; MMR6-NEXT: addiu $sp, $sp, 40 ; MMR6-NEXT: jrc $ra entry: %r = udiv i128 %a, %b diff --git a/llvm/test/CodeGen/Mips/llvm-ir/urem.ll b/llvm/test/CodeGen/Mips/llvm-ir/urem.ll --- a/llvm/test/CodeGen/Mips/llvm-ir/urem.ll +++ b/llvm/test/CodeGen/Mips/llvm-ir/urem.ll @@ -334,9 +334,8 @@ ; MMR3-NEXT: .cfi_def_cfa_offset 24 ; MMR3-NEXT: sw $ra, 20($sp) # 4-byte Folded Spill ; MMR3-NEXT: .cfi_offset 31, -4 -; MMR3-NEXT: addu $2, $2, $25 -; MMR3-NEXT: lw $25, %call16(__umoddi3)($2) -; MMR3-NEXT: move $gp, $2 +; MMR3-NEXT: addu $gp, $2, $25 +; MMR3-NEXT: lw $25, %call16(__umoddi3)($gp) ; MMR3-NEXT: jalr $25 ; MMR3-NEXT: nop ; MMR3-NEXT: lw $ra, 20($sp) # 4-byte Folded Reload @@ -351,9 +350,8 @@ ; MMR6-NEXT: .cfi_def_cfa_offset 24 ; MMR6-NEXT: sw $ra, 20($sp) # 4-byte Folded Spill ; MMR6-NEXT: .cfi_offset 31, -4 -; MMR6-NEXT: addu $2, $2, $25 -; MMR6-NEXT: lw $25, %call16(__umoddi3)($2) -; MMR6-NEXT: move $gp, $2 +; MMR6-NEXT: addu $gp, $2, $25 +; MMR6-NEXT: lw $25, %call16(__umoddi3)($gp) ; MMR6-NEXT: jalr $25 ; MMR6-NEXT: lw $ra, 20($sp) # 4-byte Folded Reload ; MMR6-NEXT: addiu $sp, $sp, 24 @@ -452,65 +450,59 @@ ; MMR3: # %bb.0: # %entry ; MMR3-NEXT: lui $2, %hi(_gp_disp) ; MMR3-NEXT: addiu $2, $2, %lo(_gp_disp) -; MMR3-NEXT: addiusp -48 -; MMR3-NEXT: .cfi_def_cfa_offset 48 -; MMR3-NEXT: sw $ra, 44($sp) # 4-byte Folded Spill -; MMR3-NEXT: swp $16, 36($sp) +; MMR3-NEXT: addiusp -40 +; MMR3-NEXT: .cfi_def_cfa_offset 40 +; MMR3-NEXT: sw $ra, 36($sp) # 4-byte Folded Spill +; MMR3-NEXT: sw $17, 32($sp) # 4-byte Folded Spill ; MMR3-NEXT: .cfi_offset 31, -4 ; MMR3-NEXT: .cfi_offset 17, -8 -; MMR3-NEXT: .cfi_offset 16, -12 -; MMR3-NEXT: addu $16, $2, $25 +; MMR3-NEXT: addu $gp, $2, $25 ; MMR3-NEXT: move $1, $7 -; MMR3-NEXT: lw $7, 68($sp) -; MMR3-NEXT: lw $17, 72($sp) -; MMR3-NEXT: lw $3, 76($sp) +; MMR3-NEXT: lw $7, 60($sp) +; MMR3-NEXT: lw $17, 64($sp) +; MMR3-NEXT: lw $3, 68($sp) ; MMR3-NEXT: move $2, $sp ; MMR3-NEXT: sw16 $3, 28($2) ; MMR3-NEXT: sw16 $17, 24($2) ; MMR3-NEXT: sw16 $7, 20($2) -; MMR3-NEXT: lw $3, 64($sp) +; MMR3-NEXT: lw $3, 56($sp) ; MMR3-NEXT: sw16 $3, 16($2) -; MMR3-NEXT: lw $25, %call16(__umodti3)($16) +; MMR3-NEXT: lw $25, %call16(__umodti3)($gp) ; MMR3-NEXT: move $7, $1 -; MMR3-NEXT: move $gp, $16 ; MMR3-NEXT: jalr $25 ; MMR3-NEXT: nop -; MMR3-NEXT: lwp $16, 36($sp) -; MMR3-NEXT: lw $ra, 44($sp) # 4-byte Folded Reload -; MMR3-NEXT: addiusp 48 +; MMR3-NEXT: lw $17, 32($sp) # 4-byte Folded Reload +; MMR3-NEXT: lw $ra, 36($sp) # 4-byte Folded Reload +; MMR3-NEXT: addiusp 40 ; MMR3-NEXT: jrc $ra ; ; MMR6-LABEL: urem_i128: ; MMR6: # %bb.0: # %entry ; MMR6-NEXT: lui $2, %hi(_gp_disp) ; MMR6-NEXT: addiu $2, $2, %lo(_gp_disp) -; MMR6-NEXT: addiu $sp, $sp, -48 -; MMR6-NEXT: .cfi_def_cfa_offset 48 -; MMR6-NEXT: sw $ra, 44($sp) # 4-byte Folded Spill -; MMR6-NEXT: sw $17, 40($sp) # 4-byte Folded Spill -; MMR6-NEXT: sw $16, 36($sp) # 4-byte Folded Spill +; MMR6-NEXT: addiu $sp, $sp, -40 +; MMR6-NEXT: .cfi_def_cfa_offset 40 +; MMR6-NEXT: sw $ra, 36($sp) # 4-byte Folded Spill +; MMR6-NEXT: sw $17, 32($sp) # 4-byte Folded Spill ; MMR6-NEXT: .cfi_offset 31, -4 ; MMR6-NEXT: .cfi_offset 17, -8 -; MMR6-NEXT: .cfi_offset 16, -12 -; MMR6-NEXT: addu $16, $2, $25 +; MMR6-NEXT: addu $gp, $2, $25 ; MMR6-NEXT: move $1, $7 -; MMR6-NEXT: lw $7, 68($sp) -; MMR6-NEXT: lw $17, 72($sp) -; MMR6-NEXT: lw $3, 76($sp) +; MMR6-NEXT: lw $7, 60($sp) +; MMR6-NEXT: lw $17, 64($sp) +; MMR6-NEXT: lw $3, 68($sp) ; MMR6-NEXT: move $2, $sp ; MMR6-NEXT: sw16 $3, 28($2) ; MMR6-NEXT: sw16 $17, 24($2) ; MMR6-NEXT: sw16 $7, 20($2) -; MMR6-NEXT: lw $3, 64($sp) +; MMR6-NEXT: lw $3, 56($sp) ; MMR6-NEXT: sw16 $3, 16($2) -; MMR6-NEXT: lw $25, %call16(__umodti3)($16) +; MMR6-NEXT: lw $25, %call16(__umodti3)($gp) ; MMR6-NEXT: move $7, $1 -; MMR6-NEXT: move $gp, $16 ; MMR6-NEXT: jalr $25 -; MMR6-NEXT: lw $16, 36($sp) # 4-byte Folded Reload -; MMR6-NEXT: lw $17, 40($sp) # 4-byte Folded Reload -; MMR6-NEXT: lw $ra, 44($sp) # 4-byte Folded Reload -; MMR6-NEXT: addiu $sp, $sp, 48 +; MMR6-NEXT: lw $17, 32($sp) # 4-byte Folded Reload +; MMR6-NEXT: lw $ra, 36($sp) # 4-byte Folded Reload +; MMR6-NEXT: addiu $sp, $sp, 40 ; MMR6-NEXT: jrc $ra entry: %r = urem i128 %a, %b diff --git a/llvm/test/CodeGen/Mips/mcount.ll b/llvm/test/CodeGen/Mips/mcount.ll --- a/llvm/test/CodeGen/Mips/mcount.ll +++ b/llvm/test/CodeGen/Mips/mcount.ll @@ -104,9 +104,8 @@ ; MIPS32-MM-PIC-NEXT: .cfi_def_cfa_offset 24 ; MIPS32-MM-PIC-NEXT: sw $ra, 20($sp) # 4-byte Folded Spill ; MIPS32-MM-PIC-NEXT: .cfi_offset 31, -4 -; MIPS32-MM-PIC-NEXT: addu $2, $2, $25 -; MIPS32-MM-PIC-NEXT: lw $25, %call16(_mcount)($2) -; MIPS32-MM-PIC-NEXT: move $gp, $2 +; MIPS32-MM-PIC-NEXT: addu $gp, $2, $25 +; MIPS32-MM-PIC-NEXT: lw $25, %call16(_mcount)($gp) ; MIPS32-MM-PIC-NEXT: move $1, $ra ; MIPS32-MM-PIC-NEXT: .reloc ($tmp0), R_MICROMIPS_JALR, _mcount ; MIPS32-MM-PIC-NEXT: $tmp0: diff --git a/llvm/test/CodeGen/Mips/micromips-gp-rc.ll b/llvm/test/CodeGen/Mips/micromips-gp-rc.ll --- a/llvm/test/CodeGen/Mips/micromips-gp-rc.ll +++ b/llvm/test/CodeGen/Mips/micromips-gp-rc.ll @@ -14,5 +14,5 @@ ; Function Attrs: noreturn declare void @exit(i32 signext) -; CHECK: move $gp, ${{[0-9]+}} +; CHECK: addu $gp, ${{[0-9]+}}, ${{[0-9]+}} diff --git a/llvm/test/CodeGen/Mips/tailcall/tailcall.ll b/llvm/test/CodeGen/Mips/tailcall/tailcall.ll --- a/llvm/test/CodeGen/Mips/tailcall/tailcall.ll +++ b/llvm/test/CodeGen/Mips/tailcall/tailcall.ll @@ -44,7 +44,7 @@ entry: ; ALL-LABEL: caller1: ; PIC32: jalr $25 -; PIC32MM: jalr $25 +; PIC32MM: jalr{{(s16)?}} $25 ; PIC32R6: jalr $25 ; STATIC32: jal ; STATIC32MMR6: balc @@ -177,7 +177,7 @@ ; ALL-LABEL: caller8_1: ; PIC32: jalr $25 ; PIC32R6: jalr $25 -; PIC32MM: jalr $25 +; PIC32MM: jalr{{(s16)?}} $25 ; STATIC32: jal ; STATIC32MMR6: balc ; PIC64: jalr $25 @@ -295,7 +295,7 @@ ; ALL-LABEL: caller13: ; PIC32: jalr $25 ; PIC32R6: jalr $25 -; PIC32MM: jalr $25 +; PIC32MM: jalr{{(s16)?}} $25 ; STATIC32: jal ; STATIC32MMR6: balc ; STATIC64: jal diff --git a/llvm/test/CodeGen/Mips/tls.ll b/llvm/test/CodeGen/Mips/tls.ll --- a/llvm/test/CodeGen/Mips/tls.ll +++ b/llvm/test/CodeGen/Mips/tls.ll @@ -31,7 +31,6 @@ ; MM-DAG: addu $[[R0:[a-z0-9]+]], $2, $25 ; MM-DAG: addiu $4, $[[R0]], %tlsgd(t1) ; MM-DAG: lw $25, %call16(__tls_get_addr)($[[R0]]) -; MM-DAG: move $gp, $2 ; MM-DAG: jalr $25 ; MM-DAG: lw16 $2, 0($2) } diff --git a/llvm/test/CodeGen/RISCV/rvv/vselect-fp-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/vselect-fp-rv32.ll --- a/llvm/test/CodeGen/RISCV/rvv/vselect-fp-rv32.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vselect-fp-rv32.ll @@ -426,12 +426,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a2, zero, e8, m2, ta, mu ; CHECK-NEXT: vle1.v v25, (a0) -; CHECK-NEXT: vmand.mm v25, v0, v25 +; CHECK-NEXT: vmand.mm v2, v0, v25 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 3 ; CHECK-NEXT: vsetvli a2, zero, e8, mf4, ta, mu -; CHECK-NEXT: vslidedown.vx v0, v25, a0 -; CHECK-NEXT: vmv1r.v v2, v25 +; CHECK-NEXT: vslidedown.vx v0, v2, a0 ; CHECK-NEXT: vsetvli a2, zero, e64, m8, ta, mu ; CHECK-NEXT: vmv.v.i v24, 0 ; CHECK-NEXT: vmerge.vvm v16, v24, v16, v0 diff --git a/llvm/test/CodeGen/RISCV/rvv/vselect-fp-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/vselect-fp-rv64.ll --- a/llvm/test/CodeGen/RISCV/rvv/vselect-fp-rv64.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vselect-fp-rv64.ll @@ -426,12 +426,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a2, zero, e8, m2, ta, mu ; CHECK-NEXT: vle1.v v25, (a0) -; CHECK-NEXT: vmand.mm v25, v0, v25 +; CHECK-NEXT: vmand.mm v2, v0, v25 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 3 ; CHECK-NEXT: vsetvli a2, zero, e8, mf4, ta, mu -; CHECK-NEXT: vslidedown.vx v0, v25, a0 -; CHECK-NEXT: vmv1r.v v2, v25 +; CHECK-NEXT: vslidedown.vx v0, v2, a0 ; CHECK-NEXT: vsetvli a2, zero, e64, m8, ta, mu ; CHECK-NEXT: vmv.v.i v24, 0 ; CHECK-NEXT: vmerge.vvm v16, v24, v16, v0 diff --git a/llvm/test/CodeGen/X86/bitreverse.ll b/llvm/test/CodeGen/X86/bitreverse.ll --- a/llvm/test/CodeGen/X86/bitreverse.ll +++ b/llvm/test/CodeGen/X86/bitreverse.ll @@ -1026,8 +1026,8 @@ ; X64-NEXT: movq %rbx, %r10 ; X64-NEXT: andq %r13, %r10 ; X64-NEXT: shlq $4, %r10 -; X64-NEXT: movabsq $-1085102592571150096, %rax # imm = 0xF0F0F0F0F0F0F0F0 -; X64-NEXT: andq %rax, %rbx +; X64-NEXT: movabsq $-1085102592571150096, %r15 # imm = 0xF0F0F0F0F0F0F0F0 +; X64-NEXT: andq %r15, %rbx ; X64-NEXT: shrq $4, %rbx ; X64-NEXT: orq %r10, %rbx ; X64-NEXT: movabsq $3689348814741910323, %r11 # imm = 0x3333333333333333 @@ -1048,7 +1048,7 @@ ; X64-NEXT: movq %rbp, %rdi ; X64-NEXT: andq %r13, %rdi ; X64-NEXT: shlq $4, %rdi -; X64-NEXT: andq %rax, %rbp +; X64-NEXT: andq %r15, %rbp ; X64-NEXT: shrq $4, %rbp ; X64-NEXT: orq %rdi, %rbp ; X64-NEXT: movq %rbp, %rdi @@ -1069,8 +1069,7 @@ ; X64-NEXT: movq %rbp, %r10 ; X64-NEXT: andq %r13, %r10 ; X64-NEXT: shlq $4, %r10 -; X64-NEXT: andq %rax, %rbp -; X64-NEXT: movq %rax, %r15 +; X64-NEXT: andq %r15, %rbp ; X64-NEXT: shrq $4, %rbp ; X64-NEXT: orq %r10, %rbp ; X64-NEXT: movq %rbp, %r10 diff --git a/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll b/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll --- a/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll +++ b/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll @@ -1298,13 +1298,12 @@ ; X86-NEXT: subl $12, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: pushl {{[0-9]+}}(%esp) ; X86-NEXT: pushl {{[0-9]+}}(%esp) ; X86-NEXT: pushl {{[0-9]+}}(%esp) ; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %ebx -; X86-NEXT: movl %ebx, %esi +; X86-NEXT: pushl %esi ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %ebp ; X86-NEXT: pushl {{[0-9]+}}(%esp) @@ -1381,13 +1380,12 @@ ; X86-NEXT: subl $12, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: pushl {{[0-9]+}}(%esp) ; X86-NEXT: pushl {{[0-9]+}}(%esp) ; X86-NEXT: pushl {{[0-9]+}}(%esp) ; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %ebx -; X86-NEXT: movl %ebx, %esi +; X86-NEXT: pushl %esi ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %ebp ; X86-NEXT: pushl {{[0-9]+}}(%esp) diff --git a/llvm/test/CodeGen/X86/machine-copy-prop.mir b/llvm/test/CodeGen/X86/machine-copy-prop.mir --- a/llvm/test/CodeGen/X86/machine-copy-prop.mir +++ b/llvm/test/CodeGen/X86/machine-copy-prop.mir @@ -5,15 +5,20 @@ define void @copyprop_remove_kill0() { ret void } define void @copyprop_remove_kill1() { ret void } define void @copyprop_remove_kill2() { ret void } + define void @copyprop_dbg_value() { ret void } define void @copyprop0() { ret void } define void @copyprop1() { ret void } define void @copyprop2() { ret void } + define void @copyprop3() { ret void } define void @nocopyprop0() { ret void } define void @nocopyprop1() { ret void } define void @nocopyprop2() { ret void } define void @nocopyprop3() { ret void } define void @nocopyprop4() { ret void } define void @nocopyprop5() { ret void } + define void @nocopyprop6() { ret void } + define void @nocopyprop7() { ret void } + define void @nocopyprop8() { ret void } ... --- # The second copy is redundant and will be removed, check that we also remove @@ -67,6 +72,20 @@ NOOP implicit $rax, implicit $rdi ... --- +# DBG_VALUE shouldn't stop backward copy propagation +# CHECK-LABEL: name: copyprop_dbg_value +# CHECK: renamable $rbx = LEA64r $rax, 1, $noreg, 8, $noreg +# CHECK-NEXT: DBG_VALUE $rbx, $noreg +# CHECK-NEXT: DBG_VALUE $noreg, $noreg +name: copyprop_dbg_value +body: | + bb.0: + renamable $rbp = LEA64r $rax, 1, $noreg, 8, $noreg + DBG_VALUE $rbp, $noreg + renamable $rbx = COPY killed renamable $rbp + DBG_VALUE $noreg, $noreg +... +--- # The second copy is redundant; the call preserves the source and dest register. # CHECK-LABEL: name: copyprop0 # CHECK: bb.0: @@ -117,6 +136,19 @@ NOOP implicit $rax, implicit $rdi ... --- +# CHECK-LABEL: name: copyprop3 +# CHECK: renamable $rbx = LEA64r $rax, 1, $noreg, 8, $noreg +# CHECK-NEXT: renamable $rax = COPY renamable $rbx +# CHECK-NEXT: NOOP implicit $rax +name: copyprop3 +body: | + bb.0: + renamable $rbp = LEA64r $rax, 1, $noreg, 8, $noreg + renamable $rax = COPY renamable $rbp + renamable $rbx = COPY killed renamable $rbp + NOOP implicit $rax +... +--- # The second copy is not redundant if the source register ($rax) is clobbered # even if the dest ($rbp) is not. # CHECK-LABEL: name: nocopyprop0 @@ -213,3 +245,48 @@ $rip = COPY $rax $rip = COPY $rax ... +--- +# Use of a sub-register should stop backward propagation +# CHECK-LABEL: name: nocopyprop6 +# CHECK: renamable $rbp = LEA64r $rax, 1, $noreg, 8, $noreg +# CHECK-NEXT: renamable $rax = COPY renamable $ebp +# CHECK-NEXT: renamable $rbx = COPY killed renamable $rbp +# CHECK-NEXT: NOOP implicit $rax, implicit $rbx +name: nocopyprop6 +body: | + bb.0: + renamable $rbp = LEA64r $rax, 1, $noreg, 8, $noreg + renamable $rax = COPY renamable $ebp + renamable $rbx = COPY killed renamable $rbp + NOOP implicit $rax, implicit $rbx +... +--- +# Non-renamable use of a register should stop backward propagation +# CHECK-LABEL: name: nocopyprop7 +# CHECK: renamable $rbp = LEA64r $rax, 1, $noreg, 8, $noreg +# CHECK-NEXT: renamable $rax = COPY $rbp +# CHECK-NEXT: renamable $rbx = COPY killed renamable $rbp +# CHECK-NEXT: NOOP implicit $rax, implicit $rbx +name: nocopyprop7 +body: | + bb.0: + renamable $rbp = LEA64r $rax, 1, $noreg, 8, $noreg + renamable $rax = COPY $rbp + renamable $rbx = COPY killed renamable $rbp + NOOP implicit $rax, implicit $rbx +... +--- +# Read of the Def register should stop backward propagation +# CHECK-LABEL: name: nocopyprop8 +# CHECK: renamable $rbp = LEA64r $rax, 1, $noreg, 8, $noreg +# CHECK-NEXT: renamable $rax = COPY $rbx +# CHECK-NEXT: renamable $rbx = COPY killed renamable $rbp +# CHECK-NEXT: NOOP implicit $rax, implicit $rbx +name: nocopyprop8 +body: | + bb.0: + renamable $rbp = LEA64r $rax, 1, $noreg, 8, $noreg + renamable $rax = COPY $rbx + renamable $rbx = COPY killed renamable $rbp + NOOP implicit $rax, implicit $rbx +... diff --git a/llvm/test/CodeGen/X86/mul-i1024.ll b/llvm/test/CodeGen/X86/mul-i1024.ll --- a/llvm/test/CodeGen/X86/mul-i1024.ll +++ b/llvm/test/CodeGen/X86/mul-i1024.ll @@ -4305,18 +4305,17 @@ ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[0-9]+}}(%esp), %edi -; X32-NEXT: movl 124(%edi), %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X32-NEXT: movl 124(%ebx), %ecx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: imull %eax, %ecx -; X32-NEXT: movl 120(%edi), %esi -; X32-NEXT: movl %edi, %ebx +; X32-NEXT: movl 120(%ebx), %esi ; X32-NEXT: mull %esi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: addl %ecx, %edx ; X32-NEXT: imull (%esp), %esi # 4-byte Folded Reload ; X32-NEXT: addl %edx, %esi -; X32-NEXT: movl 112(%edi), %edi +; X32-NEXT: movl 112(%ebx), %edi ; X32-NEXT: movl 116(%ebx), %ebp ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: movl %eax, %ebx @@ -4885,17 +4884,16 @@ ; X64-NEXT: movq %rax, %rsi ; X64-NEXT: addq %r12, %rsi ; X64-NEXT: adcq $0, %rbx -; X64-NEXT: movq 24(%rdi), %rdi +; X64-NEXT: movq 24(%rdi), %r12 ; X64-NEXT: movq %r9, %rax -; X64-NEXT: mulq %rdi -; X64-NEXT: movq %rdi, %r12 +; X64-NEXT: mulq %r12 ; X64-NEXT: movq %rdx, %rbp ; X64-NEXT: addq %rsi, %rax ; X64-NEXT: movq %rax, %rsi ; X64-NEXT: adcq %rbx, %rbp ; X64-NEXT: setb %r9b ; X64-NEXT: movq %r11, %rax -; X64-NEXT: mulq %rdi +; X64-NEXT: mulq %r12 ; X64-NEXT: movq %rdx, %rdi ; X64-NEXT: movq %rax, %rbx ; X64-NEXT: addq %rbp, %rbx @@ -5042,8 +5040,8 @@ ; X64-NEXT: addq %r12, %rbx ; X64-NEXT: adcq %r11, %r8 ; X64-NEXT: setb %r14b -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload -; X64-NEXT: movq %rsi, %rax +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; X64-NEXT: movq %r15, %rax ; X64-NEXT: movq %r10, (%rsp) # 8-byte Spill ; X64-NEXT: mulq %r10 ; X64-NEXT: movq %rdx, %rcx @@ -5055,8 +5053,7 @@ ; X64-NEXT: movq %rax, %rbp ; X64-NEXT: addq %rcx, %rbp ; X64-NEXT: adcq $0, %rdi -; X64-NEXT: movq %rsi, %rax -; X64-NEXT: movq %rsi, %r15 +; X64-NEXT: movq %r15, %rax ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload ; X64-NEXT: mulq %rsi ; X64-NEXT: movq %rdx, %rcx @@ -5086,25 +5083,22 @@ ; X64-NEXT: adcq $0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill ; X64-NEXT: adcq $0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill ; X64-NEXT: adcq $0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload -; X64-NEXT: movq 32(%rsi), %rdi +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload +; X64-NEXT: movq 32(%r13), %r11 ; X64-NEXT: movq %r15, %rax -; X64-NEXT: mulq %rdi +; X64-NEXT: mulq %r11 ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq %r9, %rbx ; X64-NEXT: movq %r9, %rax -; X64-NEXT: mulq %rdi -; X64-NEXT: movq %rdi, %r11 +; X64-NEXT: mulq %r11 ; X64-NEXT: movq %rdx, %rdi ; X64-NEXT: movq %rax, %rbp ; X64-NEXT: addq %rcx, %rbp ; X64-NEXT: adcq $0, %rdi -; X64-NEXT: movq 40(%rsi), %rcx -; X64-NEXT: movq %rsi, %r13 +; X64-NEXT: movq 40(%r13), %r8 ; X64-NEXT: movq %r15, %rax -; X64-NEXT: mulq %rcx -; X64-NEXT: movq %rcx, %r8 +; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq %rax, %r9 ; X64-NEXT: addq %rbp, %r9 @@ -5239,29 +5233,27 @@ ; X64-NEXT: setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload ; X64-NEXT: movq %rdi, %rax -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload -; X64-NEXT: mulq %rsi +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload ; X64-NEXT: movq %rbp, %rax -; X64-NEXT: mulq %rsi -; X64-NEXT: movq %rsi, %r8 +; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rdx, %rsi ; X64-NEXT: movq %rax, %rbx ; X64-NEXT: addq %rcx, %rbx ; X64-NEXT: adcq $0, %rsi ; X64-NEXT: movq %rdi, %rax -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; X64-NEXT: mulq %rdi +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; X64-NEXT: mulq %r9 ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq %rax, %r12 ; X64-NEXT: addq %rbx, %r12 ; X64-NEXT: adcq %rsi, %rcx ; X64-NEXT: setb %bl ; X64-NEXT: movq %rbp, %rax -; X64-NEXT: mulq %rdi -; X64-NEXT: movq %rdi, %r9 +; X64-NEXT: mulq %r9 ; X64-NEXT: movq %rdx, %r11 ; X64-NEXT: movq %rax, %r13 ; X64-NEXT: addq %rcx, %r13 @@ -5382,26 +5374,23 @@ ; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq $0, %rdx ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload -; X64-NEXT: movq 64(%rsi), %rdi +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload +; X64-NEXT: movq 64(%r13), %r8 ; X64-NEXT: movq (%rsp), %rbx # 8-byte Reload ; X64-NEXT: movq %rbx, %rax -; X64-NEXT: mulq %rdi +; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq %rax, %r11 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload ; X64-NEXT: movq %r9, %rax -; X64-NEXT: mulq %rdi -; X64-NEXT: movq %rdi, %r8 +; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rdx, %rdi ; X64-NEXT: movq %rax, %rbp ; X64-NEXT: addq %rcx, %rbp ; X64-NEXT: adcq $0, %rdi -; X64-NEXT: movq 72(%rsi), %rcx -; X64-NEXT: movq %rsi, %r13 +; X64-NEXT: movq 72(%r13), %rsi ; X64-NEXT: movq %rbx, %rax -; X64-NEXT: mulq %rcx -; X64-NEXT: movq %rcx, %rsi +; X64-NEXT: mulq %rsi ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq %rax, %r14 ; X64-NEXT: addq %rbp, %r14 @@ -5483,8 +5472,8 @@ ; X64-NEXT: addq %r15, %rsi ; X64-NEXT: adcq %r10, %rbx ; X64-NEXT: setb %r9b -; X64-NEXT: movq (%rsp), %r14 # 8-byte Reload -; X64-NEXT: movq %r14, %rax +; X64-NEXT: movq (%rsp), %r15 # 8-byte Reload +; X64-NEXT: movq %r15, %rax ; X64-NEXT: mulq %r11 ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq %rax, %r10 @@ -5495,8 +5484,7 @@ ; X64-NEXT: movq %rax, %rbp ; X64-NEXT: addq %rcx, %rbp ; X64-NEXT: adcq $0, %rdi -; X64-NEXT: movq %r14, %rax -; X64-NEXT: movq %r14, %r15 +; X64-NEXT: movq %r15, %rax ; X64-NEXT: mulq %r13 ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: addq %rbp, %rax @@ -5567,12 +5555,11 @@ ; X64-NEXT: adcq %rax, %r13 ; X64-NEXT: addq %r9, %r11 ; X64-NEXT: adcq %r8, %r13 -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; X64-NEXT: movq 120(%rdx), %rcx +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; X64-NEXT: movq 120(%rdi), %rcx ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; X64-NEXT: imulq %rax, %rcx -; X64-NEXT: movq 112(%rdx), %rsi -; X64-NEXT: movq %rdx, %rdi +; X64-NEXT: movq 112(%rdi), %rsi ; X64-NEXT: movq %rax, %rbx ; X64-NEXT: mulq %rsi ; X64-NEXT: movq %rax, %r10 @@ -5764,19 +5751,18 @@ ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq $0, %rdx ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; X64-NEXT: movq 96(%rbp), %rsi +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; X64-NEXT: movq 96(%rdi), %rsi ; X64-NEXT: imulq %rsi, %r9 ; X64-NEXT: movq %rsi, %rax ; X64-NEXT: mulq %rbx ; X64-NEXT: movq %rax, %r14 ; X64-NEXT: addq %r9, %rdx -; X64-NEXT: movq 104(%rbp), %r15 +; X64-NEXT: movq 104(%rdi), %r15 ; X64-NEXT: imulq %r15, %rbx ; X64-NEXT: addq %rdx, %rbx ; X64-NEXT: movq %rbx, %r9 -; X64-NEXT: movq 112(%rbp), %rax -; X64-NEXT: movq %rbp, %rdi +; X64-NEXT: movq 112(%rdi), %rax ; X64-NEXT: movq %rax, %rcx ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload ; X64-NEXT: imulq %rbx, %rcx diff --git a/llvm/test/CodeGen/X86/mul-i512.ll b/llvm/test/CodeGen/X86/mul-i512.ll --- a/llvm/test/CodeGen/X86/mul-i512.ll +++ b/llvm/test/CodeGen/X86/mul-i512.ll @@ -796,18 +796,17 @@ ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[0-9]+}}(%esp), %edi -; X32-NEXT: movl 60(%edi), %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X32-NEXT: movl 60(%ebx), %ecx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: imull %eax, %ecx -; X32-NEXT: movl 56(%edi), %esi -; X32-NEXT: movl %edi, %ebx +; X32-NEXT: movl 56(%ebx), %esi ; X32-NEXT: mull %esi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: addl %ecx, %edx ; X32-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: addl %edx, %esi -; X32-NEXT: movl 48(%edi), %edi +; X32-NEXT: movl 48(%ebx), %edi ; X32-NEXT: movl 52(%ebx), %ebp ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: movl %eax, %ebx @@ -1375,12 +1374,11 @@ ; X64-NEXT: adcq %rax, %r15 ; X64-NEXT: addq %r12, %r13 ; X64-NEXT: adcq %rbp, %r15 -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; X64-NEXT: movq 56(%rdx), %rcx +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; X64-NEXT: movq 56(%rsi), %rcx ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; X64-NEXT: imulq %rax, %rcx -; X64-NEXT: movq 48(%rdx), %rbp -; X64-NEXT: movq %rdx, %rsi +; X64-NEXT: movq 48(%rsi), %rbp ; X64-NEXT: movq %rax, %r11 ; X64-NEXT: mulq %rbp ; X64-NEXT: movq %rax, %r12 diff --git a/llvm/test/CodeGen/X86/pr43820.ll b/llvm/test/CodeGen/X86/pr43820.ll --- a/llvm/test/CodeGen/X86/pr43820.ll +++ b/llvm/test/CodeGen/X86/pr43820.ll @@ -18,17 +18,15 @@ ; CHECK-NEXT: movq %rbx, %rbp ; CHECK-NEXT: andq %rdi, %rbp ; CHECK-NEXT: shlq $4, %rbp -; CHECK-NEXT: movabsq $-1085102592571150096, %r11 # imm = 0xF0F0F0F0F0F0F0F0 -; CHECK-NEXT: andq %r11, %rbx -; CHECK-NEXT: movq %r11, %rax +; CHECK-NEXT: movabsq $-1085102592571150096, %rax # imm = 0xF0F0F0F0F0F0F0F0 +; CHECK-NEXT: andq %rax, %rbx ; CHECK-NEXT: shrq $4, %rbx ; CHECK-NEXT: orq %rbp, %rbx ; CHECK-NEXT: movabsq $3689348814741910323, %r11 # imm = 0x3333333333333333 ; CHECK-NEXT: movq %rbx, %r14 ; CHECK-NEXT: andq %r11, %r14 -; CHECK-NEXT: movabsq $-3689348814741910324, %rbp # imm = 0xCCCCCCCCCCCCCCCC -; CHECK-NEXT: andq %rbp, %rbx -; CHECK-NEXT: movq %rbp, %r15 +; CHECK-NEXT: movabsq $-3689348814741910324, %r15 # imm = 0xCCCCCCCCCCCCCCCC +; CHECK-NEXT: andq %r15, %rbx ; CHECK-NEXT: shrq $2, %rbx ; CHECK-NEXT: leaq (%rbx,%r14,4), %r14 ; CHECK-NEXT: movabsq $6148914691230924800, %rbx # imm = 0x5555555555000000 diff --git a/llvm/test/CodeGen/X86/sdiv_fix.ll b/llvm/test/CodeGen/X86/sdiv_fix.ll --- a/llvm/test/CodeGen/X86/sdiv_fix.ll +++ b/llvm/test/CodeGen/X86/sdiv_fix.ll @@ -337,11 +337,10 @@ ; X86-NEXT: pushl %eax ; X86-NEXT: calll __divti3 ; X86-NEXT: addl $32, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ecx, %eax ; X86-NEXT: subl $1, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sbbl $0, %ebx @@ -586,8 +585,8 @@ ; X86-NEXT: calll __divdi3 ; X86-NEXT: addl $16, %esp ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %eax, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %esi, %ecx ; X86-NEXT: sarl $31, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl %edx, %ebx @@ -598,8 +597,7 @@ ; X86-NEXT: pushl %ecx ; X86-NEXT: movl %ecx, %ebp ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: pushl %eax -; X86-NEXT: movl %eax, %esi +; X86-NEXT: pushl %esi ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %ebx ; X86-NEXT: calll __moddi3 diff --git a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll --- a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll +++ b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll @@ -985,10 +985,9 @@ ; X86-NEXT: pushl %ecx ; X86-NEXT: pushl %ecx ; X86-NEXT: pushl 36(%ebp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: pushl %ecx -; X86-NEXT: pushl %ecx -; X86-NEXT: movl %ecx, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %edi ; X86-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload ; X86-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload ; X86-NEXT: pushl %eax diff --git a/llvm/test/CodeGen/X86/shift-and.ll b/llvm/test/CodeGen/X86/shift-and.ll --- a/llvm/test/CodeGen/X86/shift-and.ll +++ b/llvm/test/CodeGen/X86/shift-and.ll @@ -172,9 +172,8 @@ ; X32-NEXT: pushl %esi ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movl {{[0-9]+}}(%esp), %esi -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: shrdl $3, %eax, %esi -; X32-NEXT: movl %eax, %edi +; X32-NEXT: movl {{[0-9]+}}(%esp), %edi +; X32-NEXT: shrdl $3, %edi, %esi ; X32-NEXT: shrl $3, %edi ; X32-NEXT: movl (%ecx), %eax ; X32-NEXT: movl 4(%ecx), %edx diff --git a/llvm/test/CodeGen/X86/shrink_vmul.ll b/llvm/test/CodeGen/X86/shrink_vmul.ll --- a/llvm/test/CodeGen/X86/shrink_vmul.ll +++ b/llvm/test/CodeGen/X86/shrink_vmul.ll @@ -2074,17 +2074,16 @@ ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE-NEXT: movzwl 16(%eax), %edx ; X86-SSE-NEXT: movl %edx, (%esp) # 4-byte Spill -; X86-SSE-NEXT: movdqa (%eax), %xmm3 +; X86-SSE-NEXT: movdqa (%eax), %xmm5 ; X86-SSE-NEXT: movdqa (%ecx), %xmm0 ; X86-SSE-NEXT: movdqa 16(%ecx), %xmm1 ; X86-SSE-NEXT: pxor %xmm4, %xmm4 -; X86-SSE-NEXT: movdqa %xmm3, %xmm2 -; X86-SSE-NEXT: pextrw $7, %xmm3, %eax -; X86-SSE-NEXT: pextrw $4, %xmm3, %esi -; X86-SSE-NEXT: pextrw $0, %xmm3, %edi -; X86-SSE-NEXT: pextrw $1, %xmm3, %ebx -; X86-SSE-NEXT: pextrw $3, %xmm3, %ebp -; X86-SSE-NEXT: movdqa %xmm3, %xmm5 +; X86-SSE-NEXT: movdqa %xmm5, %xmm2 +; X86-SSE-NEXT: pextrw $7, %xmm5, %eax +; X86-SSE-NEXT: pextrw $4, %xmm5, %esi +; X86-SSE-NEXT: pextrw $0, %xmm5, %edi +; X86-SSE-NEXT: pextrw $1, %xmm5, %ebx +; X86-SSE-NEXT: pextrw $3, %xmm5, %ebp ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] ; X86-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[3,3,3,3] @@ -2319,17 +2318,16 @@ ; X64-SSE-LABEL: PR34947: ; X64-SSE: # %bb.0: ; X64-SSE-NEXT: movzwl 16(%rdi), %r8d -; X64-SSE-NEXT: movdqa (%rdi), %xmm3 +; X64-SSE-NEXT: movdqa (%rdi), %xmm5 ; X64-SSE-NEXT: movdqa (%rsi), %xmm0 ; X64-SSE-NEXT: movdqa 16(%rsi), %xmm1 ; X64-SSE-NEXT: pxor %xmm4, %xmm4 -; X64-SSE-NEXT: movdqa %xmm3, %xmm2 -; X64-SSE-NEXT: pextrw $7, %xmm3, %eax -; X64-SSE-NEXT: pextrw $4, %xmm3, %r9d -; X64-SSE-NEXT: pextrw $0, %xmm3, %r10d -; X64-SSE-NEXT: pextrw $1, %xmm3, %r11d -; X64-SSE-NEXT: pextrw $3, %xmm3, %ecx -; X64-SSE-NEXT: movdqa %xmm3, %xmm5 +; X64-SSE-NEXT: movdqa %xmm5, %xmm2 +; X64-SSE-NEXT: pextrw $7, %xmm5, %eax +; X64-SSE-NEXT: pextrw $4, %xmm5, %r9d +; X64-SSE-NEXT: pextrw $0, %xmm5, %r10d +; X64-SSE-NEXT: pextrw $1, %xmm5, %r11d +; X64-SSE-NEXT: pextrw $3, %xmm5, %ecx ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] ; X64-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[3,3,3,3] diff --git a/llvm/test/CodeGen/X86/sqrt-fastmath.ll b/llvm/test/CodeGen/X86/sqrt-fastmath.ll --- a/llvm/test/CodeGen/X86/sqrt-fastmath.ll +++ b/llvm/test/CodeGen/X86/sqrt-fastmath.ll @@ -559,11 +559,10 @@ define <16 x float> @v16f32_estimate(<16 x float> %x) #1 { ; SSE-LABEL: v16f32_estimate: ; SSE: # %bb.0: -; SSE-NEXT: rsqrtps %xmm0, %xmm5 +; SSE-NEXT: rsqrtps %xmm0, %xmm6 ; SSE-NEXT: movaps {{.*#+}} xmm4 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] -; SSE-NEXT: mulps %xmm5, %xmm0 -; SSE-NEXT: mulps %xmm5, %xmm0 -; SSE-NEXT: movaps %xmm5, %xmm6 +; SSE-NEXT: mulps %xmm6, %xmm0 +; SSE-NEXT: mulps %xmm6, %xmm0 ; SSE-NEXT: mulps %xmm4, %xmm6 ; SSE-NEXT: movaps {{.*#+}} xmm5 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] ; SSE-NEXT: addps %xmm5, %xmm0 diff --git a/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll --- a/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll +++ b/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll @@ -64,10 +64,9 @@ ; X86-NEXT: movl %edx, %ecx ; X86-NEXT: addl %esi, %ecx ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %esi -; X86-NEXT: movl %esi, %ebx +; X86-NEXT: mull %ebx ; X86-NEXT: movl %eax, %esi ; X86-NEXT: seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll @@ -521,11 +521,10 @@ ; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm9 ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm4 ; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm14 ; AVX2-SLOW-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm0[1,2,3],xmm3[4],xmm0[5,6,7] -; AVX2-SLOW-NEXT: vmovdqa %xmm3, %xmm14 -; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm14[0],xmm0[1,2,3],xmm14[4],xmm0[5,6,7] +; AVX2-SLOW-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm0[1,2,3],xmm4[4],xmm0[5,6,7] ; AVX2-SLOW-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm9[0],xmm0[1,2,3],xmm9[4],xmm0[5,6,7] @@ -557,10 +556,9 @@ ; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3],xmm2[4],xmm0[5,6,7] ; AVX2-SLOW-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa 144(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm0[1,2,3],xmm3[4],xmm0[5,6,7] -; AVX2-SLOW-NEXT: vmovdqa %xmm3, %xmm15 -; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 144(%rdi), %xmm15 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm15[0],xmm0[1,2,3],xmm15[4],xmm0[5,6,7] +; AVX2-SLOW-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %xmm3 ; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1,2,3],xmm3[4],xmm0[5,6,7] @@ -781,13 +779,11 @@ ; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm15 = [0,2,2,3,0,2,4,6] ; AVX2-FAST-ALL-NEXT: vpermd %ymm6, %ymm15, %ymm2 ; AVX2-FAST-ALL-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX2-FAST-ALL-NEXT: vpshufb %ymm0, %ymm2, %ymm4 -; AVX2-FAST-ALL-NEXT: vmovdqa %ymm0, %ymm3 +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-FAST-ALL-NEXT: vpshufb %ymm3, %ymm2, %ymm4 ; AVX2-FAST-ALL-NEXT: vpermd %ymm5, %ymm15, %ymm2 -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm0 = <0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u> -; AVX2-FAST-ALL-NEXT: vpshufb %ymm0, %ymm2, %ymm5 -; AVX2-FAST-ALL-NEXT: vmovdqa %ymm0, %ymm14 +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm14 = <0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u> +; AVX2-FAST-ALL-NEXT: vpshufb %ymm14, %ymm2, %ymm5 ; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] ; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FAST-ALL-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -827,10 +823,9 @@ ; AVX2-FAST-ALL-NEXT: vpshufb %xmm3, %xmm8, %xmm12 ; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] ; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm15[2,3] -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm15 = ; AVX2-FAST-ALL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-ALL-NEXT: vpshufb %ymm11, %ymm12, %ymm12 -; AVX2-FAST-ALL-NEXT: vmovdqa %ymm11, %ymm15 +; AVX2-FAST-ALL-NEXT: vpshufb %ymm15, %ymm12, %ymm12 ; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm11 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u> ; AVX2-FAST-ALL-NEXT: vpshufb %ymm11, %ymm2, %ymm2 ; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm12[6,7] @@ -937,27 +932,24 @@ ; AVX2-FAST-PERLANE-NEXT: subq $248, %rsp ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm9 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm8 ; AVX2-FAST-PERLANE-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm0[1,2,3],xmm3[4],xmm0[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm8[0],xmm0[1,2,3],xmm8[4],xmm0[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm0[1,2,3],xmm5[4],xmm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0],xmm0[1,2,3],xmm1[4],xmm0[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm9[0],xmm0[1,2,3],xmm9[4],xmm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0],xmm0[1,2,3],xmm6[4],xmm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm3, %xmm4, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 112(%rdi), %xmm15 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm15[0],xmm0[1,2,3],xmm15[4],xmm0[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0],xmm0[1,2,3],xmm1[4],xmm0[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0],xmm0[1,2,3],xmm6[4],xmm0[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm3, %xmm4, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm3, %xmm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3