Index: llvm/include/llvm/CodeGen/TargetInstrInfo.h =================================================================== --- llvm/include/llvm/CodeGen/TargetInstrInfo.h +++ llvm/include/llvm/CodeGen/TargetInstrInfo.h @@ -459,6 +459,13 @@ unsigned &SrcOpIdx1, unsigned &SrcOpIdx2) const; + /// Returns true if the target has a preference on the operands order of + /// the given machine instruction. And specify if \p Commute is required to + /// get the desired operands order. + virtual bool hasCommutePreference(MachineInstr &MI, bool &Commute) const { + return false; + } + /// A pair composed of a register and a sub-register index. /// Used to give some type checking when modeling Reg:SubReg. struct RegSubRegPair { Index: llvm/lib/CodeGen/TwoAddressInstructionPass.cpp =================================================================== --- llvm/lib/CodeGen/TwoAddressInstructionPass.cpp +++ llvm/lib/CodeGen/TwoAddressInstructionPass.cpp @@ -527,6 +527,11 @@ if (isRevCopyChain(RegB, RegA, MaxDataFlowEdge)) return false; + // Look for other target specific commute preference. + bool Commute; + if (TII->hasCommutePreference(*MI, Commute)) + return Commute; + // Since there are no intervening uses for both registers, then commute // if the def of RegC is closer. Its live interval is shorter. return LastDefB && LastDefC && LastDefC > LastDefB; Index: llvm/lib/Target/X86/X86FixupLEAs.cpp =================================================================== --- llvm/lib/Target/X86/X86FixupLEAs.cpp +++ llvm/lib/Target/X86/X86FixupLEAs.cpp @@ -79,6 +79,30 @@ MachineBasicBlock &MBB, bool OptIncDec, bool UseLEAForSP) const; + /// Look for and transform the sequence + /// lea (reg1, reg2), reg3 + /// sub reg3, reg4 + /// to + /// sub reg1, reg4 + /// sub reg2, reg4 + /// It can also optimize the sequence lea/add similarly. + bool optLEAALU(MachineBasicBlock::iterator &I, MachineBasicBlock &MBB) const; + + /// Step forwards in MBB, looking for an ADD/SUB instruction which uses + /// the dest register of LEA instruction I. + MachineBasicBlock::iterator searchALUInst(MachineBasicBlock::iterator &I, + MachineBasicBlock &MBB) const; + + /// Check instructions between LeaI and AluI (exclusively). + /// Set BaseIndexDef to true if base or index register from LeaI is defined. + /// Set AluDestRef to true if the dest register of AluI is used or defined. + /// *KilledBase is set to the killed base register usage. + /// *KilledIndex is set to the killed index register usage. + void checkRegUsage(MachineBasicBlock::iterator &LeaI, + MachineBasicBlock::iterator &AluI, bool &BaseIndexDef, + bool &AluDestRef, MachineOperand **KilledBase, + MachineOperand **KilledIndex) const; + /// Determine if an instruction references a machine register /// and, if so, whether it reads or writes the register. RegUsageState usesRegister(MachineOperand &p, MachineBasicBlock::iterator I); @@ -338,6 +362,18 @@ } } +static inline unsigned getSUBrrFromLEA(unsigned LEAOpcode) { + switch (LEAOpcode) { + default: + llvm_unreachable("Unexpected LEA instruction"); + case X86::LEA32r: + case X86::LEA64_32r: + return X86::SUB32rr; + case X86::LEA64r: + return X86::SUB64rr; + } +} + static inline unsigned getADDriFromLEA(unsigned LEAOpcode, const MachineOperand &Offset) { bool IsInt8 = Offset.isImm() && isInt<8>(Offset.getImm()); @@ -364,6 +400,162 @@ } } +MachineBasicBlock::iterator +FixupLEAPass::searchALUInst(MachineBasicBlock::iterator &I, + MachineBasicBlock &MBB) const { + const int InstrDistanceThreshold = 5; + int InstrDistance = 1; + MachineBasicBlock::iterator CurInst = std::next(I); + + unsigned LEAOpcode = I->getOpcode(); + unsigned AddOpcode = getADDrrFromLEA(LEAOpcode); + unsigned SubOpcode = getSUBrrFromLEA(LEAOpcode); + Register DestReg = I->getOperand(0).getReg(); + + while (CurInst != MBB.end()) { + if (CurInst->isCall() || CurInst->isInlineAsm()) + break; + if (InstrDistance > InstrDistanceThreshold) + break; + + // Check if the lea dest register is used in an add/sub instruction only. + for (unsigned I = 0, E = CurInst->getNumOperands(); I != E; ++I) { + MachineOperand &Opnd = CurInst->getOperand(I); + if (Opnd.isReg()) { + if (Opnd.getReg() == DestReg) { + if (Opnd.isDef() || !Opnd.isKill()) + return MachineBasicBlock::iterator(); + + unsigned AluOpcode = CurInst->getOpcode(); + if (AluOpcode != AddOpcode && AluOpcode != SubOpcode) + return MachineBasicBlock::iterator(); + + MachineOperand &Opnd2 = CurInst->getOperand(3 - I); + MachineOperand AluDest = CurInst->getOperand(0); + if (Opnd2.getReg() != AluDest.getReg()) + return MachineBasicBlock::iterator(); + + // X - (Y + Z) may generate different flags than (X - Y) - Z when + // there is overflow. So we can't change the alu instruction if the + // flags register is live. + if (!CurInst->registerDefIsDead(X86::EFLAGS, TRI)) + return MachineBasicBlock::iterator(); + + return CurInst; + } + if (TRI->regsOverlap(DestReg, Opnd.getReg())) + return MachineBasicBlock::iterator(); + } + } + + InstrDistance++; + ++CurInst; + } + return MachineBasicBlock::iterator(); +} + +void FixupLEAPass::checkRegUsage(MachineBasicBlock::iterator &LeaI, + MachineBasicBlock::iterator &AluI, + bool &BaseIndexDef, bool &AluDestRef, + MachineOperand **KilledBase, + MachineOperand **KilledIndex) const { + BaseIndexDef = AluDestRef = false; + *KilledBase = *KilledIndex = nullptr; + Register BaseReg = LeaI->getOperand(1 + X86::AddrBaseReg).getReg(); + Register IndexReg = LeaI->getOperand(1 + X86::AddrIndexReg).getReg(); + Register AluDestReg = AluI->getOperand(0).getReg(); + + MachineBasicBlock::iterator CurInst = std::next(LeaI); + while (CurInst != AluI) { + for (unsigned I = 0, E = CurInst->getNumOperands(); I != E; ++I) { + MachineOperand &Opnd = CurInst->getOperand(I); + if (!Opnd.isReg()) + continue; + Register Reg = Opnd.getReg(); + if (TRI->regsOverlap(Reg, AluDestReg)) + AluDestRef = true; + if (TRI->regsOverlap(Reg, BaseReg)) { + if (Opnd.isDef()) + BaseIndexDef = true; + else if (Opnd.isKill()) + *KilledBase = &Opnd; + } + if (TRI->regsOverlap(Reg, IndexReg)) { + if (Opnd.isDef()) + BaseIndexDef = true; + else if (Opnd.isKill()) + *KilledIndex = &Opnd; + } + } + ++CurInst; + } +} + +bool FixupLEAPass::optLEAALU(MachineBasicBlock::iterator &I, + MachineBasicBlock &MBB) const { + // Look for an add/sub instruction which uses the result of lea. + MachineBasicBlock::iterator AluI = searchALUInst(I, MBB); + if (AluI == MachineBasicBlock::iterator()) + return false; + + // Check if there are any related register usage between lea and alu. + bool BaseIndexDef, AluDestRef; + MachineOperand *KilledBase, *KilledIndex; + checkRegUsage(I, AluI, BaseIndexDef, AluDestRef, &KilledBase, &KilledIndex); + + MachineBasicBlock::iterator InsertPos = AluI; + if (BaseIndexDef) { + if (AluDestRef) + return false; + InsertPos = I; + KilledBase = KilledIndex = nullptr; + } + + // Check if there are same registers. + Register AluDestReg = AluI->getOperand(0).getReg(); + Register BaseReg = I->getOperand(1 + X86::AddrBaseReg).getReg(); + Register IndexReg = I->getOperand(1 + X86::AddrIndexReg).getReg(); + if (I->getOpcode() == X86::LEA64_32r) { + BaseReg = TRI->getSubReg(BaseReg, X86::sub_32bit); + IndexReg = TRI->getSubReg(IndexReg, X86::sub_32bit); + } + if (AluDestReg == IndexReg) { + if (BaseReg == IndexReg) + return false; + std::swap(BaseReg, IndexReg); + std::swap(KilledBase, KilledIndex); + } + if (BaseReg == IndexReg) + KilledBase = nullptr; + + // Now it's safe to change instructions. + MachineInstr *NewMI1, *NewMI2; + unsigned NewOpcode = AluI->getOpcode(); + NewMI1 = BuildMI(MBB, InsertPos, AluI->getDebugLoc(), TII->get(NewOpcode), + AluDestReg) + .addReg(AluDestReg, RegState::Kill) + .addReg(BaseReg, KilledBase ? RegState::Kill : 0); + NewMI1->addRegisterDead(X86::EFLAGS, TRI); + NewMI2 = BuildMI(MBB, InsertPos, AluI->getDebugLoc(), TII->get(NewOpcode), + AluDestReg) + .addReg(AluDestReg, RegState::Kill) + .addReg(IndexReg, KilledIndex ? RegState::Kill : 0); + NewMI2->addRegisterDead(X86::EFLAGS, TRI); + + // Clear the old Kill flags. + if (KilledBase) + KilledBase->setIsKill(false); + if (KilledIndex) + KilledIndex->setIsKill(false); + + MBB.getParent()->substituteDebugValuesForInst(*AluI, *NewMI1, 1); + MBB.getParent()->substituteDebugValuesForInst(*AluI, *NewMI2, 1); + MBB.erase(I); + MBB.erase(AluI); + I = NewMI1; + return true; +} + bool FixupLEAPass::optTwoAddrLEA(MachineBasicBlock::iterator &I, MachineBasicBlock &MBB, bool OptIncDec, bool UseLEAForSP) const { @@ -398,6 +590,7 @@ MachineInstr *NewMI = nullptr; + // Case 1. // Look for lea(%reg1, %reg2), %reg1 or lea(%reg2, %reg1), %reg1 // which can be turned into add %reg2, %reg1 if (BaseReg != 0 && IndexReg != 0 && Disp.getImm() == 0 && @@ -417,6 +610,7 @@ .addReg(BaseReg).addReg(IndexReg); } } else if (DestReg == BaseReg && IndexReg == 0) { + // Case 2. // This is an LEA with only a base register and a displacement, // We can use ADDri or INC/DEC. @@ -447,6 +641,12 @@ .addReg(BaseReg).addImm(Disp.getImm()); } } + } else if (BaseReg != 0 && IndexReg != 0 && Disp.getImm() == 0) { + // Case 3. + // Look for and transform the sequence + // lea (reg1, reg2), reg3 + // sub reg3, reg4 + return optLEAALU(I, MBB); } else return false; Index: llvm/lib/Target/X86/X86InstrInfo.h =================================================================== --- llvm/lib/Target/X86/X86InstrInfo.h +++ llvm/lib/Target/X86/X86InstrInfo.h @@ -284,6 +284,10 @@ bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx1, unsigned &SrcOpIdx2) const override; + /// Returns true if we have preference on the operands order in MI, the + /// commute decision is returned in Commute. + bool hasCommutePreference(MachineInstr &MI, bool &Commute) const override; + /// Returns an adjusted FMA opcode that must be used in FMA instruction that /// performs the same computations as the given \p MI but which has the /// operands \p SrcOpIdx1 and \p SrcOpIdx2 commuted. Index: llvm/lib/Target/X86/X86InstrInfo.cpp =================================================================== --- llvm/lib/Target/X86/X86InstrInfo.cpp +++ llvm/lib/Target/X86/X86InstrInfo.cpp @@ -2670,6 +2670,58 @@ return false; } +static bool isConvertibleLEA(MachineInstr *MI) { + unsigned Opcode = MI->getOpcode(); + if (Opcode != X86::LEA32r && Opcode != X86::LEA64r && + Opcode != X86::LEA64_32r) + return false; + + const MachineOperand &Scale = MI->getOperand(1 + X86::AddrScaleAmt); + const MachineOperand &Disp = MI->getOperand(1 + X86::AddrDisp); + const MachineOperand &Segment = MI->getOperand(1 + X86::AddrSegmentReg); + + if (Segment.getReg() != 0 || !Disp.isImm() || Disp.getImm() != 0 || + Scale.getImm() > 1) + return false; + + return true; +} + +bool X86InstrInfo::hasCommutePreference(MachineInstr &MI, bool &Commute) const { + // Currently we're interested in following sequence only. + // r3 = lea r1, r2 + // r5 = add r3, r4 + // Both r3 and r4 are killed in add, we hope the add instruction has the + // operand order + // r5 = add r4, r3 + // So later in X86FixupLEAs the lea instruction can be rewritten as add. + unsigned Opcode = MI.getOpcode(); + if (Opcode != X86::ADD32rr && Opcode != X86::ADD64rr) + return false; + + const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); + Register Reg1 = MI.getOperand(1).getReg(); + Register Reg2 = MI.getOperand(2).getReg(); + + // Check if Reg1 comes from LEA in the same MBB. + if (MachineInstr *Inst = MRI.getUniqueVRegDef(Reg1)) { + if (isConvertibleLEA(Inst) && Inst->getParent() == MI.getParent()) { + Commute = true; + return true; + } + } + + // Check if Reg2 comes from LEA in the same MBB. + if (MachineInstr *Inst = MRI.getUniqueVRegDef(Reg2)) { + if (isConvertibleLEA(Inst) && Inst->getParent() == MI.getParent()) { + Commute = false; + return true; + } + } + + return false; +} + X86::CondCode X86::getCondFromBranch(const MachineInstr &MI) { switch (MI.getOpcode()) { default: return X86::COND_INVALID; Index: llvm/test/CodeGen/X86/2009-03-23-MultiUseSched.ll =================================================================== --- llvm/test/CodeGen/X86/2009-03-23-MultiUseSched.ll +++ llvm/test/CodeGen/X86/2009-03-23-MultiUseSched.ll @@ -29,9 +29,9 @@ ; CHECK-NEXT: addq %r15, %rbx ; CHECK-NEXT: addq %rdx, %rbx ; CHECK-NEXT: addq %rsi, %rbx -; CHECK-NEXT: leaq (%r9,%r10), %rsi -; CHECK-NEXT: leaq (%rsi,%r8), %rdx -; CHECK-NEXT: addq %rsi, %rdx +; CHECK-NEXT: leaq (%r9,%r10), %rdx +; CHECK-NEXT: addq %rdx, %rdx +; CHECK-NEXT: addq %r8, %rdx ; CHECK-NEXT: movq X(%rip), %rdi ; CHECK-NEXT: addq %rbx, %r12 ; CHECK-NEXT: addq %r8, %rdx @@ -41,9 +41,9 @@ ; CHECK-NEXT: addq %r12, %rsi ; CHECK-NEXT: addq %r11, %rdi ; CHECK-NEXT: addq %rsi, %rdi -; CHECK-NEXT: leaq (%r10,%r8), %rbx -; CHECK-NEXT: leaq (%rdx,%rbx), %rsi -; CHECK-NEXT: addq %rbx, %rsi +; CHECK-NEXT: leaq (%r10,%r8), %rsi +; CHECK-NEXT: addq %rsi, %rsi +; CHECK-NEXT: addq %rdx, %rsi ; CHECK-NEXT: movq X(%rip), %rbx ; CHECK-NEXT: addq %r12, %rdi ; CHECK-NEXT: addq %rdi, %r9 @@ -54,9 +54,9 @@ ; CHECK-NEXT: addq %r9, %rdi ; CHECK-NEXT: addq %r14, %rbx ; CHECK-NEXT: addq %rdi, %rbx -; CHECK-NEXT: leaq (%rdx,%r8), %rax -; CHECK-NEXT: leaq (%rsi,%rax), %rdi -; CHECK-NEXT: addq %rax, %rdi +; CHECK-NEXT: leaq (%rdx,%r8), %rdi +; CHECK-NEXT: addq %rdi, %rdi +; CHECK-NEXT: addq %rsi, %rdi ; CHECK-NEXT: movq X(%rip), %rcx ; CHECK-NEXT: addq %r9, %rbx ; CHECK-NEXT: addq %rbx, %r10 @@ -67,9 +67,9 @@ ; CHECK-NEXT: addq %r10, %rax ; CHECK-NEXT: addq %r15, %rcx ; CHECK-NEXT: addq %rax, %rcx -; CHECK-NEXT: leaq (%rsi,%rdx), %rbx -; CHECK-NEXT: leaq (%rdi,%rbx), %r11 -; CHECK-NEXT: addq %rbx, %r11 +; CHECK-NEXT: leaq (%rsi,%rdx), %r11 +; CHECK-NEXT: addq %r11, %r11 +; CHECK-NEXT: addq %rdi, %r11 ; CHECK-NEXT: movq X(%rip), %rbx ; CHECK-NEXT: addq %r10, %rcx ; CHECK-NEXT: addq %rcx, %r8 @@ -80,9 +80,9 @@ ; CHECK-NEXT: addq %r8, %rcx ; CHECK-NEXT: addq %r12, %rbx ; CHECK-NEXT: addq %rcx, %rbx -; CHECK-NEXT: leaq (%rdi,%rsi), %rax -; CHECK-NEXT: leaq (%r11,%rax), %r14 -; CHECK-NEXT: addq %rax, %r14 +; CHECK-NEXT: leaq (%rdi,%rsi), %r14 +; CHECK-NEXT: addq %r14, %r14 +; CHECK-NEXT: addq %r11, %r14 ; CHECK-NEXT: movq X(%rip), %rax ; CHECK-NEXT: addq %r8, %rbx ; CHECK-NEXT: addq %rbx, %rdx @@ -93,9 +93,9 @@ ; CHECK-NEXT: addq %rdx, %rbx ; CHECK-NEXT: addq %r9, %rax ; CHECK-NEXT: addq %rbx, %rax -; CHECK-NEXT: leaq (%r11,%rdi), %rbx -; CHECK-NEXT: leaq (%r14,%rbx), %r9 -; CHECK-NEXT: addq %rbx, %r9 +; CHECK-NEXT: leaq (%r11,%rdi), %r9 +; CHECK-NEXT: addq %r9, %r9 +; CHECK-NEXT: addq %r14, %r9 ; CHECK-NEXT: movq X(%rip), %rbx ; CHECK-NEXT: addq %rdx, %rax ; CHECK-NEXT: addq %rax, %rsi @@ -106,9 +106,9 @@ ; CHECK-NEXT: addq %rsi, %rax ; CHECK-NEXT: addq %r10, %rbx ; CHECK-NEXT: addq %rax, %rbx -; CHECK-NEXT: leaq (%r14,%r11), %rax -; CHECK-NEXT: leaq (%r9,%rax), %r10 -; CHECK-NEXT: addq %rax, %r10 +; CHECK-NEXT: leaq (%r14,%r11), %r10 +; CHECK-NEXT: addq %r10, %r10 +; CHECK-NEXT: addq %r9, %r10 ; CHECK-NEXT: movq X(%rip), %rax ; CHECK-NEXT: addq %rsi, %rbx ; CHECK-NEXT: addq %rbx, %rdi @@ -119,9 +119,9 @@ ; CHECK-NEXT: addq %rdi, %rbx ; CHECK-NEXT: addq %r8, %rax ; CHECK-NEXT: addq %rbx, %rax -; CHECK-NEXT: leaq (%r9,%r14), %rbx -; CHECK-NEXT: leaq (%r10,%rbx), %r8 -; CHECK-NEXT: addq %rbx, %r8 +; CHECK-NEXT: leaq (%r9,%r14), %r8 +; CHECK-NEXT: addq %r8, %r8 +; CHECK-NEXT: addq %r10, %r8 ; CHECK-NEXT: movq X(%rip), %rbx ; CHECK-NEXT: addq %rdi, %rax ; CHECK-NEXT: addq %rax, %r11 @@ -132,9 +132,9 @@ ; CHECK-NEXT: addq %r11, %rax ; CHECK-NEXT: addq %rdx, %rbx ; CHECK-NEXT: addq %rax, %rbx -; CHECK-NEXT: leaq (%r10,%r9), %rax -; CHECK-NEXT: leaq (%r8,%rax), %r15 -; CHECK-NEXT: addq %rax, %r15 +; CHECK-NEXT: leaq (%r10,%r9), %r15 +; CHECK-NEXT: addq %r15, %r15 +; CHECK-NEXT: addq %r8, %r15 ; CHECK-NEXT: movq X(%rip), %rax ; CHECK-NEXT: addq %r11, %rbx ; CHECK-NEXT: addq %rbx, %r14 @@ -145,9 +145,9 @@ ; CHECK-NEXT: addq %r14, %rbx ; CHECK-NEXT: addq %rsi, %rax ; CHECK-NEXT: addq %rbx, %rax -; CHECK-NEXT: leaq (%r8,%r10), %rbx -; CHECK-NEXT: leaq (%r15,%rbx), %rsi -; CHECK-NEXT: addq %rbx, %rsi +; CHECK-NEXT: leaq (%r8,%r10), %rsi +; CHECK-NEXT: addq %rsi, %rsi +; CHECK-NEXT: addq %r15, %rsi ; CHECK-NEXT: movq X(%rip), %rbx ; CHECK-NEXT: addq %r14, %rax ; CHECK-NEXT: addq %rax, %r9 @@ -158,9 +158,9 @@ ; CHECK-NEXT: addq %r9, %rax ; CHECK-NEXT: addq %rdi, %rbx ; CHECK-NEXT: addq %rax, %rbx -; CHECK-NEXT: leaq (%r15,%r8), %rax -; CHECK-NEXT: leaq (%rsi,%rax), %r12 -; CHECK-NEXT: addq %rax, %r12 +; CHECK-NEXT: leaq (%r15,%r8), %r12 +; CHECK-NEXT: addq %r12, %r12 +; CHECK-NEXT: addq %rsi, %r12 ; CHECK-NEXT: movq X(%rip), %rcx ; CHECK-NEXT: addq %r9, %rbx ; CHECK-NEXT: addq %rbx, %r10 @@ -171,9 +171,9 @@ ; CHECK-NEXT: addq %r10, %rax ; CHECK-NEXT: addq %r11, %rcx ; CHECK-NEXT: addq %rax, %rcx -; CHECK-NEXT: leaq (%rsi,%r15), %rbx -; CHECK-NEXT: leaq (%r12,%rbx), %rax -; CHECK-NEXT: addq %rbx, %rax +; CHECK-NEXT: leaq (%rsi,%r15), %rax +; CHECK-NEXT: addq %rax, %rax +; CHECK-NEXT: addq %r12, %rax ; CHECK-NEXT: movq X(%rip), %rbx ; CHECK-NEXT: addq %r10, %rcx ; CHECK-NEXT: addq %rcx, %r8 @@ -184,9 +184,9 @@ ; CHECK-NEXT: addq %r8, %rcx ; CHECK-NEXT: addq %r14, %rbx ; CHECK-NEXT: addq %rcx, %rbx -; CHECK-NEXT: leaq (%r12,%rsi), %rdx -; CHECK-NEXT: leaq (%rax,%rdx), %rcx -; CHECK-NEXT: addq %rdx, %rcx +; CHECK-NEXT: leaq (%r12,%rsi), %rcx +; CHECK-NEXT: addq %rcx, %rcx +; CHECK-NEXT: addq %rax, %rcx ; CHECK-NEXT: movq X(%rip), %rdx ; CHECK-NEXT: addq %r8, %rbx ; CHECK-NEXT: addq %rbx, %r15 @@ -197,9 +197,9 @@ ; CHECK-NEXT: addq %r15, %rbx ; CHECK-NEXT: addq %r9, %rdx ; CHECK-NEXT: addq %rbx, %rdx -; CHECK-NEXT: leaq (%rax,%r12), %r9 -; CHECK-NEXT: leaq (%rcx,%r9), %rbx -; CHECK-NEXT: addq %r9, %rbx +; CHECK-NEXT: leaq (%rax,%r12), %rbx +; CHECK-NEXT: addq %rbx, %rbx +; CHECK-NEXT: addq %rcx, %rbx ; CHECK-NEXT: addq %r15, %rdx ; CHECK-NEXT: addq %rdx, %rsi ; CHECK-NEXT: addq %rcx, %rbx @@ -211,12 +211,12 @@ ; CHECK-NEXT: addq %rsi, %rdi ; CHECK-NEXT: addq %rdi, %rdx ; CHECK-NEXT: addq %rax, %rcx -; CHECK-NEXT: leaq (%rbx,%rcx), %rdi -; CHECK-NEXT: addq %rcx, %rdi -; CHECK-NEXT: addq %rbx, %rdi +; CHECK-NEXT: addq %rcx, %rcx +; CHECK-NEXT: addq %rbx, %rcx +; CHECK-NEXT: addq %rbx, %rcx ; CHECK-NEXT: addq %rsi, %rdx ; CHECK-NEXT: addq %rdx, %r12 -; CHECK-NEXT: addq %rdx, %rdi +; CHECK-NEXT: addq %rdx, %rcx ; CHECK-NEXT: addq %r15, %rsi ; CHECK-NEXT: movq X(%rip), %rax ; CHECK-NEXT: bswapq %rax @@ -225,7 +225,7 @@ ; CHECK-NEXT: addq %r12, %rsi ; CHECK-NEXT: addq %rsi, %rax ; CHECK-NEXT: addq %r12, %rax -; CHECK-NEXT: addq %rdi, %rax +; CHECK-NEXT: addq %rcx, %rax ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %r12 ; CHECK-NEXT: popq %r14 Index: llvm/test/CodeGen/X86/lea-opt2.ll =================================================================== --- llvm/test/CodeGen/X86/lea-opt2.ll +++ llvm/test/CodeGen/X86/lea-opt2.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown -verify-machineinstrs | FileCheck %s ; This file tests following optimization ; @@ -11,15 +11,14 @@ ; subl %edx, %ecx ; subl %eax, %ecx -; TODO: replace lea with sub. ; C - (A + B) --> C - A - B define i32 @test1(i32* %p, i32 %a, i32 %b, i32 %c) { ; CHECK-LABEL: test1: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: # kill: def $edx killed $edx def $rdx ; CHECK-NEXT: movl %esi, %eax -; CHECK-NEXT: leal (%rdx,%rax), %esi -; CHECK-NEXT: subl %esi, %ecx +; CHECK-NEXT: subl %edx, %ecx +; CHECK-NEXT: subl %eax, %ecx ; CHECK-NEXT: movl %ecx, (%rdi) ; CHECK-NEXT: subl %edx, %eax ; CHECK-NEXT: # kill: def $eax killed $eax killed $rax @@ -32,16 +31,15 @@ ret i32 %sub1 } -; TODO: replace lea with add. ; (A + B) + C --> C + A + B define i32 @test2(i32* %p, i32 %a, i32 %b, i32 %c) { ; CHECK-LABEL: test2: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: # kill: def $edx killed $edx def $rdx ; CHECK-NEXT: movl %esi, %eax -; CHECK-NEXT: leal (%rax,%rdx), %esi -; CHECK-NEXT: addl %ecx, %esi -; CHECK-NEXT: movl %esi, (%rdi) +; CHECK-NEXT: addl %eax, %ecx +; CHECK-NEXT: addl %edx, %ecx +; CHECK-NEXT: movl %ecx, (%rdi) ; CHECK-NEXT: subl %edx, %eax ; CHECK-NEXT: # kill: def $eax killed $eax killed $rax ; CHECK-NEXT: retq @@ -53,16 +51,15 @@ ret i32 %sub1 } -; TODO: replace lea with add. ; C + (A + B) --> C + A + B define i32 @test3(i32* %p, i32 %a, i32 %b, i32 %c) { ; CHECK-LABEL: test3: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: # kill: def $edx killed $edx def $rdx ; CHECK-NEXT: movl %esi, %eax -; CHECK-NEXT: leal (%rax,%rdx), %esi -; CHECK-NEXT: addl %ecx, %esi -; CHECK-NEXT: movl %esi, (%rdi) +; CHECK-NEXT: addl %eax, %ecx +; CHECK-NEXT: addl %edx, %ecx +; CHECK-NEXT: movl %ecx, (%rdi) ; CHECK-NEXT: subl %edx, %eax ; CHECK-NEXT: # kill: def $eax killed $eax killed $rax ; CHECK-NEXT: retq @@ -95,13 +92,12 @@ ret i32 %sub1 } -; TODO: replace lea with sub. define i64 @test5(i64* %p, i64 %a, i64 %b, i64 %c) { ; CHECK-LABEL: test5: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movq (%rdi), %rax -; CHECK-NEXT: leaq (%rdx,%rax), %rsi -; CHECK-NEXT: subq %rsi, %rcx +; CHECK-NEXT: subq %rdx, %rcx +; CHECK-NEXT: subq %rax, %rcx ; CHECK-NEXT: movq %rcx, (%rdi) ; CHECK-NEXT: subq %rdx, %rax ; CHECK-NEXT: retq @@ -114,14 +110,13 @@ ret i64 %sub1 } -; TODO: replace lea with add. define i64 @test6(i64* %p, i64 %a, i64 %b, i64 %c) { ; CHECK-LABEL: test6: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movq (%rdi), %rax -; CHECK-NEXT: leaq (%rdx,%rax), %rsi -; CHECK-NEXT: addq %rcx, %rsi -; CHECK-NEXT: movq %rsi, (%rdi) +; CHECK-NEXT: addq %rdx, %rcx +; CHECK-NEXT: addq %rax, %rcx +; CHECK-NEXT: movq %rcx, (%rdi) ; CHECK-NEXT: subq %rdx, %rax ; CHECK-NEXT: retq entry: @@ -133,14 +128,13 @@ ret i64 %sub1 } -; TODO: replace lea with add. define i64 @test7(i64* %p, i64 %a, i64 %b, i64 %c) { ; CHECK-LABEL: test7: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movq (%rdi), %rax -; CHECK-NEXT: leaq (%rdx,%rax), %rsi -; CHECK-NEXT: addq %rcx, %rsi -; CHECK-NEXT: movq %rsi, (%rdi) +; CHECK-NEXT: addq %rdx, %rcx +; CHECK-NEXT: addq %rax, %rcx +; CHECK-NEXT: movq %rcx, (%rdi) ; CHECK-NEXT: subq %rdx, %rax ; CHECK-NEXT: retq entry: @@ -152,3 +146,118 @@ ret i64 %sub1 } +; The sub instruction generated flags is used by following branch, +; so it should not be transformed. +define i64 @test8(i64* %p, i64 %a, i64 %b, i64 %c) { +; CHECK-LABEL: test8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movq (%rdi), %rax +; CHECK-NEXT: leaq (%rdx,%rax), %rsi +; CHECK-NEXT: subq %rsi, %rcx +; CHECK-NEXT: ja .LBB7_2 +; CHECK-NEXT: # %bb.1: # %then +; CHECK-NEXT: movq %rcx, (%rdi) +; CHECK-NEXT: subq %rdx, %rax +; CHECK-NEXT: retq +; CHECK-NEXT: .LBB7_2: # %else +; CHECK-NEXT: movq $0, (%rdi) +; CHECK-NEXT: subq %rdx, %rax +; CHECK-NEXT: retq +entry: + %ld = load i64, i64* %p, align 8 + %0 = add i64 %b, %ld + %sub = sub i64 %c, %0 + %cond = icmp ule i64 %c, %0 + br i1 %cond, label %then, label %else + +then: + store i64 %sub, i64* %p, align 8 + br label %endif + +else: + store i64 0, i64* %p, align 8 + br label %endif + +endif: + %sub1 = sub i64 %ld, %b + ret i64 %sub1 +} + +; PR50615 +; The sub register usage of lea dest should block the transformation. +define void @test9(i64 %p, i64 %s) { +; CHECK-LABEL: test9: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: leaq (%rsi,%rdi), %rax +; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: testl $4095, %eax # imm = 0xFFF +; CHECK-NEXT: setne %cl +; CHECK-NEXT: shlq $12, %rcx +; CHECK-NEXT: addq %rax, %rcx +; CHECK-NEXT: andq $-4096, %rcx # imm = 0xF000 +; CHECK-NEXT: addq %rcx, %rdi +; CHECK-NEXT: jmp bar@PLT # TAILCALL +entry: + %add = add i64 %s, %p + %rem = and i64 %add, 4095 + %cmp.not = icmp eq i64 %rem, 0 + %add18 = select i1 %cmp.not, i64 0, i64 4096 + %div9 = add i64 %add18, %add + %mul = and i64 %div9, -4096 + %add2 = add i64 %mul, %p + tail call void @bar(i64 %add2, i64 %s) + ret void +} + +define void @test10() { +; CHECK-LABEL: test10: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movl (%rax), %eax +; CHECK-NEXT: movzwl (%rax), %ecx +; CHECK-NEXT: leal (%rcx,%rcx,2), %esi +; CHECK-NEXT: movl %ecx, %edi +; CHECK-NEXT: subl %ecx, %edi +; CHECK-NEXT: subl %ecx, %edi +; CHECK-NEXT: negl %esi +; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: cmpl $4, %eax +; CHECK-NEXT: movl %edi, (%rax) +; CHECK-NEXT: movl %esi, (%rax) +; CHECK-NEXT: cmovnel %eax, %ecx +; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx +; CHECK-NEXT: sarl %cl, %esi +; CHECK-NEXT: movl %esi, (%rax) +; CHECK-NEXT: retq +entry: + %tmp = load i32, i32* undef, align 4 + %tmp3 = sdiv i32 undef, 6 + %tmp4 = load i32, i32* undef, align 4 + %tmp5 = icmp eq i32 %tmp4, 4 + %tmp6 = select i1 %tmp5, i32 %tmp3, i32 %tmp + %tmp10 = load i16, i16* undef, align 2 + %tmp11 = zext i16 %tmp10 to i32 + %tmp13 = zext i16 undef to i32 + %tmp15 = load i16, i16* undef, align 2 + %tmp16 = zext i16 %tmp15 to i32 + %tmp19 = shl nsw i32 undef, 1 + %tmp25 = shl nsw i32 undef, 1 + %tmp26 = add nsw i32 %tmp25, %tmp13 + %tmp28 = shl nsw i32 undef, 1 + %tmp29 = add nsw i32 %tmp28, %tmp16 + %tmp30 = sub nsw i32 %tmp19, %tmp29 + %tmp31 = sub nsw i32 %tmp11, %tmp26 + %tmp32 = shl nsw i32 %tmp30, 1 + %tmp33 = add nsw i32 %tmp32, %tmp31 + store i32 %tmp33, i32* undef, align 4 + %tmp34 = mul nsw i32 %tmp31, -2 + %tmp35 = add nsw i32 %tmp34, %tmp30 + store i32 %tmp35, i32* undef, align 4 + %tmp36 = select i1 %tmp5, i32 undef, i32 undef + %tmp38 = load i32, i32* undef, align 4 + %tmp39 = ashr i32 %tmp38, %tmp6 + store i32 %tmp39, i32* undef, align 4 + ret void +} + +declare void @bar(i64, i64) + Index: llvm/test/CodeGen/X86/vp2intersect_multiple_pairs.ll =================================================================== --- llvm/test/CodeGen/X86/vp2intersect_multiple_pairs.ll +++ llvm/test/CodeGen/X86/vp2intersect_multiple_pairs.ll @@ -53,9 +53,9 @@ ; X86-NEXT: addl %ecx, %edx ; X86-NEXT: kmovw %k1, %ecx ; X86-NEXT: addl %edi, %ecx -; X86-NEXT: addl %eax, %ecx -; X86-NEXT: addl %edx, %ecx -; X86-NEXT: movw %cx, (%esi) +; X86-NEXT: addl %ecx, %eax +; X86-NEXT: addl %edx, %eax +; X86-NEXT: movw %ax, (%esi) ; X86-NEXT: leal -8(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi @@ -107,10 +107,10 @@ ; X64-NEXT: kmovw %k1, %ebx ; X64-NEXT: addl %edi, %eax ; X64-NEXT: addl %ecx, %edx -; X64-NEXT: leal (%rbx,%rsi), %ecx -; X64-NEXT: addl %eax, %ecx -; X64-NEXT: addl %edx, %ecx -; X64-NEXT: movw %cx, (%r14) +; X64-NEXT: addl %ebx, %eax +; X64-NEXT: addl %esi, %eax +; X64-NEXT: addl %edx, %eax +; X64-NEXT: movw %ax, (%r14) ; X64-NEXT: leaq -16(%rbp), %rsp ; X64-NEXT: popq %rbx ; X64-NEXT: popq %r14