Index: lib/Target/X86/X86ISelDAGToDAG.cpp =================================================================== --- lib/Target/X86/X86ISelDAGToDAG.cpp +++ lib/Target/X86/X86ISelDAGToDAG.cpp @@ -1410,6 +1410,19 @@ return false; } + if (AM.BaseType == X86ISelAddressMode::RegBase && AM.Scale == 1) { + if (AM.Base_Reg == N) { + SDValue Base_Reg = AM.Base_Reg; + AM.Base_Reg = AM.IndexReg; + AM.IndexReg = Base_Reg; + AM.Scale = 2; + return false; + } else if (AM.IndexReg == N) { + AM.Scale = 2; + return false; + } + } + // Otherwise, we cannot select it. return true; } Index: lib/Target/X86/X86OptimizeLEAs.cpp =================================================================== --- lib/Target/X86/X86OptimizeLEAs.cpp +++ lib/Target/X86/X86OptimizeLEAs.cpp @@ -26,6 +26,7 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/Passes.h" #include "llvm/IR/DIBuilder.h" #include "llvm/IR/DebugInfoMetadata.h" @@ -44,6 +45,7 @@ cl::init(false)); STATISTIC(NumSubstLEAs, "Number of LEA instruction substitutions"); +STATISTIC(NumFactoredLEAs, "Number of LEAs factorized"); STATISTIC(NumRedundantLEAs, "Number of redundant LEA instructions removed"); /// \brief Returns true if two machine operands are identical and they are not @@ -65,8 +67,8 @@ public: MemOpKey(const MachineOperand *Base, const MachineOperand *Scale, const MachineOperand *Index, const MachineOperand *Segment, - const MachineOperand *Disp) - : Disp(Disp) { + const MachineOperand *Disp, bool DispCheck = false) + : Disp(Disp), HardDispCheck(DispCheck) { Operands[0] = Base; Operands[1] = Scale; Operands[2] = Index; @@ -82,8 +84,10 @@ // Addresses' displacements don't have to be exactly the same. It only // matters that they use the same symbol/index/address. Immediates' or // offsets' differences will be taken care of during instruction - // substitution. - return isSimilarDispOp(*Disp, *Other.Disp); + // substitution. If HardDispCheck is true then Disp must be identical. + if (!HardDispCheck) + return isSimilarDispOp(*Disp, *Other.Disp); + return isIdenticalOp(*Disp,*Other.Disp); } // Address' base, scale, index and segment operands. @@ -91,6 +95,9 @@ // Address' displacement operand. const MachineOperand *Disp; + + // Forces absolute displacement check. + bool HardDispCheck; }; } // end anonymous namespace @@ -178,6 +185,17 @@ &MI.getOperand(N + X86::AddrDisp)); } +static inline MemOpKey getMemOpCSEKey(const MachineInstr &MI, unsigned N) { + static MachineOperand DummyScale = MachineOperand::CreateImm(1); + assert((isLEA(MI) || MI.mayLoadOrStore()) && + "The instruction must be a LEA, a load or a store"); + return MemOpKey(&MI.getOperand(N + X86::AddrBaseReg), + &DummyScale, + &MI.getOperand(N + X86::AddrIndexReg), + &MI.getOperand(N + X86::AddrSegmentReg), + &MI.getOperand(N + X86::AddrDisp), true); +} + static inline bool isIdenticalOp(const MachineOperand &MO1, const MachineOperand &MO2) { return MO1.isIdenticalTo(MO2) && @@ -217,8 +235,124 @@ } namespace { + + +class FactorizeLEAOpt { +public: + using LEAListT = std::list; + using LEAMapT = DenseMap; + using ValueT = DenseMap; + using ScopeEntryT = std::pair; + using ScopeStackT = std::vector; + + FactorizeLEAOpt() = default; + FactorizeLEAOpt(const FactorizeLEAOpt&) = delete; + FactorizeLEAOpt & operator = (const FactorizeLEAOpt&) = delete; + + void performCleanup() { + for(auto LEA : removedLEAs) { + LEA->eraseFromParent(); + } + LEAs.clear(); + Stack.clear(); + } + + LEAMapT& getLEAMap() { return LEAs;} + ScopeEntryT* getTopScope() { return &Stack.back();} + + void addForLazyRemoval(MachineInstr * Instr) { + removedLEAs.push_back(Instr); + } + + /// Push the ScopeEntry for the BasicBlock over Stack. + /// Also traverses over list of instruction and update + /// LEAs Map and ScopeEntry for each LEA instruction + /// found using insertLEA(). + void pushScope(MachineBasicBlock *MBB); + + /// Pops out ScopeEntry of top most BasicBlock from the stack + /// and remove the LEA instructions contained in the scope + /// from the LEAs Map. + void popScope(); + + void insertLEA(MachineInstr * MI); + + /// If LEA contains Physical Registers then its not a candidate + /// for factorizations since physical registers may violate SSA + /// semantics of MI. + bool constainsPhyReg(MachineInstr &MI); + +private: + ScopeStackT Stack; + LEAMapT LEAs; + std::vector removedLEAs; +}; + +void FactorizeLEAOpt::pushScope(MachineBasicBlock * MBB) { + ValueT EmptyMap; + ScopeEntryT SE = std::make_pair(MBB,EmptyMap); + Stack.push_back(SE); + for (auto &MI : *MBB) { + if (isLEA(MI)) + insertLEA(&MI); + } +} + +void FactorizeLEAOpt::popScope() { + ScopeEntryT &SE = Stack.back(); + for(auto MapEntry : SE.second) { + LEAMapT::iterator Itr = LEAs.find(MapEntry.first); + assert((Itr != LEAs.end()) && + "LEAs map must have a node corresponding to ScopeEntry's Key."); + + while(((*Itr).second.size() > MapEntry.second)) + (*Itr).second.pop_front(); + // If list goes empty remove entry from LEAs Map. + if((*Itr).second.empty()) + LEAs.erase(Itr); + } + Stack.pop_back(); +} + +bool FactorizeLEAOpt::constainsPhyReg(MachineInstr &MI) { + MachineOperand Res = MI.getOperand(0); + MachineOperand Base = MI.getOperand(1); + MachineOperand Index = MI.getOperand(3); + + if (Res.isReg() && TargetRegisterInfo::isPhysicalRegister(Res.getReg())) + return true; + if (Base.isReg() && TargetRegisterInfo::isPhysicalRegister(Base.getReg())) + return true; + if (Index.isReg() && TargetRegisterInfo::isPhysicalRegister(Index.getReg())) + return true; + + return false; +} + +void FactorizeLEAOpt::insertLEA(MachineInstr * MI) { + unsigned lsize = 0; + if (constainsPhyReg(*MI)) + return; + + MemOpKey Key = getMemOpCSEKey(*MI, 1); + ScopeEntryT *TopScope = getTopScope(); + + LEAMapT::iterator Itr = LEAs.find(Key); + if (Itr == LEAs.end()) { + lsize = 0; + LEAs[Key].push_front(MI); + } else { + lsize = (*Itr).second.size(); + (*Itr).second.push_front(MI); + } + if (TopScope->second.find(Key) == TopScope->second.end()) + TopScope->second[Key] = lsize; +} + + class OptimizeLEAPass : public MachineFunctionPass { public: + OptimizeLEAPass() : MachineFunctionPass(ID) {} StringRef getPassName() const override { return "X86 LEA Optimize"; } @@ -228,6 +362,13 @@ /// been calculated by LEA. Also, remove redundant LEAs. bool runOnMachineFunction(MachineFunction &MF) override; + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + AU.addRequired(); + } + + private: typedef DenseMap> MemOpMap; @@ -273,8 +414,20 @@ /// \brief Removes LEAs which calculate similar addresses. bool removeRedundantLEAs(MemOpMap &LEAs); + /// \brief Visit over basic blocks, collect LEAs in a scoped + /// hash map (FactorizeLEAOpt::LEAs) and try to factor them out. + bool FactorizeLEAsAllBasicBlocks(MachineFunction &MF); + + bool FactorizeLEAsBasicBlock(MachineDomTreeNode *DN); + + /// \brief Factor out LEAs which share Base,Index,Offset and Segment. + bool processBasicBlock(const MachineBasicBlock &MBB); + DenseMap InstrPos; + FactorizeLEAOpt FactorOpt; + + MachineDominatorTree *DT; MachineRegisterInfo *MRI; const X86InstrInfo *TII; const X86RegisterInfo *TRI; @@ -646,6 +799,89 @@ return Changed; } +bool OptimizeLEAPass::processBasicBlock(const MachineBasicBlock &MBB) { + bool cseDone = false; + + // Legal scale value (1,2,4 & 8) vector. + int LegalScale[9] = {0,1,1,0,1,0,0,0,1}; + + auto CompareFn = + [] (const MachineInstr *Arg1,const MachineInstr *Arg2) -> bool { + if(Arg1->getOperand(2).getImm() < Arg2->getOperand(2).getImm()) + return false; + return true; + }; + + // Loop over all entries in the table. + for (auto &E : FactorOpt.getLEAMap()) { + auto &List = E.second; + if(List.size() > 1) { + List.sort(CompareFn); + } + // Loop over all LEA pairs. + for(auto Iter1 = List.begin(); Iter1 != List.end(); Iter1++) { + for(auto Iter2 = std::next(Iter1); Iter2 != List.end(); Iter2++) { + MachineInstr &LI1 = **Iter1; + MachineInstr &LI2 = **Iter2; + + if (!DT->dominates(&LI2,&LI1)) + continue; + + int Scale1 = LI1.getOperand(2).getImm(); + int Scale2 = LI2.getOperand(2).getImm(); + assert(LI2.getOperand(0).isReg() && "Result is a VirtualReg"); + DebugLoc DL = LI1.getDebugLoc(); + + int Factor = Scale1 - Scale2; + if (Factor > 0 && LegalScale[Factor]) { + DEBUG(dbgs() << "CSE LEAs: Candidate to replace: "; LI1.dump();); + MachineInstr *NewMI = + BuildMI(*(const_cast(&MBB)), + &LI1,DL,TII->get(LI1.getOpcode())) + .addDef(LI1.getOperand(0).getReg()) // Dst = Dst of LI1. + .addUse(LI2.getOperand(0).getReg()) // Base = Dst of LI2. + .addImm(Factor) // Scale = Diff b/w scales. + .addUse(LI1.getOperand(3).getReg()) // Index = Index of LI1. + .addImm(0) // Disp = 0 + .addUse(LI1.getOperand(5).getReg()); // Segment = Segmant of LI1. + + cseDone = NewMI != nullptr; + + /// Lazy removal shall ensure that replaced LEA remains + /// till we finish processing all the basic block. This shall + /// provide opportunity for further factorization based on + /// the replaced LEA which will be legal since it has same + /// destination as newly formed LEA. + FactorOpt.addForLazyRemoval(&LI1); + + NumFactoredLEAs++; + DEBUG(dbgs() << "CSE LEAs: Replaced by: "; NewMI->dump();); + } + } + } + } + return cseDone; +} + +bool OptimizeLEAPass::FactorizeLEAsBasicBlock(MachineDomTreeNode * DN) { + bool Changed = false; + MachineBasicBlock * MBB = DN->getBlock(); + FactorOpt.pushScope(MBB); + + Changed |= processBasicBlock(*MBB); + for(auto Child : DN->getChildren()) + FactorizeLEAsBasicBlock(Child); + + FactorOpt.popScope(); + return Changed; +} + +bool OptimizeLEAPass::FactorizeLEAsAllBasicBlocks(MachineFunction &MF) { + bool Changed = FactorizeLEAsBasicBlock(DT->getRootNode()); + FactorOpt.performCleanup(); + return Changed; +} + bool OptimizeLEAPass::runOnMachineFunction(MachineFunction &MF) { bool Changed = false; @@ -655,6 +891,10 @@ MRI = &MF.getRegInfo(); TII = MF.getSubtarget().getInstrInfo(); TRI = MF.getSubtarget().getRegisterInfo(); + DT = &getAnalysis(); + + // Attempt factorizing LEAs. + Changed |= FactorizeLEAsAllBasicBlocks(MF); // Process all basic blocks. for (auto &MBB : MF) { Index: test/CodeGen/X86/lea-opt-csebb.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/lea-opt-csebb.ll @@ -0,0 +1,68 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py + +; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s -check-prefix=X64 +; RUN: llc < %s -mtriple=i686-unknown | FileCheck %s -check-prefix=X86 +%struct.SA = type { i32 , i32 , i32 , i32 , i32}; + +define void @foo(%struct.SA* nocapture %ctx, i32 %n) local_unnamed_addr #0 { +; X64-LABEL: foo: +; X64: # BB#0: # %entry +; X64-NEXT: .p2align 4, 0x90 +; X64-NEXT: .LBB0_1: # %loop +; X64-NEXT: # =>This Inner Loop Header: Depth=1 +; X64-NEXT: movl (%rdi), %ecx +; X64-NEXT: movl 16(%rdi), %eax +; X64-NEXT: leal 1(%rcx,%rax), %ecx +; X64-NEXT: movl %ecx, 12(%rdi) +; X64-NEXT: decl %esi +; X64-NEXT: jne .LBB0_1 +; X64-NEXT: # BB#2: # %exit +; X64-NEXT: leal (%ecx,%rax), %eax +; X64-NEXT: movl %eax, 16(%rdi) +; X64-NEXT: retq +; +; X86-LABEL: foo: +; X86: # BB#0: # %entry +; X86-NEXT: pushl %esi +; X86-NEXT: .Lcfi0: +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: .Lcfi1: +; X86-NEXT: .cfi_offset %esi, -8 +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: .p2align 4, 0x90 +; X86-NEXT: .LBB0_1: # %loop +; X86-NEXT: # =>This Inner Loop Header: Depth=1 +; X86-NEXT: movl (%eax), %esi +; X86-NEXT: movl 16(%eax), %edx +; X86-NEXT: leal 1(%esi,%edx), %esi +; X86-NEXT: movl %esi, 12(%eax) +; X86-NEXT: decl %ecx +; X86-NEXT: jne .LBB0_1 +; X86-NEXT: # BB#2: # %exit +; X86-NEXT: leal (%esi,%edx), %ecx +; X86-NEXT: movl %ecx, 16(%eax) +; X86-NEXT: popl %esi +; X86-NEXT: retl + entry: + br label %loop + + loop: + %iter = phi i32 [%n ,%entry ] ,[ %iter.ctr ,%loop] + %h0 = getelementptr inbounds %struct.SA, %struct.SA* %ctx, i64 0, i32 0 + %0 = load i32, i32* %h0, align 8 + %h3 = getelementptr inbounds %struct.SA, %struct.SA* %ctx, i64 0, i32 3 + %h4 = getelementptr inbounds %struct.SA, %struct.SA* %ctx, i64 0, i32 4 + %1 = load i32, i32* %h4, align 8 + %add = add i32 %0, 1 + %add4 = add i32 %add, %1 + store i32 %add4, i32* %h3, align 4 + %add29 = add i32 %add4, %1 + %iter.ctr = sub i32 %iter , 1 + %res = icmp ne i32 %iter.ctr , 0 + br i1 %res , label %loop , label %exit + + exit: + store i32 %add29, i32* %h4, align 8 + ret void +} Index: test/CodeGen/X86/lea-opt-cst.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/lea-opt-cst.ll @@ -0,0 +1,40 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s -check-prefix=X64 +; RUN: llc < %s -mtriple=i686-unknown | FileCheck %s -check-prefix=X86 + +%struct.SA = type { i32 , i32 , i32 , i32 ,i32 } + +define void @test_func(%struct.SA* nocapture %ctx, i32 %n) local_unnamed_addr { +; X64-LABEL: test_func: +; X64: # BB#0: # %entry +; X64-NEXT: movl (%rdi), %eax +; X64-NEXT: movl 16(%rdi), %ecx +; X64-NEXT: leal 1(%rax,%rcx), %eax +; X64-NEXT: movl %eax, 12(%rdi) +; X64-NEXT: leal (%eax,%rcx), %eax +; X64-NEXT: movl %eax, 16(%rdi) +; X64-NEXT: retq +; +; X86-LABEL: test_func: +; X86: # BB#0: # %entry +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl (%eax), %ecx +; X86-NEXT: movl 16(%eax), %edx +; X86-NEXT: leal 1(%ecx,%edx), %ecx +; X86-NEXT: movl %ecx, 12(%eax) +; X86-NEXT: leal (%ecx,%edx), %ecx +; X86-NEXT: movl %ecx, 16(%eax) +; X86-NEXT: retl + entry: + %h0 = getelementptr inbounds %struct.SA, %struct.SA* %ctx, i64 0, i32 0 + %0 = load i32, i32* %h0, align 8 + %h3 = getelementptr inbounds %struct.SA, %struct.SA* %ctx, i64 0, i32 3 + %h4 = getelementptr inbounds %struct.SA, %struct.SA* %ctx, i64 0, i32 4 + %1 = load i32, i32* %h4, align 8 + %add = add i32 %0, 1 + %add4 = add i32 %add, %1 + store i32 %add4, i32* %h3, align 4 + %add29 = add i32 %add4 , %1 + store i32 %add29, i32* %h4, align 8 + ret void +} Index: test/CodeGen/X86/mul-constant-i16.ll =================================================================== --- test/CodeGen/X86/mul-constant-i16.ll +++ test/CodeGen/X86/mul-constant-i16.ll @@ -558,11 +558,10 @@ define i16 @test_mul_by_29(i16 %x) { ; X86-LABEL: test_mul_by_29: ; X86: # BB#0: -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: leal (%ecx,%ecx,8), %eax -; X86-NEXT: leal (%eax,%eax,2), %eax -; X86-NEXT: addl %ecx, %eax -; X86-NEXT: addl %ecx, %eax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: leal (%eax,%eax,8), %ecx +; X86-NEXT: leal (%ecx,%ecx,2), %ecx +; X86-NEXT: leal (%ecx,%eax,2), %eax ; X86-NEXT: # kill: %AX %AX %EAX ; X86-NEXT: retl ; @@ -571,8 +570,7 @@ ; X64-NEXT: # kill: %EDI %EDI %RDI ; X64-NEXT: leal (%rdi,%rdi,8), %eax ; X64-NEXT: leal (%rax,%rax,2), %eax -; X64-NEXT: addl %edi, %eax -; X64-NEXT: addl %edi, %eax +; X64-NEXT: leal (%rax,%rdi,2), %eax ; X64-NEXT: # kill: %AX %AX %EAX ; X64-NEXT: retq %mul = mul nsw i16 %x, 29 Index: test/CodeGen/X86/mul-constant-i32.ll =================================================================== --- test/CodeGen/X86/mul-constant-i32.ll +++ test/CodeGen/X86/mul-constant-i32.ll @@ -1457,11 +1457,10 @@ define i32 @test_mul_by_29(i32 %x) { ; X86-LABEL: test_mul_by_29: ; X86: # BB#0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: leal (%ecx,%ecx,8), %eax -; X86-NEXT: leal (%eax,%eax,2), %eax -; X86-NEXT: addl %ecx, %eax -; X86-NEXT: addl %ecx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: leal (%eax,%eax,8), %ecx +; X86-NEXT: leal (%ecx,%ecx,2), %ecx +; X86-NEXT: leal (%ecx,%eax,2), %eax ; X86-NEXT: retl ; ; X64-HSW-LABEL: test_mul_by_29: @@ -1469,8 +1468,7 @@ ; X64-HSW-NEXT: # kill: %EDI %EDI %RDI ; X64-HSW-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50] ; X64-HSW-NEXT: leal (%rax,%rax,2), %eax # sched: [1:0.50] -; X64-HSW-NEXT: addl %edi, %eax # sched: [1:0.25] -; X64-HSW-NEXT: addl %edi, %eax # sched: [1:0.25] +; X64-HSW-NEXT: leal (%rax,%rdi,2), %eax # sched: [1:0.50] ; X64-HSW-NEXT: retq # sched: [1:1.00] ; ; X64-JAG-LABEL: test_mul_by_29: @@ -1478,8 +1476,7 @@ ; X64-JAG-NEXT: # kill: %EDI %EDI %RDI ; X64-JAG-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50] ; X64-JAG-NEXT: leal (%rax,%rax,2), %eax # sched: [1:0.50] -; X64-JAG-NEXT: addl %edi, %eax # sched: [1:0.50] -; X64-JAG-NEXT: addl %edi, %eax # sched: [1:0.50] +; X64-JAG-NEXT: leal (%rax,%rdi,2), %eax # sched: [1:0.50] ; X64-JAG-NEXT: retq # sched: [4:1.00] ; ; X86-NOOPT-LABEL: test_mul_by_29: Index: test/CodeGen/X86/mul-constant-i64.ll =================================================================== --- test/CodeGen/X86/mul-constant-i64.ll +++ test/CodeGen/X86/mul-constant-i64.ll @@ -1523,8 +1523,7 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: leal (%eax,%eax,8), %ecx ; X86-NEXT: leal (%ecx,%ecx,2), %ecx -; X86-NEXT: addl %eax, %ecx -; X86-NEXT: addl %eax, %ecx +; X86-NEXT: leal (%ecx,%eax,2), %ecx ; X86-NEXT: movl $29, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: addl %ecx, %edx @@ -1534,16 +1533,14 @@ ; X64-HSW: # BB#0: ; X64-HSW-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50] ; X64-HSW-NEXT: leaq (%rax,%rax,2), %rax # sched: [1:0.50] -; X64-HSW-NEXT: addq %rdi, %rax # sched: [1:0.25] -; X64-HSW-NEXT: addq %rdi, %rax # sched: [1:0.25] +; X64-HSW-NEXT: leaq (%rax,%rdi,2), %rax # sched: [1:0.50] ; X64-HSW-NEXT: retq # sched: [1:1.00] ; ; X64-JAG-LABEL: test_mul_by_29: ; X64-JAG: # BB#0: ; X64-JAG-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50] ; X64-JAG-NEXT: leaq (%rax,%rax,2), %rax # sched: [1:0.50] -; X64-JAG-NEXT: addq %rdi, %rax # sched: [1:0.50] -; X64-JAG-NEXT: addq %rdi, %rax # sched: [1:0.50] +; X64-JAG-NEXT: leaq (%rax,%rdi,2), %rax # sched: [1:0.50] ; X64-JAG-NEXT: retq # sched: [4:1.00] ; ; X86-NOOPT-LABEL: test_mul_by_29: Index: test/CodeGen/X86/mul-constant-result.ll =================================================================== --- test/CodeGen/X86/mul-constant-result.ll +++ test/CodeGen/X86/mul-constant-result.ll @@ -163,8 +163,7 @@ ; X86-NEXT: .LBB0_35: ; X86-NEXT: leal (%eax,%eax,8), %ecx ; X86-NEXT: leal (%ecx,%ecx,2), %ecx -; X86-NEXT: addl %eax, %ecx -; X86-NEXT: addl %ecx, %eax +; X86-NEXT: leal (%ecx,%eax,2), %eax ; X86-NEXT: popl %esi ; X86-NEXT: retl ; X86-NEXT: .LBB0_36: @@ -322,16 +321,17 @@ ; X64-HSW-NEXT: .LBB0_31: ; X64-HSW-NEXT: leal (%rax,%rax,8), %ecx ; X64-HSW-NEXT: leal (%rcx,%rcx,2), %ecx -; X64-HSW-NEXT: jmp .LBB0_17 -; X64-HSW-NEXT: .LBB0_32: -; X64-HSW-NEXT: leal (%rax,%rax,8), %ecx -; X64-HSW-NEXT: leal (%rcx,%rcx,2), %ecx -; X64-HSW-NEXT: addl %eax, %ecx ; X64-HSW-NEXT: .LBB0_17: ; X64-HSW-NEXT: addl %eax, %ecx ; X64-HSW-NEXT: movl %ecx, %eax ; X64-HSW-NEXT: # kill: %EAX %EAX %RAX ; X64-HSW-NEXT: retq +; X64-HSW-NEXT: .LBB0_32: +; X64-HSW-NEXT: leal (%rax,%rax,8), %ecx +; X64-HSW-NEXT: leal (%rcx,%rcx,2), %ecx +; X64-HSW-NEXT: leal (%rcx,%rax,2), %eax +; X64-HSW-NEXT: # kill: %EAX %EAX %RAX +; X64-HSW-NEXT: retq ; X64-HSW-NEXT: .LBB0_33: ; X64-HSW-NEXT: movl %eax, %ecx ; X64-HSW-NEXT: shll $5, %ecx Index: test/CodeGen/X86/umul-with-overflow.ll =================================================================== --- test/CodeGen/X86/umul-with-overflow.ll +++ test/CodeGen/X86/umul-with-overflow.ll @@ -1,37 +1,46 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=i686-unknown-linux-gnu | FileCheck %s declare {i32, i1} @llvm.umul.with.overflow.i32(i32 %a, i32 %b) define zeroext i1 @a(i32 %x) nounwind { +; CHECK-LABEL: a: +; CHECK: # BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl $3, %ecx +; CHECK-NEXT: mull %ecx +; CHECK-NEXT: seto %al +; CHECK-NEXT: retl %res = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %x, i32 3) %obil = extractvalue {i32, i1} %res, 1 ret i1 %obil - -; CHECK-LABEL: a: -; CHECK: mull -; CHECK: seto %al -; CHECK: ret + } define i32 @test2(i32 %a, i32 %b) nounwind readnone { +; CHECK-LABEL: test2: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: addl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: addl %eax, %eax +; CHECK-NEXT: retl entry: %tmp0 = add i32 %b, %a %tmp1 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %tmp0, i32 2) %tmp2 = extractvalue { i32, i1 } %tmp1, 0 ret i32 %tmp2 -; CHECK-LABEL: test2: -; CHECK: addl -; CHECK-NEXT: addl -; CHECK-NEXT: ret } define i32 @test3(i32 %a, i32 %b) nounwind readnone { +; CHECK-LABEL: test3: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: addl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl $4, %ecx +; CHECK-NEXT: mull %ecx +; CHECK-NEXT: retl entry: %tmp0 = add i32 %b, %a %tmp1 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %tmp0, i32 4) %tmp2 = extractvalue { i32, i1 } %tmp1, 0 ret i32 %tmp2 -; CHECK-LABEL: test3: -; CHECK: addl -; CHECK: mull -; CHECK-NEXT: ret } Index: test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll =================================================================== --- test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll +++ test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll @@ -13,14 +13,14 @@ ; X64-NEXT: .p2align ; X64: %loop ; no complex address modes -; X64-NOT: (%{{[^)]+}},%{{[^)]+}}, +; X64-NOT: [1-9]+(%{{[^)]+}},%{{[^)]+}}, ; ; X32: @simple ; no expensive address computation in the preheader ; X32-NOT: imul ; X32: %loop ; no complex address modes -; X32-NOT: (%{{[^)]+}},%{{[^)]+}}, +; X32-NOT: [1-9]+(%{{[^)]+}},%{{[^)]+}}, define i32 @simple(i32* %a, i32* %b, i32 %x) nounwind { entry: br label %loop @@ -103,7 +103,7 @@ ; X32-NOT: mov{{.*}}(%esp){{$}} ; X32: %for.body{{$}} ; no complex address modes -; X32-NOT: (%{{[^)]+}},%{{[^)]+}}, +; X32-NOT: [1-9]+(%{{[^)]+}},%{{[^)]+}}, ; no reloads ; X32-NOT: (%esp) define void @extrastride(i8* nocapture %main, i32 %main_stride, i32* nocapture %res, i32 %x, i32 %y, i32 %z) nounwind { Index: utils/TableGen/DAGISelMatcherGen.cpp =================================================================== --- utils/TableGen/DAGISelMatcherGen.cpp +++ utils/TableGen/DAGISelMatcherGen.cpp @@ -305,7 +305,7 @@ const SDNodeInfo &CInfo = CGP.getSDNodeInfo(N->getOperator()); // If this is an 'and R, 1234' where the operation is AND/OR and the RHS is - // a constant without a predicate fn that has more that one bit set, handle + // a constant without a predicate fn that has more than one bit set, handle // this as a special case. This is usually for targets that have special // handling of certain large constants (e.g. alpha with it's 8/16/32-bit // handling stuff). Using these instructions is often far more efficient