Index: lib/Target/X86/X86ISelDAGToDAG.cpp =================================================================== --- lib/Target/X86/X86ISelDAGToDAG.cpp +++ lib/Target/X86/X86ISelDAGToDAG.cpp @@ -1410,6 +1410,19 @@ return false; } + if (AM.BaseType == X86ISelAddressMode::RegBase && AM.Scale == 1) { + if (AM.Base_Reg == N) { + SDValue Base_Reg = AM.Base_Reg; + AM.Base_Reg = AM.IndexReg; + AM.IndexReg = Base_Reg; + AM.Scale = 2; + return false; + } else if (AM.IndexReg == N) { + AM.Scale = 2; + return false; + } + } + // Otherwise, we cannot select it. return true; } Index: lib/Target/X86/X86OptimizeLEAs.cpp =================================================================== --- lib/Target/X86/X86OptimizeLEAs.cpp +++ lib/Target/X86/X86OptimizeLEAs.cpp @@ -26,6 +26,7 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/Passes.h" #include "llvm/IR/DIBuilder.h" #include "llvm/IR/DebugInfoMetadata.h" @@ -65,8 +66,8 @@ public: MemOpKey(const MachineOperand *Base, const MachineOperand *Scale, const MachineOperand *Index, const MachineOperand *Segment, - const MachineOperand *Disp) - : Disp(Disp) { + const MachineOperand *Disp, bool DispCheck = false) + : Disp(Disp), HardDispCheck(DispCheck) { Operands[0] = Base; Operands[1] = Scale; Operands[2] = Index; @@ -82,8 +83,10 @@ // Addresses' displacements don't have to be exactly the same. It only // matters that they use the same symbol/index/address. Immediates' or // offsets' differences will be taken care of during instruction - // substitution. - return isSimilarDispOp(*Disp, *Other.Disp); + // substitution. If HardDispCheck is true then Disp must be identical. + if (!HardDispCheck) + return isSimilarDispOp(*Disp, *Other.Disp); + return isIdenticalOp(*Disp,*Other.Disp); } // Address' base, scale, index and segment operands. @@ -91,6 +94,9 @@ // Address' displacement operand. const MachineOperand *Disp; + + // Forces absolute displacement check. + bool HardDispCheck; }; } // end anonymous namespace @@ -178,6 +184,17 @@ &MI.getOperand(N + X86::AddrDisp)); } +static inline MemOpKey getMemOpCSEKey(const MachineInstr &MI, unsigned N) { + static MachineOperand DummyScale = MachineOperand::CreateImm(1); + assert((isLEA(MI) || MI.mayLoadOrStore()) && + "The instruction must be a LEA, a load or a store"); + return MemOpKey(&MI.getOperand(N + X86::AddrBaseReg), + &DummyScale, + &MI.getOperand(N + X86::AddrIndexReg), + &MI.getOperand(N + X86::AddrSegmentReg), + &MI.getOperand(N + X86::AddrDisp), true); +} + static inline bool isIdenticalOp(const MachineOperand &MO1, const MachineOperand &MO2) { return MO1.isIdenticalTo(MO2) && @@ -228,6 +245,12 @@ /// been calculated by LEA. Also, remove redundant LEAs. bool runOnMachineFunction(MachineFunction &MF) override; + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + AU.addRequired(); + } + private: typedef DenseMap> MemOpMap; @@ -261,6 +284,13 @@ /// distance between them. void findLEAs(const MachineBasicBlock &MBB, MemOpMap &LEAs); + /// \brief Find all LEA instructions in the basic block that have same + /// Base, Index, Disp and Segment. + void populateCSEMap(const MachineBasicBlock &MBB, MemOpMap &LEAs); + + /// \brief Factor out LEAs which share Base,Index,Offset and Segment. + bool cseLEAs(const MachineBasicBlock &MBB); + /// \brief Removes redundant address calculations. bool removeRedundantAddrCalc(MemOpMap &LEAs); @@ -275,6 +305,7 @@ DenseMap InstrPos; + MachineDominatorTree *DT; MachineRegisterInfo *MRI; const X86InstrInfo *TII; const X86RegisterInfo *TRI; @@ -286,6 +317,13 @@ FunctionPass *llvm::createX86OptimizeLEAs() { return new OptimizeLEAPass(); } +void OptimizeLEAPass::populateCSEMap(const MachineBasicBlock &MBB, MemOpMap &LEAs) { + for (auto &MI : MBB) { + if (isLEA(MI)) + LEAs[getMemOpCSEKey(MI, 1)].push_back(const_cast(&MI)); + } +} + int OptimizeLEAPass::calcInstrDist(const MachineInstr &First, const MachineInstr &Last) { // Both instructions must be in the same basic block and they must be @@ -646,6 +684,66 @@ return Changed; } +bool OptimizeLEAPass::cseLEAs(const MachineBasicBlock &MBB) { + MemOpMap LEAs; + bool cseDone = false; + + // Legal scale value (1,2,4 & 8) vector. + int LegalScale[9] = {0,1,1,0,1,0,0,0,1}; + + populateCSEMap(MBB,LEAs); + + auto CompareFn = + [] (const MachineInstr *Arg1,const MachineInstr *Arg2) -> bool { + if(Arg1->getOperand(2).getImm() < Arg2->getOperand(2).getImm()) + return false; + return true; + }; + + // Loop over all entries in the table. + for (auto &E : LEAs) { + auto &List = E.second; + if(List.size() > 1) { + std::sort(List.begin(),List.end(),CompareFn); + } + // Loop over all LEA pairs. + for(auto LII = List.begin(); LII != List.end(); LII++) { + MachineInstr &LI1 = **LII; + auto LINext = std::next(LII); + if(LINext == List.end()) + break; + MachineInstr &LI2 = **LINext; + if (!DT->dominates(&LI2,&LI1)) + continue; + + int Scale1 = LI1.getOperand(2).getImm(); + int Scale2 = LI2.getOperand(2).getImm(); + assert(LI2.getOperand(0).isReg() && "Result is a VirtualReg"); + DebugLoc DL = LI1.getDebugLoc(); + + int Factor = Scale1 - Scale2; + if (Factor > 0 && LegalScale[Factor]) { + DEBUG(dbgs() << "CSE LEAs: Candidate to replace: "; LI1.dump();); + MachineInstr *NewMI = + BuildMI(*(const_cast(&MBB)), + &LI1,DL,TII->get(LI1.getOpcode())) + .addDef(LI1.getOperand(0).getReg()) // Dst = Dst of LI1. + .addUse(LI2.getOperand(0).getReg()) // Base = Dst of LI2. + .addImm(Factor) // Scale = Diff b/w scales. + .addUse(LI1.getOperand(3).getReg()) // Index = Index of LI1. + .addImm(0) // Disp = 0 + .addUse(LI1.getOperand(5).getReg()); // Segment = Segmant of LI1. + + LI1.eraseFromParent(); + cseDone = NewMI != nullptr; + DEBUG(dbgs() << "CSE LEAs: Replaced by: "; NewMI->dump();); + } + } + } + return cseDone; +} + + bool OptimizeLEAPass::runOnMachineFunction(MachineFunction &MF) { bool Changed = false; @@ -655,12 +753,16 @@ MRI = &MF.getRegInfo(); TII = MF.getSubtarget().getInstrInfo(); TRI = MF.getSubtarget().getRegisterInfo(); + DT = &getAnalysis(); // Process all basic blocks. for (auto &MBB : MF) { MemOpMap LEAs; InstrPos.clear(); + // Attempt CSE over LEAs. + Changed |= cseLEAs(MBB); + // Find all LEA instructions in basic block. findLEAs(MBB, LEAs); Index: test/CodeGen/X86/lea-opt-cst.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/lea-opt-cst.ll @@ -0,0 +1,40 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s -check-prefix=X64 +; RUN: llc < %s -mtriple=i686-unknown | FileCheck %s -check-prefix=X86 + +%struct.SA = type { i32 , i32 , i32 , i32 ,i32 } + +define void @test_func(%struct.SA* nocapture %ctx, i32 %n) local_unnamed_addr { +; X64-LABEL: test_func: +; X64: # BB#0: # %entry +; X64-NEXT: movl (%rdi), %eax +; X64-NEXT: movl 16(%rdi), %ecx +; X64-NEXT: leal 1(%rax,%rcx), %eax +; X64-NEXT: movl %eax, 12(%rdi) +; X64-NEXT: leal (%eax,%rcx), %eax +; X64-NEXT: movl %eax, 16(%rdi) +; X64-NEXT: retq +; +; X86-LABEL: test_func: +; X86: # BB#0: # %entry +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl (%eax), %ecx +; X86-NEXT: movl 16(%eax), %edx +; X86-NEXT: leal 1(%ecx,%edx), %ecx +; X86-NEXT: movl %ecx, 12(%eax) +; X86-NEXT: leal (%ecx,%edx), %ecx +; X86-NEXT: movl %ecx, 16(%eax) +; X86-NEXT: retl + entry: + %h0 = getelementptr inbounds %struct.SA, %struct.SA* %ctx, i64 0, i32 0 + %0 = load i32, i32* %h0, align 8 + %h3 = getelementptr inbounds %struct.SA, %struct.SA* %ctx, i64 0, i32 3 + %h4 = getelementptr inbounds %struct.SA, %struct.SA* %ctx, i64 0, i32 4 + %1 = load i32, i32* %h4, align 8 + %add = add i32 %0, 1 + %add4 = add i32 %add, %1 + store i32 %add4, i32* %h3, align 4 + %add29 = add i32 %add4 , %1 + store i32 %add29, i32* %h4, align 8 + ret void +} Index: test/CodeGen/X86/mul-constant-i16.ll =================================================================== --- test/CodeGen/X86/mul-constant-i16.ll +++ test/CodeGen/X86/mul-constant-i16.ll @@ -558,11 +558,10 @@ define i16 @test_mul_by_29(i16 %x) { ; X86-LABEL: test_mul_by_29: ; X86: # BB#0: -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: leal (%ecx,%ecx,8), %eax -; X86-NEXT: leal (%eax,%eax,2), %eax -; X86-NEXT: addl %ecx, %eax -; X86-NEXT: addl %ecx, %eax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: leal (%eax,%eax,8), %ecx +; X86-NEXT: leal (%ecx,%ecx,2), %ecx +; X86-NEXT: leal (%ecx,%eax,2), %eax ; X86-NEXT: # kill: %AX %AX %EAX ; X86-NEXT: retl ; @@ -571,8 +570,7 @@ ; X64-NEXT: # kill: %EDI %EDI %RDI ; X64-NEXT: leal (%rdi,%rdi,8), %eax ; X64-NEXT: leal (%rax,%rax,2), %eax -; X64-NEXT: addl %edi, %eax -; X64-NEXT: addl %edi, %eax +; X64-NEXT: leal (%rax,%rdi,2), %eax ; X64-NEXT: # kill: %AX %AX %EAX ; X64-NEXT: retq %mul = mul nsw i16 %x, 29 Index: test/CodeGen/X86/mul-constant-i32.ll =================================================================== --- test/CodeGen/X86/mul-constant-i32.ll +++ test/CodeGen/X86/mul-constant-i32.ll @@ -1457,11 +1457,10 @@ define i32 @test_mul_by_29(i32 %x) { ; X86-LABEL: test_mul_by_29: ; X86: # BB#0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: leal (%ecx,%ecx,8), %eax -; X86-NEXT: leal (%eax,%eax,2), %eax -; X86-NEXT: addl %ecx, %eax -; X86-NEXT: addl %ecx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: leal (%eax,%eax,8), %ecx +; X86-NEXT: leal (%ecx,%ecx,2), %ecx +; X86-NEXT: leal (%ecx,%eax,2), %eax ; X86-NEXT: retl ; ; X64-HSW-LABEL: test_mul_by_29: @@ -1469,8 +1468,7 @@ ; X64-HSW-NEXT: # kill: %EDI %EDI %RDI ; X64-HSW-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50] ; X64-HSW-NEXT: leal (%rax,%rax,2), %eax # sched: [1:0.50] -; X64-HSW-NEXT: addl %edi, %eax # sched: [1:0.25] -; X64-HSW-NEXT: addl %edi, %eax # sched: [1:0.25] +; X64-HSW-NEXT: leal (%rax,%rdi,2), %eax # sched: [1:0.50] ; X64-HSW-NEXT: retq # sched: [1:1.00] ; ; X64-JAG-LABEL: test_mul_by_29: @@ -1478,8 +1476,7 @@ ; X64-JAG-NEXT: # kill: %EDI %EDI %RDI ; X64-JAG-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50] ; X64-JAG-NEXT: leal (%rax,%rax,2), %eax # sched: [1:0.50] -; X64-JAG-NEXT: addl %edi, %eax # sched: [1:0.50] -; X64-JAG-NEXT: addl %edi, %eax # sched: [1:0.50] +; X64-JAG-NEXT: leal (%rax,%rdi,2), %eax # sched: [1:0.50] ; X64-JAG-NEXT: retq # sched: [4:1.00] ; ; X86-NOOPT-LABEL: test_mul_by_29: Index: test/CodeGen/X86/mul-constant-i64.ll =================================================================== --- test/CodeGen/X86/mul-constant-i64.ll +++ test/CodeGen/X86/mul-constant-i64.ll @@ -1523,8 +1523,7 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: leal (%eax,%eax,8), %ecx ; X86-NEXT: leal (%ecx,%ecx,2), %ecx -; X86-NEXT: addl %eax, %ecx -; X86-NEXT: addl %eax, %ecx +; X86-NEXT: leal (%ecx,%eax,2), %ecx ; X86-NEXT: movl $29, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: addl %ecx, %edx @@ -1534,16 +1533,14 @@ ; X64-HSW: # BB#0: ; X64-HSW-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50] ; X64-HSW-NEXT: leaq (%rax,%rax,2), %rax # sched: [1:0.50] -; X64-HSW-NEXT: addq %rdi, %rax # sched: [1:0.25] -; X64-HSW-NEXT: addq %rdi, %rax # sched: [1:0.25] +; X64-HSW-NEXT: leaq (%rax,%rdi,2), %rax # sched: [1:0.50] ; X64-HSW-NEXT: retq # sched: [1:1.00] ; ; X64-JAG-LABEL: test_mul_by_29: ; X64-JAG: # BB#0: ; X64-JAG-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50] ; X64-JAG-NEXT: leaq (%rax,%rax,2), %rax # sched: [1:0.50] -; X64-JAG-NEXT: addq %rdi, %rax # sched: [1:0.50] -; X64-JAG-NEXT: addq %rdi, %rax # sched: [1:0.50] +; X64-JAG-NEXT: leaq (%rax,%rdi,2), %rax # sched: [1:0.50] ; X64-JAG-NEXT: retq # sched: [4:1.00] ; ; X86-NOOPT-LABEL: test_mul_by_29: Index: test/CodeGen/X86/mul-constant-result.ll =================================================================== --- test/CodeGen/X86/mul-constant-result.ll +++ test/CodeGen/X86/mul-constant-result.ll @@ -163,8 +163,7 @@ ; X86-NEXT: .LBB0_35: ; X86-NEXT: leal (%eax,%eax,8), %ecx ; X86-NEXT: leal (%ecx,%ecx,2), %ecx -; X86-NEXT: addl %eax, %ecx -; X86-NEXT: addl %ecx, %eax +; X86-NEXT: leal (%ecx,%eax,2), %eax ; X86-NEXT: popl %esi ; X86-NEXT: retl ; X86-NEXT: .LBB0_36: @@ -322,16 +321,17 @@ ; X64-HSW-NEXT: .LBB0_31: ; X64-HSW-NEXT: leal (%rax,%rax,8), %ecx ; X64-HSW-NEXT: leal (%rcx,%rcx,2), %ecx -; X64-HSW-NEXT: jmp .LBB0_17 -; X64-HSW-NEXT: .LBB0_32: -; X64-HSW-NEXT: leal (%rax,%rax,8), %ecx -; X64-HSW-NEXT: leal (%rcx,%rcx,2), %ecx -; X64-HSW-NEXT: addl %eax, %ecx ; X64-HSW-NEXT: .LBB0_17: ; X64-HSW-NEXT: addl %eax, %ecx ; X64-HSW-NEXT: movl %ecx, %eax ; X64-HSW-NEXT: # kill: %EAX %EAX %RAX ; X64-HSW-NEXT: retq +; X64-HSW-NEXT: .LBB0_32: +; X64-HSW-NEXT: leal (%rax,%rax,8), %ecx +; X64-HSW-NEXT: leal (%rcx,%rcx,2), %ecx +; X64-HSW-NEXT: leal (%rcx,%rax,2), %eax +; X64-HSW-NEXT: # kill: %EAX %EAX %RAX +; X64-HSW-NEXT: retq ; X64-HSW-NEXT: .LBB0_33: ; X64-HSW-NEXT: movl %eax, %ecx ; X64-HSW-NEXT: shll $5, %ecx Index: test/CodeGen/X86/umul-with-overflow.ll =================================================================== --- test/CodeGen/X86/umul-with-overflow.ll +++ test/CodeGen/X86/umul-with-overflow.ll @@ -1,37 +1,46 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=i686-unknown-linux-gnu | FileCheck %s declare {i32, i1} @llvm.umul.with.overflow.i32(i32 %a, i32 %b) define zeroext i1 @a(i32 %x) nounwind { +; CHECK-LABEL: a: +; CHECK: # BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl $3, %ecx +; CHECK-NEXT: mull %ecx +; CHECK-NEXT: seto %al +; CHECK-NEXT: retl %res = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %x, i32 3) %obil = extractvalue {i32, i1} %res, 1 ret i1 %obil - -; CHECK-LABEL: a: -; CHECK: mull -; CHECK: seto %al -; CHECK: ret + } define i32 @test2(i32 %a, i32 %b) nounwind readnone { +; CHECK-LABEL: test2: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: addl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: addl %eax, %eax +; CHECK-NEXT: retl entry: %tmp0 = add i32 %b, %a %tmp1 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %tmp0, i32 2) %tmp2 = extractvalue { i32, i1 } %tmp1, 0 ret i32 %tmp2 -; CHECK-LABEL: test2: -; CHECK: addl -; CHECK-NEXT: addl -; CHECK-NEXT: ret } define i32 @test3(i32 %a, i32 %b) nounwind readnone { +; CHECK-LABEL: test3: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: addl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl $4, %ecx +; CHECK-NEXT: mull %ecx +; CHECK-NEXT: retl entry: %tmp0 = add i32 %b, %a %tmp1 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %tmp0, i32 4) %tmp2 = extractvalue { i32, i1 } %tmp1, 0 ret i32 %tmp2 -; CHECK-LABEL: test3: -; CHECK: addl -; CHECK: mull -; CHECK-NEXT: ret } Index: test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll =================================================================== --- test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll +++ test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll @@ -13,14 +13,14 @@ ; X64-NEXT: .p2align ; X64: %loop ; no complex address modes -; X64-NOT: (%{{[^)]+}},%{{[^)]+}}, +; X64-NOT: [1-9]+(%{{[^)]+}},%{{[^)]+}}, ; ; X32: @simple ; no expensive address computation in the preheader ; X32-NOT: imul ; X32: %loop ; no complex address modes -; X32-NOT: (%{{[^)]+}},%{{[^)]+}}, +; X32-NOT: [1-9]+(%{{[^)]+}},%{{[^)]+}}, define i32 @simple(i32* %a, i32* %b, i32 %x) nounwind { entry: br label %loop @@ -103,7 +103,7 @@ ; X32-NOT: mov{{.*}}(%esp){{$}} ; X32: %for.body{{$}} ; no complex address modes -; X32-NOT: (%{{[^)]+}},%{{[^)]+}}, +; X32-NOT: [1-9]+(%{{[^)]+}},%{{[^)]+}}, ; no reloads ; X32-NOT: (%esp) define void @extrastride(i8* nocapture %main, i32 %main_stride, i32* nocapture %res, i32 %x, i32 %y, i32 %z) nounwind { Index: utils/TableGen/DAGISelMatcherGen.cpp =================================================================== --- utils/TableGen/DAGISelMatcherGen.cpp +++ utils/TableGen/DAGISelMatcherGen.cpp @@ -305,7 +305,7 @@ const SDNodeInfo &CInfo = CGP.getSDNodeInfo(N->getOperator()); // If this is an 'and R, 1234' where the operation is AND/OR and the RHS is - // a constant without a predicate fn that has more that one bit set, handle + // a constant without a predicate fn that has more than one bit set, handle // this as a special case. This is usually for targets that have special // handling of certain large constants (e.g. alpha with it's 8/16/32-bit // handling stuff). Using these instructions is often far more efficient