Index: lib/Target/X86/X86ISelDAGToDAG.cpp =================================================================== --- lib/Target/X86/X86ISelDAGToDAG.cpp +++ lib/Target/X86/X86ISelDAGToDAG.cpp @@ -1410,6 +1410,19 @@ return false; } + if (AM.Scale == 1) { + if (AM.Base_Reg == N) { + SDValue Base_Reg = AM.Base_Reg; + AM.Base_Reg = AM.IndexReg; + AM.IndexReg = Base_Reg; + AM.Scale++; + return false; + } else if (AM.IndexReg == N) { + AM.Scale++; + return false; + } + } + // Otherwise, we cannot select it. return true; } Index: lib/Target/X86/X86OptimizeLEAs.cpp =================================================================== --- lib/Target/X86/X86OptimizeLEAs.cpp +++ lib/Target/X86/X86OptimizeLEAs.cpp @@ -26,6 +26,7 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/Passes.h" #include "llvm/IR/DIBuilder.h" #include "llvm/IR/DebugInfoMetadata.h" @@ -65,8 +66,8 @@ public: MemOpKey(const MachineOperand *Base, const MachineOperand *Scale, const MachineOperand *Index, const MachineOperand *Segment, - const MachineOperand *Disp) - : Disp(Disp) { + const MachineOperand *Disp, bool DispCheck = false) + : Disp(Disp),HardDispCheck(DispCheck) { Operands[0] = Base; Operands[1] = Scale; Operands[2] = Index; @@ -83,7 +84,10 @@ // matters that they use the same symbol/index/address. Immediates' or // offsets' differences will be taken care of during instruction // substitution. - return isSimilarDispOp(*Disp, *Other.Disp); + if (HardDispCheck) + return isIdenticalOp(*Disp,*Other.Disp); + else + return isSimilarDispOp(*Disp, *Other.Disp); } // Address' base, scale, index and segment operands. @@ -91,6 +95,9 @@ // Address' displacement operand. const MachineOperand *Disp; + + // Forces absolute displacement check. + bool HardDispCheck; }; } // end anonymous namespace @@ -178,6 +185,17 @@ &MI.getOperand(N + X86::AddrDisp)); } +static inline MemOpKey getMemOpCSEKey(const MachineInstr &MI, unsigned N) { + static MachineOperand DummyScale = MachineOperand::CreateImm(1); + assert((isLEA(MI) || MI.mayLoadOrStore()) && + "The instruction must be a LEA, a load or a store"); + return MemOpKey(&MI.getOperand(N + X86::AddrBaseReg), + &DummyScale, + &MI.getOperand(N + X86::AddrIndexReg), + &MI.getOperand(N + X86::AddrSegmentReg), + &MI.getOperand(N + X86::AddrDisp),true); +} + static inline bool isIdenticalOp(const MachineOperand &MO1, const MachineOperand &MO2) { return MO1.isIdenticalTo(MO2) && @@ -228,6 +246,12 @@ /// been calculated by LEA. Also, remove redundant LEAs. bool runOnMachineFunction(MachineFunction &MF) override; + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + AU.addRequired(); + } + private: typedef DenseMap> MemOpMap; @@ -261,6 +285,12 @@ /// distance between them. void findLEAs(const MachineBasicBlock &MBB, MemOpMap &LEAs); + /// \brief Find all LEA instructions in the basic block that have same + /// Base, Index, Disp and Segment. + void populateCSEMap(const MachineBasicBlock &MBB, MemOpMap &LEAs); + + bool cseLEAs(const MachineBasicBlock &MBB); + /// \brief Removes redundant address calculations. bool removeRedundantAddrCalc(MemOpMap &LEAs); @@ -275,6 +305,7 @@ DenseMap InstrPos; + MachineDominatorTree *DT; MachineRegisterInfo *MRI; const X86InstrInfo *TII; const X86RegisterInfo *TRI; @@ -286,6 +317,13 @@ FunctionPass *llvm::createX86OptimizeLEAs() { return new OptimizeLEAPass(); } +void OptimizeLEAPass::populateCSEMap(const MachineBasicBlock &MBB, MemOpMap &LEAs) { + for (auto &MI : MBB) { + if (isLEA(MI)) + LEAs[getMemOpCSEKey(MI, 1)].push_back(const_cast(&MI)); + } +} + int OptimizeLEAPass::calcInstrDist(const MachineInstr &First, const MachineInstr &Last) { // Both instructions must be in the same basic block and they must be @@ -646,6 +684,60 @@ return Changed; } +bool OptimizeLEAPass::cseLEAs(const MachineBasicBlock &MBB) { + MemOpMap LEAs; + bool cseDone = false; + + populateCSEMap(MBB,LEAs); + + auto CompareFn = + [] (const MachineInstr *Arg1,const MachineInstr *Arg2) -> bool { + if(Arg1->getOperand(2).getImm() < Arg2->getOperand(2).getImm()) + return false; + return true; + }; + + // Loop over all entries in the table. + for (auto &E : LEAs) { + auto &List = E.second; + if(List.size() > 1) { + std::sort(List.begin(),List.end(),CompareFn); + } + // Loop over all LEA pairs. + for(auto LII = List.begin(); LII != List.end(); LII++) { + MachineInstr &LI1 = **LII; + auto LINext = std::next(LII); + if(LINext == List.end()) + break; + MachineInstr &LI2 = **LINext; + if (!DT->dominates(&LI2,&LI1)) + continue; + + int Scale1 = LI1.getOperand(2).getImm(); + int Scale2 = LI2.getOperand(2).getImm(); + assert(LI2.getOperand(0).isReg() && "Result is a VirtualReg"); + DebugLoc DL = LI1.getDebugLoc(); + + if ((Scale1 - Scale2) > 0) { + MachineInstr *NewMI = + BuildMI(*(const_cast(&MBB)), + &LI1,DL,TII->get(LI1.getOpcode())) + .addDef(LI1.getOperand(0).getReg()) // Dst = Dst of LI1. + .addUse(LI2.getOperand(0).getReg()) // Base = Dst of LI2. + .addImm(Scale1-Scale2) // Scale = Diff b/w scales. + .addUse(LI1.getOperand(3).getReg()) // Index = Index of LI1. + .addImm(0) // Disp = 0 + .addUse(LI1.getOperand(5).getReg()); // Segment = Segmant of LI1. + + LI1.eraseFromParent(); + cseDone = NewMI != nullptr; + } + } + } + return cseDone; +} + + bool OptimizeLEAPass::runOnMachineFunction(MachineFunction &MF) { bool Changed = false; @@ -655,12 +747,16 @@ MRI = &MF.getRegInfo(); TII = MF.getSubtarget().getInstrInfo(); TRI = MF.getSubtarget().getRegisterInfo(); + DT = &getAnalysis(); // Process all basic blocks. for (auto &MBB : MF) { MemOpMap LEAs; InstrPos.clear(); + // Attempt CSE over LEAs. + Changed |= cseLEAs(MBB); + // Find all LEA instructions in basic block. findLEAs(MBB, LEAs); Index: test/CodeGen/X86/lea-opt-cst.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/lea-opt-cst.ll @@ -0,0 +1,29 @@ +; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s -check-prefix=X86 +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%struct.SA = type { i32 , i32 , i32 , i32 ,i32 } + +define void @test_func(%struct.SA* nocapture %ctx, i32 %n) local_unnamed_addr { +; X86-LABEL: test_func: +; X86: # BB#0: +; X86-NEXT: movl (%rdi), %eax +; X86-NEXT: movl 16(%rdi), %ecx +; X86-NEXT: leal 1(%rax,%rcx), %eax +; X86-NEXT: movl %eax, 12(%rdi) +; X86-NEXT: leal (%eax,%rcx), %eax +; X86-NEXT: movl %eax, 16(%rdi) +; X86-NEXT: retq + entry: + %h0 = getelementptr inbounds %struct.SA, %struct.SA* %ctx, i64 0, i32 0 + %0 = load i32, i32* %h0, align 8 + %h3 = getelementptr inbounds %struct.SA, %struct.SA* %ctx, i64 0, i32 3 + %h4 = getelementptr inbounds %struct.SA, %struct.SA* %ctx, i64 0, i32 4 + %1 = load i32, i32* %h4, align 8 + %add = add i32 %0, 1 + %add4 = add i32 %add, %1 + store i32 %add4, i32* %h3, align 4 + %add29 = add i32 %add4 , %1 + store i32 %add29, i32* %h4, align 8 + ret void +} Index: test/CodeGen/X86/mul-constant-i16.ll =================================================================== --- test/CodeGen/X86/mul-constant-i16.ll +++ test/CodeGen/X86/mul-constant-i16.ll @@ -558,11 +558,10 @@ define i16 @test_mul_by_29(i16 %x) { ; X86-LABEL: test_mul_by_29: ; X86: # BB#0: -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: leal (%ecx,%ecx,8), %eax -; X86-NEXT: leal (%eax,%eax,2), %eax -; X86-NEXT: addl %ecx, %eax -; X86-NEXT: addl %ecx, %eax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: leal (%eax,%eax,8), %ecx +; X86-NEXT: leal (%ecx,%ecx,2), %ecx +; X86-NEXT: leal (%ecx,%eax,2), %eax ; X86-NEXT: # kill: %AX %AX %EAX ; X86-NEXT: retl ; @@ -571,8 +570,7 @@ ; X64-NEXT: # kill: %EDI %EDI %RDI ; X64-NEXT: leal (%rdi,%rdi,8), %eax ; X64-NEXT: leal (%rax,%rax,2), %eax -; X64-NEXT: addl %edi, %eax -; X64-NEXT: addl %edi, %eax +; X64-NEXT: leal (%rax,%rdi,2), %eax ; X64-NEXT: # kill: %AX %AX %EAX ; X64-NEXT: retq %mul = mul nsw i16 %x, 29 Index: test/CodeGen/X86/mul-constant-i32.ll =================================================================== --- test/CodeGen/X86/mul-constant-i32.ll +++ test/CodeGen/X86/mul-constant-i32.ll @@ -1457,30 +1457,27 @@ define i32 @test_mul_by_29(i32 %x) { ; X86-LABEL: test_mul_by_29: ; X86: # BB#0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: leal (%ecx,%ecx,8), %eax -; X86-NEXT: leal (%eax,%eax,2), %eax -; X86-NEXT: addl %ecx, %eax -; X86-NEXT: addl %ecx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: leal (%eax,%eax,8), %ecx +; X86-NEXT: leal (%ecx,%ecx,2), %ecx +; X86-NEXT: leal (%ecx,%eax,2), %eax ; X86-NEXT: retl ; ; X64-HSW-LABEL: test_mul_by_29: ; X64-HSW: # BB#0: ; X64-HSW-NEXT: # kill: %EDI %EDI %RDI -; X64-HSW-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50] +; X64-HSW-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50] ; X64-HSW-NEXT: leal (%rax,%rax,2), %eax # sched: [1:0.50] -; X64-HSW-NEXT: addl %edi, %eax # sched: [1:0.25] -; X64-HSW-NEXT: addl %edi, %eax # sched: [1:0.25] +; X64-HSW-NEXT: leal (%rax,%rdi,2), %eax # sched: [1:0.50] ; X64-HSW-NEXT: retq # sched: [1:1.00] ; ; X64-JAG-LABEL: test_mul_by_29: ; X64-JAG: # BB#0: ; X64-JAG-NEXT: # kill: %EDI %EDI %RDI -; X64-JAG-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50] -; X64-JAG-NEXT: leal (%rax,%rax,2), %eax # sched: [1:0.50] -; X64-JAG-NEXT: addl %edi, %eax # sched: [1:0.50] -; X64-JAG-NEXT: addl %edi, %eax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] +; X64-JAG-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50] +; X64-JAG-NEXT: leal (%rax,%rax,2), %eax # sched: [1:0.50] +; X64-JAG-NEXT: leal (%rax,%rdi,2), %eax # sched: [1:0.50] +; X64-JAG-NEXT: retq # sched: [4:1.00] ; ; X86-NOOPT-LABEL: test_mul_by_29: ; X86-NOOPT: # BB#0: Index: test/CodeGen/X86/mul-constant-i64.ll =================================================================== --- test/CodeGen/X86/mul-constant-i64.ll +++ test/CodeGen/X86/mul-constant-i64.ll @@ -1523,8 +1523,7 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: leal (%eax,%eax,8), %ecx ; X86-NEXT: leal (%ecx,%ecx,2), %ecx -; X86-NEXT: addl %eax, %ecx -; X86-NEXT: addl %eax, %ecx +; X86-NEXT: leal (%ecx,%eax,2), %ecx ; X86-NEXT: movl $29, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: addl %ecx, %edx @@ -1532,19 +1531,17 @@ ; ; X64-HSW-LABEL: test_mul_by_29: ; X64-HSW: # BB#0: -; X64-HSW-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50] -; X64-HSW-NEXT: leaq (%rax,%rax,2), %rax # sched: [1:0.50] -; X64-HSW-NEXT: addq %rdi, %rax # sched: [1:0.25] -; X64-HSW-NEXT: addq %rdi, %rax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] +; X64-HSW-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50] +; X64-HSW-NEXT: leaq (%rax,%rax,2), %rax # sched: [1:0.50] +; X64-HSW-NEXT: leaq (%rax,%rdi,2), %rax # sched: [1:0.50] +; X64-HSW-NEXT: retq # sched: [1:1.00] ; ; X64-JAG-LABEL: test_mul_by_29: ; X64-JAG: # BB#0: -; X64-JAG-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50] -; X64-JAG-NEXT: leaq (%rax,%rax,2), %rax # sched: [1:0.50] -; X64-JAG-NEXT: addq %rdi, %rax # sched: [1:0.50] -; X64-JAG-NEXT: addq %rdi, %rax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] +; X64-JAG-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50] +; X64-JAG-NEXT: leaq (%rax,%rax,2), %rax # sched: [1:0.50] +; X64-JAG-NEXT: leaq (%rax,%rdi,2), %rax # sched: [1:0.50] +; X64-JAG-NEXT: retq # sched: [4:1.00] ; ; X86-NOOPT-LABEL: test_mul_by_29: ; X86-NOOPT: # BB#0: Index: test/CodeGen/X86/mul-constant-result.ll =================================================================== --- test/CodeGen/X86/mul-constant-result.ll +++ test/CodeGen/X86/mul-constant-result.ll @@ -163,8 +163,7 @@ ; X86-NEXT: .LBB0_35: ; X86-NEXT: leal (%eax,%eax,8), %ecx ; X86-NEXT: leal (%ecx,%ecx,2), %ecx -; X86-NEXT: addl %eax, %ecx -; X86-NEXT: addl %ecx, %eax +; X86-NEXT: leal (%ecx,%eax,2), %eax ; X86-NEXT: popl %esi ; X86-NEXT: retl ; X86-NEXT: .LBB0_36: @@ -322,16 +321,17 @@ ; X64-HSW-NEXT: .LBB0_31: ; X64-HSW-NEXT: leal (%rax,%rax,8), %ecx ; X64-HSW-NEXT: leal (%rcx,%rcx,2), %ecx -; X64-HSW-NEXT: jmp .LBB0_17 -; X64-HSW-NEXT: .LBB0_32: -; X64-HSW-NEXT: leal (%rax,%rax,8), %ecx -; X64-HSW-NEXT: leal (%rcx,%rcx,2), %ecx -; X64-HSW-NEXT: addl %eax, %ecx ; X64-HSW-NEXT: .LBB0_17: ; X64-HSW-NEXT: addl %eax, %ecx ; X64-HSW-NEXT: movl %ecx, %eax ; X64-HSW-NEXT: # kill: %EAX %EAX %RAX ; X64-HSW-NEXT: retq +; X64-HSW-NEXT: .LBB0_32: +; X64-HSW-NEXT: leal (%rax,%rax,8), %ecx +; X64-HSW-NEXT: leal (%rcx,%rcx,2), %ecx +; X64-HSW-NEXT: leal (%rcx,%rax,2), %eax +; X64-HSW-NEXT: # kill: %EAX %EAX %RAX +; X64-HSW-NEXT: retq ; X64-HSW-NEXT: .LBB0_33: ; X64-HSW-NEXT: movl %eax, %ecx ; X64-HSW-NEXT: shll $5, %ecx Index: test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll =================================================================== --- test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll +++ test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll @@ -13,14 +13,14 @@ ; X64-NEXT: .p2align ; X64: %loop ; no complex address modes -; X64-NOT: (%{{[^)]+}},%{{[^)]+}}, +; X64-NOT: [1-9]+(%{{[^)]+}},%{{[^)]+}}, ; ; X32: @simple ; no expensive address computation in the preheader ; X32-NOT: imul ; X32: %loop ; no complex address modes -; X32-NOT: (%{{[^)]+}},%{{[^)]+}}, +; X32-NOT: [1-9]+(%{{[^)]+}},%{{[^)]+}}, define i32 @simple(i32* %a, i32* %b, i32 %x) nounwind { entry: br label %loop @@ -103,7 +103,7 @@ ; X32-NOT: mov{{.*}}(%esp){{$}} ; X32: %for.body{{$}} ; no complex address modes -; X32-NOT: (%{{[^)]+}},%{{[^)]+}}, +; X32-NOT: [1-9]+(%{{[^)]+}},%{{[^)]+}}, ; no reloads ; X32-NOT: (%esp) define void @extrastride(i8* nocapture %main, i32 %main_stride, i32* nocapture %res, i32 %x, i32 %y, i32 %z) nounwind { Index: utils/TableGen/DAGISelMatcherGen.cpp =================================================================== --- utils/TableGen/DAGISelMatcherGen.cpp +++ utils/TableGen/DAGISelMatcherGen.cpp @@ -305,7 +305,7 @@ const SDNodeInfo &CInfo = CGP.getSDNodeInfo(N->getOperator()); // If this is an 'and R, 1234' where the operation is AND/OR and the RHS is - // a constant without a predicate fn that has more that one bit set, handle + // a constant without a predicate fn that has more than one bit set, handle // this as a special case. This is usually for targets that have special // handling of certain large constants (e.g. alpha with it's 8/16/32-bit // handling stuff). Using these instructions is often far more efficient