Index: lib/CodeGen/TwoAddressInstructionPass.cpp =================================================================== --- lib/CodeGen/TwoAddressInstructionPass.cpp +++ lib/CodeGen/TwoAddressInstructionPass.cpp @@ -102,6 +102,8 @@ bool sink3AddrInstruction(MachineInstr *MI, unsigned Reg, MachineBasicBlock::iterator OldPos); + bool isRevCopyChain(unsigned FromReg, unsigned ToReg); + bool noUseAfterLastDef(unsigned Reg, unsigned Dist, unsigned &LastDef); bool isProfitableToCommute(unsigned regA, unsigned regB, unsigned regC, @@ -309,6 +311,48 @@ return true; } +/// getSingleDef -- return the MachineInstr* if it is the single def of the Reg +/// in current BB. +static MachineInstr* getSingleDef(unsigned Reg, MachineBasicBlock *BB, + const MachineRegisterInfo *MRI) { + MachineInstr *ret = NULL; + for (MachineOperand &MO : MRI->reg_operands(Reg)) { + MachineInstr *MI = MO.getParent(); + if (MI->getParent() != BB || MI->isDebugValue()) + continue; + if (MO.isDef()) { + if (!ret) + ret = MI; + else if (ret != MI) + return NULL; + } + } + return ret; +} + +/// Check if there is a reversed copy chain from FromReg to ToReg: +/// %Tmp1 = copy %Tmp2; +/// %FromReg = copy %Tmp1; +/// %ToReg = add %FromReg ... +/// %Tmp2 = copy %ToReg; +bool TwoAddressInstructionPass::isRevCopyChain(unsigned FromReg, unsigned ToReg) { + unsigned TmpReg = FromReg; + // We set Length to 3 for now, which covers the common cases we see. Set the + // Length limit is to avoid endless searching in a copy chain loop. + int Length = 3; + for (int i = 0; i < Length; i++) { + MachineInstr *Def = getSingleDef(TmpReg, MBB, MRI); + if (Def && Def->isCopy()) + TmpReg = Def->getOperand(1).getReg(); + else + return false; + + if (TmpReg == ToReg) + return true; + } + return false; +} + /// noUseAfterLastDef - Return true if there are no intervening uses between the /// last instruction in the MBB that defines the specified register and the /// two-address instruction which is being processed. It also returns the last @@ -574,6 +618,27 @@ if (!noUseAfterLastDef(regB, Dist, LastDefB)) return true; + // Look for situation like this: + // %reg101 = MOV %reg100 + // %reg102 = ... + // %reg103 = ADD %reg102, %reg101 + // ... = %reg103 ... + // %reg100 = MOV %reg103 + // If there is a reversed copy chain from reg101 to reg103, commute the ADD + // to eliminate an otherwise unavoidable copy. + // FIXME: + // We can extend the logic further: If an pair of operands in an insn has + // been merged, the insn could be regarded as a virtual copy, and the virtual + // copy could also be used to construct a copy chain. + // To more generally minimize register copies, ideally the logic of two addr + // instruction pass should be integrated with register allocation pass where + // interference graph is available. + if (isRevCopyChain(regC, regA)) + return true; + + if (isRevCopyChain(regB, regA)) + return false; + // Since there are no intervening uses for both registers, then commute // if the def of regC is closer. Its live interval is shorter. return LastDefB && LastDefC && LastDefC > LastDefB; Index: test/CodeGen/X86/twoaddr-coalesce-3.ll =================================================================== --- test/CodeGen/X86/twoaddr-coalesce-3.ll +++ test/CodeGen/X86/twoaddr-coalesce-3.ll @@ -0,0 +1,78 @@ +; RUN: llc < %s -march=x86-64 | FileCheck %s +; This test is to ensure TwoAddrInstruction pass choose the proper operands to merge and +; generate less mov insns. + +@M = common global i32 0, align 4 +@total = common global i32 0, align 4 +@g = common global i32 0, align 4 + +; Function Attrs: nounwind uwtable +define void @foo() { +entry: + %0 = load i32* @M, align 4 + %cmp3 = icmp sgt i32 %0, 0 + br i1 %cmp3, label %for.body.lr.ph, label %for.end + +for.body.lr.ph: ; preds = %entry + %total.promoted = load i32* @total, align 4 + br label %for.body + +; Check that only one mov will be generated in the kernel loop. +; CHECK-LABEL: foo: +; CHECK: .LBB0_2: +; CHECK: mov +; CHECK-NOT: mov +; CHECK: .LBB0_2 +for.body: ; preds = %for.body.lr.ph, %for.body + %add5 = phi i32 [ %total.promoted, %for.body.lr.ph ], [ %add, %for.body ] + %i.04 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] + %div = sdiv i32 %i.04, 2 + %add = add nsw i32 %div, %add5 + %inc = add nuw nsw i32 %i.04, 1 + %cmp = icmp slt i32 %inc, %0 + br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge + +for.cond.for.end_crit_edge: ; preds = %for.body + store i32 %add, i32* @total, align 4 + br label %for.end + +for.end: ; preds = %for.cond.for.end_crit_edge, %entry + ret void +} + +; Function Attrs: nounwind uwtable +define void @goo() #0 { +entry: + %0 = load i32* @M, align 4 + %cmp3 = icmp sgt i32 %0, 0 + br i1 %cmp3, label %for.body.lr.ph, label %for.end + +for.body.lr.ph: ; preds = %entry + %total.promoted = load i32* @total, align 4 + br label %for.body + +; Check that only two mov will be generated in the kernel loop. +; CHECK-LABEL: goo: +; CHECK: .LBB1_2: +; CHECK: mov +; CHECK: mov +; CHECK-NOT: mov +; CHECK: .LBB1_2 +for.body: ; preds = %for.body.lr.ph, %for.body + %add5 = phi i32 [ %total.promoted, %for.body.lr.ph ], [ %add, %for.body ] + %i.04 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] + %div = sdiv i32 %i.04, 2 + %add = add nsw i32 %div, %add5 + store volatile i32 %add, i32* @g, align 4 + %inc = add nuw nsw i32 %i.04, 1 + %cmp = icmp slt i32 %inc, %0 + br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge + +for.cond.for.end_crit_edge: ; preds = %for.body + store i32 %add, i32* @total, align 4 + br label %for.end + +for.end: ; preds = %for.cond.for.end_crit_edge, %entry + ret void +} +