Index: lib/CodeGen/RegisterCoalescer.cpp =================================================================== --- lib/CodeGen/RegisterCoalescer.cpp +++ lib/CodeGen/RegisterCoalescer.cpp @@ -22,6 +22,7 @@ #include "llvm/CodeGen/LiveRangeEdit.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/Passes.h" @@ -189,6 +190,9 @@ /// This returns true if an interval was modified. bool removeCopyByCommutingDef(const CoalescerPair &CP,MachineInstr *CopyMI); + /// We found a copy which can be moved to its less frequent predecessor. + bool removePartialRedundency(const CoalescerPair &CP, MachineInstr *CopyMI); + /// If the source of a copy is defined by a /// trivial computation, replace the copy by rematerialize the definition. bool reMaterializeTrivialDef(const CoalescerPair &CP, MachineInstr *CopyMI, @@ -861,6 +865,156 @@ return true; } +/// For copy B = A in BB2, if A is defined by A = B in BB0 which is a +/// predecessor of BB2, and if B is not redefined on the way from A = B +/// in BB2 to B = A in BB2, B = A in BB2 is partially redundent if the +/// execution goes through the path from BB0 to BB2. We may move B = A +/// to the predecessor without such reversed copy. +/// So we will transform the program from: +/// BB0: +/// A = B; BB1: +/// ... ... +/// / \ / +/// BB2: +/// ... +/// B = A; +/// +/// to: +/// +/// BB0: BB1: +/// A = B; ... +/// ... B = A; +/// / \ / +/// BB2: +/// ... +/// +/// A special case is when BB0 and BB2 are the same BB which is the only +/// BB in a loop: +/// BB1: +/// ... +/// BB0/BB2: ---- +/// B = A; | +/// ... | +/// A = B; | +/// |------- +/// | +/// We may hoist B = A from BB0/BB2 to BB1. +bool RegisterCoalescer::removePartialRedundency(const CoalescerPair &CP, + MachineInstr *CopyMI) { + assert(!CP.isPhys()); + if (!CopyMI->isFullCopy()) + return false; + + MachineBasicBlock *MBB = CopyMI->getParent(); + if (MBB->isEHPad()) + return false; + + if (MBB->pred_size() != 2) + return false; + + LiveInterval &IntA = + LIS->getInterval(CP.isFlipped() ? CP.getDstReg() : CP.getSrcReg()); + LiveInterval &IntB = + LIS->getInterval(CP.isFlipped() ? CP.getSrcReg() : CP.getDstReg()); + + // A is defined by PHI at the entry of MBB. + SlotIndex CopyIdx = LIS->getInstructionIndex(*CopyMI).getRegSlot(true); + VNInfo *AValNo = IntA.getVNInfoAt(CopyIdx); + assert(AValNo && !AValNo->isUnused() && "COPY source not live"); + if (!AValNo->isPHIDef()) + return false; + + // No B is referenced before CopyMI in MBB. + if (IntB.overlaps(LIS->getMBBStartIdx(MBB), CopyIdx)) + return false; + + // MBB has two predecessors: one contains A = B so no copy will be inserted + // for it. The other one will have a copy moved from MBB. + bool FoundReverseCopy = false; + MachineBasicBlock *CopyLeftBB = nullptr; + for (auto Pred : MBB->predecessors()) { + VNInfo *PVal = IntA.getVNInfoBefore(LIS->getMBBEndIdx(Pred)); + MachineInstr *DefMI = LIS->getInstructionFromIndex(PVal->def); + if (!DefMI || !DefMI->isFullCopy()) { + CopyLeftBB = Pred; + continue; + } + // Check DefMI is a reverse copy and it is in BB Pred. + if (DefMI->getOperand(0).getReg() != IntA.reg || + DefMI->getOperand(1).getReg() != IntB.reg || + DefMI->getParent() != Pred) { + CopyLeftBB = Pred; + continue; + } + // If there is any other def of B after DefMI and before the end of Pred, + // we need to keep the copy of B = A at the end of Pred if we remove + // B = A from MBB. + bool ValB_Changed = false; + for (auto VNI : IntB.valnos) { + if (VNI->isUnused()) + continue; + if (PVal->def < VNI->def && VNI->def < LIS->getMBBEndIdx(Pred)) { + ValB_Changed = true; + break; + } + } + if (ValB_Changed) { + CopyLeftBB = Pred; + continue; + } + FoundReverseCopy = true; + } + + // If no reverse copy is found in predecessors, nothing to do. + if (!FoundReverseCopy) + return false; + + // If CopyLeftBB is nullptr, it means every predecessor of MBB contains + // reverse copy, CopyMI can be removed trivially if only IntA/IntB is updated. + // If CopyLeftBB is not nullptr, move CopyMI from MBB to CopyLeftBB and + // update IntA/IntB. + // + // If CopyLeftBB is not nullptr, ensure CopyLeftBB has a single succ so + // MBB is hoter than CopyLeftBB. + if (CopyLeftBB && CopyLeftBB->succ_size() > 1) + return false; + + // Now ok to move copy. + if (CopyLeftBB) + DEBUG(dbgs() << "\tremovePartialRedundency: Move the copy to BB#" + << CopyLeftBB->getNumber() << '\t' << *CopyMI); + else + DEBUG(dbgs() << "\tremovePartialRedundency: Remove the copy from BB#" + << MBB->getNumber() << '\t' << *CopyMI); + + if (CopyLeftBB) { + // Insert new copy to CopyLeftBB. + auto InsPos = CopyLeftBB->getFirstTerminator(); + MachineInstr *NewCopyMI = + BuildMI(*CopyLeftBB, InsPos, CopyMI->getDebugLoc(), + TII->get(TargetOpcode::COPY), IntB.reg) + .addReg(IntA.reg); + SlotIndex NewCopyIdx = + LIS->InsertMachineInstrInMaps(*NewCopyMI).getRegSlot(); + VNInfo *VNI = IntB.getNextValue(NewCopyIdx, LIS->getVNInfoAllocator()); + IntB.createDeadDef(VNI); + } + + // Remove CopyMI. + SlotIndex EndPoint = IntB.Query(CopyIdx.getRegSlot()).endPoint(); + LIS->removeVRegDefAt(IntB, CopyIdx.getRegSlot()); + LIS->RemoveMachineInstrFromMaps(*CopyMI); + CopyMI->eraseFromParent(); + + // Extend IntB to the EndPoint of its original live interval. + SmallVector EndPoints; + EndPoints.push_back(EndPoint); + LIS->extendToIndices(IntB, EndPoints); + + shrinkToUses(&IntA); + return true; +} + /// Returns true if @p MI defines the full vreg @p Reg, as opposed to just /// defining a subregister. static bool definesFullReg(const MachineInstr &MI, unsigned Reg) { @@ -1486,6 +1640,12 @@ } } + // Try and see if we can partially eliminate the copy by moving the copy to + // its predecessor. + if (!CP.isPartial() && !CP.isPhys()) + if (removePartialRedundency(CP, CopyMI)) + return true; + // Otherwise, we are unable to join the intervals. DEBUG(dbgs() << "\tInterference!\n"); Again = true; // May be possible to coalesce later. Index: test/CodeGen/X86/pre-coalesce.ll =================================================================== --- test/CodeGen/X86/pre-coalesce.ll +++ test/CodeGen/X86/pre-coalesce.ll @@ -0,0 +1,51 @@ +; RUN: llc -regalloc=greedy -mtriple=x86_64-unknown-linux-gnu < %s -o - | FileCheck %s +; +; The test is to check no redundent mov as follows will be generated in %while.body loop. +; .LBB0_2: +; movsbl %cl, %ecx +; movl %edx, %eax ==> This movl can be promoted outside of loop. +; shll $5, %eax +; ... +; movl %eax, %edx +; jne .LBB0_2 +; +; CHECK-LABEL: foo: +; CHECK: [[L0:.LBB0_[0-9]+]]: # %while.body +; CHECK: movl %[[REGA:.*]], %[[REGB:.*]] +; CHECK-NOT: movl %[[REGB]], %[[REGA]] +; CHECK: jne [[L0]] +; +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +@b = common local_unnamed_addr global i8* null, align 8 +@a = common local_unnamed_addr global i32 0, align 4 + +define i32 @foo() local_unnamed_addr { +entry: + %t0 = load i8*, i8** @b, align 8 + %t1 = load i8, i8* %t0, align 1 + %cmp4 = icmp eq i8 %t1, 0 + %t2 = load i32, i32* @a, align 4 + br i1 %cmp4, label %while.end, label %while.body.preheader + +while.body.preheader: ; preds = %entry + br label %while.body + +while.body: ; preds = %while.body.preheader, %while.body + %t3 = phi i32 [ %add3, %while.body ], [ %t2, %while.body.preheader ] + %t4 = phi i8 [ %t5, %while.body ], [ %t1, %while.body.preheader ] + %conv = sext i8 %t4 to i32 + %add = mul i32 %t3, 33 + %add3 = add nsw i32 %add, %conv + store i32 %add3, i32* @a, align 4 + %t5 = load i8, i8* %t0, align 1 + %cmp = icmp eq i8 %t5, 0 + br i1 %cmp, label %while.end.loopexit, label %while.body + +while.end.loopexit: ; preds = %while.body + br label %while.end + +while.end: ; preds = %while.end.loopexit, %entry + %.lcssa = phi i32 [ %t2, %entry ], [ %add3, %while.end.loopexit ] + ret i32 %.lcssa +}