Index: lib/Target/PowerPC/CMakeLists.txt =================================================================== --- lib/Target/PowerPC/CMakeLists.txt +++ lib/Target/PowerPC/CMakeLists.txt @@ -42,6 +42,7 @@ PPCVSXFMAMutate.cpp PPCVSXSwapRemoval.cpp PPCExpandISEL.cpp + PPCRegCopyElim.cpp ) add_subdirectory(AsmParser) Index: lib/Target/PowerPC/PPC.h =================================================================== --- lib/Target/PowerPC/PPC.h +++ lib/Target/PowerPC/PPC.h @@ -49,6 +49,7 @@ FunctionPass *createPPCTLSDynamicCallPass(); FunctionPass *createPPCBoolRetToIntPass(); FunctionPass *createPPCExpandISELPass(); + FunctionPass *createPPCRegCopyElimPass(); void LowerPPCMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI, AsmPrinter &AP, bool isDarwin); bool LowerPPCMachineOperandToMCOperand(const MachineOperand &MO, @@ -59,6 +60,7 @@ void initializePPCBoolRetToIntPass(PassRegistry&); void initializePPCExpandISELPass(PassRegistry &); void initializePPCTLSDynamicCallPass(PassRegistry &); + void initializePPCRegCopyElimPass(PassRegistry &); extern char &PPCVSXFMAMutateID; namespace PPCII { Index: lib/Target/PowerPC/PPCRegCopyElim.cpp =================================================================== --- /dev/null +++ lib/Target/PowerPC/PPCRegCopyElim.cpp @@ -0,0 +1,355 @@ +//===----- PPCRegCopyElim.cpp -- Redundant Register Copy Elimination -----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===---------------------------------------------------------------------===// +// +// This pass aims to eliminate redundancy related to physical registers +// after the register allocation. +// So far, we eliminate the following two types of redundant register copys. +// +// 1) intra-BB redundant register copy +// li Y, 0 li X, 0 +// mr X, Y => (erase mr) +// .. .. +// 2) inter-BB partially redundant register copy +// BB1-------- BB1-------- +// | .. | | .. | +// | mr Y, X | | (erase) | +// | .. | | .. | +// with ----------- with ----------- +// 1 pred / | 1 pred / | +// BB-------- | BB-------- BB--------- | BB--------- +// | .. | | | .. | => | mr Y, X | | | .. | +// | .. | | | .. | | .. | | | mr X, Y | +// ---------- | ---------- ----------- | ----------- +// | / with | / with +// BB2-------- 1 succ BB2-------- 1 succ +// | .. | | .. | +// | mr X, Y | | (erase) | +// | .. | | .. | +// ----------- ----------- +// +//===---------------------------------------------------------------------===// + +#include "PPCTargetMachine.h" + +using namespace llvm; + +#define DEBUG_TYPE "ppc-regcopy-elim" + +namespace { + +struct PPCRegCopyElim : public MachineFunctionPass { + + static char ID; + const PPCInstrInfo *TII; + MachineFunction *MF; + + PPCRegCopyElim() : MachineFunctionPass(ID) { + initializePPCRegCopyElimPass(*PassRegistry::getPassRegistry()); + } + +private: + // Initialize class variables. + void initialize(MachineFunction &MFParm); + + // Perform peepholes. + bool eliminateRedundantRegCopy(void); + +public: + // Main entry point for this pass. + bool runOnMachineFunction(MachineFunction &MF) override { + if (skipFunction(*MF.getFunction())) + return false; + initialize(MF); + bool Simplified = false; + + Simplified |= eliminateRedundantRegCopy(); + return Simplified; + } +}; + +// Initialize class variables. +void PPCRegCopyElim::initialize(MachineFunction &MFParm) { + MF = &MFParm; + TII = MF->getSubtarget().getInstrInfo(); + DEBUG(dbgs() << + "*** PowerPC Redundant Register Copy Elimination pass ***\n\n"); +} + +// If MI is a register copy, this method returns true and set +// source and destination operands in SrcOperand and DstOperand. +// When isKill is true, this method returns true only if the src operand +// has kill flag set. +static bool isRegCopy(MachineInstr &MI, MachineOperand* &SrcOperand, + MachineOperand* &DstOperand, bool isKill = false) { + // Refer PPCInstrInfo::copyPhysReg to find opcodes used for copying + // the value in a physical register for each register class. + // Currently we do not optimize VSX registers (copied by xxlor), + // which may involve different register types, i.e. FPR and VRF. + unsigned Opc = MI.getOpcode(); + if (Opc == PPC::FMR) { + if (isKill && !MI.getOperand(1).isKill()) + return false; + DstOperand = &MI.getOperand(0); + SrcOperand = &MI.getOperand(1); + return true; + } + if ((Opc == PPC::OR8 || Opc == PPC::OR || Opc == PPC::VOR) && + MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) { + if (isKill && !(MI.getOperand(1).isKill() || MI.getOperand(2).isKill())) + return false; + DstOperand = &MI.getOperand(0); + SrcOperand = &MI.getOperand(1); + return true; + } + return false; +} + +bool PPCRegCopyElim::eliminateRedundantRegCopy(void) { + const PPCSubtarget *PPCSubTarget = &MF->getSubtarget(); + const TargetRegisterInfo *TRI = PPCSubTarget->getRegisterInfo(); + bool Simplified = false; + for (MachineBasicBlock &MBB : *MF) { + bool CurrentInstrErased = false; + do { + CurrentInstrErased = false; + for (MachineInstr &CopyMI : MBB) { + if (CopyMI.isDebugValue()) + continue; + + MachineOperand *SrcMO = nullptr, *DstMO = nullptr; + if (isRegCopy(CopyMI, SrcMO, DstMO, true)) { + unsigned DstReg = DstMO->getReg(); + unsigned SrcReg = SrcMO->getReg(); + + MachineInstr *DefMI = nullptr; + bool Found = false; + SmallVector OperandsToRewrite1; + + // We iterate instructions backward to find the def of the source + // of register copy or instructions conflicting with register copy + // (e.g. instructions that use destination reg of copy). + MachineBasicBlock::reverse_iterator MBBRI = CopyMI, + MBBRIE = MBB.rend(); + for (MBBRI++; MBBRI != MBBRIE && !Found; MBBRI++) { + // We conservatively assume function call may crobber all regs. + if (MBBRI->isCall()) { + Found = true; + break; + } + + // If this instruction defines source reg of copy, this is the + // target for our optimization. + for (auto DefMO: MBBRI->defs()) + if (TRI->isSuperOrSubRegisterEq(DefMO.getReg(), SrcReg)) { + if (!DefMO.isTied() && !DefMO.isImplicit()) + DefMI = &(*MBBRI); + Found = true; + break; + } + if (Found) + break; + + // If this instr conflicts with the reg copy, we give up here. + for (auto MO: MBBRI->operands()) + if (MO.isReg() && + TRI->isSuperOrSubRegisterEq(MO.getReg(), DstReg)) { + Found = true; + break; + } + + // If this instruction uses source register of register copy, + // we need to remember the operand to replace it with destination + // register of copy. For example: + // li r4, 1 li r3, 1 + // cmplwi r4, 0 -> cmplwi r3, 0 + // mr r3, r4 (erase mr) + for (unsigned I = 0; I < MBBRI->getNumOperands(); I++) { + MachineOperand *MO = &MBBRI->getOperand(I); + if (!(MO->isReg() && MO->isUse())) + continue; + if (MO->getReg() == SrcReg) + OperandsToRewrite1.push_back(MO); + else if (TRI->isSuperOrSubRegisterEq(MO->getReg(), SrcReg)) { + Found = true; + break; + } + } + } + + // Def for the source of the register copy is found in the BB + // and can be optimized within this block. + if (DefMI && DefMI->getOperand(0).getReg() == SrcReg) { + DEBUG(dbgs() + << "Eliminating redundant register copy:\n"); + DEBUG(DefMI->dump()); + DEBUG(CopyMI.dump()); + DefMI->getOperand(0).setReg(DstReg); + CopyMI.eraseFromParent(); + for (auto &MO: OperandsToRewrite1) + MO->setReg(DstReg); + CurrentInstrErased = true; + break; + } + + // If there is conflicting instruction (e.g. uses source reg of copy) + // between def and reg copy, we cannot optimize. + if (Found) + continue; + + // If no def and no conflicting instruction is found in this BB, + // we check the predecessors. + // So far, we do not optimize if this BB has more than two + // predecessors not to increase the number of total instructions. + if (MBB.pred_size() != 2) continue; + + // From here, we handle redundancy among multiple BBs. + // We currently support following CFGs. + // + // Pred1MBB + // / | / + // (SideMBB) | Pred2MBB + // / | / + // MBB (including CopyMI) + // + // - MBB doesn't have more than two predecessors. + // - Pred1MBB doesn't have more than two successors. + // - SideMBB is optional. + // - SideMBB has only one predecessor (Pred1MBB). + // - Pred2MBB has only one successor (MBB). + // - SideMBB and Pred2MBB may be the same BB. + auto FirstPredBB = MBB.pred_begin(); + MachineBasicBlock *Pred2MBB = nullptr, *SideMBB = nullptr; + for (auto &Pred1MBB: MBB.predecessors()) { + // If the BBs does not match the above patterns we support, + // we give up before iterating instructions. + Pred2MBB = (Pred1MBB == *FirstPredBB) ? *(FirstPredBB + 1): + * FirstPredBB; + if (Pred1MBB->succ_size() > 2 || Pred2MBB->succ_size() > 1) + continue; + + if (Pred1MBB->succ_size() == 2) { + auto FirstSuccBB = Pred1MBB->succ_begin(); + SideMBB = (&MBB == *FirstSuccBB) ? *(FirstSuccBB + 1): + * FirstSuccBB; + if (SideMBB->pred_size() > 1) + continue; + } + + MachineInstr *DefMI = nullptr; + bool Found = false; + SmallVector OperandsToRewrite2; + // As we did for the current MBB, we iterate instructions backward + // to find the def of the source of register copy, starting from + // the end of the BB. + MachineBasicBlock::reverse_iterator MBBRI = Pred1MBB->rbegin(), + MBBRIE = Pred1MBB->rend(); + for (; MBBRI != MBBRIE && !Found; MBBRI++) { + if (MBBRI->isCall()) { + Found = true; + break; + } + + // If this instruction defines source reg of copy, this is the + // target for our optimization. + for (auto DefMO: MBBRI->defs()) + if (TRI->isSuperOrSubRegisterEq(DefMO.getReg(), SrcReg)) { + if (!DefMO.isTied() && !DefMO.isImplicit()) + DefMI = &(*MBBRI); + Found = true; + break; + } + if (Found) + break; + + // If this instruction conflicts with the reg copy, we give up. + for (auto DefMO: MBBRI->defs()) + if (TRI->isSuperOrSubRegisterEq(DefMO.getReg(), DstReg)) { + Found = true; + break; + } + + for (unsigned I = 0; I < MBBRI->getNumOperands(); I++) { + MachineOperand *MO = &MBBRI->getOperand(I); + if (!(MO->isReg() && MO->isUse())) + continue; + if (MO->getReg() == SrcReg) + OperandsToRewrite2.push_back(MO); + else if (TRI->isSuperOrSubRegisterEq(MO->getReg(), SrcReg)) { + Found = true; + break; + } + } + } + // We keep going iff the def of the reg copy is another reg copy + // with reversed src and dst. + if (!(DefMI && isRegCopy(*DefMI, SrcMO, DstMO) && + DefMI->getOpcode() == CopyMI.getOpcode() && + SrcReg == DstMO->getReg() && DstReg == SrcMO->getReg())) + continue; + + DEBUG(dbgs() + << "Optimizing partially redundant reg copy pair:\n"); + DEBUG(DefMI->dump()); + DEBUG(CopyMI.dump()); + + // Here, we find a partially redundant register copy pair. + // If we have SideMBB, we copy the first reg copy to the top of it. + // Otherwise, we can erase it. + if (SideMBB) { + SideMBB->splice(SideMBB->getFirstNonDebugInstr(), Pred1MBB, + MachineBasicBlock::iterator(DefMI)); + SideMBB->removeLiveIn(SrcReg); + SideMBB->addLiveIn(DstReg); + } + else + DefMI->eraseFromParent(); + + // We move the second reg copy to Pred2MBB by inserting before + // the first terminator instructions (or at the end if no + // terminator in the BB). + MachineBasicBlock::iterator To = Pred2MBB->getFirstTerminator(); + if (To == Pred2MBB->end()) + Pred2MBB->push_back(MBB.remove(&CopyMI)); + else + Pred2MBB->splice(To, &MBB, MachineBasicBlock::iterator(CopyMI)); + MBB.removeLiveIn(SrcReg); + MBB.addLiveIn(DstReg); + + // We touch up some operands if we have found any. + for (auto &MO: OperandsToRewrite1) + MO->setReg(DstReg); + for (auto &MO: OperandsToRewrite2) + MO->setReg(DstReg); + + CurrentInstrErased = true; + break; + } + + Simplified = CurrentInstrErased; + // If the current instruction has been eliminated, + // we cannot continue iteration, so restart it. + if (CurrentInstrErased) + break; + } + } + } while(CurrentInstrErased); + } + return Simplified; +} + +} // end default namespace + +INITIALIZE_PASS_BEGIN(PPCRegCopyElim, DEBUG_TYPE, + "PowerPC Redundant Register Copy Elimination", false, false) +INITIALIZE_PASS_END(PPCRegCopyElim, DEBUG_TYPE, + "PowerPC Redundant Register Copy Elimination", false, false) + +char PPCRegCopyElim::ID = 0; +FunctionPass* +llvm::createPPCRegCopyElimPass() { return new PPCRegCopyElim(); } Index: lib/Target/PowerPC/PPCTargetMachine.cpp =================================================================== --- lib/Target/PowerPC/PPCTargetMachine.cpp +++ lib/Target/PowerPC/PPCTargetMachine.cpp @@ -68,6 +68,10 @@ opt DisableMIPeephole("disable-ppc-peephole", cl::Hidden, cl::desc("Disable machine peepholes for PPC")); +static cl:: +opt DisableRegCopyElim("disable-ppc-regcopy-elim", cl::Hidden, + cl::desc("Disable register copy elimination for PPC")); + static cl::opt EnableGEPOpt("ppc-gep-opt", cl::Hidden, cl::desc("Enable optimizations on complex GEPs"), @@ -98,6 +102,7 @@ initializePPCBoolRetToIntPass(PR); initializePPCExpandISELPass(PR); initializePPCTLSDynamicCallPass(PR); + initializePPCRegCopyElimPass(PR); } /// Return the datalayout string of a subtarget. @@ -422,6 +427,10 @@ void PPCPassConfig::addPreSched2() { if (getOptLevel() != CodeGenOpt::None) { + // We try to eliminate redundant register copies among physical registers. + if (!DisableRegCopyElim) + addPass(createPPCRegCopyElimPass()); + addPass(&IfConverterID); // This optimization must happen after anything that might do store-to-load Index: test/CodeGen/PowerPC/aggressive-anti-dep-breaker-subreg.ll =================================================================== --- test/CodeGen/PowerPC/aggressive-anti-dep-breaker-subreg.ll +++ test/CodeGen/PowerPC/aggressive-anti-dep-breaker-subreg.ll @@ -10,10 +10,8 @@ lnext: %elementArray = load i32*, i32** %elementArrayPtr, align 8 ; CHECK: lwz [[LDREG:[0-9]+]], 124(1) # 4-byte Folded Reload -; CHECK: # implicit-def: %X[[TEMPREG:[0-9]+]] %element = load i32, i32* %elementArray, align 4 -; CHECK: mr [[TEMPREG]], [[LDREG]] -; CHECK: clrldi 4, [[TEMPREG]], 32 +; CHECK: clrldi 4, [[LDREG]], 32 %element.ext = zext i32 %element to i64 %value.ext = zext i32 %value to i64 call void @func(i8* %context, i64 %value.ext, i64 %element.ext) Index: test/CodeGen/PowerPC/ppc64-byval-align.ll =================================================================== --- test/CodeGen/PowerPC/ppc64-byval-align.ll +++ test/CodeGen/PowerPC/ppc64-byval-align.ll @@ -24,8 +24,7 @@ ret void } ; CHECK-LABEL: @caller1 -; CHECK: mr [[REG:[0-9]+]], 3 -; CHECK: mr 7, [[REG]] +; CHECK: mr 7, 3 ; CHECK: bl test1 define i64 @callee2(%struct.pad* byval nocapture readnone %x, i32 signext %y, %struct.test* byval align 16 nocapture readonly %z) { Index: test/CodeGen/PowerPC/redundant_regcopy.ll =================================================================== --- /dev/null +++ test/CodeGen/PowerPC/redundant_regcopy.ll @@ -0,0 +1,23 @@ +; RUN: llc < %s -O2 -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu | FileCheck %s + +define i8* @func1(i8* %a, i1 %b) { +; CHECK-LABEL: @func1 +; CHECK: bc +; CHECK: # BB#1 +; CHECK: mr 30, 3 +; CHECK: bl callee +; CHECK: mr 3, 30 +; CHECK: .LBB0_2: +; CHECK: blr +entry: + br i1 %b, label %exit, label %foo + +foo: + call void @callee() + br label %exit + +exit: + ret i8* %a +} + +declare void @callee() Index: test/CodeGen/PowerPC/redundant_regcopy_1.mir =================================================================== --- /dev/null +++ test/CodeGen/PowerPC/redundant_regcopy_1.mir @@ -0,0 +1,48 @@ +# test for BB-local register copy elimination +# RUN: llc -run-pass ppc-regcopy-elim -verify-machineinstrs -o - %s | FileCheck %s + +--- | + target datalayout = "E-m:e-i64:64-n32:64" + target triple = "powerpc64le-unknown-linux-gnu" + define signext i32 @func(i32 signext %i) { + entry: + ret i32 %i + } + +... +--- +name: func +alignment: 2 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '%x3' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0.entry: + liveins: %x3 + + ; CHECK-LABEL: bb.0.entry: + ; CHECK: %x3 = ADDI8 killed %x3, 1 + ; CHECK-NOT: OR8 + %x4 = ADDI8 killed %x3, 1 + %x3 = OR8 %x4, killed %x4 + +... + Index: test/CodeGen/PowerPC/redundant_regcopy_2.mir =================================================================== --- /dev/null +++ test/CodeGen/PowerPC/redundant_regcopy_2.mir @@ -0,0 +1,75 @@ +# test for BB-local register copy elimination +# RUN: llc -run-pass ppc-regcopy-elim -verify-machineinstrs -o - %s | FileCheck %s + +--- | + target datalayout = "E-m:e-i64:64-n32:64" + target triple = "powerpc64le-unknown-linux-gnu" + define i8* @func(i8* %a, i1 %b) { + entry: + br i1 %b, label %exit, label %foo + + foo: ; preds = %entry + br label %exit + + exit: ; preds = %foo, %entry + ret i8* %a + } + +... +--- +name: func +alignment: 2 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '%x3' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0.entry: + successors: %bb.2.exit(0x40000000), %bb.1.foo(0x40000000) + liveins: %x3 + + ; CHECK-LABEL: bb.0.entry: + ; CHECK-NOT: OR8 + ; CHECK: %cr0 = CMPLDI %x3, 0 + %x4 = OR8 %x3, killed %x3 + %cr0 = CMPLDI %x4, 0 + BCC 76, killed %cr0, %bb.2.exit + B %bb.1.foo + + bb.1.foo: + liveins: %x4 + + ; CHECK-LABEL: bb.1.foo: + ; CHECK: %x4 = OR8 %x3, killed %x3 + ; CHECK: %x4 = ADDI8 %x4, 123 + ; CHECK: %x3 = OR8 %x4, killed %x4 + %x4 = ADDI8 %x4, 123 + B %bb.2.exit + + bb.2.exit: + liveins: %x4 + ; CHECK-LABEL: bb.2.exit: + ; CHECK-NOT: OR8 + ; CHECK: BLR8 + %x3 = OR8 %x4, killed %x4 + BLR8 implicit %lr8, implicit %rm + +... +