Index: lib/Target/PowerPC/CMakeLists.txt =================================================================== --- lib/Target/PowerPC/CMakeLists.txt +++ lib/Target/PowerPC/CMakeLists.txt @@ -30,6 +30,7 @@ PPCMCInstLower.cpp PPCMachineFunctionInfo.cpp PPCMIPeephole.cpp + PPCPostRAPeephole.cpp PPCRegisterInfo.cpp PPCQPXLoadSplat.cpp PPCSubtarget.cpp Index: lib/Target/PowerPC/PPC.h =================================================================== --- lib/Target/PowerPC/PPC.h +++ lib/Target/PowerPC/PPC.h @@ -42,6 +42,7 @@ FunctionPass *createPPCVSXFMAMutatePass(); FunctionPass *createPPCVSXSwapRemovalPass(); FunctionPass *createPPCMIPeepholePass(); + FunctionPass *createPPCPostRAPeepholePass(); FunctionPass *createPPCBranchSelectionPass(); FunctionPass *createPPCBranchCoalescingPass(); FunctionPass *createPPCQPXLoadSplatPass(); @@ -59,6 +60,7 @@ void initializePPCBoolRetToIntPass(PassRegistry&); void initializePPCExpandISELPass(PassRegistry &); void initializePPCTLSDynamicCallPass(PassRegistry &); + void initializePPCPostRAPeepholePass(PassRegistry &); extern char &PPCVSXFMAMutateID; namespace PPCII { Index: lib/Target/PowerPC/PPCPostRAPeephole.cpp =================================================================== --- /dev/null +++ lib/Target/PowerPC/PPCPostRAPeephole.cpp @@ -0,0 +1,373 @@ +//===-------------- PPCPostRAPeephole.cpp - MI Peephole Cleanups ---------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===---------------------------------------------------------------------===// +// +// This pass aims to eliminate redundancy related to physical registers +// after the register allocation. +// So far, we eliminate the following two types of redundant register copys. +// +// 1) intra-BB redundant register copy +// li Y, 0 li X, 0 +// mr X, Y => (erase mr) +// .. .. +// 2) inter-BB partially redundant register copy +// BB1-------- BB1-------- +// | .. | | .. | +// | mr Y, X | | (erase) | +// | .. | | .. | +// with ----------- with ----------- +// 1 pred / | 1 pred / | +// BB-------- | BB-------- BB--------- | BB--------- +// | .. | | | .. | => | mr Y, X | | | .. | +// | .. | | | .. | | .. | | | mr X, Y | +// ---------- | ---------- ----------- | ----------- +// | / with | / with +// BB2-------- 1 succ BB2-------- 1 succ +// | .. | | .. | +// | mr X, Y | | (erase) | +// | .. | | .. | +// ----------- ----------- +// +//===---------------------------------------------------------------------===// + +#include "PPCTargetMachine.h" + +using namespace llvm; + +#define DEBUG_TYPE "ppc-postra-peepholes" + +namespace { + +struct PPCPostRAPeephole : public MachineFunctionPass { + + static char ID; + const PPCInstrInfo *TII; + MachineFunction *MF; + + PPCPostRAPeephole() : MachineFunctionPass(ID) { + initializePPCPostRAPeepholePass(*PassRegistry::getPassRegistry()); + } + +private: + // Initialize class variables. + void initialize(MachineFunction &MFParm); + + // Perform peepholes. + bool eliminateRedundantRegCopy(void); + +public: + // Main entry point for this pass. + bool runOnMachineFunction(MachineFunction &MF) override { + if (skipFunction(*MF.getFunction())) + return false; + initialize(MF); + bool Simplified = false; + + Simplified |= eliminateRedundantRegCopy(); + return Simplified; + } +}; + +// Initialize class variables. +void PPCPostRAPeephole::initialize(MachineFunction &MFParm) { + MF = &MFParm; + TII = MF->getSubtarget().getInstrInfo(); + DEBUG(dbgs() << "*** PowerPC Post RA peephole pass ***\n\n"); +} + +// If MI is a register copy, this method returns true and set +// source and destination operands in SrcOperand and DstOperand. +static bool isRegCopy(MachineInstr &MI, MachineOperand* &SrcOperand, + MachineOperand* &DstOperand) { + // Refer PPCInstrInfo::copyPhysReg to find opcodes used for copying + // the value in a physical register for each register class. + // Currently we do not optimize VSX registers (copied by xxlor), + // which may involve different register types, i.e. FPR and VRF. + unsigned Opc = MI.getOpcode(); + if (Opc == PPC::FMR) { + DstOperand = &MI.getOperand(0); + SrcOperand = &MI.getOperand(1); + return true; + } + if ((Opc == PPC::OR8 || Opc == PPC::OR || Opc == PPC::VOR) && + MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) { + DstOperand = &MI.getOperand(0); + // We return the last operand as src to access kill flag. + SrcOperand = &MI.getOperand(2); + return true; + } + return false; +} + +bool PPCPostRAPeephole::eliminateRedundantRegCopy(void) { + const PPCSubtarget *PPCSubTarget = &MF->getSubtarget(); + const TargetRegisterInfo *TRI = PPCSubTarget->getRegisterInfo(); + bool Simplified = false; + for (MachineBasicBlock &MBB : *MF) { + bool CurrentInstrErased = false; + do { + CurrentInstrErased = false; + for (MachineInstr &CopyMI : MBB) { + if (CopyMI.isDebugValue()) + continue; + + MachineOperand *SrcMO = nullptr, *DstMO = nullptr; + if (isRegCopy(CopyMI, SrcMO, DstMO) && SrcMO->isKill()) { + unsigned DstReg = DstMO->getReg(); + unsigned SrcReg = SrcMO->getReg(); + + MachineInstr *DefMI = nullptr; + bool Found = false; + SmallVector OperandsToRewrite1; + + // We iterate instructions backward to find the def of the source + // of register copy or instructions conflicting with register copy + // (e.g. instructions that use destination reg of copy). + MachineBasicBlock::reverse_iterator MBBRI = CopyMI, + MBBRIE = MBB.rend(); + for (MBBRI++; MBBRI != MBBRIE && !Found; MBBRI++) { + // We conservatively assume function call may crobber all regs. + if (MBBRI->isCall()) { + Found = true; + break; + } + + // If this instruction defines source reg of copy, this is the + // target for our optimization. + for (auto DefMO: MBBRI->defs()) + if (TRI->isSuperOrSubRegisterEq(DefMO.getReg(), SrcReg)) { + if (!DefMO.isTied()) + DefMI = &(*MBBRI); + Found = true; + break; + } + if (Found) + break; + + // If this instr conflicts with the reg copy, we give up here. + for (auto MO: MBBRI->operands()) + if (MO.isReg() && + TRI->isSuperOrSubRegisterEq(MO.getReg(), DstReg)) { + Found = true; + break; + } + + const MCInstrDesc &MCID = TII->get(MBBRI->getOpcode()); + if (MCID.ImplicitDefs && !Found) + for (const MCPhysReg *ImpDef = MCID.getImplicitDefs(); *ImpDef; + ImpDef++) + if (TRI->isSuperOrSubRegisterEq(*ImpDef, SrcReg) || + TRI->isSuperOrSubRegisterEq(*ImpDef, DstReg)) { + Found = true; + break; + } + if (Found) + break; + + // If this instruction uses source register of register copy, + // we need to remember the operand to replace it with destination + // register of copy. For example: + // li r4, 1 li r3, 1 + // cmplwi r4, 0 -> cmplwi r3, 0 + // mr r3, r4 (erase mr) + for (unsigned I = MCID.getNumDefs(); I < MCID.getNumOperands(); + I++) + if (MBBRI->getOperand(I).isReg()) { + unsigned Operand = MBBRI->getOperand(I).getReg(); + if (Operand == SrcReg) + OperandsToRewrite1.push_back(&MBBRI->getOperand(I)); + else if (TRI->isSuperOrSubRegisterEq(Operand, SrcReg)) { + Found = true; + break; + } + } + } + + // Def for the source of the register copy is found in the BB + // and can be optimized within this block. + if (DefMI && DefMI->getOperand(0).getReg() == SrcReg) { + DEBUG(dbgs() + << "Eliminating redundant register copy:\n"); + DEBUG(DefMI->dump()); + DEBUG(CopyMI.dump()); + DefMI->getOperand(0).setReg(DstReg); + CopyMI.eraseFromParent(); + for (auto &MO: OperandsToRewrite1) + MO->setReg(DstReg); + CurrentInstrErased = true; + break; + } + + // If there is conflicting instruction (e.g. uses source reg of copy) + // between def and reg copy, we cannot optimize. + if (Found) + continue; + + // If no def and no conflicting instruction is found in this BB, + // we check the predecessors. + // So far, we do not optimize if this BB has more than two + // predecessors not to increase the number of total instructions. + if (MBB.pred_size() != 2) continue; + + // From here, we handle redundancy among multiple BBs. + // We currently support following CFGs. + // + // Pred1MBB + // / | / + // (SideMBB) | Pred2MBB + // / | / + // MBB (including CopyMI) + // + // - MBB doesn't have more than two predecessors. + // - Pred1MBB doesn't have more than two successors. + // - SideMBB is optional. + // - SideMBB has only one predecessor (Pred1MBB). + // - Pred2MBB has only one successor (MBB). + // - SideMBB and Pred2MBB may be the same BB. + auto FirstPredBB = MBB.pred_begin(); + MachineBasicBlock *Pred2MBB = nullptr, *SideMBB = nullptr; + for (auto &Pred1MBB: MBB.predecessors()) { + // If the BBs does not match the above patterns we support, + // we give up before iterating instructions. + Pred2MBB = (Pred1MBB == *FirstPredBB) ? *(FirstPredBB + 1): + * FirstPredBB; + if (Pred1MBB->succ_size() > 2 || Pred2MBB->succ_size() > 1) + continue; + + if (Pred1MBB->succ_size() == 2) { + auto FirstSuccBB = Pred1MBB->succ_begin(); + SideMBB = (&MBB == *FirstSuccBB) ? *(FirstSuccBB + 1): + * FirstSuccBB; + if (SideMBB->pred_size() > 1) + continue; + } + + MachineInstr *DefMI = nullptr; + bool Found = false; + SmallVector OperandsToRewrite2; + // As we did for the current MBB, we iterate instructions backward + // to find the def of the source of register copy, starting from + // the end of the BB. + MachineBasicBlock::reverse_iterator MBBRI = Pred1MBB->rbegin(), + MBBRIE = Pred1MBB->rend(); + for (; MBBRI != MBBRIE && !Found; MBBRI++) { + if (MBBRI->isCall()) { + Found = true; + break; + } + + // If this instruction defines source reg of copy, this is the + // target for our optimization. + for (auto DefMO: MBBRI->defs()) + if (TRI->isSuperOrSubRegisterEq(DefMO.getReg(), SrcReg)) { + if (!DefMO.isTied()) + DefMI = &(*MBBRI); + Found = true; + break; + } + if (Found) + break; + + // If this instruction conflicts with the reg copy, we give up. + for (auto DefMO: MBBRI->defs()) + if (TRI->isSuperOrSubRegisterEq(DefMO.getReg(), DstReg)) { + Found = true; + break; + } + + const MCInstrDesc &MCID = TII->get(MBBRI->getOpcode()); + if (MCID.ImplicitDefs && !Found) + for (const MCPhysReg *ImpDef = MCID.getImplicitDefs(); *ImpDef; + ImpDef++) + if (TRI->isSuperOrSubRegisterEq(*ImpDef, SrcReg) || + TRI->isSuperOrSubRegisterEq(*ImpDef, DstReg)) { + Found = true; + break; + } + if (Found) + break; + + for (unsigned I = MCID.getNumDefs(); I < MCID.getNumOperands(); + I++) + if (MBBRI->getOperand(I).isReg()) { + unsigned Operand = MBBRI->getOperand(I).getReg(); + if (Operand == SrcReg) + OperandsToRewrite2.push_back(&MBBRI->getOperand(I)); + else if (TRI->isSuperOrSubRegisterEq(Operand, SrcReg)) { + Found = true; + break; + } + } + } + // We keep going iff the def of the reg copy is another reg copy + // with reversed src and dst. + if (!(DefMI && isRegCopy(*DefMI, SrcMO, DstMO) && + DefMI->getOpcode() == CopyMI.getOpcode() && + SrcReg == DstMO->getReg() && DstReg == SrcMO->getReg())) + continue; + + DEBUG(dbgs() + << "Optimizing partially redundant reg copy pair:\n"); + DEBUG(DefMI->dump()); + DEBUG(CopyMI.dump()); + + // Here, we find a partially redundant register copy pair. + // If we have SideMBB, we copy the first reg copy to the top of it. + // Otherwise, we can erase it. + if (SideMBB) { + SideMBB->splice(SideMBB->getFirstNonDebugInstr(), Pred1MBB, + MachineBasicBlock::iterator(DefMI)); + SideMBB->removeLiveIn(SrcReg); + SideMBB->addLiveIn(DstReg); + } + else + DefMI->eraseFromParent(); + + // We move the second reg copy to Pred2MBB by inserting before + // the first terminator instructions (or at the end if no + // terminator in the BB). + MachineBasicBlock::iterator To = Pred2MBB->getFirstTerminator(); + if (To == Pred2MBB->end()) + Pred2MBB->push_back(MBB.remove(&CopyMI)); + else + Pred2MBB->splice(To, &MBB, MachineBasicBlock::iterator(CopyMI)); + MBB.removeLiveIn(SrcReg); + MBB.addLiveIn(DstReg); + + // We touch up some operands if we have found any. + for (auto &MO: OperandsToRewrite1) + MO->setReg(DstReg); + for (auto &MO: OperandsToRewrite2) + MO->setReg(DstReg); + + CurrentInstrErased = true; + break; + } + + Simplified = CurrentInstrErased; + // If the current instruction has been eliminated, + // we cannot continue iteration, so restart it. + if (CurrentInstrErased) + break; + } + } + } while(CurrentInstrErased); + } + return Simplified; +} + +} // end default namespace + +INITIALIZE_PASS_BEGIN(PPCPostRAPeephole, DEBUG_TYPE, + "PowerPC Post RA Peephole Optimization", false, false) +INITIALIZE_PASS_END(PPCPostRAPeephole, DEBUG_TYPE, + "PowerPC Post RA Peephole Optimization", false, false) + +char PPCPostRAPeephole::ID = 0; +FunctionPass* +llvm::createPPCPostRAPeepholePass() { return new PPCPostRAPeephole(); } Index: lib/Target/PowerPC/PPCTargetMachine.cpp =================================================================== --- lib/Target/PowerPC/PPCTargetMachine.cpp +++ lib/Target/PowerPC/PPCTargetMachine.cpp @@ -68,6 +68,10 @@ opt DisableMIPeephole("disable-ppc-peephole", cl::Hidden, cl::desc("Disable machine peepholes for PPC")); +static cl:: +opt DisablePostRAPeephole("disable-ppc-postra-peephole", cl::Hidden, + cl::desc("Disable post RA peepholes for PPC")); + static cl::opt EnableGEPOpt("ppc-gep-opt", cl::Hidden, cl::desc("Enable optimizations on complex GEPs"), @@ -98,6 +102,7 @@ initializePPCBoolRetToIntPass(PR); initializePPCExpandISELPass(PR); initializePPCTLSDynamicCallPass(PR); + initializePPCPostRAPeepholePass(PR); } /// Return the datalayout string of a subtarget. @@ -422,6 +427,10 @@ void PPCPassConfig::addPreSched2() { if (getOptLevel() != CodeGenOpt::None) { + // Target-specific peephole optimization after register allocation. + if (!DisablePostRAPeephole) + addPass(createPPCPostRAPeepholePass()); + addPass(&IfConverterID); // This optimization must happen after anything that might do store-to-load Index: test/CodeGen/PowerPC/aggressive-anti-dep-breaker-subreg.ll =================================================================== --- test/CodeGen/PowerPC/aggressive-anti-dep-breaker-subreg.ll +++ test/CodeGen/PowerPC/aggressive-anti-dep-breaker-subreg.ll @@ -10,10 +10,8 @@ lnext: %elementArray = load i32*, i32** %elementArrayPtr, align 8 ; CHECK: lwz [[LDREG:[0-9]+]], 124(1) # 4-byte Folded Reload -; CHECK: # implicit-def: %X[[TEMPREG:[0-9]+]] %element = load i32, i32* %elementArray, align 4 -; CHECK: mr [[TEMPREG]], [[LDREG]] -; CHECK: clrldi 4, [[TEMPREG]], 32 +; CHECK: clrldi 4, [[LDREG]], 32 %element.ext = zext i32 %element to i64 %value.ext = zext i32 %value to i64 call void @func(i8* %context, i64 %value.ext, i64 %element.ext) Index: test/CodeGen/PowerPC/ppc64-byval-align.ll =================================================================== --- test/CodeGen/PowerPC/ppc64-byval-align.ll +++ test/CodeGen/PowerPC/ppc64-byval-align.ll @@ -24,8 +24,7 @@ ret void } ; CHECK-LABEL: @caller1 -; CHECK: mr [[REG:[0-9]+]], 3 -; CHECK: mr 7, [[REG]] +; CHECK: mr 7, 3 ; CHECK: bl test1 define i64 @callee2(%struct.pad* byval nocapture readnone %x, i32 signext %y, %struct.test* byval align 16 nocapture readonly %z) { Index: test/CodeGen/PowerPC/redundant_regcopy.ll =================================================================== --- /dev/null +++ test/CodeGen/PowerPC/redundant_regcopy.ll @@ -0,0 +1,23 @@ +; RUN: llc < %s -O2 -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu | FileCheck %s + +define i8* @func1(i8* %a, i1 %b) { +; CHECK-LABEL: @func1 +; CHECK: bc +; CHECK: # BB#1 +; CHECK: mr 30, 3 +; CHECK: bl callee +; CHECK: mr 3, 30 +; CHECK: .LBB0_2: +; CHECK: blr +entry: + br i1 %b, label %exit, label %foo + +foo: + call void @callee() + br label %exit + +exit: + ret i8* %a +} + +declare void @callee() Index: test/CodeGen/PowerPC/redundant_regcopy_1.mir =================================================================== --- /dev/null +++ test/CodeGen/PowerPC/redundant_regcopy_1.mir @@ -0,0 +1,48 @@ +# test for BB-local register copy elimination +# RUN: llc -run-pass ppc-postra-peepholes -verify-machineinstrs -o - %s | FileCheck %s + +--- | + target datalayout = "E-m:e-i64:64-n32:64" + target triple = "powerpc64le-unknown-linux-gnu" + define signext i32 @func(i32 signext %i) { + entry: + ret i32 %i + } + +... +--- +name: func +alignment: 2 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '%x3' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0.entry: + liveins: %x3 + + ; CHECK-LABEL: bb.0.entry: + ; CHECK: %x3 = ADDI8 killed %x3, 1 + ; CHECK-NOT: OR8 + %x4 = ADDI8 killed %x3, 1 + %x3 = OR8 %x4, killed %x4 + +... + Index: test/CodeGen/PowerPC/redundant_regcopy_2.mir =================================================================== --- /dev/null +++ test/CodeGen/PowerPC/redundant_regcopy_2.mir @@ -0,0 +1,75 @@ +# test for BB-local register copy elimination +# RUN: llc -run-pass ppc-postra-peepholes -verify-machineinstrs -o - %s | FileCheck %s + +--- | + target datalayout = "E-m:e-i64:64-n32:64" + target triple = "powerpc64le-unknown-linux-gnu" + define i8* @func(i8* %a, i1 %b) { + entry: + br i1 %b, label %exit, label %foo + + foo: ; preds = %entry + br label %exit + + exit: ; preds = %foo, %entry + ret i8* %a + } + +... +--- +name: func +alignment: 2 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '%x3' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0.entry: + successors: %bb.2.exit(0x40000000), %bb.1.foo(0x40000000) + liveins: %x3 + + ; CHECK-LABEL: bb.0.entry: + ; CHECK-NOT: OR8 + ; CHECK: %cr0 = CMPLDI %x3, 0 + %x4 = OR8 %x3, killed %x3 + %cr0 = CMPLDI %x4, 0 + BCC 76, killed %cr0, %bb.2.exit + B %bb.1.foo + + bb.1.foo: + liveins: %x4 + + ; CHECK-LABEL: bb.1.foo: + ; CHECK: %x4 = OR8 %x3, killed %x3 + ; CHECK: %x4 = ADDI8 %x4, 123 + ; CHECK: %x3 = OR8 %x4, killed %x4 + %x4 = ADDI8 %x4, 123 + B %bb.2.exit + + bb.2.exit: + liveins: %x4 + ; CHECK-LABEL: bb.2.exit: + ; CHECK-NOT: OR8 + ; CHECK: BLR8 + %x3 = OR8 %x4, killed %x4 + BLR8 implicit %lr8, implicit %rm + +... +