Skip to content

Commit

Permalink
[MachineCopyPropagation] Extend pass to do COPY source forwarding
Browse files Browse the repository at this point in the history
Summary:
This change extends MachineCopyPropagation to do COPY source forwarding
and adds an additional run of the pass to the default pass pipeline just
after register allocation.

This version of this patch uses the newly added
MachineOperand::isRenamable bit to avoid forwarding registers is such a
way as to violate constraints that aren't captured in the
Machine IR (e.g. ABI or ISA constraints).

This change is a continuation of the work started in D30751.

Reviewers: qcolombet, javed.absar, MatzeB, jonpa, tstellar

Subscribers: tpr, mgorny, mcrosier, nhaehnle, nemanjai, jyknight, hfinkel, arsenm, inouehrs, eraman, sdardis, guyblank, fedor.sergeev, aheejin, dschuff, jfb, myatsina, llvm-commits

Differential Revision: https://reviews.llvm.org/D41835

llvm-svn: 323991
  • Loading branch information
geoffberry committed Feb 1, 2018
1 parent a95bd9f commit 94503c7
Showing 119 changed files with 795 additions and 495 deletions.
207 changes: 206 additions & 1 deletion llvm/lib/CodeGen/MachineCopyPropagation.cpp
Original file line number Diff line number Diff line change
@@ -9,6 +9,35 @@
//
// This is an extremely simple MachineInstr-level copy propagation pass.
//
// This pass forwards the source of COPYs to the users of their destinations
// when doing so is legal. For example:
//
// %reg1 = COPY %reg0
// ...
// ... = OP %reg1
//
// If
// - %reg0 has not been clobbered by the time of the use of %reg1
// - the register class constraints are satisfied
// - the COPY def is the only value that reaches OP
// then this pass replaces the above with:
//
// %reg1 = COPY %reg0
// ...
// ... = OP %reg0
//
// This pass also removes some redundant COPYs. For example:
//
// %R1 = COPY %R0
// ... // No clobber of %R1
// %R0 = COPY %R1 <<< Removed
//
// or
//
// %R1 = COPY %R0
// ... // No clobber of %R0
// %R1 = COPY %R0 <<< Removed
//
//===----------------------------------------------------------------------===//

#include "llvm/ADT/DenseMap.h"
@@ -23,11 +52,13 @@
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/Pass.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/DebugCounter.h"
#include "llvm/Support/raw_ostream.h"
#include <cassert>
#include <iterator>
@@ -37,6 +68,9 @@ using namespace llvm;
#define DEBUG_TYPE "machine-cp"

STATISTIC(NumDeletes, "Number of dead copies deleted");
STATISTIC(NumCopyForwards, "Number of copy uses forwarded");
DEBUG_COUNTER(FwdCounter, "machine-cp-fwd",
"Controls which register COPYs are forwarded");

namespace {

@@ -73,6 +107,10 @@ using Reg2MIMap = DenseMap<unsigned, MachineInstr *>;
void ReadRegister(unsigned Reg);
void CopyPropagateBlock(MachineBasicBlock &MBB);
bool eraseIfRedundant(MachineInstr &Copy, unsigned Src, unsigned Def);
void forwardUses(MachineInstr &MI);
bool isForwardableRegClassCopy(const MachineInstr &Copy,
const MachineInstr &UseI, unsigned UseIdx);
bool hasImplicitOverlap(const MachineInstr &MI, const MachineOperand &Use);

/// Candidates for deletion.
SmallSetVector<MachineInstr*, 8> MaybeDeadCopies;
@@ -208,6 +246,152 @@ bool MachineCopyPropagation::eraseIfRedundant(MachineInstr &Copy, unsigned Src,
return true;
}

/// Decide whether we should forward the source of \param Copy to its use in
/// \param UseI based on the physical register class constraints of the opcode
/// and avoiding introducing more cross-class COPYs.
bool MachineCopyPropagation::isForwardableRegClassCopy(const MachineInstr &Copy,
const MachineInstr &UseI,
unsigned UseIdx) {

unsigned CopySrcReg = Copy.getOperand(1).getReg();

// If the new register meets the opcode register constraints, then allow
// forwarding.
if (const TargetRegisterClass *URC =
UseI.getRegClassConstraint(UseIdx, TII, TRI))
return URC->contains(CopySrcReg);

if (!UseI.isCopy())
return false;

/// COPYs don't have register class constraints, so if the user instruction
/// is a COPY, we just try to avoid introducing additional cross-class
/// COPYs. For example:
///
/// RegClassA = COPY RegClassB // Copy parameter
/// ...
/// RegClassB = COPY RegClassA // UseI parameter
///
/// which after forwarding becomes
///
/// RegClassA = COPY RegClassB
/// ...
/// RegClassB = COPY RegClassB
///
/// so we have reduced the number of cross-class COPYs and potentially
/// introduced a nop COPY that can be removed.
const TargetRegisterClass *UseDstRC =
TRI->getMinimalPhysRegClass(UseI.getOperand(0).getReg());

const TargetRegisterClass *SuperRC = UseDstRC;
for (TargetRegisterClass::sc_iterator SuperRCI = UseDstRC->getSuperClasses();
SuperRC; SuperRC = *SuperRCI++)
if (SuperRC->contains(CopySrcReg))
return true;

return false;
}

/// Check that \p MI does not have implicit uses that overlap with it's \p Use
/// operand (the register being replaced), since these can sometimes be
/// implicitly tied to other operands. For example, on AMDGPU:
///
/// V_MOVRELS_B32_e32 %VGPR2, %M0<imp-use>, %EXEC<imp-use>, %VGPR2_VGPR3_VGPR4_VGPR5<imp-use>
///
/// the %VGPR2 is implicitly tied to the larger reg operand, but we have no
/// way of knowing we need to update the latter when updating the former.
bool MachineCopyPropagation::hasImplicitOverlap(const MachineInstr &MI,
const MachineOperand &Use) {
for (const MachineOperand &MIUse : MI.uses())
if (&MIUse != &Use && MIUse.isReg() && MIUse.isImplicit() &&
MIUse.isUse() && TRI->regsOverlap(Use.getReg(), MIUse.getReg()))
return true;

return false;
}

/// Look for available copies whose destination register is used by \p MI and
/// replace the use in \p MI with the copy's source register.
void MachineCopyPropagation::forwardUses(MachineInstr &MI) {
if (AvailCopyMap.empty())
return;

// Look for non-tied explicit vreg uses that have an active COPY
// instruction that defines the physical register allocated to them.
// Replace the vreg with the source of the active COPY.
for (unsigned OpIdx = 0, OpEnd = MI.getNumOperands(); OpIdx < OpEnd;
++OpIdx) {
MachineOperand &MOUse = MI.getOperand(OpIdx);
// Don't forward into undef use operands since doing so can cause problems
// with the machine verifier, since it doesn't treat undef reads as reads,
// so we can end up with a live range that ends on an undef read, leading to
// an error that the live range doesn't end on a read of the live range
// register.
if (!MOUse.isReg() || MOUse.isTied() || MOUse.isUndef() || MOUse.isDef() ||
MOUse.isImplicit())
continue;

if (!MOUse.getReg())
continue;

// Check that the register is marked 'renamable' so we know it is safe to
// rename it without violating any constraints that aren't expressed in the
// IR (e.g. ABI or opcode requirements).
if (!MOUse.isRenamable())
continue;

auto CI = AvailCopyMap.find(MOUse.getReg());
if (CI == AvailCopyMap.end())
continue;

MachineInstr &Copy = *CI->second;
unsigned CopyDstReg = Copy.getOperand(0).getReg();
const MachineOperand &CopySrc = Copy.getOperand(1);
unsigned CopySrcReg = CopySrc.getReg();

// FIXME: Don't handle partial uses of wider COPYs yet.
if (MOUse.getReg() != CopyDstReg) {
DEBUG(dbgs() << "MCP: FIXME! Not forwarding COPY to sub-register use:\n "
<< MI);
continue;
}

// Don't forward COPYs of reserved regs unless they are constant.
if (MRI->isReserved(CopySrcReg) && !MRI->isConstantPhysReg(CopySrcReg))
continue;

if (!isForwardableRegClassCopy(Copy, MI, OpIdx))
continue;

if (hasImplicitOverlap(MI, MOUse))
continue;

if (!DebugCounter::shouldExecute(FwdCounter)) {
DEBUG(dbgs() << "MCP: Skipping forwarding due to debug counter:\n "
<< MI);
continue;
}

DEBUG(dbgs() << "MCP: Replacing " << printReg(MOUse.getReg(), TRI)
<< "\n with " << printReg(CopySrcReg, TRI) << "\n in "
<< MI << " from " << Copy);

MOUse.setReg(CopySrcReg);
if (!CopySrc.isRenamable())
MOUse.setIsRenamable(false);

DEBUG(dbgs() << "MCP: After replacement: " << MI << "\n");

// Clear kill markers that may have been invalidated.
for (MachineInstr &KMI :
make_range(Copy.getIterator(), std::next(MI.getIterator())))
KMI.clearRegisterKills(CopySrcReg, TRI);

++NumCopyForwards;
Changed = true;
}
}

void MachineCopyPropagation::CopyPropagateBlock(MachineBasicBlock &MBB) {
DEBUG(dbgs() << "MCP: CopyPropagateBlock " << MBB.getName() << "\n");

@@ -241,6 +425,11 @@ void MachineCopyPropagation::CopyPropagateBlock(MachineBasicBlock &MBB) {
if (eraseIfRedundant(*MI, Def, Src) || eraseIfRedundant(*MI, Src, Def))
continue;

forwardUses(*MI);

// Src may have been changed by forwardUses()
Src = MI->getOperand(1).getReg();

// If Src is defined by a previous copy, the previous copy cannot be
// eliminated.
ReadRegister(Src);
@@ -292,6 +481,20 @@ void MachineCopyPropagation::CopyPropagateBlock(MachineBasicBlock &MBB) {
continue;
}

// Clobber any earlyclobber regs first.
for (const MachineOperand &MO : MI->operands())
if (MO.isReg() && MO.isEarlyClobber()) {
unsigned Reg = MO.getReg();
// If we have a tied earlyclobber, that means it is also read by this
// instruction, so we need to make sure we don't remove it as dead
// later.
if (MO.isTied())
ReadRegister(Reg);
ClobberRegister(Reg);
}

forwardUses(*MI);

// Not a copy.
SmallVector<unsigned, 2> Defs;
const MachineOperand *RegMask = nullptr;
@@ -307,7 +510,7 @@ void MachineCopyPropagation::CopyPropagateBlock(MachineBasicBlock &MBB) {
assert(!TargetRegisterInfo::isVirtualRegister(Reg) &&
"MachineCopyPropagation should be run after register allocation!");

if (MO.isDef()) {
if (MO.isDef() && !MO.isEarlyClobber()) {
Defs.push_back(Reg);
continue;
} else if (MO.readsReg())
@@ -364,6 +567,8 @@ void MachineCopyPropagation::CopyPropagateBlock(MachineBasicBlock &MBB) {
// since we don't want to trust live-in lists.
if (MBB.succ_empty()) {
for (MachineInstr *MaybeDead : MaybeDeadCopies) {
DEBUG(dbgs() << "MCP: Removing copy due to no live-out succ: ";
MaybeDead->dump());
assert(!MRI->isReserved(MaybeDead->getOperand(0).getReg()));
MaybeDead->eraseFromParent();
Changed = true;
4 changes: 4 additions & 0 deletions llvm/lib/CodeGen/TargetPassConfig.cpp
Original file line number Diff line number Diff line change
@@ -1081,6 +1081,10 @@ void TargetPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
// kill markers.
addPass(&StackSlotColoringID);

// Copy propagate to forward register uses and try to eliminate COPYs that
// were not coalesced.
addPass(&MachineCopyPropagationID);

// Run post-ra machine LICM to hoist reloads / remats.
//
// FIXME: can this move into MachineLateOptimization?
9 changes: 6 additions & 3 deletions llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll
Original file line number Diff line number Diff line change
@@ -9,7 +9,8 @@ define i16 @halfword(%struct.a* %ctx, i32 %xor72) nounwind {
; CHECK-LABEL: halfword:
; CHECK: ubfx [[REG:x[0-9]+]], x1, #9, #8
; CHECK: ldrh [[REG1:w[0-9]+]], [{{.*}}[[REG2:x[0-9]+]], [[REG]], lsl #1]
; CHECK: strh [[REG1]], [{{.*}}[[REG2]], [[REG]], lsl #1]
; CHECK: mov [[REG3:x[0-9]+]], [[REG2]]
; CHECK: strh [[REG1]], [{{.*}}[[REG3]], [[REG]], lsl #1]
%shr81 = lshr i32 %xor72, 9
%conv82 = zext i32 %shr81 to i64
%idxprom83 = and i64 %conv82, 255
@@ -24,7 +25,8 @@ define i32 @word(%struct.b* %ctx, i32 %xor72) nounwind {
; CHECK-LABEL: word:
; CHECK: ubfx [[REG:x[0-9]+]], x1, #9, #8
; CHECK: ldr [[REG1:w[0-9]+]], [{{.*}}[[REG2:x[0-9]+]], [[REG]], lsl #2]
; CHECK: str [[REG1]], [{{.*}}[[REG2]], [[REG]], lsl #2]
; CHECK: mov [[REG3:x[0-9]+]], [[REG2]]
; CHECK: str [[REG1]], [{{.*}}[[REG3]], [[REG]], lsl #2]
%shr81 = lshr i32 %xor72, 9
%conv82 = zext i32 %shr81 to i64
%idxprom83 = and i64 %conv82, 255
@@ -39,7 +41,8 @@ define i64 @doubleword(%struct.c* %ctx, i32 %xor72) nounwind {
; CHECK-LABEL: doubleword:
; CHECK: ubfx [[REG:x[0-9]+]], x1, #9, #8
; CHECK: ldr [[REG1:x[0-9]+]], [{{.*}}[[REG2:x[0-9]+]], [[REG]], lsl #3]
; CHECK: str [[REG1]], [{{.*}}[[REG2]], [[REG]], lsl #3]
; CHECK: mov [[REG3:x[0-9]+]], [[REG2]]
; CHECK: str [[REG1]], [{{.*}}[[REG3]], [[REG]], lsl #3]
%shr81 = lshr i32 %xor72, 9
%conv82 = zext i32 %shr81 to i64
%idxprom83 = and i64 %conv82, 255
16 changes: 4 additions & 12 deletions llvm/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll
Original file line number Diff line number Diff line change
@@ -8,27 +8,19 @@ define <2 x i64> @bar(<2 x i64> %a, <2 x i64> %b) nounwind readnone {
; CHECK: add.2d v[[REG:[0-9]+]], v0, v1
; CHECK: add d[[REG3:[0-9]+]], d[[REG]], d1
; CHECK: sub d[[REG2:[0-9]+]], d[[REG]], d1
; Without advanced copy optimization, we end up with cross register
; banks copies that cannot be coalesced.
; CHECK-NOOPT: fmov [[COPY_REG3:x[0-9]+]], d[[REG3]]
; With advanced copy optimization, we end up with just one copy
; to insert the computed high part into the V register.
; CHECK-OPT-NOT: fmov
; CHECK-NOT: fmov
; CHECK: fmov [[COPY_REG2:x[0-9]+]], d[[REG2]]
; CHECK-NOOPT: fmov d0, [[COPY_REG3]]
; CHECK-OPT-NOT: fmov
; CHECK-NOT: fmov
; CHECK: mov.d v0[1], [[COPY_REG2]]
; CHECK-NEXT: ret
;
; GENERIC-LABEL: bar:
; GENERIC: add v[[REG:[0-9]+]].2d, v0.2d, v1.2d
; GENERIC: add d[[REG3:[0-9]+]], d[[REG]], d1
; GENERIC: sub d[[REG2:[0-9]+]], d[[REG]], d1
; GENERIC-NOOPT: fmov [[COPY_REG3:x[0-9]+]], d[[REG3]]
; GENERIC-OPT-NOT: fmov
; GENERIC-NOT: fmov
; GENERIC: fmov [[COPY_REG2:x[0-9]+]], d[[REG2]]
; GENERIC-NOOPT: fmov d0, [[COPY_REG3]]
; GENERIC-OPT-NOT: fmov
; GENERIC-NOT: fmov
; GENERIC: mov v0.d[1], [[COPY_REG2]]
; GENERIC-NEXT: ret
%add = add <2 x i64> %a, %b
6 changes: 4 additions & 2 deletions llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov.ll
Original file line number Diff line number Diff line change
@@ -4,8 +4,10 @@
define i32 @t(i32 %a, i32 %b, i32 %c, i32 %d) nounwind ssp {
entry:
; CHECK-LABEL: t:
; CHECK: mov x0, [[REG1:x[0-9]+]]
; CHECK: mov x1, [[REG2:x[0-9]+]]
; CHECK: mov [[REG2:x[0-9]+]], x3
; CHECK: mov [[REG1:x[0-9]+]], x2
; CHECK: mov x0, x2
; CHECK: mov x1, x3
; CHECK: bl _foo
; CHECK: mov x0, [[REG1]]
; CHECK: mov x1, [[REG2]]
3 changes: 1 addition & 2 deletions llvm/test/CodeGen/AArch64/cmpxchg-idioms.ll
Original file line number Diff line number Diff line change
@@ -45,8 +45,7 @@ define i1 @test_return_bool(i8* %value, i8 %oldValue, i8 %newValue) {

; CHECK: [[FAILED]]:
; CHECK-NOT: cmp {{w[0-9]+}}, {{w[0-9]+}}
; CHECK: mov [[TMP:w[0-9]+]], wzr
; CHECK: eor w0, [[TMP]], #0x1
; CHECK: eor w0, wzr, #0x1
; CHECK: ret

%pair = cmpxchg i8* %value, i8 %oldValue, i8 %newValue acq_rel monotonic
Loading

0 comments on commit 94503c7

Please sign in to comment.