Index: llvm/trunk/lib/CodeGen/PeepholeOptimizer.cpp =================================================================== --- llvm/trunk/lib/CodeGen/PeepholeOptimizer.cpp +++ llvm/trunk/lib/CodeGen/PeepholeOptimizer.cpp @@ -98,6 +98,10 @@ DisableAdvCopyOpt("disable-adv-copy-opt", cl::Hidden, cl::init(false), cl::desc("Disable advanced copy optimization")); +static cl::opt DisableNAPhysCopyOpt( + "disable-non-allocatable-phys-copy-opt", cl::Hidden, cl::init(false), + cl::desc("Disable non-allocatable physical register copy optimization")); + // Limit the number of PHI instructions to process // in PeepholeOptimizer::getNextSource. static cl::opt RewritePHILimit( @@ -111,6 +115,7 @@ STATISTIC(NumSelects, "Number of selects optimized"); STATISTIC(NumUncoalescableCopies, "Number of uncoalescable copies optimized"); STATISTIC(NumRewrittenCopies, "Number of copies rewritten"); +STATISTIC(NumNAPhysCopies, "Number of non-allocatable physical copies removed"); namespace { class ValueTrackerResult; @@ -162,12 +167,24 @@ DenseMap &ImmDefMIs); /// \brief If copy instruction \p MI is a virtual register copy, track it in - /// the set \p CopiedFromRegs and \p CopyMIs. If this virtual register was + /// the set \p CopySrcRegs and \p CopyMIs. If this virtual register was /// previously seen as a copy, replace the uses of this copy with the /// previously seen copy's destination register. bool foldRedundantCopy(MachineInstr *MI, - SmallSet &CopiedFromRegs, - DenseMap &CopyMIs); + SmallSet &CopySrcRegs, + DenseMap &CopyMIs); + + /// \brief Is the register \p Reg a non-allocatable physical register? + bool isNAPhysCopy(unsigned Reg); + + /// \brief If copy instruction \p MI is a non-allocatable virtual<->physical + /// register copy, track it in the \p NAPhysToVirtMIs map. If this + /// non-allocatable physical register was previously copied to a virtual + /// registered and hasn't been clobbered, the virt->phys copy can be + /// deleted. + bool foldRedundantNAPhysCopy( + MachineInstr *MI, + DenseMap &NAPhysToVirtMIs); bool isLoadFoldable(MachineInstr *MI, SmallSet &FoldAsLoadDefCandidates); @@ -1332,7 +1349,7 @@ if (ImmDefRegs.count(Reg) == 0) continue; DenseMap::iterator II = ImmDefMIs.find(Reg); - assert(II != ImmDefMIs.end()); + assert(II != ImmDefMIs.end() && "couldn't find immediate definition"); if (TII->FoldImmediate(MI, II->second, Reg, MRI)) { ++NumImmFold; return true; @@ -1356,10 +1373,10 @@ // // Should replace %vreg2 uses with %vreg1:sub1 bool PeepholeOptimizer::foldRedundantCopy( - MachineInstr *MI, - SmallSet &CopySrcRegs, - DenseMap &CopyMIs) { - assert(MI->isCopy()); + MachineInstr *MI, + SmallSet &CopySrcRegs, + DenseMap &CopyMIs) { + assert(MI->isCopy() && "expected a COPY machine instruction"); unsigned SrcReg = MI->getOperand(1).getReg(); if (!TargetRegisterInfo::isVirtualRegister(SrcReg)) @@ -1400,6 +1417,59 @@ return true; } +bool PeepholeOptimizer::isNAPhysCopy(unsigned Reg) { + return TargetRegisterInfo::isPhysicalRegister(Reg) && + !MRI->isAllocatable(Reg); +} + +bool PeepholeOptimizer::foldRedundantNAPhysCopy( + MachineInstr *MI, DenseMap &NAPhysToVirtMIs) { + assert(MI->isCopy() && "expected a COPY machine instruction"); + + if (DisableNAPhysCopyOpt) + return false; + + unsigned DstReg = MI->getOperand(0).getReg(); + unsigned SrcReg = MI->getOperand(1).getReg(); + if (isNAPhysCopy(SrcReg) && TargetRegisterInfo::isVirtualRegister(DstReg)) { + // %vreg = COPY %PHYSREG + // Avoid using a datastructure which can track multiple live non-allocatable + // phys->virt copies since LLVM doesn't seem to do this. + NAPhysToVirtMIs.insert({SrcReg, MI}); + return false; + } + + if (!(TargetRegisterInfo::isVirtualRegister(SrcReg) && isNAPhysCopy(DstReg))) + return false; + + // %PHYSREG = COPY %vreg + auto PrevCopy = NAPhysToVirtMIs.find(DstReg); + if (PrevCopy == NAPhysToVirtMIs.end()) { + // We can't remove the copy: there was an intervening clobber of the + // non-allocatable physical register after the copy to virtual. + DEBUG(dbgs() << "NAPhysCopy: intervening clobber forbids erasing " << *MI + << '\n'); + return false; + } + + unsigned PrevDstReg = PrevCopy->second->getOperand(0).getReg(); + if (PrevDstReg == SrcReg) { + // Remove the virt->phys copy: we saw the virtual register definition, and + // the non-allocatable physical register's state hasn't changed since then. + DEBUG(dbgs() << "NAPhysCopy: erasing " << *MI << '\n'); + ++NumNAPhysCopies; + return true; + } + + // Potential missed optimization opportunity: we saw a different virtual + // register get a copy of the non-allocatable physical register, and we only + // track one such copy. Avoid getting confused by this new non-allocatable + // physical register definition, and remove it from the tracked copies. + DEBUG(dbgs() << "NAPhysCopy: missed opportunity " << *MI << '\n'); + NAPhysToVirtMIs.erase(PrevCopy); + return false; +} + bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) { if (skipOptnoneFunction(*MF.getFunction())) return false; @@ -1433,6 +1503,13 @@ DenseMap ImmDefMIs; SmallSet FoldAsLoadDefCandidates; + // Track when a non-allocatable physical register is copied to a virtual + // register so that useless moves can be removed. + // + // %PHYSREG is the map index; MI is the last valid `%vreg = COPY %PHYSREG` + // without any intervening re-definition of %PHYSREG. + DenseMap NAPhysToVirtMIs; + // Set of virtual registers that are copied from. SmallSet CopySrcRegs; DenseMap CopySrcMIs; @@ -1453,10 +1530,51 @@ if (MI->isLoadFoldBarrier()) FoldAsLoadDefCandidates.clear(); - if (MI->isPosition() || MI->isPHI() || MI->isImplicitDef() || - MI->isKill() || MI->isInlineAsm() || - MI->hasUnmodeledSideEffects()) + if (MI->isPosition() || MI->isPHI()) + continue; + + if (!MI->isCopy()) { + for (const auto &Op : MI->operands()) { + // Visit all operands: definitions can be implicit or explicit. + if (Op.isReg()) { + unsigned Reg = Op.getReg(); + if (Op.isDef() && isNAPhysCopy(Reg)) { + const auto &Def = NAPhysToVirtMIs.find(Reg); + if (Def != NAPhysToVirtMIs.end()) { + // A new definition of the non-allocatable physical register + // invalidates previous copies. + DEBUG(dbgs() << "NAPhysCopy: invalidating because of " << *MI + << '\n'); + NAPhysToVirtMIs.erase(Def); + } + } + } else if (Op.isRegMask()) { + const uint32_t *RegMask = Op.getRegMask(); + for (auto &RegMI : NAPhysToVirtMIs) { + unsigned Def = RegMI.first; + if (MachineOperand::clobbersPhysReg(RegMask, Def)) { + DEBUG(dbgs() << "NAPhysCopy: invalidating because of " << *MI + << '\n'); + NAPhysToVirtMIs.erase(Def); + } + } + } + } + } + + if (MI->isImplicitDef() || MI->isKill()) + continue; + + if (MI->isInlineAsm() || MI->hasUnmodeledSideEffects()) { + // Blow away all non-allocatable physical registers knowledge since we + // don't know what's correct anymore. + // + // FIXME: handle explicit asm clobbers. + DEBUG(dbgs() << "NAPhysCopy: blowing away all info due to " << *MI + << '\n'); + NAPhysToVirtMIs.clear(); continue; + } if ((isUncoalescableCopy(*MI) && optimizeUncoalescableCopy(MI, LocalMIs)) || @@ -1479,7 +1597,9 @@ continue; } - if (MI->isCopy() && foldRedundantCopy(MI, CopySrcRegs, CopySrcMIs)) { + if (MI->isCopy() && + (foldRedundantCopy(MI, CopySrcRegs, CopySrcMIs) || + foldRedundantNAPhysCopy(MI, NAPhysToVirtMIs))) { LocalMIs.erase(MI); MI->eraseFromParent(); Changed = true; Index: llvm/trunk/test/CodeGen/X86/peephole-na-phys-copy-folding.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/peephole-na-phys-copy-folding.ll +++ llvm/trunk/test/CodeGen/X86/peephole-na-phys-copy-folding.ll @@ -0,0 +1,190 @@ +; RUN: llc -verify-machineinstrs -mtriple=i386-linux-gnu %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -mtriple=x86_64-linux-gnu %s -o - | FileCheck %s + +; The peephole optimizer can elide some physical register copies such as +; EFLAGS. Make sure the flags are used directly, instead of needlessly using +; lahf, when possible. + +@L = external global i32 +@M = external global i8 +declare i32 @bar(i64) + +; CHECK-LABEL: plus_one +; CHECK-NOT: seto +; CHECK-NOT: lahf +; CHECK-NOT: sahf +; CHECK-NOT: pushf +; CHECK-NOT: popf +; CHECK: incl L +define i1 @plus_one() { +entry: + %loaded_L = load i32, i32* @L + %val = add nsw i32 %loaded_L, 1 ; N.B. will emit inc. + store i32 %val, i32* @L + %loaded_M = load i8, i8* @M + %masked = and i8 %loaded_M, 8 + %M_is_true = icmp ne i8 %masked, 0 + %L_is_false = icmp eq i32 %val, 0 + %cond = and i1 %L_is_false, %M_is_true + br i1 %cond, label %exit2, label %exit + +exit: + ret i1 true + +exit2: + ret i1 false +} + +; CHECK-LABEL: plus_forty_two +; CHECK-NOT: seto +; CHECK-NOT: lahf +; CHECK-NOT: sahf +; CHECK-NOT: pushf +; CHECK-NOT: popf +; CHECK: addl $42, +define i1 @plus_forty_two() { +entry: + %loaded_L = load i32, i32* @L + %val = add nsw i32 %loaded_L, 42 ; N.B. won't emit inc. + store i32 %val, i32* @L + %loaded_M = load i8, i8* @M + %masked = and i8 %loaded_M, 8 + %M_is_true = icmp ne i8 %masked, 0 + %L_is_false = icmp eq i32 %val, 0 + %cond = and i1 %L_is_false, %M_is_true + br i1 %cond, label %exit2, label %exit + +exit: + ret i1 true + +exit2: + ret i1 false +} + +; CHECK-LABEL: minus_one +; CHECK-NOT: seto +; CHECK-NOT: lahf +; CHECK-NOT: sahf +; CHECK-NOT: pushf +; CHECK-NOT: popf +; CHECK: decl L +define i1 @minus_one() { +entry: + %loaded_L = load i32, i32* @L + %val = add nsw i32 %loaded_L, -1 ; N.B. will emit dec. + store i32 %val, i32* @L + %loaded_M = load i8, i8* @M + %masked = and i8 %loaded_M, 8 + %M_is_true = icmp ne i8 %masked, 0 + %L_is_false = icmp eq i32 %val, 0 + %cond = and i1 %L_is_false, %M_is_true + br i1 %cond, label %exit2, label %exit + +exit: + ret i1 true + +exit2: + ret i1 false +} + +; CHECK-LABEL: minus_forty_two +; CHECK-NOT: seto +; CHECK-NOT: lahf +; CHECK-NOT: sahf +; CHECK-NOT: pushf +; CHECK-NOT: popf +; CHECK: addl $-42, +define i1 @minus_forty_two() { +entry: + %loaded_L = load i32, i32* @L + %val = add nsw i32 %loaded_L, -42 ; N.B. won't emit dec. + store i32 %val, i32* @L + %loaded_M = load i8, i8* @M + %masked = and i8 %loaded_M, 8 + %M_is_true = icmp ne i8 %masked, 0 + %L_is_false = icmp eq i32 %val, 0 + %cond = and i1 %L_is_false, %M_is_true + br i1 %cond, label %exit2, label %exit + +exit: + ret i1 true + +exit2: + ret i1 false +} + +; CHECK-LABEL: test_intervening_call: +; CHECK: cmpxchg +; CHECK: seto %al +; CHECK-NEXT: lahf +; CHECK: call{{[lq]}} bar +; CHECK: addb $127, %al +; CHECK-NEXT: sahf +define i64 @test_intervening_call(i64* %foo, i64 %bar, i64 %baz) { + ; cmpxchg sets EFLAGS, call clobbers it, then br uses EFLAGS. + %cx = cmpxchg i64* %foo, i64 %bar, i64 %baz seq_cst seq_cst + %v = extractvalue { i64, i1 } %cx, 0 + %p = extractvalue { i64, i1 } %cx, 1 + call i32 @bar(i64 %v) + br i1 %p, label %t, label %f + +t: + ret i64 42 + +f: + ret i64 0 +} + +; CHECK-LABEL: test_two_live_flags: +; CHECK: cmpxchg +; CHECK-NEXT: seto %al +; CHECK-NEXT: lahf +; Save result of the first cmpxchg into D. +; CHECK-NEXT: mov{{[lq]}} %[[AX:[er]ax]], %[[D:[re]d[xi]]] +; CHECK: cmpxchg +; CHECK-NEXT: sete %al +; Save result of the second cmpxchg onto the stack. +; CHECK-NEXT: push{{[lq]}} %[[AX]] +; Restore result of the first cmpxchg from D, put it back in EFLAGS. +; CHECK-NEXT: mov{{[lq]}} %[[D]], %[[AX]] +; CHECK-NEXT: addb $127, %al +; CHECK-NEXT: sahf +; Restore result of the second cmpxchg from the stack. +; CHECK-NEXT: pop{{[lq]}} %[[AX]] +; Test from EFLAGS restored from first cmpxchg, jump if that fails. +; CHECK-NEXT: jne +; Fallthrough to test the second cmpxchg's result. +; CHECK: testb %al, %al +; CHECK-NEXT: je +define i64 @test_two_live_flags( + i64* %foo0, i64 %bar0, i64 %baz0, + i64* %foo1, i64 %bar1, i64 %baz1) { + %cx0 = cmpxchg i64* %foo0, i64 %bar0, i64 %baz0 seq_cst seq_cst + %p0 = extractvalue { i64, i1 } %cx0, 1 + %cx1 = cmpxchg i64* %foo1, i64 %bar1, i64 %baz1 seq_cst seq_cst + %p1 = extractvalue { i64, i1 } %cx1, 1 + %flag = and i1 %p0, %p1 + br i1 %flag, label %t, label %f + +t: + ret i64 42 + +f: + ret i64 0 +} + +; CHECK-LABEL: asm_clobbering_flags: +; CHECK: test +; CHECK-NEXT: setg +; CHECK-NEXT: #APP +; CHECK-NEXT: bsfl +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: movl +; CHECK-NEXT: ret +define i1 @asm_clobbering_flags(i32* %mem) { + %val = load i32, i32* %mem, align 4 + %cmp = icmp sgt i32 %val, 0 + %res = tail call i32 asm "bsfl $1,$0", "=r,r,~{cc},~{dirflag},~{fpsr},~{flags}"(i32 %val) + store i32 %res, i32* %mem, align 4 + ret i1 %cmp +}