Index: llvm/trunk/lib/Target/X86/X86FlagsCopyLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86FlagsCopyLowering.cpp +++ llvm/trunk/lib/Target/X86/X86FlagsCopyLowering.cpp @@ -27,6 +27,7 @@ #include "X86Subtarget.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/ScopeExit.h" #include "llvm/ADT/SmallPtrSet.h" @@ -102,7 +103,7 @@ MachineDominatorTree *MDT; CondRegArray collectCondsInRegs(MachineBasicBlock &MBB, - MachineInstr &CopyDefI); + MachineBasicBlock::iterator CopyDefI); unsigned promoteCondToReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator TestPos, @@ -356,9 +357,14 @@ // Nothing to do for a degenerate empty function... return false; + // Collect the copies in RPO so that when there are chains where a copy is in + // turn copied again we visit the first one first. This ensures we can find + // viable locations for testing the original EFLAGS that dominate all the + // uses across complex CFGs. SmallVector Copies; - for (MachineBasicBlock &MBB : MF) - for (MachineInstr &MI : MBB) + ReversePostOrderTraversal RPOT(&MF); + for (MachineBasicBlock *MBB : RPOT) + for (MachineInstr &MI : *MBB) if (MI.getOpcode() == TargetOpcode::COPY && MI.getOperand(0).getReg() == X86::EFLAGS) Copies.push_back(&MI); @@ -407,12 +413,99 @@ if (DOp.isDead()) continue; - MachineBasicBlock &TestMBB = *CopyDefI.getParent(); + MachineBasicBlock *TestMBB = CopyDefI.getParent(); auto TestPos = CopyDefI.getIterator(); DebugLoc TestLoc = CopyDefI.getDebugLoc(); LLVM_DEBUG(dbgs() << "Rewriting copy: "; CopyI->dump()); + // Walk up across live-in EFLAGS to find where they were actually def'ed. + // + // This copy's def may just be part of a region of blocks covered by + // a single def of EFLAGS and we want to find the top of that region where + // possible. + // + // This is essentially a search for a *candidate* reaching definition + // location. We don't need to ever find the actual reaching definition here, + // but we want to walk up the dominator tree to find the highest point which + // would be viable for such a definition. + auto HasEFLAGSClobber = [&](MachineBasicBlock::iterator Begin, + MachineBasicBlock::iterator End) { + // Scan backwards as we expect these to be relatively short and often find + // a clobber near the end. + return llvm::any_of( + llvm::reverse(llvm::make_range(Begin, End)), [&](MachineInstr &MI) { + // Flag any instruction (other than the copy we are + // currently rewriting) that defs EFLAGS. + return &MI != CopyI && MI.findRegisterDefOperand(X86::EFLAGS); + }); + }; + auto HasEFLAGSClobberPath = [&](MachineBasicBlock *BeginMBB, + MachineBasicBlock *EndMBB) { + assert(MDT->dominates(BeginMBB, EndMBB) && + "Only support paths down the dominator tree!"); + SmallPtrSet Visited; + SmallVector Worklist; + // We terminate at the beginning. No need to scan it. + Visited.insert(BeginMBB); + Worklist.push_back(EndMBB); + do { + auto *MBB = Worklist.pop_back_val(); + for (auto *PredMBB : MBB->predecessors()) { + if (!Visited.insert(PredMBB).second) + continue; + if (HasEFLAGSClobber(PredMBB->begin(), PredMBB->end())) + return true; + // Enqueue this block to walk its predecessors. + Worklist.push_back(PredMBB); + } + } while (!Worklist.empty()); + // No clobber found along a path from the begin to end. + return false; + }; + while (TestMBB->isLiveIn(X86::EFLAGS) && !TestMBB->pred_empty() && + !HasEFLAGSClobber(TestMBB->begin(), TestPos)) { + // Find the nearest common dominator of the predecessors, as + // that will be the best candidate to hoist into. + MachineBasicBlock *HoistMBB = + std::accumulate(std::next(TestMBB->pred_begin()), TestMBB->pred_end(), + *TestMBB->pred_begin(), + [&](MachineBasicBlock *LHS, MachineBasicBlock *RHS) { + return MDT->findNearestCommonDominator(LHS, RHS); + }); + + // Now we need to scan all predecessors that may be reached along paths to + // the hoist block. A clobber anywhere in any of these blocks the hoist. + // Note that this even handles loops because we require *no* clobbers. + if (HasEFLAGSClobberPath(HoistMBB, TestMBB)) + break; + + // We also need the terminators to not sneakily clobber flags. + if (HasEFLAGSClobber(HoistMBB->getFirstTerminator()->getIterator(), + HoistMBB->instr_end())) + break; + + // We found a viable location, hoist our test position to it. + TestMBB = HoistMBB; + TestPos = TestMBB->getFirstTerminator()->getIterator(); + // Clear the debug location as it would just be confusing after hoisting. + TestLoc = DebugLoc(); + } + LLVM_DEBUG({ + auto DefIt = llvm::find_if( + llvm::reverse(llvm::make_range(TestMBB->instr_begin(), TestPos)), + [&](MachineInstr &MI) { + return MI.findRegisterDefOperand(X86::EFLAGS); + }); + if (DefIt.base() != TestMBB->instr_begin()) { + dbgs() << " Using EFLAGS defined by: "; + DefIt->dump(); + } else { + dbgs() << " Using live-in flags for BB:\n"; + TestMBB->dump(); + } + }); + // While rewriting uses, we buffer jumps and rewrite them in a second pass // because doing so will perturb the CFG that we are walking to find the // uses in the first place. @@ -423,7 +516,7 @@ // very few of them and we expect to not revisit the same copy definition // many times. If either of those change sufficiently we could build a map // of these up front instead. - CondRegArray CondRegs = collectCondsInRegs(TestMBB, CopyDefI); + CondRegArray CondRegs = collectCondsInRegs(*TestMBB, TestPos); // Collect the basic blocks we need to scan. Typically this will just be // a single basic block but we may have to scan multiple blocks if the @@ -431,7 +524,6 @@ SmallVector Blocks; SmallPtrSet VisitedBlocks; Blocks.push_back(&MBB); - VisitedBlocks.insert(&MBB); do { MachineBasicBlock &UseMBB = *Blocks.pop_back_val(); @@ -439,36 +531,32 @@ // Track when if/when we find a kill of the flags in this block. bool FlagsKilled = false; - // We currently don't do any PHI insertion and so we require that the - // test basic block dominates all of the use basic blocks. - // - // We could in theory do PHI insertion here if it becomes useful by just - // taking undef values in along every edge that we don't trace this - // EFLAGS copy along. This isn't as bad as fully general PHI insertion, - // but still seems like a great deal of complexity. - // - // Because it is theoretically possible that some earlier MI pass or - // other lowering transformation could induce this to happen, we do - // a hard check even in non-debug builds here. - if (&TestMBB != &UseMBB && !MDT->dominates(&TestMBB, &UseMBB)) { - LLVM_DEBUG({ - dbgs() << "ERROR: Encountered use that is not dominated by our test " - "basic block! Rewriting this would require inserting PHI " - "nodes to track the flag state across the CFG.\n\nTest " - "block:\n"; - TestMBB.dump(); - dbgs() << "Use block:\n"; - UseMBB.dump(); - }); - report_fatal_error("Cannot lower EFLAGS copy when original copy def " - "does not dominate all uses."); - } - - for (auto MII = &UseMBB == &MBB ? std::next(CopyI->getIterator()) - : UseMBB.instr_begin(), + // In most cases, we walk from the beginning to the end of the block. But + // when the block is the same block as the copy is from, we will visit it + // twice. The first time we start from the copy and go to the end. The + // second time we start from the beginning and go to the copy. This lets + // us handle copies inside of cycles. + // FIXME: This loop is *super* confusing. This is at least in part + // a symptom of all of this routine needing to be refactored into + // documentable components. Once done, there may be a better way to write + // this loop. + for (auto MII = (&UseMBB == &MBB && !VisitedBlocks.count(&UseMBB)) + ? std::next(CopyI->getIterator()) + : UseMBB.instr_begin(), MIE = UseMBB.instr_end(); MII != MIE;) { MachineInstr &MI = *MII++; + // If we are in the original copy block and encounter either the copy + // def or the copy itself, break so that we don't re-process any part of + // the block or process the instructions in the range that was copied + // over. + if (&MI == CopyI || &MI == &CopyDefI) { + assert(&UseMBB == &MBB && VisitedBlocks.count(&MBB) && + "Should only encounter these on the second pass over the " + "original block."); + break; + } + MachineOperand *FlagUse = MI.findRegisterUseOperand(X86::EFLAGS); if (!FlagUse) { if (MI.findRegisterDefOperand(X86::EFLAGS)) { @@ -512,10 +600,10 @@ // Otherwise we can just rewrite in-place. if (X86::getCondFromCMovOpc(MI.getOpcode()) != X86::COND_INVALID) { - rewriteCMov(TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs); + rewriteCMov(*TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs); } else if (X86::getCondFromSETOpc(MI.getOpcode()) != X86::COND_INVALID) { - rewriteSetCC(TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs); + rewriteSetCC(*TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs); } else if (MI.getOpcode() == TargetOpcode::COPY) { rewriteCopy(MI, *FlagUse, CopyDefI); } else { @@ -538,13 +626,13 @@ case X86::SETB_C64r: // Use custom lowering for arithmetic that is merely extending the // carry flag. We model this as the SETB_C* pseudo instructions. - rewriteSetCarryExtended(TestMBB, TestPos, TestLoc, MI, *FlagUse, + rewriteSetCarryExtended(*TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs); break; default: // Generically handle remaining uses as arithmetic instructions. - rewriteArithmetic(TestMBB, TestPos, TestLoc, MI, *FlagUse, + rewriteArithmetic(*TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs); break; } @@ -564,8 +652,38 @@ // and queue those up for processing. for (MachineBasicBlock *SuccMBB : UseMBB.successors()) if (SuccMBB->isLiveIn(X86::EFLAGS) && - VisitedBlocks.insert(SuccMBB).second) + VisitedBlocks.insert(SuccMBB).second) { + // We currently don't do any PHI insertion and so we require that the + // test basic block dominates all of the use basic blocks. Further, we + // can't have a cycle from the test block back to itself as that would + // create a cycle requiring a PHI to break it. + // + // We could in theory do PHI insertion here if it becomes useful by + // just taking undef values in along every edge that we don't trace + // this EFLAGS copy along. This isn't as bad as fully general PHI + // insertion, but still seems like a great deal of complexity. + // + // Because it is theoretically possible that some earlier MI pass or + // other lowering transformation could induce this to happen, we do + // a hard check even in non-debug builds here. + if (SuccMBB == TestMBB || !MDT->dominates(TestMBB, SuccMBB)) { + LLVM_DEBUG({ + dbgs() + << "ERROR: Encountered use that is not dominated by our test " + "basic block! Rewriting this would require inserting PHI " + "nodes to track the flag state across the CFG.\n\nTest " + "block:\n"; + TestMBB->dump(); + dbgs() << "Use block:\n"; + SuccMBB->dump(); + }); + report_fatal_error( + "Cannot lower EFLAGS copy when original copy def " + "does not dominate all uses."); + } + Blocks.push_back(SuccMBB); + } } while (!Blocks.empty()); // Now rewrite the jumps that use the flags. These we handle specially @@ -580,7 +698,7 @@ else LastJmpMBB = JmpI->getParent(); - rewriteCondJmp(TestMBB, TestPos, TestLoc, *JmpI, CondRegs); + rewriteCondJmp(*TestMBB, TestPos, TestLoc, *JmpI, CondRegs); } // FIXME: Mark the last use of EFLAGS before the copy's def as a kill if @@ -604,14 +722,13 @@ /// Collect any conditions that have already been set in registers so that we /// can re-use them rather than adding duplicates. -CondRegArray -X86FlagsCopyLoweringPass::collectCondsInRegs(MachineBasicBlock &MBB, - MachineInstr &CopyDefI) { +CondRegArray X86FlagsCopyLoweringPass::collectCondsInRegs( + MachineBasicBlock &MBB, MachineBasicBlock::iterator TestPos) { CondRegArray CondRegs = {}; // Scan backwards across the range of instructions with live EFLAGS. - for (MachineInstr &MI : llvm::reverse( - llvm::make_range(MBB.instr_begin(), CopyDefI.getIterator()))) { + for (MachineInstr &MI : + llvm::reverse(llvm::make_range(MBB.begin(), TestPos))) { X86::CondCode Cond = X86::getCondFromSETOpc(MI.getOpcode()); if (Cond != X86::COND_INVALID && MI.getOperand(0).isReg() && TRI->isVirtualRegister(MI.getOperand(0).getReg())) Index: llvm/trunk/test/CodeGen/X86/flags-copy-lowering.mir =================================================================== --- llvm/trunk/test/CodeGen/X86/flags-copy-lowering.mir +++ llvm/trunk/test/CodeGen/X86/flags-copy-lowering.mir @@ -84,6 +84,12 @@ call void @foo() ret i64 0 } + + define i64 @test_mid_cycle_copies(i64 %a, i64 %b) { + entry: + call void @foo() + ret i64 0 + } ... --- name: test_branch @@ -738,3 +744,195 @@ RET 0, $rax ... +--- +# This test case is designed to exercise a particularly challenging situation: +# when the flags are copied and restored *inside* of a complex and cyclic CFG +# all of which have live-in flags. To correctly handle this case we have to walk +# up the dominator tree and locate a viable reaching definition location, +# checking for clobbers along any path. The CFG for this function looks like the +# following diagram, control flowing out the bottom of blocks and in the top: +# +# bb.0 +# | __________________ +# |/ \ +# bb.1 | +# |\_________ | +# | __ \ ____ | +# |/ \ |/ \ | +# bb.2 | bb.4 | | +# |\__/ / \ | | +# | / \ | | +# bb.3 bb.5 bb.6 | | +# | \ / | | +# | \ / | | +# | bb.7 | | +# | ________/ \____/ | +# |/ | +# bb.8 | +# |\__________________/ +# | +# bb.9 +# +# We set EFLAGS in bb.0, clobber them in bb.3, and copy them in bb.2 and bb.6. +# Because of the cycles this requires hoisting the `SETcc` instructions to +# capture the flags for the bb.6 copy to bb.1 and using them for the copy in +# `bb.2` as well despite the clobber in `bb.3`. The clobber in `bb.3` also +# prevents hoisting the `SETcc`s up to `bb.0`. +# +# Throughout the test we use branch instructions that are totally bogus (as the +# flags are obviously not changing!) but this is just to allow us to send +# a small but complex CFG structure through the backend and force it to choose +# plausible lowering decisions based on the core CFG presented, regardless of +# the futility of the actual branches. +name: test_mid_cycle_copies +# CHECK-LABEL: name: test_mid_cycle_copies +liveins: + - { reg: '$rdi', virtual-reg: '%0' } + - { reg: '$rsi', virtual-reg: '%1' } +body: | + bb.0: + successors: %bb.1 + liveins: $rdi, $rsi + + %0:gr64 = COPY $rdi + %1:gr64 = COPY $rsi + CMP64rr %0, %1, implicit-def $eflags + ; CHECK: bb.0: + ; CHECK-NOT: COPY{{( killed)?}} $eflags + ; CHECK: CMP64rr %0, %1, implicit-def $eflags + ; CHECK-NOT: COPY{{( killed)?}} $eflags + JMP_1 %bb.1 + + bb.1: + successors: %bb.2, %bb.4 + liveins: $eflags + + ; Outer loop header, target for one set of hoisting. + JE_1 %bb.2, implicit $eflags + JMP_1 %bb.4 + ; CHECK: bb.1: + ; CHECK-NOT: COPY{{( killed)?}} $eflags + ; CHECK: %[[A_REG:[^:]*]]:gr8 = SETAr implicit $eflags + ; CHECK-NEXT: %[[E_REG:[^:]*]]:gr8 = SETEr implicit $eflags + ; CHECK-NEXT: %[[B_REG:[^:]*]]:gr8 = SETBr implicit $eflags + ; CHECK-NOT: COPY{{( killed)?}} $eflags + + bb.2: + successors: %bb.2, %bb.3 + liveins: $eflags + + ; Inner loop with a local copy. We should eliminate this but can't hoist. + %2:gr64 = COPY $eflags + $eflags = COPY %2 + JE_1 %bb.2, implicit $eflags + JMP_1 %bb.3 + ; CHECK: bb.2: + ; CHECK-NOT: COPY{{( killed)?}} $eflags + ; CHECK: TEST8rr %[[E_REG]], %[[E_REG]], implicit-def $eflags + ; CHECK-NEXT: JNE_1 %bb.2, implicit killed $eflags + ; CHECK-NOT: COPY{{( killed)?}} $eflags + + bb.3: + successors: %bb.8 + liveins: $eflags + + ; Use and then clobber $eflags. Then hop to the outer loop latch. + %3:gr64 = ADC64ri32 %0, 42, implicit-def dead $eflags, implicit $eflags + ; CHECK: bb.3: + ; CHECK-NOT: COPY{{( killed)?}} $eflags + ; CHECK: dead %{{[^:]*}}:gr8 = ADD8ri %[[B_REG]], 255, implicit-def $eflags + ; CHECK-NEXT: %3:gr64 = ADC64ri32 %0, 42, implicit-def{{( dead)?}} $eflags, implicit{{( killed)?}} $eflags + ; CHECK-NOT: COPY{{( killed)?}} $eflags + MOV64mr $rsp, 1, $noreg, -16, $noreg, killed %3 + JMP_1 %bb.8 + + bb.4: + successors: %bb.5, %bb.6 + liveins: $eflags + + ; Another inner loop, this one with a diamond. + JE_1 %bb.5, implicit $eflags + JMP_1 %bb.6 + ; CHECK: bb.4: + ; CHECK-NOT: COPY{{( killed)?}} $eflags + ; CHECK: TEST8rr %[[E_REG]], %[[E_REG]], implicit-def $eflags + ; CHECK-NEXT: JNE_1 %bb.5, implicit killed $eflags + ; CHECK-NOT: COPY{{( killed)?}} $eflags + + bb.5: + successors: %bb.7 + liveins: $eflags + + ; Just use $eflags on this side of the diamond. + %4:gr64 = CMOVA64rr %0, %1, implicit $eflags + ; CHECK: bb.5: + ; CHECK-NOT: COPY{{( killed)?}} $eflags + ; CHECK: TEST8rr %[[A_REG]], %[[A_REG]], implicit-def $eflags + ; CHECK-NEXT: %4:gr64 = CMOVNE64rr %0, %1, implicit killed $eflags + ; CHECK-NOT: COPY{{( killed)?}} $eflags + MOV64mr $rsp, 1, $noreg, -16, $noreg, killed %4 + JMP_1 %bb.7 + + bb.6: + successors: %bb.7 + liveins: $eflags + + ; Use, copy, and then use $eflags again. + %5:gr64 = CMOVA64rr %0, %1, implicit $eflags + ; CHECK: bb.6: + ; CHECK-NOT: COPY{{( killed)?}} $eflags + ; CHECK: TEST8rr %[[A_REG]], %[[A_REG]], implicit-def $eflags + ; CHECK-NEXT: %5:gr64 = CMOVNE64rr %0, %1, implicit killed $eflags + ; CHECK-NOT: COPY{{( killed)?}} $eflags + MOV64mr $rsp, 1, $noreg, -16, $noreg, killed %5 + + %6:gr64 = COPY $eflags + $eflags = COPY %6:gr64 + + %7:gr64 = CMOVA64rr %0, %1, implicit $eflags + ; CHECK-NOT: COPY{{( killed)?}} $eflags + ; CHECK: TEST8rr %[[A_REG]], %[[A_REG]], implicit-def $eflags + ; CHECK-NEXT: %7:gr64 = CMOVNE64rr %0, %1, implicit killed $eflags + ; CHECK-NOT: COPY{{( killed)?}} $eflags + MOV64mr $rsp, 1, $noreg, -16, $noreg, killed %7 + JMP_1 %bb.7 + + bb.7: + successors: %bb.4, %bb.8 + liveins: $eflags + + ; Inner loop latch. + JE_1 %bb.4, implicit $eflags + JMP_1 %bb.8 + ; CHECK: bb.7: + ; CHECK-NOT: COPY{{( killed)?}} $eflags + ; CHECK: TEST8rr %[[E_REG]], %[[E_REG]], implicit-def $eflags + ; CHECK-NEXT: JNE_1 %bb.4, implicit killed $eflags + ; CHECK-NOT: COPY{{( killed)?}} $eflags + + bb.8: + successors: %bb.1, %bb.9 + + ; Outer loop latch. Note that we cannot have EFLAGS live-in here as that + ; Immediately require PHIs. + CMP64rr %0, %1, implicit-def $eflags + JE_1 %bb.1, implicit $eflags + JMP_1 %bb.9 + ; CHECK: bb.8: + ; CHECK-NOT: COPY{{( killed)?}} $eflags + ; CHECK: CMP64rr %0, %1, implicit-def $eflags + ; CHECK-NEXT: JE_1 %bb.1, implicit $eflags + ; CHECK-NOT: COPY{{( killed)?}} $eflags + + bb.9: + liveins: $eflags + + ; And we're done. + %8:gr64 = CMOVE64rr %0, %1, implicit killed $eflags + $rax = COPY %8 + RET 0, $rax + ; CHECK: bb.9: + ; CHECK-NOT: $eflags + ; CHECK: %8:gr64 = CMOVE64rr %0, %1, implicit killed $eflags + +...