Index: llvm/lib/Target/X86/X86FlagsCopyLowering.cpp =================================================================== --- llvm/lib/Target/X86/X86FlagsCopyLowering.cpp +++ llvm/lib/Target/X86/X86FlagsCopyLowering.cpp @@ -36,6 +36,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineConstantPool.h" +#include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstr.h" @@ -98,6 +99,7 @@ const X86InstrInfo *TII; const TargetRegisterInfo *TRI; const TargetRegisterClass *PromoteRC; + MachineDominatorTree *MDT; CondRegArray collectCondsInRegs(MachineBasicBlock &MBB, MachineInstr &CopyDefI); @@ -145,6 +147,7 @@ char X86FlagsCopyLoweringPass::ID = 0; void X86FlagsCopyLoweringPass::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -342,6 +345,7 @@ MRI = &MF.getRegInfo(); TII = Subtarget.getInstrInfo(); TRI = Subtarget.getRegisterInfo(); + MDT = &getAnalysis(); PromoteRC = &X86::GR8RegClass; if (MF.begin() == MF.end()) @@ -416,103 +420,142 @@ // of these up front instead. CondRegArray CondRegs = collectCondsInRegs(TestMBB, CopyDefI); - for (auto MII = std::next(CopyI->getIterator()), MIE = MBB.instr_end(); - MII != MIE;) { - MachineInstr &MI = *MII++; - MachineOperand *FlagUse = MI.findRegisterUseOperand(X86::EFLAGS); - if (!FlagUse) { - if (MI.findRegisterDefOperand(X86::EFLAGS)) { - // If EFLAGS are defined, it's as-if they were killed. We can stop - // scanning here. - // - // NB!!! Many instructions only modify some flags. LLVM currently - // models this as clobbering all flags, but if that ever changes this - // will need to be carefully updated to handle that more complex - // logic. + // Collect the basic blocks we need to scan. Typically this will just be + // a single basic block but we may have to scan multiple blocks if the + // EFLAGS copy lives into successors. + SmallVector Blocks; + SmallPtrSet VisitedBlocks; + Blocks.push_back(&MBB); + VisitedBlocks.insert(&MBB); + + do { + MachineBasicBlock &UseMBB = *Blocks.pop_back_val(); + + // We currently don't do any PHI insertion and so we require that the + // test basic block dominates all of the use basic blocks. + // + // We could in theory do PHI insertion here if it becomes useful by just + // taking undef values in along every edge that we don't trace this + // EFLAGS copy along. This isn't as bad as fully general PHI insertion, + // but still seems like a great deal of complexity. + // + // Because it is theoretically possible that some earlier MI pass or + // other lowering transformation could induce this to happen, we do + // a hard check even in non-debug builds here. + if (&TestMBB != &UseMBB && !MDT->dominates(&TestMBB, &UseMBB)) { + DEBUG({ + dbgs() << "ERROR: Encountered use that is not dominated by our test " + "basic block! Rewriting this would require inserting PHI " + "nodes to track the flag state across the CFG.\n\nTest " + "block:\n"; + TestMBB.dump(); + dbgs() << "Use block:\n"; + UseMBB.dump(); + }); + report_fatal_error("Cannot lower EFLAGS copy when original copy def " + "does not dominate all uses."); + } + + for (auto MII = &UseMBB == &MBB ? std::next(CopyI->getIterator()) + : UseMBB.instr_begin(), + MIE = UseMBB.instr_end(); + MII != MIE;) { + MachineInstr &MI = *MII++; + MachineOperand *FlagUse = MI.findRegisterUseOperand(X86::EFLAGS); + if (!FlagUse) { + if (MI.findRegisterDefOperand(X86::EFLAGS)) { + // If EFLAGS are defined, it's as-if they were killed. We can stop + // scanning here. + // + // NB!!! Many instructions only modify some flags. LLVM currently + // models this as clobbering all flags, but if that ever changes + // this will need to be carefully updated to handle that more + // complex logic. + FlagsKilled = true; + break; + } + continue; + } + + DEBUG(dbgs() << " Rewriting use: "; MI.dump()); + + // Check the kill flag before we rewrite as that may change it. + if (FlagUse->isKill()) FlagsKilled = true; + + // Once we encounter a branch, the rest of the instructions must also be + // branches. We can't rewrite in place here, so we handle them below. + // + // Note that we don't have to handle tail calls here, even conditional + // tail calls, as those are not introduced into the X86 MI until post-RA + // branch folding or black placement. As a consequence, we get to deal + // with the simpler formulation of conditional branches followed by tail + // calls. + if (X86::getCondFromBranchOpc(MI.getOpcode()) != X86::COND_INVALID) { + auto JmpIt = MI.getIterator(); + do { + JmpIs.push_back(&*JmpIt); + ++JmpIt; + } while (JmpIt != UseMBB.instr_end() && + X86::getCondFromBranchOpc(JmpIt->getOpcode()) != + X86::COND_INVALID); break; } - continue; - } - DEBUG(dbgs() << " Rewriting use: "; MI.dump()); - - // Check the kill flag before we rewrite as that may change it. - if (FlagUse->isKill()) - FlagsKilled = true; + // Otherwise we can just rewrite in-place. + if (X86::getCondFromCMovOpc(MI.getOpcode()) != X86::COND_INVALID) { + rewriteCMov(TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs); + } else if (X86::getCondFromSETOpc(MI.getOpcode()) != + X86::COND_INVALID) { + rewriteSetCC(TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs); + } else if (MI.getOpcode() == TargetOpcode::COPY) { + rewriteCopy(MI, *FlagUse, CopyDefI); + } else { + // We assume that arithmetic instructions that use flags also def + // them. + assert(MI.findRegisterDefOperand(X86::EFLAGS) && + "Expected a def of EFLAGS for this instruction!"); + + // NB!!! Several arithmetic instructions only *partially* update + // flags. Theoretically, we could generate MI code sequences that + // would rely on this fact and observe different flags independently. + // But currently LLVM models all of these instructions as clobbering + // all the flags in an undef way. We rely on that to simplify the + // logic. + FlagsKilled = true; - // Once we encounter a branch, the rest of the instructions must also be - // branches. We can't rewrite in place here, so we handle them below. - // - // Note that we don't have to handle tail calls here, even conditional - // tail calls, as those are not introduced into the X86 MI until post-RA - // branch folding or black placement. As a consequence, we get to deal - // with the simpler formulation of conditional branches followed by tail - // calls. - if (X86::getCondFromBranchOpc(MI.getOpcode()) != X86::COND_INVALID) { - auto JmpIt = MI.getIterator(); - do { - JmpIs.push_back(&*JmpIt); - ++JmpIt; - } while (JmpIt != MBB.instr_end() && - X86::getCondFromBranchOpc(JmpIt->getOpcode()) != - X86::COND_INVALID); - break; - } + rewriteArithmetic(TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs); + break; + } - // Otherwise we can just rewrite in-place. - if (X86::getCondFromCMovOpc(MI.getOpcode()) != X86::COND_INVALID) { - rewriteCMov(TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs); - } else if (X86::getCondFromSETOpc(MI.getOpcode()) != X86::COND_INVALID) { - rewriteSetCC(TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs); - } else if (MI.getOpcode() == TargetOpcode::COPY) { - rewriteCopy(MI, *FlagUse, CopyDefI); - } else { - // We assume that arithmetic instructions that use flags also def them. - assert(MI.findRegisterDefOperand(X86::EFLAGS) && - "Expected a def of EFLAGS for this instruction!"); - - // NB!!! Several arithmetic instructions only *partially* update - // flags. Theoretically, we could generate MI code sequences that - // would rely on this fact and observe different flags independently. - // But currently LLVM models all of these instructions as clobbering - // all the flags in an undef way. We rely on that to simplify the - // logic. - FlagsKilled = true; - - rewriteArithmetic(TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs); - break; + // If this was the last use of the flags, we're done. + if (FlagsKilled) + break; } - // If this was the last use of the flags, we're done. + // If the flags were killed, we're done with this block. if (FlagsKilled) break; - } - // If we didn't find a kill (or equivalent) check that the flags don't - // live-out of the basic block. Currently we don't support lowering copies - // of flags that live out in this fashion. - if (!FlagsKilled && - llvm::any_of(MBB.successors(), [](MachineBasicBlock *SuccMBB) { - return SuccMBB->isLiveIn(X86::EFLAGS); - })) { - DEBUG({ - dbgs() << "ERROR: Found a copied EFLAGS live-out from basic block:\n" - << "----\n"; - MBB.dump(); - dbgs() << "----\n" - << "ERROR: Cannot lower this EFLAGS copy!\n"; - }); - report_fatal_error( - "Cannot lower EFLAGS copy that lives out of a basic block!"); - } + // Otherwise we need to scan successors for ones where the flags live-in + // and queue those up for processing. + for (MachineBasicBlock *SuccMBB : UseMBB.successors()) + if (SuccMBB->isLiveIn(X86::EFLAGS) && + VisitedBlocks.insert(SuccMBB).second) + Blocks.push_back(SuccMBB); + } while (!Blocks.empty()); // Now rewrite the jumps that use the flags. These we handle specially - // because if there are multiple jumps we'll have to do surgery on the CFG. + // because if there are multiple jumps in a single basic block we'll have + // to do surgery on the CFG. + MachineBasicBlock *LastJmpMBB = nullptr; for (MachineInstr *JmpI : JmpIs) { - // Past the first jump we need to split the blocks apart. - if (JmpI != JmpIs.front()) + // Past the first jump within a basic block we need to split the blocks + // apart. + if (JmpI->getParent() == LastJmpMBB) splitBlock(*JmpI->getParent(), *JmpI, *TII); + else + LastJmpMBB = JmpI->getParent(); rewriteCondJmp(TestMBB, TestPos, TestLoc, *JmpI, CondRegs); } Index: llvm/test/CodeGen/X86/O0-pipeline.ll =================================================================== --- llvm/test/CodeGen/X86/O0-pipeline.ll +++ llvm/test/CodeGen/X86/O0-pipeline.ll @@ -37,6 +37,7 @@ ; CHECK-NEXT: X86 PIC Global Base Reg Initialization ; CHECK-NEXT: Expand ISel Pseudo-instructions ; CHECK-NEXT: Local Stack Slot Allocation +; CHECK-NEXT: MachineDominator Tree Construction ; CHECK-NEXT: X86 EFLAGS copy lowering ; CHECK-NEXT: X86 WinAlloca Expander ; CHECK-NEXT: Eliminate PHI nodes for register allocation Index: llvm/test/CodeGen/X86/O3-pipeline.ll =================================================================== --- llvm/test/CodeGen/X86/O3-pipeline.ll +++ llvm/test/CodeGen/X86/O3-pipeline.ll @@ -90,6 +90,7 @@ ; CHECK-NEXT: X86 LEA Optimize ; CHECK-NEXT: X86 Optimize Call Frame ; CHECK-NEXT: X86 Avoid Store Forwarding Block +; CHECK-NEXT: MachineDominator Tree Construction ; CHECK-NEXT: X86 EFLAGS copy lowering ; CHECK-NEXT: X86 WinAlloca Expander ; CHECK-NEXT: Detect Dead Lanes Index: llvm/test/CodeGen/X86/copy-eflags.ll =================================================================== --- llvm/test/CodeGen/X86/copy-eflags.ll +++ llvm/test/CodeGen/X86/copy-eflags.ll @@ -196,3 +196,99 @@ tail call void @external_b() ret void } + +; Test a function that gets special select lowering into CFG with copied EFLAGS +; threaded across the CFG. This requires our EFLAGS copy rewriting to handle +; cross-block rewrites in at least some narrow cases. +; +; Note that the 'undef' values here are significant! Other values will in many +; cases not effectively trigger the lowering we're interested in exercising. +define void @PR37100(i8 %arg1, i16 %arg2, i64 %arg3) { +; X32-LABEL: PR37100: +; X32: # %bb.0: # %bb +; X32-NEXT: pushl %ebx +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: pushl %edi +; X32-NEXT: .cfi_def_cfa_offset 12 +; X32-NEXT: pushl %esi +; X32-NEXT: .cfi_def_cfa_offset 16 +; X32-NEXT: .cfi_offset %esi, -16 +; X32-NEXT: .cfi_offset %edi, -12 +; X32-NEXT: .cfi_offset %ebx, -8 +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movb {{[0-9]+}}(%esp), %bl +; X32-NEXT: jmp .LBB3_1 +; X32-NEXT: .p2align 4, 0x90 +; X32-NEXT: .LBB3_5: # %bb1 +; X32-NEXT: # in Loop: Header=BB3_1 Depth=1 +; X32-NEXT: xorl %eax, %eax +; X32-NEXT: xorl %edx, %edx +; X32-NEXT: idivl %edi +; X32-NEXT: .LBB3_1: # %bb1 +; X32-NEXT: # =>This Inner Loop Header: Depth=1 +; X32-NEXT: movsbl %bl, %eax +; X32-NEXT: movl %eax, %edx +; X32-NEXT: sarl $31, %edx +; X32-NEXT: cmpl %eax, %esi +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: sbbl %edx, %eax +; X32-NEXT: setl %al +; X32-NEXT: setl %dl +; X32-NEXT: movzbl %dl, %edi +; X32-NEXT: negl %edi +; X32-NEXT: testb $-1, %al +; X32-NEXT: jne .LBB3_3 +; X32-NEXT: # %bb.2: # %bb1 +; X32-NEXT: # in Loop: Header=BB3_1 Depth=1 +; X32-NEXT: # implicit-def: $bl +; X32-NEXT: .LBB3_3: # %bb1 +; X32-NEXT: # in Loop: Header=BB3_1 Depth=1 +; X32-NEXT: testb $-1, %al +; X32-NEXT: jne .LBB3_5 +; X32-NEXT: # %bb.4: # %bb1 +; X32-NEXT: # in Loop: Header=BB3_1 Depth=1 +; X32-NEXT: # implicit-def: $edi +; X32-NEXT: jmp .LBB3_5 +; +; X64-LABEL: PR37100: +; X64: # %bb.0: # %bb +; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: jmp .LBB3_1 +; X64-NEXT: .p2align 4, 0x90 +; X64-NEXT: .LBB3_3: # %bb1 +; X64-NEXT: # in Loop: Header=BB3_1 Depth=1 +; X64-NEXT: cmovll %eax, %esi +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: xorl %edx, %edx +; X64-NEXT: idivl %esi +; X64-NEXT: .LBB3_1: # %bb1 +; X64-NEXT: # =>This Inner Loop Header: Depth=1 +; X64-NEXT: movsbq %dil, %rdx +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: cmpq %rdx, %rcx +; X64-NEXT: setl %al +; X64-NEXT: negl %eax +; X64-NEXT: cmpq %rdx, %rcx +; X64-NEXT: jl .LBB3_3 +; X64-NEXT: # %bb.2: # %bb1 +; X64-NEXT: # in Loop: Header=BB3_1 Depth=1 +; X64-NEXT: # implicit-def: $dil +; X64-NEXT: jmp .LBB3_3 +bb: + br label %bb1 + +bb1: + %tmp = phi i8 [ %tmp8, %bb1 ], [ %arg1, %bb ] + %tmp2 = phi i16 [ %tmp11, %bb1 ], [ %arg2, %bb ] + %tmp3 = icmp sgt i16 %tmp2, 7 + %tmp4 = select i1 %tmp3, i16 %tmp2, i16 7 + %tmp5 = sext i8 %tmp to i64 + %tmp6 = icmp slt i64 %arg3, %tmp5 + %tmp7 = sext i1 %tmp6 to i32 + %tmp8 = select i1 %tmp6, i8 %tmp, i8 undef + %tmp9 = select i1 %tmp6, i32 %tmp7, i32 undef + %tmp10 = srem i32 0, %tmp9 + %tmp11 = trunc i32 %tmp10 to i16 + br label %bb1 +}