Index: llvm/lib/Target/X86/X86FlagsCopyLowering.cpp
===================================================================
--- llvm/lib/Target/X86/X86FlagsCopyLowering.cpp
+++ llvm/lib/Target/X86/X86FlagsCopyLowering.cpp
@@ -27,6 +27,7 @@
 #include "X86Subtarget.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/SmallPtrSet.h"
@@ -102,7 +103,7 @@
   MachineDominatorTree *MDT;
 
   CondRegArray collectCondsInRegs(MachineBasicBlock &MBB,
-                                  MachineInstr &CopyDefI);
+                                  MachineBasicBlock::iterator CopyDefI);
 
   unsigned promoteCondToReg(MachineBasicBlock &MBB,
                             MachineBasicBlock::iterator TestPos,
@@ -356,9 +357,14 @@
     // Nothing to do for a degenerate empty function...
     return false;
 
+  // Collect the copies in RPO so that when there are chains where a copy is in
+  // turn copied again we visit the first one first. This ensures we can find
+  // viable locations for testing the original EFLAGS that dominate all the
+  // uses across complex CFGs.
   SmallVector<MachineInstr *, 4> Copies;
-  for (MachineBasicBlock &MBB : MF)
-    for (MachineInstr &MI : MBB)
+  ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
+  for (MachineBasicBlock *MBB : RPOT)
+    for (MachineInstr &MI : *MBB)
       if (MI.getOpcode() == TargetOpcode::COPY &&
           MI.getOperand(0).getReg() == X86::EFLAGS)
         Copies.push_back(&MI);
@@ -407,12 +413,99 @@
     if (DOp.isDead())
       continue;
 
-    MachineBasicBlock &TestMBB = *CopyDefI.getParent();
+    MachineBasicBlock *TestMBB = CopyDefI.getParent();
     auto TestPos = CopyDefI.getIterator();
     DebugLoc TestLoc = CopyDefI.getDebugLoc();
 
     LLVM_DEBUG(dbgs() << "Rewriting copy: "; CopyI->dump());
 
+    // Walk up across live-in EFLAGS to find where they were actually def'ed.
+    //
+    // This copy's def may just be part of a region of blocks covered by
+    // a single def of EFLAGS and we want to find the top of that region where
+    // possible.
+    //
+    // This is essentially a search for a *candidate* reaching definition
+    // location. We don't need to ever find the actual reaching definition here,
+    // but we want to walk up the dominator tree to find the highest point which
+    // would be viable for such a definition.
+    auto HasEFLAGSClobber = [&](MachineBasicBlock::iterator Begin,
+                                MachineBasicBlock::iterator End) {
+      // Scan backwards as we expect these to be relatively short and often find
+      // a clobber near the end.
+      return llvm::any_of(
+          llvm::reverse(llvm::make_range(Begin, End)), [&](MachineInstr &MI) {
+            // Flag any instruction (other than the copy we are
+            // currently rewriting) that defs EFLAGS.
+            return &MI != CopyI && MI.findRegisterDefOperand(X86::EFLAGS);
+          });
+    };
+    auto HasEFLAGSClobberPath = [&](MachineBasicBlock *BeginMBB,
+                                    MachineBasicBlock *EndMBB) {
+      assert(MDT->dominates(BeginMBB, EndMBB) &&
+             "Only support paths down the dominator tree!");
+      SmallPtrSet<MachineBasicBlock *, 4> Visited;
+      SmallVector<MachineBasicBlock *, 4> Worklist;
+      // We terminate at the beginning. No need to scan it.
+      Visited.insert(BeginMBB);
+      Worklist.push_back(EndMBB);
+      do {
+        auto *MBB = Worklist.pop_back_val();
+        for (auto *PredMBB : MBB->predecessors()) {
+          if (!Visited.insert(PredMBB).second)
+            continue;
+          if (HasEFLAGSClobber(PredMBB->begin(), PredMBB->end()))
+            return true;
+          // Enqueue this block to walk its predecessors.
+          Worklist.push_back(PredMBB);
+        }
+      } while (!Worklist.empty());
+      // No clobber found along a path from the begin to end.
+      return false;
+    };
+    while (TestMBB->isLiveIn(X86::EFLAGS) && !TestMBB->pred_empty() &&
+           !HasEFLAGSClobber(TestMBB->begin(), TestPos)) {
+      // Find the nearest common dominator of the predecessors, as
+      // that will be the best candidate to hoist into.
+      MachineBasicBlock *HoistMBB =
+          std::accumulate(std::next(TestMBB->pred_begin()), TestMBB->pred_end(),
+                          *TestMBB->pred_begin(),
+                          [&](MachineBasicBlock *LHS, MachineBasicBlock *RHS) {
+                            return MDT->findNearestCommonDominator(LHS, RHS);
+                          });
+
+      // Now we need to scan all predecessors that may be reached along paths to
+      // the hoist block. A clobber anywhere in any of these blocks the hoist.
+      // Note that this even handles loops because we require *no* clobbers.
+      if (HasEFLAGSClobberPath(HoistMBB, TestMBB))
+        break;
+
+      // We also need the terminators to not sneakily clobber flags.
+      if (HasEFLAGSClobber(HoistMBB->getFirstTerminator()->getIterator(),
+                           HoistMBB->instr_end()))
+        break;
+
+      // We found a viable location, hoist our test position to it.
+      TestMBB = HoistMBB;
+      TestPos = TestMBB->getFirstTerminator()->getIterator();
+      // Clear the debug location as it would just be confusing after hoisting.
+      TestLoc = DebugLoc();
+    }
+    LLVM_DEBUG({
+      auto DefIt = llvm::find_if(
+          llvm::reverse(llvm::make_range(TestMBB->instr_begin(), TestPos)),
+          [&](MachineInstr &MI) {
+            return MI.findRegisterDefOperand(X86::EFLAGS);
+          });
+      if (DefIt.base() != TestMBB->instr_begin()) {
+        dbgs() << "  Using EFLAGS defined by: ";
+        DefIt->dump();
+      } else {
+        dbgs() << "  Using live-in flags for BB:\n";
+        TestMBB->dump();
+      }
+    });
+
     // While rewriting uses, we buffer jumps and rewrite them in a second pass
     // because doing so will perturb the CFG that we are walking to find the
     // uses in the first place.
@@ -423,7 +516,7 @@
     // very few of them and we expect to not revisit the same copy definition
     // many times. If either of those change sufficiently we could build a map
     // of these up front instead.
-    CondRegArray CondRegs = collectCondsInRegs(TestMBB, CopyDefI);
+    CondRegArray CondRegs = collectCondsInRegs(*TestMBB, TestPos);
 
     // Collect the basic blocks we need to scan. Typically this will just be
     // a single basic block but we may have to scan multiple blocks if the
@@ -431,7 +524,6 @@
     SmallVector<MachineBasicBlock *, 2> Blocks;
     SmallPtrSet<MachineBasicBlock *, 2> VisitedBlocks;
     Blocks.push_back(&MBB);
-    VisitedBlocks.insert(&MBB);
 
     do {
       MachineBasicBlock &UseMBB = *Blocks.pop_back_val();
@@ -439,36 +531,28 @@
       // Track when if/when we find a kill of the flags in this block.
       bool FlagsKilled = false;
 
-      // We currently don't do any PHI insertion and so we require that the
-      // test basic block dominates all of the use basic blocks.
-      //
-      // We could in theory do PHI insertion here if it becomes useful by just
-      // taking undef values in along every edge that we don't trace this
-      // EFLAGS copy along. This isn't as bad as fully general PHI insertion,
-      // but still seems like a great deal of complexity.
-      //
-      // Because it is theoretically possible that some earlier MI pass or
-      // other lowering transformation could induce this to happen, we do
-      // a hard check even in non-debug builds here.
-      if (&TestMBB != &UseMBB && !MDT->dominates(&TestMBB, &UseMBB)) {
-        LLVM_DEBUG({
-          dbgs() << "ERROR: Encountered use that is not dominated by our test "
-                    "basic block! Rewriting this would require inserting PHI "
-                    "nodes to track the flag state across the CFG.\n\nTest "
-                    "block:\n";
-          TestMBB.dump();
-          dbgs() << "Use block:\n";
-          UseMBB.dump();
-        });
-        report_fatal_error("Cannot lower EFLAGS copy when original copy def "
-                           "does not dominate all uses.");
-      }
-
-      for (auto MII = &UseMBB == &MBB ? std::next(CopyI->getIterator())
-                                      : UseMBB.instr_begin(),
+      // In most cases, we walk from the beginning to the end of the block. But
+      // when the block is the same as the copy is from, we will visit it twice.
+      // The first time we start from the copy and go to the end. The second
+      // time we start from the beginning and go to the copy. This lets us
+      // handle copies inside of cycles.
+      for (auto MII = (&UseMBB == &MBB && !VisitedBlocks.count(&UseMBB))
+                          ? std::next(CopyI->getIterator())
+                          : UseMBB.instr_begin(),
                 MIE = UseMBB.instr_end();
            MII != MIE;) {
         MachineInstr &MI = *MII++;
+        // If we are in the original copy block and encounter either the copy
+        // def or the copy itself, break so that we don't re-process any part of
+        // the block or process the instructions in the range that was copied
+        // over.
+        if (&MI == CopyI || &MI == &CopyDefI) {
+          assert(&UseMBB == &MBB && VisitedBlocks.count(&MBB) &&
+                 "Should only encounter these on the second pass over the "
+                 "original block.");
+          break;
+        }
+
         MachineOperand *FlagUse = MI.findRegisterUseOperand(X86::EFLAGS);
         if (!FlagUse) {
           if (MI.findRegisterDefOperand(X86::EFLAGS)) {
@@ -512,10 +596,10 @@
 
         // Otherwise we can just rewrite in-place.
         if (X86::getCondFromCMovOpc(MI.getOpcode()) != X86::COND_INVALID) {
-          rewriteCMov(TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs);
+          rewriteCMov(*TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs);
         } else if (X86::getCondFromSETOpc(MI.getOpcode()) !=
                    X86::COND_INVALID) {
-          rewriteSetCC(TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs);
+          rewriteSetCC(*TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs);
         } else if (MI.getOpcode() == TargetOpcode::COPY) {
           rewriteCopy(MI, *FlagUse, CopyDefI);
         } else {
@@ -538,13 +622,13 @@
           case X86::SETB_C64r:
             // Use custom lowering for arithmetic that is merely extending the
             // carry flag. We model this as the SETB_C* pseudo instructions.
-            rewriteSetCarryExtended(TestMBB, TestPos, TestLoc, MI, *FlagUse,
+            rewriteSetCarryExtended(*TestMBB, TestPos, TestLoc, MI, *FlagUse,
                                     CondRegs);
             break;
 
           default:
             // Generically handle remaining uses as arithmetic instructions.
-            rewriteArithmetic(TestMBB, TestPos, TestLoc, MI, *FlagUse,
+            rewriteArithmetic(*TestMBB, TestPos, TestLoc, MI, *FlagUse,
                               CondRegs);
             break;
           }
@@ -564,8 +648,38 @@
       // and queue those up for processing.
       for (MachineBasicBlock *SuccMBB : UseMBB.successors())
         if (SuccMBB->isLiveIn(X86::EFLAGS) &&
-            VisitedBlocks.insert(SuccMBB).second)
+            VisitedBlocks.insert(SuccMBB).second) {
+          // We currently don't do any PHI insertion and so we require that the
+          // test basic block dominates all of the use basic blocks. Further, we
+          // can't have a cycle from the test block back to itself as that would
+          // create a cycle requiring a PHI to break it.
+          //
+          // We could in theory do PHI insertion here if it becomes useful by
+          // just taking undef values in along every edge that we don't trace
+          // this EFLAGS copy along. This isn't as bad as fully general PHI
+          // insertion, but still seems like a great deal of complexity.
+          //
+          // Because it is theoretically possible that some earlier MI pass or
+          // other lowering transformation could induce this to happen, we do
+          // a hard check even in non-debug builds here.
+          if (SuccMBB == TestMBB || !MDT->dominates(TestMBB, SuccMBB)) {
+            LLVM_DEBUG({
+              dbgs()
+                  << "ERROR: Encountered use that is not dominated by our test "
+                     "basic block! Rewriting this would require inserting PHI "
+                     "nodes to track the flag state across the CFG.\n\nTest "
+                     "block:\n";
+              TestMBB->dump();
+              dbgs() << "Use block:\n";
+              SuccMBB->dump();
+            });
+            report_fatal_error(
+                "Cannot lower EFLAGS copy when original copy def "
+                "does not dominate all uses.");
+          }
+
           Blocks.push_back(SuccMBB);
+        }
     } while (!Blocks.empty());
 
     // Now rewrite the jumps that use the flags. These we handle specially
@@ -580,7 +694,7 @@
       else
         LastJmpMBB = JmpI->getParent();
 
-      rewriteCondJmp(TestMBB, TestPos, TestLoc, *JmpI, CondRegs);
+      rewriteCondJmp(*TestMBB, TestPos, TestLoc, *JmpI, CondRegs);
     }
 
     // FIXME: Mark the last use of EFLAGS before the copy's def as a kill if
@@ -604,14 +718,13 @@
 
 /// Collect any conditions that have already been set in registers so that we
 /// can re-use them rather than adding duplicates.
-CondRegArray
-X86FlagsCopyLoweringPass::collectCondsInRegs(MachineBasicBlock &MBB,
-                                             MachineInstr &CopyDefI) {
+CondRegArray X86FlagsCopyLoweringPass::collectCondsInRegs(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator TestPos) {
   CondRegArray CondRegs = {};
 
   // Scan backwards across the range of instructions with live EFLAGS.
-  for (MachineInstr &MI : llvm::reverse(
-           llvm::make_range(MBB.instr_begin(), CopyDefI.getIterator()))) {
+  for (MachineInstr &MI :
+       llvm::reverse(llvm::make_range(MBB.begin(), TestPos))) {
     X86::CondCode Cond = X86::getCondFromSETOpc(MI.getOpcode());
     if (Cond != X86::COND_INVALID && MI.getOperand(0).isReg() &&
         TRI->isVirtualRegister(MI.getOperand(0).getReg()))
Index: llvm/test/CodeGen/X86/flags-copy-lowering.mir
===================================================================
--- llvm/test/CodeGen/X86/flags-copy-lowering.mir
+++ llvm/test/CodeGen/X86/flags-copy-lowering.mir
@@ -84,6 +84,12 @@
     call void @foo()
     ret i64 0
   }
+
+  define i64 @test_mid_cycle_copies(i64 %a, i64 %b) {
+  entry:
+    call void @foo()
+    ret i64 0
+  }
 ...
 ---
 name:            test_branch
@@ -738,3 +744,195 @@
     RET 0, $rax
 
 ...
+---
+# This test case is designed to exercise a particularly challenging situation:
+# when the flags are copied and restored *inside* of a complex and cyclic CFG
+# all of which have live-in flags. To correctly handle this case we have to walk
+# up the dominator tree and locate a viable reaching definition location,
+# checking for clobbers along any path. The CFG for this function looks like the
+# following diagram, control flowing out the bottom of blocks and in the top:
+#
+#  bb.0
+#   | __________________
+#   |/                  \
+#  bb.1                  |
+#   |\_________          |
+#   | __       \ ____    |
+#   |/  \      |/    \   |
+#  bb.2  |    bb.4    |  |
+#   |\__/     / \     |  |
+#   |        /   \    |  |
+#  bb.3    bb.5  bb.6 |  |
+#   |        \   /    |  |
+#   |         \ /     |  |
+#   |         bb.7    |  |
+#   | ________/ \____/   |
+#   |/                   |
+#  bb.8                  |
+#   |\__________________/
+#   |
+#  bb.9
+#
+# We set EFLAGS in bb.0, clobber them in bb.3, and copy them in bb.2 and bb.6.
+# Because of the cycles this requires hoisting the `SETcc` instructions to
+# capture the flags for the bb.6 copy to bb.1 and using them for the copy in
+# `bb.2` as well despite the clobber in `bb.3`. The clobber in `bb.3` also
+# prevents hoisting the `SETcc`s up to `bb.0`.
+#
+# Throughout the test we use branch instructions that are totally bogus (as the
+# flags are obviously not changing!) but this is just to allow us to send
+# a small but complex CFG structure through the backend and force it to choose
+# plausible lowering decisions based on the core CFG presented, regardless of
+# the futility of the actual branches.
+name:            test_mid_cycle_copies
+# CHECK-LABEL: name: test_mid_cycle_copies
+liveins:
+  - { reg: '$rdi', virtual-reg: '%0' }
+  - { reg: '$rsi', virtual-reg: '%1' }
+body:             |
+  bb.0:
+    successors: %bb.1
+    liveins: $rdi, $rsi
+
+    %0:gr64 = COPY $rdi
+    %1:gr64 = COPY $rsi
+    CMP64rr %0, %1, implicit-def $eflags
+  ; CHECK:      bb.0:
+  ; CHECK-NOT:    COPY{{( killed)?}} $eflags
+  ; CHECK:        CMP64rr %0, %1, implicit-def $eflags
+  ; CHECK-NOT:    COPY{{( killed)?}} $eflags
+    JMP_1 %bb.1
+
+  bb.1:
+    successors: %bb.2, %bb.4
+    liveins: $eflags
+
+    ; Outer loop header, target for one set of hoisting.
+    JE_1 %bb.2, implicit $eflags
+    JMP_1 %bb.4
+  ; CHECK:      bb.1:
+  ; CHECK-NOT:    COPY{{( killed)?}} $eflags
+  ; CHECK:        %[[A_REG:[^:]*]]:gr8 = SETAr implicit $eflags
+  ; CHECK-NEXT:   %[[E_REG:[^:]*]]:gr8 = SETEr implicit $eflags
+  ; CHECK-NEXT:   %[[B_REG:[^:]*]]:gr8 = SETBr implicit $eflags
+  ; CHECK-NOT:    COPY{{( killed)?}} $eflags
+
+  bb.2:
+    successors: %bb.2, %bb.3
+    liveins: $eflags
+
+    ; Inner loop with a local copy. We should eliminate this but can't hoist.
+    %2:gr64 = COPY $eflags
+    $eflags = COPY %2
+    JE_1 %bb.2, implicit $eflags
+    JMP_1 %bb.3
+  ; CHECK:      bb.2:
+  ; CHECK-NOT:    COPY{{( killed)?}} $eflags
+  ; CHECK:        TEST8rr %[[E_REG]], %[[E_REG]], implicit-def $eflags
+  ; CHECK-NEXT:   JNE_1 %bb.2, implicit killed $eflags
+  ; CHECK-NOT:    COPY{{( killed)?}} $eflags
+
+  bb.3:
+    successors: %bb.8
+    liveins: $eflags
+
+    ; Use and then clobber $eflags. Then hop to the outer loop latch.
+    %3:gr64 = ADC64ri32 %0, 42, implicit-def dead $eflags, implicit $eflags
+  ; CHECK:      bb.3:
+  ; CHECK-NOT:    COPY{{( killed)?}} $eflags
+  ; CHECK:        dead %{{[^:]*}}:gr8 = ADD8ri %[[B_REG]], 255, implicit-def $eflags
+  ; CHECK-NEXT:   %3:gr64 = ADC64ri32 %0, 42, implicit-def{{( dead)?}} $eflags, implicit{{( killed)?}} $eflags
+  ; CHECK-NOT:    COPY{{( killed)?}} $eflags
+    MOV64mr $rsp, 1, $noreg, -16, $noreg, killed %3
+    JMP_1 %bb.8
+
+  bb.4:
+    successors: %bb.5, %bb.6
+    liveins: $eflags
+
+    ; Another inner loop, this one with a diamond.
+    JE_1 %bb.5, implicit $eflags
+    JMP_1 %bb.6
+  ; CHECK:      bb.4:
+  ; CHECK-NOT:    COPY{{( killed)?}} $eflags
+  ; CHECK:        TEST8rr %[[E_REG]], %[[E_REG]], implicit-def $eflags
+  ; CHECK-NEXT:   JNE_1 %bb.5, implicit killed $eflags
+  ; CHECK-NOT:    COPY{{( killed)?}} $eflags
+
+  bb.5:
+    successors: %bb.7
+    liveins: $eflags
+
+    ; Just use $eflags on this side of the diamond.
+    %4:gr64 = CMOVA64rr %0, %1, implicit $eflags
+  ; CHECK:      bb.5:
+  ; CHECK-NOT:    COPY{{( killed)?}} $eflags
+  ; CHECK:         TEST8rr %[[A_REG]], %[[A_REG]], implicit-def $eflags
+  ; CHECK-NEXT:    %4:gr64 = CMOVNE64rr %0, %1, implicit killed $eflags
+  ; CHECK-NOT:    COPY{{( killed)?}} $eflags
+    MOV64mr $rsp, 1, $noreg, -16, $noreg, killed %4
+    JMP_1 %bb.7
+
+  bb.6:
+    successors: %bb.7
+    liveins: $eflags
+
+    ; Use, copy, and then use $eflags again.
+    %5:gr64 = CMOVA64rr %0, %1, implicit $eflags
+  ; CHECK:      bb.6:
+  ; CHECK-NOT:    COPY{{( killed)?}} $eflags
+  ; CHECK:        TEST8rr %[[A_REG]], %[[A_REG]], implicit-def $eflags
+  ; CHECK-NEXT:   %5:gr64 = CMOVNE64rr %0, %1, implicit killed $eflags
+  ; CHECK-NOT:    COPY{{( killed)?}} $eflags
+    MOV64mr $rsp, 1, $noreg, -16, $noreg, killed %5
+
+    %6:gr64 = COPY $eflags
+    $eflags = COPY %6:gr64
+
+    %7:gr64 = CMOVA64rr %0, %1, implicit $eflags
+  ; CHECK-NOT:    COPY{{( killed)?}} $eflags
+  ; CHECK:        TEST8rr %[[A_REG]], %[[A_REG]], implicit-def $eflags
+  ; CHECK-NEXT:   %7:gr64 = CMOVNE64rr %0, %1, implicit killed $eflags
+  ; CHECK-NOT:    COPY{{( killed)?}} $eflags
+    MOV64mr $rsp, 1, $noreg, -16, $noreg, killed %7
+    JMP_1 %bb.7
+
+  bb.7:
+    successors: %bb.4, %bb.8
+    liveins: $eflags
+
+    ; Inner loop latch.
+    JE_1 %bb.4, implicit $eflags
+    JMP_1 %bb.8
+  ; CHECK:      bb.7:
+  ; CHECK-NOT:    COPY{{( killed)?}} $eflags
+  ; CHECK:        TEST8rr %[[E_REG]], %[[E_REG]], implicit-def $eflags
+  ; CHECK-NEXT:   JNE_1 %bb.4, implicit killed $eflags
+  ; CHECK-NOT:    COPY{{( killed)?}} $eflags
+
+  bb.8:
+    successors: %bb.1, %bb.9
+
+    ; Outer loop latch. Note that we cannot have EFLAGS live-in here as that
+    ; Immediately require PHIs.
+    CMP64rr %0, %1, implicit-def $eflags
+    JE_1 %bb.1, implicit $eflags
+    JMP_1 %bb.9
+  ; CHECK:      bb.8:
+  ; CHECK-NOT:    COPY{{( killed)?}} $eflags
+  ; CHECK:        CMP64rr %0, %1, implicit-def $eflags
+  ; CHECK-NEXT:   JE_1 %bb.1, implicit $eflags
+  ; CHECK-NOT:    COPY{{( killed)?}} $eflags
+
+  bb.9:
+    liveins: $eflags
+
+    ; And we're done.
+    %8:gr64 = CMOVE64rr %0, %1, implicit killed $eflags
+    $rax = COPY %8
+    RET 0, $rax
+  ; CHECK:      bb.9:
+  ; CHECK-NOT:     $eflags
+  ; CHECK:         %8:gr64 = CMOVE64rr %0, %1, implicit killed $eflags
+
+...