Index: llvm/lib/Target/X86/X86FlagsCopyLowering.cpp
===================================================================
--- llvm/lib/Target/X86/X86FlagsCopyLowering.cpp
+++ llvm/lib/Target/X86/X86FlagsCopyLowering.cpp
@@ -36,6 +36,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
@@ -98,6 +99,7 @@
   const X86InstrInfo *TII;
   const TargetRegisterInfo *TRI;
   const TargetRegisterClass *PromoteRC;
+  MachineDominatorTree *MDT;
 
   CondRegArray collectCondsInRegs(MachineBasicBlock &MBB,
                                   MachineInstr &CopyDefI);
@@ -145,6 +147,7 @@
 char X86FlagsCopyLoweringPass::ID = 0;
 
 void X86FlagsCopyLoweringPass::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<MachineDominatorTree>();
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
@@ -342,6 +345,7 @@
   MRI = &MF.getRegInfo();
   TII = Subtarget.getInstrInfo();
   TRI = Subtarget.getRegisterInfo();
+  MDT = &getAnalysis<MachineDominatorTree>();
   PromoteRC = &X86::GR8RegClass;
 
   if (MF.begin() == MF.end())
@@ -416,103 +420,142 @@
     // of these up front instead.
     CondRegArray CondRegs = collectCondsInRegs(TestMBB, CopyDefI);
 
-    for (auto MII = std::next(CopyI->getIterator()), MIE = MBB.instr_end();
-         MII != MIE;) {
-      MachineInstr &MI = *MII++;
-      MachineOperand *FlagUse = MI.findRegisterUseOperand(X86::EFLAGS);
-      if (!FlagUse) {
-        if (MI.findRegisterDefOperand(X86::EFLAGS)) {
-          // If EFLAGS are defined, it's as-if they were killed. We can stop
-          // scanning here.
-          //
-          // NB!!! Many instructions only modify some flags. LLVM currently
-          // models this as clobbering all flags, but if that ever changes this
-          // will need to be carefully updated to handle that more complex
-          // logic.
+    // Collect the basic blocks we need to scan. Typically this will just be
+    // a single basic block but we may have to scan multiple blocks if the
+    // EFLAGS copy lives into successors.
+    SmallVector<MachineBasicBlock *, 2> Blocks;
+    SmallPtrSet<MachineBasicBlock *, 2> VisitedBlocks;
+    Blocks.push_back(&MBB);
+    VisitedBlocks.insert(&MBB);
+
+    do {
+      MachineBasicBlock &UseMBB = *Blocks.pop_back_val();
+
+      // We currently don't do any PHI insertion and so we require that the
+      // test basic block dominates all of the use basic blocks.
+      //
+      // We could in theory do PHI insertion here if it becomes useful by just
+      // taking undef values in along every edge that we don't trace this
+      // EFLAGS copy along. This isn't as bad as fully general PHI insertion,
+      // but still seems like a great deal of complexity.
+      //
+      // Because it is theoretically possible that some earlier MI pass or
+      // other lowering transformation could induce this to happen, we do
+      // a hard check even in non-debug builds here.
+      if (&TestMBB != &UseMBB && !MDT->dominates(&TestMBB, &UseMBB)) {
+        DEBUG({
+          dbgs() << "ERROR: Encountered use that is not dominated by our test "
+                    "basic block! Rewriting this would require inserting PHI "
+                    "nodes to track the flag state across the CFG.\n\nTest "
+                    "block:\n";
+          TestMBB.dump();
+          dbgs() << "Use block:\n";
+          UseMBB.dump();
+        });
+        report_fatal_error("Cannot lower EFLAGS copy when original copy def "
+                           "does not dominate all uses.");
+      }
+
+      for (auto MII = &UseMBB == &MBB ? std::next(CopyI->getIterator())
+                                      : UseMBB.instr_begin(),
+                MIE = UseMBB.instr_end();
+           MII != MIE;) {
+        MachineInstr &MI = *MII++;
+        MachineOperand *FlagUse = MI.findRegisterUseOperand(X86::EFLAGS);
+        if (!FlagUse) {
+          if (MI.findRegisterDefOperand(X86::EFLAGS)) {
+            // If EFLAGS are defined, it's as-if they were killed. We can stop
+            // scanning here.
+            //
+            // NB!!! Many instructions only modify some flags. LLVM currently
+            // models this as clobbering all flags, but if that ever changes
+            // this will need to be carefully updated to handle that more
+            // complex logic.
+            FlagsKilled = true;
+            break;
+          }
+          continue;
+        }
+
+        DEBUG(dbgs() << "  Rewriting use: "; MI.dump());
+
+        // Check the kill flag before we rewrite as that may change it.
+        if (FlagUse->isKill())
           FlagsKilled = true;
+
+        // Once we encounter a branch, the rest of the instructions must also be
+        // branches. We can't rewrite in place here, so we handle them below.
+        //
+        // Note that we don't have to handle tail calls here, even conditional
+        // tail calls, as those are not introduced into the X86 MI until post-RA
+        // branch folding or black placement. As a consequence, we get to deal
+        // with the simpler formulation of conditional branches followed by tail
+        // calls.
+        if (X86::getCondFromBranchOpc(MI.getOpcode()) != X86::COND_INVALID) {
+          auto JmpIt = MI.getIterator();
+          do {
+            JmpIs.push_back(&*JmpIt);
+            ++JmpIt;
+          } while (JmpIt != UseMBB.instr_end() &&
+                   X86::getCondFromBranchOpc(JmpIt->getOpcode()) !=
+                       X86::COND_INVALID);
           break;
         }
-        continue;
-      }
 
-      DEBUG(dbgs() << "  Rewriting use: "; MI.dump());
-
-      // Check the kill flag before we rewrite as that may change it.
-      if (FlagUse->isKill())
-        FlagsKilled = true;
+        // Otherwise we can just rewrite in-place.
+        if (X86::getCondFromCMovOpc(MI.getOpcode()) != X86::COND_INVALID) {
+          rewriteCMov(TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs);
+        } else if (X86::getCondFromSETOpc(MI.getOpcode()) !=
+                   X86::COND_INVALID) {
+          rewriteSetCC(TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs);
+        } else if (MI.getOpcode() == TargetOpcode::COPY) {
+          rewriteCopy(MI, *FlagUse, CopyDefI);
+        } else {
+          // We assume that arithmetic instructions that use flags also def
+          // them.
+          assert(MI.findRegisterDefOperand(X86::EFLAGS) &&
+                 "Expected a def of EFLAGS for this instruction!");
+
+          // NB!!! Several arithmetic instructions only *partially* update
+          // flags. Theoretically, we could generate MI code sequences that
+          // would rely on this fact and observe different flags independently.
+          // But currently LLVM models all of these instructions as clobbering
+          // all the flags in an undef way. We rely on that to simplify the
+          // logic.
+          FlagsKilled = true;
 
-      // Once we encounter a branch, the rest of the instructions must also be
-      // branches. We can't rewrite in place here, so we handle them below.
-      //
-      // Note that we don't have to handle tail calls here, even conditional
-      // tail calls, as those are not introduced into the X86 MI until post-RA
-      // branch folding or black placement. As a consequence, we get to deal
-      // with the simpler formulation of conditional branches followed by tail
-      // calls.
-      if (X86::getCondFromBranchOpc(MI.getOpcode()) != X86::COND_INVALID) {
-        auto JmpIt = MI.getIterator();
-        do {
-          JmpIs.push_back(&*JmpIt);
-          ++JmpIt;
-        } while (JmpIt != MBB.instr_end() &&
-                 X86::getCondFromBranchOpc(JmpIt->getOpcode()) !=
-                     X86::COND_INVALID);
-        break;
-      }
+          rewriteArithmetic(TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs);
+          break;
+        }
 
-      // Otherwise we can just rewrite in-place.
-      if (X86::getCondFromCMovOpc(MI.getOpcode()) != X86::COND_INVALID) {
-        rewriteCMov(TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs);
-      } else if (X86::getCondFromSETOpc(MI.getOpcode()) != X86::COND_INVALID) {
-        rewriteSetCC(TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs);
-      } else if (MI.getOpcode() == TargetOpcode::COPY) {
-        rewriteCopy(MI, *FlagUse, CopyDefI);
-      } else {
-        // We assume that arithmetic instructions that use flags also def them.
-        assert(MI.findRegisterDefOperand(X86::EFLAGS) &&
-               "Expected a def of EFLAGS for this instruction!");
-
-        // NB!!! Several arithmetic instructions only *partially* update
-        // flags. Theoretically, we could generate MI code sequences that
-        // would rely on this fact and observe different flags independently.
-        // But currently LLVM models all of these instructions as clobbering
-        // all the flags in an undef way. We rely on that to simplify the
-        // logic.
-        FlagsKilled = true;
-
-        rewriteArithmetic(TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs);
-        break;
+        // If this was the last use of the flags, we're done.
+        if (FlagsKilled)
+          break;
       }
 
-      // If this was the last use of the flags, we're done.
+      // If the flags were killed, we're done with this block.
       if (FlagsKilled)
         break;
-    }
 
-    // If we didn't find a kill (or equivalent) check that the flags don't
-    // live-out of the basic block. Currently we don't support lowering copies
-    // of flags that live out in this fashion.
-    if (!FlagsKilled &&
-        llvm::any_of(MBB.successors(), [](MachineBasicBlock *SuccMBB) {
-          return SuccMBB->isLiveIn(X86::EFLAGS);
-        })) {
-      DEBUG({
-        dbgs() << "ERROR: Found a copied EFLAGS live-out from basic block:\n"
-               << "----\n";
-        MBB.dump();
-        dbgs() << "----\n"
-               << "ERROR: Cannot lower this EFLAGS copy!\n";
-      });
-      report_fatal_error(
-          "Cannot lower EFLAGS copy that lives out of a basic block!");
-    }
+      // Otherwise we need to scan successors for ones where the flags live-in
+      // and queue those up for processing.
+      for (MachineBasicBlock *SuccMBB : UseMBB.successors())
+        if (SuccMBB->isLiveIn(X86::EFLAGS) &&
+            VisitedBlocks.insert(SuccMBB).second)
+          Blocks.push_back(SuccMBB);
+    } while (!Blocks.empty());
 
     // Now rewrite the jumps that use the flags. These we handle specially
-    // because if there are multiple jumps we'll have to do surgery on the CFG.
+    // because if there are multiple jumps in a single basic block we'll have
+    // to do surgery on the CFG.
+    MachineBasicBlock *LastJmpMBB = nullptr;
     for (MachineInstr *JmpI : JmpIs) {
-      // Past the first jump we need to split the blocks apart.
-      if (JmpI != JmpIs.front())
+      // Past the first jump within a basic block we need to split the blocks
+      // apart.
+      if (JmpI->getParent() == LastJmpMBB)
         splitBlock(*JmpI->getParent(), *JmpI, *TII);
+      else
+        LastJmpMBB = JmpI->getParent();
 
       rewriteCondJmp(TestMBB, TestPos, TestLoc, *JmpI, CondRegs);
     }
Index: llvm/test/CodeGen/X86/O0-pipeline.ll
===================================================================
--- llvm/test/CodeGen/X86/O0-pipeline.ll
+++ llvm/test/CodeGen/X86/O0-pipeline.ll
@@ -37,6 +37,7 @@
 ; CHECK-NEXT:       X86 PIC Global Base Reg Initialization
 ; CHECK-NEXT:       Expand ISel Pseudo-instructions
 ; CHECK-NEXT:       Local Stack Slot Allocation
+; CHECK-NEXT:       MachineDominator Tree Construction
 ; CHECK-NEXT:       X86 EFLAGS copy lowering
 ; CHECK-NEXT:       X86 WinAlloca Expander
 ; CHECK-NEXT:       Eliminate PHI nodes for register allocation
Index: llvm/test/CodeGen/X86/O3-pipeline.ll
===================================================================
--- llvm/test/CodeGen/X86/O3-pipeline.ll
+++ llvm/test/CodeGen/X86/O3-pipeline.ll
@@ -90,6 +90,7 @@
 ; CHECK-NEXT:       X86 LEA Optimize
 ; CHECK-NEXT:       X86 Optimize Call Frame
 ; CHECK-NEXT:       X86 Avoid Store Forwarding Block
+; CHECK-NEXT:       MachineDominator Tree Construction
 ; CHECK-NEXT:       X86 EFLAGS copy lowering
 ; CHECK-NEXT:       X86 WinAlloca Expander
 ; CHECK-NEXT:       Detect Dead Lanes
Index: llvm/test/CodeGen/X86/copy-eflags.ll
===================================================================
--- llvm/test/CodeGen/X86/copy-eflags.ll
+++ llvm/test/CodeGen/X86/copy-eflags.ll
@@ -196,3 +196,99 @@
   tail call void @external_b()
   ret void
 }
+
+; Test a function that gets special select lowering into CFG with copied EFLAGS
+; threaded across the CFG. This requires our EFLAGS copy rewriting to handle
+; cross-block rewrites in at least some narrow cases.
+;
+; Note that the 'undef' values here are significant! Other values will in many
+; cases not effectively trigger the lowering we're interested in exercising.
+define void @PR37100(i8 %arg1, i16 %arg2, i64 %arg3) {
+; X32-LABEL: PR37100:
+; X32:       # %bb.0: # %bb
+; X32-NEXT:    pushl %ebx
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    .cfi_def_cfa_offset 12
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    .cfi_def_cfa_offset 16
+; X32-NEXT:    .cfi_offset %esi, -16
+; X32-NEXT:    .cfi_offset %edi, -12
+; X32-NEXT:    .cfi_offset %ebx, -8
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movb {{[0-9]+}}(%esp), %bl
+; X32-NEXT:    jmp .LBB3_1
+; X32-NEXT:    .p2align 4, 0x90
+; X32-NEXT:  .LBB3_5: # %bb1
+; X32-NEXT:    # in Loop: Header=BB3_1 Depth=1
+; X32-NEXT:    xorl %eax, %eax
+; X32-NEXT:    xorl %edx, %edx
+; X32-NEXT:    idivl %edi
+; X32-NEXT:  .LBB3_1: # %bb1
+; X32-NEXT:    # =>This Inner Loop Header: Depth=1
+; X32-NEXT:    movsbl %bl, %eax
+; X32-NEXT:    movl %eax, %edx
+; X32-NEXT:    sarl $31, %edx
+; X32-NEXT:    cmpl %eax, %esi
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    sbbl %edx, %eax
+; X32-NEXT:    setl %al
+; X32-NEXT:    setl %dl
+; X32-NEXT:    movzbl %dl, %edi
+; X32-NEXT:    negl %edi
+; X32-NEXT:    testb $-1, %al
+; X32-NEXT:    jne .LBB3_3
+; X32-NEXT:  # %bb.2: # %bb1
+; X32-NEXT:    # in Loop: Header=BB3_1 Depth=1
+; X32-NEXT:    # implicit-def: $bl
+; X32-NEXT:  .LBB3_3: # %bb1
+; X32-NEXT:    # in Loop: Header=BB3_1 Depth=1
+; X32-NEXT:    testb $-1, %al
+; X32-NEXT:    jne .LBB3_5
+; X32-NEXT:  # %bb.4: # %bb1
+; X32-NEXT:    # in Loop: Header=BB3_1 Depth=1
+; X32-NEXT:    # implicit-def: $edi
+; X32-NEXT:    jmp .LBB3_5
+;
+; X64-LABEL: PR37100:
+; X64:       # %bb.0: # %bb
+; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    jmp .LBB3_1
+; X64-NEXT:    .p2align 4, 0x90
+; X64-NEXT:  .LBB3_3: # %bb1
+; X64-NEXT:    # in Loop: Header=BB3_1 Depth=1
+; X64-NEXT:    cmovll %eax, %esi
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    idivl %esi
+; X64-NEXT:  .LBB3_1: # %bb1
+; X64-NEXT:    # =>This Inner Loop Header: Depth=1
+; X64-NEXT:    movsbq %dil, %rdx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    setl %al
+; X64-NEXT:    negl %eax
+; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    jl .LBB3_3
+; X64-NEXT:  # %bb.2: # %bb1
+; X64-NEXT:    # in Loop: Header=BB3_1 Depth=1
+; X64-NEXT:    # implicit-def: $dil
+; X64-NEXT:    jmp .LBB3_3
+bb:
+  br label %bb1
+
+bb1:
+  %tmp = phi i8 [ %tmp8, %bb1 ], [ %arg1, %bb ]
+  %tmp2 = phi i16 [ %tmp11, %bb1 ], [ %arg2, %bb ]
+  %tmp3 = icmp sgt i16 %tmp2, 7
+  %tmp4 = select i1 %tmp3, i16 %tmp2, i16 7
+  %tmp5 = sext i8 %tmp to i64
+  %tmp6 = icmp slt i64 %arg3, %tmp5
+  %tmp7 = sext i1 %tmp6 to i32
+  %tmp8 = select i1 %tmp6, i8 %tmp, i8 undef
+  %tmp9 = select i1 %tmp6, i32 %tmp7, i32 undef
+  %tmp10 = srem i32 0, %tmp9
+  %tmp11 = trunc i32 %tmp10 to i16
+  br label %bb1
+}