Index: llvm/trunk/lib/CodeGen/CodeGenPrepare.cpp
===================================================================
--- llvm/trunk/lib/CodeGen/CodeGenPrepare.cpp
+++ llvm/trunk/lib/CodeGen/CodeGenPrepare.cpp
@@ -1177,6 +1177,20 @@
 bool CodeGenPrepare::replaceMathCmpWithIntrinsic(BinaryOperator *BO,
                                                  CmpInst *Cmp,
                                                  Intrinsic::ID IID) {
+  if (BO->getParent() != Cmp->getParent()) {
+    // We used to use a dominator tree here to allow multi-block optimization.
+    // But that was problematic because:
+    // 1. It could cause a perf regression by hoisting the math op into the
+    //    critical path.
+    // 2. It could cause a perf regression by creating a value that was live
+    //    across multiple blocks and increasing register pressure.
+    // 3. Use of a dominator tree could cause large compile-time regression.
+    //    This is because we recompute the DT on every change in the main CGP
+    //    run-loop. The recomputing is probably unnecessary in many cases, so if
+    //    that was fixed, using a DT here would be ok.
+    return false;
+  }
+
   // We allow matching the canonical IR (add X, C) back to (usubo X, -C).
   Value *Arg0 = BO->getOperand(0);
   Value *Arg1 = BO->getOperand(1);
@@ -1186,36 +1200,15 @@
     Arg1 = ConstantExpr::getNeg(cast<Constant>(Arg1));
   }
 
-  Instruction *InsertPt;
-  if (BO->hasOneUse() && BO->user_back() == Cmp) {
-    // If the math is only used by the compare, insert at the compare to keep
-    // the condition in the same block as its users. (CGP aggressively sinks
-    // compares to help out SDAG.)
-    InsertPt = Cmp;
-  } else {
-    // The math and compare may be independent instructions. Check dominance to
-    // determine the insertion point for the intrinsic.
-    bool MathDominates = getDT(*BO->getFunction()).dominates(BO, Cmp);
-    if (!MathDominates && !getDT(*BO->getFunction()).dominates(Cmp, BO))
-      return false;
-
-    BasicBlock *MathBB = BO->getParent(), *CmpBB = Cmp->getParent();
-    if (MathBB != CmpBB) {
-      // Avoid hoisting an extra op into a dominating block and creating a
-      // potentially longer critical path.
-      if (!MathDominates)
-        return false;
-      // Check that the insertion doesn't create a value that is live across
-      // more than two blocks, so to minimise the increase in register pressure.
-      BasicBlock *Dominator = MathDominates ? MathBB : CmpBB;
-      BasicBlock *Dominated = MathDominates ? CmpBB : MathBB;
-      auto Successors = successors(Dominator);
-      if (llvm::find(Successors, Dominated) == Successors.end())
-        return false;
+  // Insert at the first instruction of the pair.
+  Instruction *InsertPt = nullptr;
+  for (Instruction &Iter : *Cmp->getParent()) {
+    if (&Iter == BO || &Iter == Cmp) {
+      InsertPt = &Iter;
+      break;
     }
-
-    InsertPt = MathDominates ? cast<Instruction>(BO) : cast<Instruction>(Cmp);
   }
+  assert(InsertPt != nullptr && "Parent block did not contain cmp or binop");
 
   IRBuilder<> Builder(InsertPt);
   Value *MathOV = Builder.CreateBinaryIntrinsic(IID, Arg0, Arg1);
Index: llvm/trunk/test/CodeGen/X86/cgp-usubo.ll
===================================================================
--- llvm/trunk/test/CodeGen/X86/cgp-usubo.ll
+++ llvm/trunk/test/CodeGen/X86/cgp-usubo.ll
@@ -121,7 +121,7 @@
   ret i1 %ov
 }
 
-; Verify insertion point for multi-BB.
+; This used to verify insertion point for multi-BB, but now we just bail out.
 
 declare void @call(i1)
 
@@ -131,14 +131,17 @@
 ; CHECK-NEXT:    testb $1, %cl
 ; CHECK-NEXT:    je .LBB8_2
 ; CHECK-NEXT:  # %bb.1: # %t
-; CHECK-NEXT:    subq %rsi, %rdi
-; CHECK-NEXT:    setb %al
-; CHECK-NEXT:    movq %rdi, (%rdx)
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    subq %rsi, %rax
+; CHECK-NEXT:    movq %rax, (%rdx)
 ; CHECK-NEXT:    testb $1, %cl
-; CHECK-NEXT:    jne .LBB8_3
+; CHECK-NEXT:    je .LBB8_2
+; CHECK-NEXT:  # %bb.3: # %end
+; CHECK-NEXT:    cmpq %rsi, %rdi
+; CHECK-NEXT:    setb %al
+; CHECK-NEXT:    retq
 ; CHECK-NEXT:  .LBB8_2: # %f
 ; CHECK-NEXT:    movl %ecx, %eax
-; CHECK-NEXT:  .LBB8_3: # %end
 ; CHECK-NEXT:    retq
 entry:
   br i1 %cond, label %t, label %f
Index: llvm/trunk/test/Transforms/CodeGenPrepare/X86/optimizeSelect-DT.ll
===================================================================
--- llvm/trunk/test/Transforms/CodeGenPrepare/X86/optimizeSelect-DT.ll
+++ llvm/trunk/test/Transforms/CodeGenPrepare/X86/optimizeSelect-DT.ll
@@ -14,11 +14,10 @@
 ; CHECK-NEXT:    br label [[SELECT_END]]
 ; CHECK:       select.end:
 ; CHECK-NEXT:    [[MUL:%.*]] = phi i32 [ [[REM]], [[SELECT_TRUE_SINK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[T1:%.*]], i32 1)
-; CHECK-NEXT:    [[MATH:%.*]] = extractvalue { i32, i1 } [[TMP0]], 0
-; CHECK-NEXT:    [[OV:%.*]] = extractvalue { i32, i1 } [[TMP0]], 1
-; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[MATH]], [[MUL]]
-; CHECK-NEXT:    ret i1 [[OV]]
+; CHECK-NEXT:    [[NEG:%.*]] = add i32 [[T1:%.*]], -1
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[NEG]], [[MUL]]
+; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[T1]], 0
+; CHECK-NEXT:    ret i1 [[TOBOOL]]
 ;
 entry:
   %rem = srem i32 %x, 2
Index: llvm/trunk/test/Transforms/CodeGenPrepare/X86/overflow-intrinsics.ll
===================================================================
--- llvm/trunk/test/Transforms/CodeGenPrepare/X86/overflow-intrinsics.ll
+++ llvm/trunk/test/Transforms/CodeGenPrepare/X86/overflow-intrinsics.ll
@@ -47,15 +47,16 @@
   ret i64 %Q
 }
 
+; TODO? CGP sinks the compare before we have a chance to form the overflow intrinsic.
+
 define i64 @uaddo4(i64 %a, i64 %b, i1 %c) nounwind ssp {
 ; CHECK-LABEL: @uaddo4(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ADD:%.*]] = add i64 [[B:%.*]], [[A:%.*]]
 ; CHECK-NEXT:    br i1 [[C:%.*]], label [[NEXT:%.*]], label [[EXIT:%.*]]
 ; CHECK:       next:
-; CHECK-NEXT:    [[TMP0:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[B:%.*]], i64 [[A:%.*]])
-; CHECK-NEXT:    [[MATH:%.*]] = extractvalue { i64, i1 } [[TMP0]], 0
-; CHECK-NEXT:    [[OV:%.*]] = extractvalue { i64, i1 } [[TMP0]], 1
-; CHECK-NEXT:    [[Q:%.*]] = select i1 [[OV]], i64 [[B]], i64 42
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp ugt i64 [[B]], [[ADD]]
+; CHECK-NEXT:    [[Q:%.*]] = select i1 [[TMP0]], i64 [[B]], i64 42
 ; CHECK-NEXT:    ret i64 [[Q]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret i64 0
@@ -362,7 +363,7 @@
   ret i1 %ov
 }
 
-; Verify insertion point for multi-BB.
+; This used to verify insertion point for multi-BB, but now we just bail out.
 
 declare void @call(i1)
 
@@ -371,15 +372,14 @@
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 [[COND:%.*]], label [[T:%.*]], label [[F:%.*]]
 ; CHECK:       t:
-; CHECK-NEXT:    [[TMP0:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[X:%.*]], i64 [[Y:%.*]])
-; CHECK-NEXT:    [[MATH:%.*]] = extractvalue { i64, i1 } [[TMP0]], 0
-; CHECK-NEXT:    [[OV1:%.*]] = extractvalue { i64, i1 } [[TMP0]], 1
-; CHECK-NEXT:    store i64 [[MATH]], i64* [[P:%.*]]
+; CHECK-NEXT:    [[S:%.*]] = sub i64 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    store i64 [[S]], i64* [[P:%.*]]
 ; CHECK-NEXT:    br i1 [[COND]], label [[END:%.*]], label [[F]]
 ; CHECK:       f:
 ; CHECK-NEXT:    ret i1 [[COND]]
 ; CHECK:       end:
-; CHECK-NEXT:    ret i1 [[OV1]]
+; CHECK-NEXT:    [[OV:%.*]] = icmp ult i64 [[X]], [[Y]]
+; CHECK-NEXT:    ret i1 [[OV]]
 ;
 entry:
   br i1 %cond, label %t, label %f
@@ -514,6 +514,26 @@
   ret void
 }
 
+; This was crashing when trying to delay instruction removal/deletion.
+
+declare i64 @llvm.objectsize.i64.p0i8(i8*, i1 immarg, i1 immarg, i1 immarg) #0
+
+define hidden fastcc void @crash() {
+; CHECK-LABEL: @crash(
+; CHECK-NEXT:    [[TMP1:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
+; CHECK-NEXT:    [[MATH:%.*]] = extractvalue { i64, i1 } [[TMP1]], 0
+; CHECK-NEXT:    [[OV:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1
+; CHECK-NEXT:    [[T2:%.*]] = select i1 undef, i1 undef, i1 [[OV]]
+; CHECK-NEXT:    unreachable
+;
+  %t0 = add i64 undef, undef
+  %t1 = icmp ult i64 %t0, undef
+  %t2 = select i1 undef, i1 undef, i1 %t1
+  %t3 = call i64 @llvm.objectsize.i64.p0i8(i8* nonnull undef, i1 false, i1 false, i1 false)
+  %t4 = icmp ugt i64 %t3, 7
+  unreachable
+}
+
 ; Check that every instruction inserted by -codegenprepare has a debug location.
 ; DEBUG: CheckModuleDebugify: PASS