Index: llvm/trunk/lib/CodeGen/CodeGenPrepare.cpp =================================================================== --- llvm/trunk/lib/CodeGen/CodeGenPrepare.cpp +++ llvm/trunk/lib/CodeGen/CodeGenPrepare.cpp @@ -1177,6 +1177,20 @@ bool CodeGenPrepare::replaceMathCmpWithIntrinsic(BinaryOperator *BO, CmpInst *Cmp, Intrinsic::ID IID) { + if (BO->getParent() != Cmp->getParent()) { + // We used to use a dominator tree here to allow multi-block optimization. + // But that was problematic because: + // 1. It could cause a perf regression by hoisting the math op into the + // critical path. + // 2. It could cause a perf regression by creating a value that was live + // across multiple blocks and increasing register pressure. + // 3. Use of a dominator tree could cause large compile-time regression. + // This is because we recompute the DT on every change in the main CGP + // run-loop. The recomputing is probably unnecessary in many cases, so if + // that was fixed, using a DT here would be ok. + return false; + } + // We allow matching the canonical IR (add X, C) back to (usubo X, -C). Value *Arg0 = BO->getOperand(0); Value *Arg1 = BO->getOperand(1); @@ -1186,36 +1200,15 @@ Arg1 = ConstantExpr::getNeg(cast(Arg1)); } - Instruction *InsertPt; - if (BO->hasOneUse() && BO->user_back() == Cmp) { - // If the math is only used by the compare, insert at the compare to keep - // the condition in the same block as its users. (CGP aggressively sinks - // compares to help out SDAG.) - InsertPt = Cmp; - } else { - // The math and compare may be independent instructions. Check dominance to - // determine the insertion point for the intrinsic. - bool MathDominates = getDT(*BO->getFunction()).dominates(BO, Cmp); - if (!MathDominates && !getDT(*BO->getFunction()).dominates(Cmp, BO)) - return false; - - BasicBlock *MathBB = BO->getParent(), *CmpBB = Cmp->getParent(); - if (MathBB != CmpBB) { - // Avoid hoisting an extra op into a dominating block and creating a - // potentially longer critical path. - if (!MathDominates) - return false; - // Check that the insertion doesn't create a value that is live across - // more than two blocks, so to minimise the increase in register pressure. - BasicBlock *Dominator = MathDominates ? MathBB : CmpBB; - BasicBlock *Dominated = MathDominates ? CmpBB : MathBB; - auto Successors = successors(Dominator); - if (llvm::find(Successors, Dominated) == Successors.end()) - return false; + // Insert at the first instruction of the pair. + Instruction *InsertPt = nullptr; + for (Instruction &Iter : *Cmp->getParent()) { + if (&Iter == BO || &Iter == Cmp) { + InsertPt = &Iter; + break; } - - InsertPt = MathDominates ? cast(BO) : cast(Cmp); } + assert(InsertPt != nullptr && "Parent block did not contain cmp or binop"); IRBuilder<> Builder(InsertPt); Value *MathOV = Builder.CreateBinaryIntrinsic(IID, Arg0, Arg1); Index: llvm/trunk/test/CodeGen/X86/cgp-usubo.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/cgp-usubo.ll +++ llvm/trunk/test/CodeGen/X86/cgp-usubo.ll @@ -121,7 +121,7 @@ ret i1 %ov } -; Verify insertion point for multi-BB. +; This used to verify insertion point for multi-BB, but now we just bail out. declare void @call(i1) @@ -131,14 +131,17 @@ ; CHECK-NEXT: testb $1, %cl ; CHECK-NEXT: je .LBB8_2 ; CHECK-NEXT: # %bb.1: # %t -; CHECK-NEXT: subq %rsi, %rdi -; CHECK-NEXT: setb %al -; CHECK-NEXT: movq %rdi, (%rdx) +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: subq %rsi, %rax +; CHECK-NEXT: movq %rax, (%rdx) ; CHECK-NEXT: testb $1, %cl -; CHECK-NEXT: jne .LBB8_3 +; CHECK-NEXT: je .LBB8_2 +; CHECK-NEXT: # %bb.3: # %end +; CHECK-NEXT: cmpq %rsi, %rdi +; CHECK-NEXT: setb %al +; CHECK-NEXT: retq ; CHECK-NEXT: .LBB8_2: # %f ; CHECK-NEXT: movl %ecx, %eax -; CHECK-NEXT: .LBB8_3: # %end ; CHECK-NEXT: retq entry: br i1 %cond, label %t, label %f Index: llvm/trunk/test/Transforms/CodeGenPrepare/X86/optimizeSelect-DT.ll =================================================================== --- llvm/trunk/test/Transforms/CodeGenPrepare/X86/optimizeSelect-DT.ll +++ llvm/trunk/test/Transforms/CodeGenPrepare/X86/optimizeSelect-DT.ll @@ -14,11 +14,10 @@ ; CHECK-NEXT: br label [[SELECT_END]] ; CHECK: select.end: ; CHECK-NEXT: [[MUL:%.*]] = phi i32 [ [[REM]], [[SELECT_TRUE_SINK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[TMP0:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[T1:%.*]], i32 1) -; CHECK-NEXT: [[MATH:%.*]] = extractvalue { i32, i1 } [[TMP0]], 0 -; CHECK-NEXT: [[OV:%.*]] = extractvalue { i32, i1 } [[TMP0]], 1 -; CHECK-NEXT: [[ADD:%.*]] = add i32 [[MATH]], [[MUL]] -; CHECK-NEXT: ret i1 [[OV]] +; CHECK-NEXT: [[NEG:%.*]] = add i32 [[T1:%.*]], -1 +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[NEG]], [[MUL]] +; CHECK-NEXT: [[TOBOOL:%.*]] = icmp eq i32 [[T1]], 0 +; CHECK-NEXT: ret i1 [[TOBOOL]] ; entry: %rem = srem i32 %x, 2 Index: llvm/trunk/test/Transforms/CodeGenPrepare/X86/overflow-intrinsics.ll =================================================================== --- llvm/trunk/test/Transforms/CodeGenPrepare/X86/overflow-intrinsics.ll +++ llvm/trunk/test/Transforms/CodeGenPrepare/X86/overflow-intrinsics.ll @@ -47,15 +47,16 @@ ret i64 %Q } +; TODO? CGP sinks the compare before we have a chance to form the overflow intrinsic. + define i64 @uaddo4(i64 %a, i64 %b, i1 %c) nounwind ssp { ; CHECK-LABEL: @uaddo4( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[ADD:%.*]] = add i64 [[B:%.*]], [[A:%.*]] ; CHECK-NEXT: br i1 [[C:%.*]], label [[NEXT:%.*]], label [[EXIT:%.*]] ; CHECK: next: -; CHECK-NEXT: [[TMP0:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[B:%.*]], i64 [[A:%.*]]) -; CHECK-NEXT: [[MATH:%.*]] = extractvalue { i64, i1 } [[TMP0]], 0 -; CHECK-NEXT: [[OV:%.*]] = extractvalue { i64, i1 } [[TMP0]], 1 -; CHECK-NEXT: [[Q:%.*]] = select i1 [[OV]], i64 [[B]], i64 42 +; CHECK-NEXT: [[TMP0:%.*]] = icmp ugt i64 [[B]], [[ADD]] +; CHECK-NEXT: [[Q:%.*]] = select i1 [[TMP0]], i64 [[B]], i64 42 ; CHECK-NEXT: ret i64 [[Q]] ; CHECK: exit: ; CHECK-NEXT: ret i64 0 @@ -362,7 +363,7 @@ ret i1 %ov } -; Verify insertion point for multi-BB. +; This used to verify insertion point for multi-BB, but now we just bail out. declare void @call(i1) @@ -371,15 +372,14 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 [[COND:%.*]], label [[T:%.*]], label [[F:%.*]] ; CHECK: t: -; CHECK-NEXT: [[TMP0:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[X:%.*]], i64 [[Y:%.*]]) -; CHECK-NEXT: [[MATH:%.*]] = extractvalue { i64, i1 } [[TMP0]], 0 -; CHECK-NEXT: [[OV1:%.*]] = extractvalue { i64, i1 } [[TMP0]], 1 -; CHECK-NEXT: store i64 [[MATH]], i64* [[P:%.*]] +; CHECK-NEXT: [[S:%.*]] = sub i64 [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: store i64 [[S]], i64* [[P:%.*]] ; CHECK-NEXT: br i1 [[COND]], label [[END:%.*]], label [[F]] ; CHECK: f: ; CHECK-NEXT: ret i1 [[COND]] ; CHECK: end: -; CHECK-NEXT: ret i1 [[OV1]] +; CHECK-NEXT: [[OV:%.*]] = icmp ult i64 [[X]], [[Y]] +; CHECK-NEXT: ret i1 [[OV]] ; entry: br i1 %cond, label %t, label %f @@ -514,6 +514,26 @@ ret void } +; This was crashing when trying to delay instruction removal/deletion. + +declare i64 @llvm.objectsize.i64.p0i8(i8*, i1 immarg, i1 immarg, i1 immarg) #0 + +define hidden fastcc void @crash() { +; CHECK-LABEL: @crash( +; CHECK-NEXT: [[TMP1:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef) +; CHECK-NEXT: [[MATH:%.*]] = extractvalue { i64, i1 } [[TMP1]], 0 +; CHECK-NEXT: [[OV:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1 +; CHECK-NEXT: [[T2:%.*]] = select i1 undef, i1 undef, i1 [[OV]] +; CHECK-NEXT: unreachable +; + %t0 = add i64 undef, undef + %t1 = icmp ult i64 %t0, undef + %t2 = select i1 undef, i1 undef, i1 %t1 + %t3 = call i64 @llvm.objectsize.i64.p0i8(i8* nonnull undef, i1 false, i1 false, i1 false) + %t4 = icmp ugt i64 %t3, 7 + unreachable +} + ; Check that every instruction inserted by -codegenprepare has a debug location. ; DEBUG: CheckModuleDebugify: PASS