diff --git a/llvm/lib/Transforms/Scalar/LoopPredication.cpp b/llvm/lib/Transforms/Scalar/LoopPredication.cpp --- a/llvm/lib/Transforms/Scalar/LoopPredication.cpp +++ b/llvm/lib/Transforms/Scalar/LoopPredication.cpp @@ -1173,6 +1173,11 @@ if (ChangedLoop) SE->forgetLoop(L); + // The insertion point for the widening should be at the widenably call, not + // at the WidenableBR. If we do this at the widenableBR, we can incorrectly + // change a loop-invariant condition to a loop-varying one. + auto *IP = cast(WidenableBR->getCondition()); + // The use of umin(all analyzeable exits) instead of latch is subtle, but // important for profitability. We may have a loop which hasn't been fully // canonicalized just yet. If the exit we chose to widen is provably never @@ -1182,21 +1187,9 @@ const SCEV *MinEC = getMinAnalyzeableBackedgeTakenCount(*SE, *DT, L); if (isa(MinEC) || MinEC->getType()->isPointerTy() || !SE->isLoopInvariant(MinEC, L) || - !Rewriter.isSafeToExpandAt(MinEC, WidenableBR)) + !Rewriter.isSafeToExpandAt(MinEC, IP)) return ChangedLoop; - // Subtlety: We need to avoid inserting additional uses of the WC. We know - // that it can only have one transitive use at the moment, and thus moving - // that use to just before the branch and inserting code before it and then - // modifying the operand is legal. - auto *IP = cast(WidenableBR->getCondition()); - // Here we unconditionally modify the IR, so after this point we should return - // only `true`! - IP->moveBefore(WidenableBR); - if (MSSAU) - if (auto *MUD = MSSAU->getMemorySSA()->getMemoryAccess(IP)) - MSSAU->moveToPlace(MUD, WidenableBR->getParent(), - MemorySSA::BeforeTerminator); Rewriter.setInsertPoint(IP); IRBuilder<> B(IP); diff --git a/llvm/test/Transforms/LoopPredication/pr61963.ll b/llvm/test/Transforms/LoopPredication/pr61963.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopPredication/pr61963.ll @@ -0,0 +1,117 @@ +; RUN: opt -S -passes=loop-predication < %s 2>&1 | FileCheck %s + +@global = external global ptr addrspace(1) + +; Do not convert the widenable call widenable_cond11 to a loop-varying one since that will result in a miscompile. +; deopt9 will have incorrect deopt state. +; CHECK-LABEL: foo +; CHECK-LABEL: bb +; CHECK: %call = call i1 @llvm.experimental.widenable.condition() +; CHECK-NEXT: %widenable_cond11 = call i1 @llvm.experimental.widenable.condition +define i32 @foo(ptr addrspace(3) %arg) !prof !0 { +bb: + %alloca = alloca i1, align 1 + %getelementptr1 = getelementptr inbounds i8, ptr addrspace(3) %arg, i64 32 + %load2 = load i32, ptr addrspace(3) %getelementptr1, align 4 + %getelementptr7 = getelementptr inbounds i8, ptr addrspace(3) %arg, i64 72 + %load8 = load ptr addrspace(1), ptr addrspace(3) %getelementptr7, align 8 + %getelementptr9 = getelementptr inbounds i8, ptr addrspace(3) %arg, i64 80 + %init_val = load i32, ptr addrspace(3) %getelementptr9, align 4 + %getelementptr11 = getelementptr inbounds i8, ptr addrspace(3) %arg, i64 88 + %load12 = load i32, ptr addrspace(3) %getelementptr11, align 4 + store volatile i1 true, ptr %alloca, align 1 + %load13 = load volatile i1, ptr %alloca, align 1 + %icmp = icmp eq ptr addrspace(1) %load8, null + %getelementptr14 = getelementptr inbounds i8, ptr addrspace(1) %load8, i64 8 + %getelementptr15 = getelementptr inbounds i8, ptr addrspace(1) %load8, i64 20 + %call = call i1 @llvm.experimental.widenable.condition() #1 + %widenable_cond11 = call i1 @llvm.experimental.widenable.condition() #1 + br label %loop_outer + +loop_outer: ; preds = %bb34, %bb + %phi = phi i32 [ %phi36, %bb34 ], [ 42, %bb ] + %phi18 = phi i32 [ 2, %bb34 ], [ %load2, %bb ] + %phi21 = phi i32 [ %add39, %bb34 ], [ %init_val, %bb ] + %phi22 = phi i32 [ %phi35, %bb34 ], [ %load12, %bb ] + %load24 = load atomic ptr addrspace(1), ptr @global unordered, align 8, !nonnull !1 + %getelementptr25 = getelementptr i8, ptr addrspace(1) %load24, i64 16 + %mul = mul i32 %phi22, 16777216 + %add26 = add i32 %mul, -16777216 + %ashr = ashr exact i32 %add26, 24 + %add27 = add i32 %phi, 1 + %icmp28 = icmp eq i32 %add27, 0 + %load29 = load atomic i32, ptr addrspace(1) %getelementptr14 unordered, align 8, !range !2, !invariant.load !1, !noundef !1 + %icmp30 = icmp ugt i32 %load29, 1 + %and = and i1 %icmp30, %call + %load31 = load atomic i32, ptr addrspace(1) %getelementptr15 unordered, align 4, !tbaa !3, !noundef !1 + %icmp32 = icmp eq i32 %load31, 0 + store atomic ptr addrspace(1) null, ptr addrspace(1) %getelementptr25 unordered, align 8 + store ptr addrspace(1) null, ptr addrspace(256) inttoptr (i64 8 to ptr addrspace(256)), align 8, !tbaa !6 + br i1 %widenable_cond11, label %bb33, label %deopt9, !prof !8 + +bb33: ; preds = %loop_outer + store atomic i32 606, ptr addrspace(1) %getelementptr15 unordered, align 4, !tbaa !3 + br label %inner_loop + +bb34: ; preds = %bb54 + %phi35 = phi i32 [ %ashr47, %bb54 ] + %phi36 = phi i32 [ %add48, %bb54 ] + %add37 = add i32 %phi18, 1 + %icmp38 = icmp sgt i32 %add37, 12 + %add39 = add i32 %phi21, 1 + br label %loop_outer + +inner_loop: ; preds = %bb54, %bb33 + %phi42 = phi i32 [ %ashr, %bb33 ], [ %ashr47, %bb54 ] + %phi43 = phi i32 [ 1, %bb33 ], [ %add55, %bb54 ] + %phi44 = phi i32 [ %add27, %bb33 ], [ %add48, %bb54 ] + %mul45 = mul i32 %phi42, 16777216 + %add46 = add i32 %mul45, -16777216 + %ashr47 = ashr exact i32 %add46, 24 + %add48 = add i32 %phi44, 1 + %icmp49 = icmp eq i32 %add48, 0 + br i1 %icmp49, label %bb57, label %bb54, !prof !9, !make.implicit !1 + +deopt9: ; preds = %loop_outer + %lcssa = phi i32 [ %init_val, %loop_outer ] + %phi51 = phi i32 [ %ashr, %loop_outer ] + %phi52 = phi i32 [ %add27, %loop_outer ] + %call53 = call i32 (...) @llvm.experimental.deoptimize.i32(i32 13) #2 [ "deopt"(i32 0, i32 1, i32 1291853205, i32 127, i32 3, i32 13, i32 0, i32 0, ptr addrspace(1) %load8, i32 3, i32 1, i32 3, i32 606, i32 7, ptr null, i32 7, ptr null, i32 3, i32 %phi52, i32 7, ptr null, i32 3, i32 %load2, i32 7, ptr null, i32 7, ptr null, i32 3, i32 1, i32 3, i32 0, i32 0, ptr addrspace(1) %load8, i32 3, i32 %lcssa, i32 3, i32 %phi51, i32 7, ptr null) ] + ret i32 %call53 + +bb54: ; preds = %inner_loop + store atomic i32 606, ptr addrspace(1) %getelementptr15 unordered, align 4, !tbaa !3 + %add55 = add nuw nsw i32 %phi43, 1 + %exitcond = icmp eq i32 %add55, 10 + br i1 %exitcond, label %bb34, label %inner_loop, !llvm.loop !10 + +bb57: ; preds = %inner_loop + %phi58 = phi i32 [ %phi18, %inner_loop ] + %phi59 = phi i32 [ %phi21, %inner_loop ] + %phi60 = phi i32 [ %phi43, %inner_loop ] + %phi61 = phi i32 [ %ashr47, %inner_loop ] + %call62 = call i32 (...) @llvm.experimental.deoptimize.i32(i32 12) #2 [ "deopt"(i32 0, i32 1, i32 1291853205, i32 99, i32 2, i32 13, i32 0, i32 3, i32 0, i32 3, i32 0, i32 7, ptr null, i32 7, ptr null, i32 3, i32 0, i32 7, ptr null, i32 3, i32 %phi58, i32 7, ptr null, i32 7, ptr null, i32 3, i32 1, i32 3, i32 %phi60, i32 0, ptr addrspace(1) %load8, i32 3, i32 %phi59, i32 3, i32 %phi61, i32 7, ptr null) ] + ret i32 %call62 +} + +declare i32 @llvm.experimental.deoptimize.i32(...) + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(inaccessiblemem: readwrite) +declare noundef i1 @llvm.experimental.widenable.condition() #0 + +attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(inaccessiblemem: readwrite) } +attributes #1 = { "inline-remark"="cost=never, reason=unavailable definition" } +attributes #2 = { "deopt-lowering"="live-in" "inline-remark"="cost=never, reason=unavailable definition" } + +!0 = !{!"function_entry_count", i64 32768} +!1 = !{} +!2 = !{i32 0, i32 2147483646} +!3 = !{!4, !4, i64 0} +!4 = !{!"int.array", !5} +!5 = !{!"tbaa-access-type"} +!6 = !{!7, !7, i64 0} +!7 = !{!"pendingException_access", !5} +!8 = !{!"branch_weights", i32 1048576, i32 1} +!9 = !{!"branch_weights", i32 1, i32 983039} +!10 = distinct !{!10, !11} +!11 = !{!"llvm.loop.peeled.count", i32 1}