diff --git a/llvm/include/llvm/Transforms/Scalar/JumpThreading.h b/llvm/include/llvm/Transforms/Scalar/JumpThreading.h --- a/llvm/include/llvm/Transforms/Scalar/JumpThreading.h +++ b/llvm/include/llvm/Transforms/Scalar/JumpThreading.h @@ -17,7 +17,6 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/SmallPtrSet.h" -#include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/BranchProbabilityInfo.h" @@ -85,11 +84,7 @@ std::unique_ptr BPI; bool HasProfileData = false; bool HasGuards = false; -#ifdef NDEBUG SmallPtrSet LoopHeaders; -#else - SmallSet, 16> LoopHeaders; -#endif unsigned BBDupThreshold; unsigned DefaultBBDupThreshold; diff --git a/llvm/include/llvm/Transforms/Utils/Local.h b/llvm/include/llvm/Transforms/Utils/Local.h --- a/llvm/include/llvm/Transforms/Utils/Local.h +++ b/llvm/include/llvm/Transforms/Utils/Local.h @@ -157,8 +157,12 @@ /// other than PHI nodes, potential debug intrinsics and the branch. If /// possible, eliminate BB by rewriting all the predecessors to branch to the /// successor block and return true. If we can't transform, return false. -bool TryToSimplifyUncondBranchFromEmptyBlock(BasicBlock *BB, - DomTreeUpdater *DTU = nullptr); +/// +/// The set of loop headers is needed to avoid that loop metadata is associated +/// with the wrong loop. +bool TryToSimplifyUncondBranchFromEmptyBlock( + BasicBlock *BB, const SmallPtrSetImpl &LoopHeaders, + DomTreeUpdater *DTU = nullptr); /// Check for and eliminate duplicate PHI nodes in this block. This doesn't try /// to be clever about PHI nodes which differ only in the order of the incoming diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp --- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp +++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp @@ -468,7 +468,7 @@ // Don't alter Loop headers and latches to ensure another pass can // detect and transform nested loops later. !LoopHeaders.count(&BB) && !LoopHeaders.count(Succ) && - TryToSimplifyUncondBranchFromEmptyBlock(&BB, DTU)) { + TryToSimplifyUncondBranchFromEmptyBlock(&BB, LoopHeaders, DTU)) { RemoveRedundantDbgInstrs(Succ, true); // BB is valid for cleanup here because we passed in DTU. F remains // BB's parent until a DTU->getDomTree() event. diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp --- a/llvm/lib/Transforms/Utils/Local.cpp +++ b/llvm/lib/Transforms/Utils/Local.cpp @@ -1017,8 +1017,9 @@ replaceUndefValuesInPhi(PN, IncomingValues); } -bool llvm::TryToSimplifyUncondBranchFromEmptyBlock(BasicBlock *BB, - DomTreeUpdater *DTU) { +bool llvm::TryToSimplifyUncondBranchFromEmptyBlock( + BasicBlock *BB, const SmallPtrSetImpl &LoopHeaders, + DomTreeUpdater *DTU) { assert(BB != &BB->getParent()->getEntryBlock() && "TryToSimplifyUncondBranchFromEmptyBlock called on entry block!"); @@ -1057,6 +1058,20 @@ } } + // Return whether the BB is a latch of some loop. If yes, it's llvm.loop + // metadata has to be preserved. + auto IsLoopLatch = [&LoopHeaders](BasicBlock *BB) { + for (BasicBlock *Succ : successors(BB)) + if (is_contained(LoopHeaders, Succ)) + return true; + return false; + }; + + unsigned LoopMDKind = BB->getContext().getMDKindID("llvm.loop"); + Instruction *TI = BB->getTerminator(); + MDNode *LoopMD = TI ? TI->getMetadata(LoopMDKind) : nullptr; + bool BBIsLatch = IsLoopLatch(BB); + // We cannot fold the block if it's a branch to an already present callbr // successor because that creates duplicate successors. for (BasicBlock *PredBB : predecessors(BB)) { @@ -1067,6 +1082,13 @@ if (Succ == CBI->getIndirectDest(i)) return false; } + + // If both BBs are latches, then do not merge them to keep their metadata + // distinct. + Instruction *PredTI = PredBB->getTerminator(); + MDNode *PredMD = PredTI ? PredTI->getMetadata(LoopMDKind) : nullptr; + if (LoopMD != PredMD && BBIsLatch && IsLoopLatch(PredBB)) + return false; } LLVM_DEBUG(dbgs() << "Killing Trivial BB: \n" << *BB); @@ -1116,14 +1138,14 @@ } } - // If the unconditional branch we replaced contains llvm.loop metadata, we - // add the metadata to the branch instructions in the predecessors. - unsigned LoopMDKind = BB->getContext().getMDKindID("llvm.loop"); - Instruction *TI = BB->getTerminator(); - if (TI) - if (MDNode *LoopMD = TI->getMetadata(LoopMDKind)) - for (BasicBlock *Pred : predecessors(BB)) - Pred->getTerminator()->setMetadata(LoopMDKind, LoopMD); + // We previously ensured that either BB is not a loop latch, or none of its + // predecessor is (or they have the same metadata already). If BB is a latch, + // then its predecessors will become latches of the same loop; therefore + // transfer the loop's metadata to the new latches. If a predecessor was a + // latch, then nothing changes. + if (BBIsLatch) + for (BasicBlock *Pred : predecessors(BB)) + Pred->getTerminator()->setMetadata(LoopMDKind, LoopMD); // For AutoFDO, since BB is going to be removed, we won't be able to sample // it. To avoid assigning a zero weight for BB, move all its pseudo probes diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -6347,6 +6347,11 @@ BasicBlock *BB = BI->getParent(); BasicBlock *Succ = BI->getSuccessor(0); + SmallPtrSet LoopHeaderSet; + for (WeakVH L : LoopHeaders) + if (L) + LoopHeaderSet.insert(cast(L)); + // If the Terminator is the only non-phi instruction, simplify the block. // If LoopHeader is provided, check if the block or its successor is a loop // header. (This is for early invocations before loop simplify and @@ -6360,7 +6365,8 @@ (is_contained(LoopHeaders, BB) || is_contained(LoopHeaders, Succ))); BasicBlock::iterator I = BB->getFirstNonPHIOrDbg(true)->getIterator(); if (I->isTerminator() && BB != &BB->getParent()->getEntryBlock() && - !NeedCanonicalLoop && TryToSimplifyUncondBranchFromEmptyBlock(BB, DTU)) + !NeedCanonicalLoop && + TryToSimplifyUncondBranchFromEmptyBlock(BB, LoopHeaderSet, DTU)) return true; // If the only instruction in the block is a seteq/setne comparison against a diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll --- a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll @@ -153,9 +153,11 @@ ; ENABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 ; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], ; ENABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1016 -; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP6]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]] +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP6]], label [[SCALAR_PH:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]] +; ENABLED_MASKED_STRIDED: scalar.ph: +; ENABLED_MASKED_STRIDED-NEXT: br label [[FOR_BODY:%.*]] ; ENABLED_MASKED_STRIDED: for.body: -; ENABLED_MASKED_STRIDED-NEXT: [[IX_09:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ 1016, [[VECTOR_BODY]] ] +; ENABLED_MASKED_STRIDED-NEXT: [[IX_09:%.*]] = phi i32 [ 1016, [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_INC:%.*]] ] ; ENABLED_MASKED_STRIDED-NEXT: [[CMP1:%.*]] = icmp ugt i32 [[IX_09]], [[CONV]] ; ENABLED_MASKED_STRIDED-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] ; ENABLED_MASKED_STRIDED: if.then: diff --git a/llvm/test/Transforms/SimplifyCFG/pr50060-jumpthread.ll b/llvm/test/Transforms/SimplifyCFG/pr50060-jumpthread.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/SimplifyCFG/pr50060-jumpthread.ll @@ -0,0 +1,95 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -simplifycfg < %s | FileCheck %s +; +; When simplifying the unconditional branch do.body.loopexit +; (latch of%do.body), ensure that the loop metadata is transferred to +; the new latch (for.cond.cleanup3). +; +; llvm.org/PR50060 + +@n = dso_local global i32 0, align 4 +@C = dso_local global i32 0, align 4 + +; Function Attrs: nounwind +define dso_local void @_Z6test01v() addrspace(1) #0 { +; CHECK-LABEL: @_Z6test01v( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* @n, align 4, !tbaa [[TBAA2:![0-9]+]] +; CHECK-NEXT: [[CMP26:%.*]] = icmp slt i32 0, [[TMP0]] +; CHECK-NEXT: br label [[DO_BODY:%.*]] +; CHECK: do.body.loopexit: +; CHECK-NEXT: br label [[DO_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK: do.body: +; CHECK-NEXT: br label [[FOR_COND1_PREHEADER:%.*]] +; CHECK: for.cond1.preheader: +; CHECK-NEXT: [[J_08:%.*]] = phi i32 [ 0, [[DO_BODY]] ], [ [[INC7:%.*]], [[FOR_COND_CLEANUP3:%.*]] ] +; CHECK-NEXT: br i1 [[CMP26]], label [[FOR_BODY4:%.*]], label [[FOR_COND_CLEANUP3]] +; CHECK: for.cond.cleanup3: +; CHECK-NEXT: [[INC7]] = add nuw nsw i32 [[J_08]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[INC7]], 3 +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_COND1_PREHEADER]], label [[DO_BODY_LOOPEXIT:%.*]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK: for.body4: +; CHECK-NEXT: [[I_07:%.*]] = phi i32 [ [[INC5:%.*]], [[FOR_BODY4]] ], [ 0, [[FOR_COND1_PREHEADER]] ] +; CHECK-NEXT: store volatile i32 [[I_07]], i32* @C, align 4, !tbaa [[TBAA2]] +; CHECK-NEXT: [[INC5]] = add nuw nsw i32 [[I_07]], 1 +; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i32 [[INC5]], [[TMP0]] +; CHECK-NEXT: br i1 [[CMP2]], label [[FOR_BODY4]], label [[FOR_COND_CLEANUP3]], !llvm.loop [[LOOP11:![0-9]+]] +; +entry: + %0 = load i32, i32* @n, align 4, !tbaa !2 + %cmp26 = icmp slt i32 0, %0 + br label %do.body + +do.body.loopexit: ; preds = %for.cond.cleanup3 + br label %do.body, !llvm.loop !6 + +do.body: ; preds = %do.body.loopexit, %entry + br label %for.cond1.preheader + +for.cond1.preheader: ; preds = %do.body, %for.cond.cleanup3 + %j.08 = phi i32 [ 0, %do.body ], [ %inc7, %for.cond.cleanup3 ] + br i1 %cmp26, label %for.body4.lr.ph, label %for.cond.cleanup3 + +for.body4.lr.ph: ; preds = %for.cond1.preheader + br label %for.body4 + +for.cond1.for.cond.cleanup3_crit_edge: ; preds = %for.body4 + br label %for.cond.cleanup3 + +for.cond.cleanup3: ; preds = %for.cond1.for.cond.cleanup3_crit_edge, %for.cond1.preheader + %inc7 = add nuw nsw i32 %j.08, 1 + %cmp = icmp ult i32 %inc7, 3 + br i1 %cmp, label %for.cond1.preheader, label %do.body.loopexit, !llvm.loop !8 + +for.body4: ; preds = %for.body4.lr.ph, %for.body4 + %i.07 = phi i32 [ 0, %for.body4.lr.ph ], [ %inc5, %for.body4 ] + store volatile i32 %i.07, i32* @C, align 4, !tbaa !2 + %inc5 = add nuw nsw i32 %i.07, 1 + %cmp2 = icmp slt i32 %inc5, %0 + br i1 %cmp2, label %for.body4, label %for.cond1.for.cond.cleanup3_crit_edge, !llvm.loop !11 +} + +; Function Attrs: argmemonly nofree nosync nounwind willreturn +declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) addrspace(1) #1 + +; Function Attrs: argmemonly nofree nosync nounwind willreturn +declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) addrspace(1) #1 + +attributes #0 = { nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { argmemonly nofree nosync nounwind willreturn } + +!llvm.module.flags = !{!0} +!llvm.ident = !{!1} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{!"clang)"} +!2 = !{!3, !3, i64 0, i64 4} +!3 = !{!4, i64 4, !"int"} +!4 = !{!5, i64 1, !"omnipotent char"} +!5 = !{!"Simple C++ TBAA"} +!6 = distinct !{!6, !7} +!7 = !{!"llvm.loop.unroll.count", i32 2} +!8 = distinct !{!8, !9, !10} +!9 = !{!"llvm.loop.mustprogress"} +!10 = !{!"llvm.loop.unroll.disable"} +!11 = distinct !{!11, !9}