Index: include/llvm/Analysis/LoopInfoImpl.h =================================================================== --- include/llvm/Analysis/LoopInfoImpl.h +++ include/llvm/Analysis/LoopInfoImpl.h @@ -186,8 +186,13 @@ template void LoopBase:: addBasicBlockToLoop(BlockT *NewBB, LoopInfoBase &LIB) { - assert((Blocks.empty() || LIB[getHeader()] == this) && - "Incorrect LI specified for this loop!"); +#ifndef NDEBUG + if (!Blocks.empty()) { + auto SameHeader = LIB[getHeader()]; + assert(contains(SameHeader) && getHeader() == SameHeader->getHeader() + && "Incorrect LI specified for this loop!"); + } +#endif assert(NewBB && "Cannot add a null basic block to the loop!"); assert(!LIB[NewBB] && "BasicBlock already in the loop!"); Index: lib/CodeGen/MachineBlockPlacement.cpp =================================================================== --- lib/CodeGen/MachineBlockPlacement.cpp +++ lib/CodeGen/MachineBlockPlacement.cpp @@ -478,11 +478,11 @@ --SuccChain.UnscheduledPredecessors > 0) continue; - auto *MBB = *SuccChain.begin(); - if (MBB->isEHPad()) - EHPadWorkList.push_back(MBB); + auto *NewBB = *SuccChain.begin(); + if (NewBB->isEHPad()) + EHPadWorkList.push_back(NewBB); else - BlockWorkList.push_back(MBB); + BlockWorkList.push_back(NewBB); } } @@ -1814,16 +1814,13 @@ BlockChain &Chain, BlockFilterSet *BlockFilter, MachineFunction::iterator &PrevUnplacedBlockIt) { bool Removed, DuplicatedToLPred; + bool DuplicatedToOriginalLPred; Removed = maybeTailDuplicateBlock(BB, LPred, Chain, BlockFilter, PrevUnplacedBlockIt, DuplicatedToLPred); if (!Removed) return false; - // If BB was duplicated into LPred, it is now scheduled. But because it was - // removed, markChainSuccessors won't be called for its chain. Instead we call - // markBlockSuccessors for LPred to achieve the same effect. - if (DuplicatedToLPred) - markBlockSuccessors(Chain, LPred, LoopHeaderBB, BlockFilter); + DuplicatedToOriginalLPred = DuplicatedToLPred; // Iteratively try to duplicate again. It can happen that a block that is // duplicated into is still small enough to be duplicated again. // No need to call markBlockSuccessors in this case, as the blocks being @@ -1848,7 +1845,14 @@ PrevUnplacedBlockIt, DuplicatedToLPred); } + // If BB was duplicated into LPred, it is now scheduled. But because it was + // removed, markChainSuccessors won't be called for its chain. Instead we + // call markBlockSuccessors for LPred to achieve the same effect. This must go + // at the end because repeating the tail duplication can increase the number + // of unscheduled predecessors. LPred = *std::prev(Chain.end()); + if (DuplicatedToOriginalLPred) + markBlockSuccessors(Chain, LPred, LoopHeaderBB, BlockFilter); return true; } Index: lib/CodeGen/TailDuplicator.cpp =================================================================== --- lib/CodeGen/TailDuplicator.cpp +++ lib/CodeGen/TailDuplicator.cpp @@ -868,6 +868,10 @@ !TailBB->hasAddressTaken()) { DEBUG(dbgs() << "\nMerging into block: " << *PrevBB << "From MBB: " << *TailBB); + // There may be a branch to the layout successor. This is unlikely but it + // happens. The correct thing to do is to remove the branch before + // duplicating the instructions in all cases. + TII->removeBranch(*PrevBB); if (PreRegAlloc) { DenseMap LocalVRMap; SmallVector, 4> CopyInfos; Index: test/CodeGen/AArch64/tail-dup-repeat-worklist.ll =================================================================== --- /dev/null +++ test/CodeGen/AArch64/tail-dup-repeat-worklist.ll @@ -0,0 +1,69 @@ +; RUN: llc -O3 -o - -verify-machineinstrs %s | FileCheck %s +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-unknown-linux-gnu" + +%struct.s1 = type { %struct.s3*, %struct.s1* } +%struct.s2 = type opaque +%struct.s3 = type { i32 } + +; Function Attrs: nounwind +define internal fastcc i32 @repeated_dup_worklist(%struct.s1** %pp1, %struct.s2* %p2, i32 %state, i1 %i1_1, i32 %i32_1) unnamed_addr #0 { +entry: + br label %while.cond.outer + +; The loop gets laid out: +; %while.cond.outer +; %(null) +; %(null) +; %dup2 +; and then %dup1 gets chosen as the next block. +; when dup2 is duplicated into dup1, %worklist could erroneously be placed on +; the worklist, because all of its current predecessors are now scheduled. +; However, after dup2 is tail-duplicated, %worklist can't be on the worklist +; because it now has unscheduled predecessors.q +; CHECK-LABEL: repeated_dup_worklist +; CHECK: // %entry +; CHECK: // %while.cond.outer +; first %(null) block +; CHECK: // in Loop: +; CHECK: ldr +; CHECK-NEXT: tbnz +; second %(null) block +; CHECK: // in Loop: +; CHECK: // %dup2 +; CHECK: // %worklist +; CHECK: // %if.then96.i +while.cond.outer: ; preds = %dup1, %entry + %progress.0.ph = phi i32 [ 0, %entry ], [ %progress.1, %dup1 ] + %inc77 = add nsw i32 %progress.0.ph, 1 + %cmp = icmp slt i32 %progress.0.ph, %i32_1 + br i1 %cmp, label %dup2, label %dup1 + +dup2: ; preds = %if.then96.i, %worklist, %while.cond.outer + %progress.1.ph = phi i32 [ 0, %while.cond.outer ], [ %progress.1, %if.then96.i ], [ %progress.1, %worklist ] + %.pr = load %struct.s1*, %struct.s1** %pp1, align 8 + br label %dup1 + +dup1: ; preds = %dup2, %while.cond.outer + %0 = phi %struct.s1* [ %.pr, %dup2 ], [ undef, %while.cond.outer ] + %progress.1 = phi i32 [ %progress.1.ph, %dup2 ], [ %inc77, %while.cond.outer ] + br i1 %i1_1, label %while.cond.outer, label %worklist + +worklist: ; preds = %dup1 + %snode94 = getelementptr inbounds %struct.s1, %struct.s1* %0, i64 0, i32 0 + %1 = load %struct.s3*, %struct.s3** %snode94, align 8 + %2 = getelementptr inbounds %struct.s3, %struct.s3* %1, i32 0, i32 0 + %3 = load i32, i32* %2, align 4 + %tobool95.i = icmp eq i32 %3, 0 + br i1 %tobool95.i, label %if.then96.i, label %dup2 + +if.then96.i: ; preds = %worklist + call fastcc void @free_s3(%struct.s2* %p2, %struct.s3* %1) #1 + br label %dup2 +} + +; Function Attrs: nounwind +declare fastcc void @free_s3(%struct.s2*, %struct.s3*) unnamed_addr #0 + +attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a57" "target-features"="+crc,+crypto,+neon" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind } Index: test/CodeGen/PowerPC/tail-dup-branch-to-fallthrough.ll =================================================================== --- /dev/null +++ test/CodeGen/PowerPC/tail-dup-branch-to-fallthrough.ll @@ -0,0 +1,65 @@ +; RUN: llc -O2 %s -o - | FileCheck %s +target datalayout = "E-m:e-i64:64-n32:64" +target triple = "powerpc64-unknown-linux-gnu" + +; Function Attrs: nounwind +declare void @llvm.lifetime.end(i64, i8* nocapture) #0 + +declare void @f1() +declare void @f2() +declare void @f3() +declare void @f4() + +; Function Attrs: nounwind +; CHECK-LABEL: tail_dup_fallthrough_with_branch +; CHECK: # %entry +; CHECK-NOT: # %{{[-_a-zA-Z0-9]+}} +; CHECK: # %entry +; CHECK-NOT: # %{{[-_a-zA-Z0-9]+}} +; CHECK: # %sw.0 +; CHECK-NOT: # %{{[-_a-zA-Z0-9]+}} +; CHECK: # %sw.1 +; CHECK-NOT: # %{{[-_a-zA-Z0-9]+}} +; CHECK: # %sw.default +; CHECK-NOT: # %{{[-_a-zA-Z0-9]+}} +; CHECK: # %if.then +; CHECK-NOT: # %{{[-_a-zA-Z0-9]+}} +; CHECK: # %if.else +; CHECK-NOT: # %{{[-_a-zA-Z0-9]+}} +; CHECK: .Lfunc_end0 +define fastcc void @tail_dup_fallthrough_with_branch(i32 %a, i1 %b) unnamed_addr #0 { +entry: + switch i32 %a, label %sw.default [ + i32 0, label %sw.0 + i32 1, label %sw.1 + ] + +sw.0: ; preds = %entry + call void @f1() #0 + br label %dup1 + +sw.1: ; preds = %entry + call void @f2() #0 + br label %dup1 + +sw.default: ; preds = %entry + br i1 %b, label %if.then, label %if.else + +if.then: ; preds = %sw.default + call void @f3() #0 + br label %dup2 + +if.else: ; preds = %sw.default + call void @f4() #0 + br label %dup2 + +dup1: ; preds = %sw.0, %sw.1 + call void @llvm.lifetime.end(i64 8, i8* nonnull undef) #0 + unreachable + +dup2: ; preds = %if.then, %if.else + call void @llvm.lifetime.end(i64 8, i8* nonnull undef) #0 + unreachable +} + +attributes #0 = { nounwind } Index: test/CodeGen/X86/tail-dup-merge-loop-headers.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/tail-dup-merge-loop-headers.ll @@ -0,0 +1,190 @@ +; RUN: llc -O2 -o - %s | FileCheck %s +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: nounwind uwtable +; CHECK-LABEL: tail_dup_merge_loops +; CHECK: # %entry +; CHECK-NOT: # %{{[a-zA-Z_]+}} +; CHECK: # %inner_loop_exit +; CHECK-NOT: # %{{[a-zA-Z_]+}} +; CHECK: # %inner_loop_latch +; CHECK-NOT: # %{{[a-zA-Z_]+}} +; CHECK: # %inner_loop_test +; CHECK-NOT: # %{{[a-zA-Z_]+}} +; CHECK: # %exit +define void @tail_dup_merge_loops(i32 %a, i8* %b, i8* %c) local_unnamed_addr #0 { +entry: + %notlhs674.i = icmp eq i32 %a, 0 + br label %outer_loop_top + +outer_loop_top: ; preds = %inner_loop_exit, %entry + %dst.0.ph.i = phi i8* [ %b, %entry ], [ %scevgep679.i, %inner_loop_exit ] + br i1 %notlhs674.i, label %exit, label %inner_loop_preheader + +inner_loop_preheader: ; preds = %outer_loop_top + br label %inner_loop_top + +inner_loop_top: ; preds = %inner_loop_latch, %inner_loop_preheader + %dst.0.i = phi i8* [ %inc, %inner_loop_latch ], [ %dst.0.ph.i, %inner_loop_preheader ] + %var = load i8, i8* %dst.0.i + %tobool1.i = icmp slt i8 %var, 0 + br label %inner_loop_test + +inner_loop_test: ; preds = %inner_loop_top + br i1 %tobool1.i, label %inner_loop_exit, label %inner_loop_latch + +inner_loop_exit: ; preds = %inner_loop_test + %scevgep.i = getelementptr i8, i8* %dst.0.i, i64 1 + %scevgep679.i = getelementptr i8, i8* %scevgep.i, i64 0 + br label %outer_loop_top + +inner_loop_latch: ; preds = %inner_loop_test + %cmp75.i = icmp ult i8* %dst.0.i, %c + %inc = getelementptr i8, i8* %dst.0.i, i64 2 + br label %inner_loop_top + +exit: ; preds = %outer_loop_top + ret void +} + +@.str.6 = external unnamed_addr constant [23 x i8], align 1 + +; There is an erroneus check in LoopBase::addBasicBlockToLoop(), where it +; assumes that the header block for a loop is unique. +; For most of compilation this assumption is true, but during layout we allow +; this assumption to be violated. The following code will trigger the bug: + +; The loops in question is eventually headed by the block shared_loop_header +; +; During layout The block labeled outer_loop_header gets tail-duplicated into +; outer_loop_latch, and into shared_preheader, and then removed. This leaves +; shared_loop_header as the header of both loops. The end result +; is that there are 2 valid loops, and that they share a header. If we re-ran +; the loop analysis, it would classify this as a single loop. +; So far this is fine as far as layout is concerned. +; After layout we tail merge blocks merge_other and merge_predecessor_split. +; We do this even though they share only a single instruction, because +; merge_predecessor_split falls through to their shared successor: +; outer_loop_latch. +; The rest of the blocks in the function are noise unfortunately. Bugpoint +; couldn't shrink the test any further. + +; CHECK-LABEL: loop_shared_header +; CHECK: # %entry +; CHECK: # %shared_preheader +; CHECK: # %shared_loop_header +; CHECK: # %inner_loop_body +; CHECK: # %merge_predecessor_split +; CHECK: # %outer_loop_latch +; CHECK: # %outer_loop_latch +; CHECK: # %cleanup +define i32 @loop_shared_header(i8* %exe, i32 %exesz, i32 %headsize, i32 %min, i32 %wwprva, i32 %e_lfanew, i8* readonly %wwp, i32 %wwpsz, i16 zeroext %sects) local_unnamed_addr #0 { +entry: + %0 = load i32, i32* undef, align 4 + %mul = shl nsw i32 %0, 2 + br i1 undef, label %if.end19, label %cleanup + +if.end19: ; preds = %entry + %conv = zext i32 %mul to i64 + %call = tail call i8* @cli_calloc(i64 %conv, i64 1) + %1 = icmp eq i32 %exesz, 0 + %notrhs = icmp eq i32 %0, 0 + %or.cond117.not = or i1 %1, %notrhs + %or.cond202 = or i1 %or.cond117.not, undef + %cmp35 = icmp ult i8* undef, %exe + %or.cond203 = or i1 %or.cond202, %cmp35 + br i1 %or.cond203, label %cleanup, label %if.end50 + +if.end50: ; preds = %if.end19 + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull %call, i8* undef, i64 %conv, i32 1, i1 false) + %cmp1.i.i = icmp ugt i32 %mul, 3 + br i1 %cmp1.i.i, label %shared_preheader, label %wunpsect.exit.thread.loopexit391 + +shared_preheader: ; preds = %if.end50 + br label %outer_loop_header + +outer_loop_header: ; preds = %outer_loop_latch, %shared_preheader + %bits.1.i = phi i8 [ 32, %shared_preheader ], [ %bits.43.i, %outer_loop_latch ] + %dst.0.ph.i = phi i8* [ undef, %shared_preheader ], [ %scevgep679.i, %outer_loop_latch ] + %2 = icmp eq i32 undef, 0 + br i1 %2, label %while.cond.us1412.i, label %shared_loop_header + +while.cond.us1412.i: ; preds = %outer_loop_header + %.pre.i = add i8 %bits.1.i, -1 + %tobool2.us1420.i = icmp eq i8 %.pre.i, 0 + %or.cond.us1421.i = or i1 undef, %tobool2.us1420.i + br i1 %or.cond.us1421.i, label %if.end41.us1436.i, label %cleanup + +if.end41.us1436.i: ; preds = %while.cond.us1412.i + unreachable + +shared_loop_header: ; preds = %dup_early2, %dup_early1 + %dst.0.i = phi i8* [ undef, %inner_loop_body ], [ %dst.0.ph.i, %outer_loop_header ], [ undef, %dead_block ] + %cmp3.i1172.i = icmp ult i8* undef, %call + br i1 %cmp3.i1172.i, label %wunpsect.exit.thread.loopexit389, label %inner_loop_body + +inner_loop_body: ; preds = %shared_loop_header + %3 = icmp slt i32 undef, 0 + br i1 %3, label %if.end96.i, label %shared_loop_header + +dead_block: ; preds = %inner_loop_body + %cmp75.i = icmp ult i8* %dst.0.i, undef + br label %shared_loop_header + +if.end96.i: ; preds = %inner_loop_body + %cmp97.i = icmp ugt i32 undef, 2 + br i1 %cmp97.i, label %if.then99.i, label %if.end287.i + +if.then99.i: ; preds = %if.end96.i + tail call void (i8*, ...) @cli_dbgmsg(i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str.6, i64 0, i64 0), i32 undef) + br label %cleanup + +if.end287.i: ; preds = %if.end96.i + %cmp291.i = icmp ne i32 undef, 1 + %conv294.i = select i1 %cmp291.i, i16 4, i16 3 + br i1 undef, label %if.end308.i, label %outer_loop_latch + +if.end308.i: ; preds = %if.end287.i + br i1 undef, label %if.end335.i, label %merge_predecessor_split + +merge_predecessor_split: ; preds = %if.end308.i + %4 = bitcast i8* undef to i32* + br label %outer_loop_latch + +if.end335.i: ; preds = %if.end308.i + br i1 undef, label %outer_loop_latch, label %merge_other + +merge_other: ; preds = %if.end335.i + br label %outer_loop_latch + +outer_loop_latch: ; preds = %merge_other, %if.end335.i, %merge_predecessor_split, %if.end287.i + %bits.43.i = phi i8 [ undef, %if.end287.i ], [ undef, %merge_other ], [ 32, %merge_predecessor_split ], [ 0, %if.end335.i ] + %backsize.0.i = phi i16 [ %conv294.i, %if.end287.i ], [ 0, %merge_other ], [ 0, %merge_predecessor_split ], [ 0, %if.end335.i ] + %5 = add i16 %backsize.0.i, -1 + %6 = zext i16 %5 to i64 + %scevgep.i = getelementptr i8, i8* %dst.0.ph.i, i64 1 + %scevgep679.i = getelementptr i8, i8* %scevgep.i, i64 %6 + br label %outer_loop_header + +wunpsect.exit.thread.loopexit389: ; preds = %shared_loop_header + unreachable + +wunpsect.exit.thread.loopexit391: ; preds = %if.end50 + unreachable + +cleanup: ; preds = %if.then99.i, %while.cond.us1412.i, %if.end19, %entry + %retval.0 = phi i32 [ 0, %if.then99.i ], [ 1, %entry ], [ 1, %if.end19 ], [ 1, %while.cond.us1412.i ] + ret i32 %retval.0 +} + +; Function Attrs: nounwind +declare void @cli_dbgmsg(i8*, ...) local_unnamed_addr #0 + +; Function Attrs: nounwind +declare i8* @cli_calloc(i64, i64) local_unnamed_addr #0 + +; Function Attrs: argmemonly nounwind +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32, i1) #1 +attributes #0 = { nounwind } +attributes #1 = { argmemonly nounwind }