diff --git a/llvm/include/llvm/Analysis/RegionInfoImpl.h b/llvm/include/llvm/Analysis/RegionInfoImpl.h --- a/llvm/include/llvm/Analysis/RegionInfoImpl.h +++ b/llvm/include/llvm/Analysis/RegionInfoImpl.h @@ -254,7 +254,9 @@ if (entry != BB) { for (BlockT *Pred : make_range(InvBlockTraits::child_begin(BB), InvBlockTraits::child_end(BB))) { - if (!contains(Pred)) + // Allow predecessors that are unreachable, as these are ignored during + // region analysis. + if (!contains(Pred) && DT->isReachableFromEntry(Pred)) report_fatal_error("Broken region found: edges entering the region must " "go to the entry node!"); } diff --git a/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp b/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp --- a/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp +++ b/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp @@ -1777,6 +1777,20 @@ BasicBlock *NewBB = CloneBasicBlock(BB, VMap, ".nonchr", &F); NewBlocks.push_back(NewBB); VMap[BB] = NewBB; + + // Unreachable predecessors will not be cloned and will not have an edge + // to the cloned block. As such, also remove them from any phi nodes. + // To avoid iterator invalidation, first collect the dead predecessors + // from the first phi node, and then perform the actual removal. + if (auto *FirstPN = dyn_cast(NewBB->begin())) { + SmallVector DeadPreds; + for (BasicBlock *Pred : FirstPN->blocks()) + if (!DT.isReachableFromEntry(Pred)) + DeadPreds.push_back(Pred); + for (PHINode &PN : make_early_inc_range(NewBB->phis())) + for (BasicBlock *Pred : DeadPreds) + PN.removeIncomingValue(Pred); + } } // Place the cloned blocks right after the original blocks (right before the diff --git a/llvm/test/Transforms/PGOProfile/chr-dead-pred.ll b/llvm/test/Transforms/PGOProfile/chr-dead-pred.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/PGOProfile/chr-dead-pred.ll @@ -0,0 +1,106 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 +; RUN: opt -S -passes='require,chr' -verify-region-info < %s | FileCheck %s + +define void @test(i1 %c, i1 %c2) !prof !29 { +; CHECK-LABEL: define void @test +; CHECK-SAME: (i1 [[C:%.*]], i1 [[C2:%.*]]) !prof [[PROF29:![0-9]+]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = xor i1 true, [[C]] +; CHECK-NEXT: [[TMP1:%.*]] = freeze i1 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = select i1 true, i1 [[TMP1]], i1 false +; CHECK-NEXT: [[TMP3:%.*]] = xor i1 true, [[C2]] +; CHECK-NEXT: [[TMP4:%.*]] = freeze i1 [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = select i1 [[TMP2]], i1 [[TMP4]], i1 false +; CHECK-NEXT: br i1 [[TMP5]], label [[ENTRY_SPLIT:%.*]], label [[ENTRY_SPLIT_NONCHR:%.*]], !prof [[PROF30:![0-9]+]] +; CHECK: entry.split: +; CHECK-NEXT: switch i8 0, label [[BB1:%.*]] [ +; CHECK-NEXT: i8 1, label [[BB2:%.*]] +; CHECK-NEXT: i8 2, label [[BB3:%.*]] +; CHECK-NEXT: ] +; CHECK: bb1: +; CHECK-NEXT: [[SELECT:%.*]] = select i1 false, i32 0, i32 1, !prof [[PROF31:![0-9]+]] +; CHECK-NEXT: br label [[EXIT:%.*]] +; CHECK: bb2: +; CHECK-NEXT: [[SELECT3:%.*]] = select i1 false, i32 0, i32 1, !prof [[PROF32:![0-9]+]] +; CHECK-NEXT: br label [[EXIT]] +; CHECK: dead: +; CHECK-NEXT: br label [[BB3]] +; CHECK: bb3: +; CHECK-NEXT: [[PHI:%.*]] = phi i64 [ 0, [[DEAD:%.*]] ], [ 1, [[ENTRY_SPLIT]] ] +; CHECK-NEXT: br label [[EXIT]] +; CHECK: entry.split.nonchr: +; CHECK-NEXT: switch i8 0, label [[BB1_NONCHR:%.*]] [ +; CHECK-NEXT: i8 1, label [[BB2_NONCHR:%.*]] +; CHECK-NEXT: i8 2, label [[BB3_NONCHR:%.*]] +; CHECK-NEXT: ] +; CHECK: bb1.nonchr: +; CHECK-NEXT: [[SELECT_NONCHR:%.*]] = select i1 [[C]], i32 0, i32 1, !prof [[PROF31]] +; CHECK-NEXT: br label [[EXIT]] +; CHECK: bb2.nonchr: +; CHECK-NEXT: [[SELECT3_NONCHR:%.*]] = select i1 [[C2]], i32 0, i32 1, !prof [[PROF32]] +; CHECK-NEXT: br label [[EXIT]] +; CHECK: bb3.nonchr: +; CHECK-NEXT: [[PHI_NONCHR:%.*]] = phi i64 [ 1, [[ENTRY_SPLIT_NONCHR]] ] +; CHECK-NEXT: br label [[EXIT]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + switch i8 0, label %bb1 [ + i8 1, label %bb2 + i8 2, label %bb3 + ] + +bb1: ; preds = %entry + %select = select i1 %c, i32 0, i32 1, !prof !30 + br label %exit + +bb2: ; preds = %entry + %select3 = select i1 %c2, i32 0, i32 1, !prof !31 + br label %exit + +dead: ; No predecessors! + br label %bb3 + +bb3: ; preds = %dead, %entry + %phi = phi i64 [ 0, %dead ], [ 1, %entry ] + br label %exit + +exit: ; preds = %bb3, %bb2, %bb1 + ret void +} + +!llvm.module.flags = !{!0} + +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9, !10, !11} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 597326977313} +!4 = !{!"MaxCount", i64 12561793713} +!5 = !{!"MaxInternalCount", i64 2509052618} +!6 = !{!"MaxFunctionCount", i64 12561793713} +!7 = !{!"NumCounts", i64 1694881} +!8 = !{!"NumFunctions", i64 129214} +!9 = !{!"IsPartialProfile", i64 0} +!10 = !{!"PartialProfileRatio", double 0.000000e+00} +!11 = !{!"DetailedSummary", !12} +!12 = !{!13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28} +!13 = !{i32 10000, i64 12561793713, i32 1} +!14 = !{i32 100000, i64 1733566697, i32 20} +!15 = !{i32 200000, i64 820928443, i32 71} +!16 = !{i32 300000, i64 404967336, i32 182} +!17 = !{i32 400000, i64 233162193, i32 376} +!18 = !{i32 500000, i64 120552435, i32 741} +!19 = !{i32 600000, i64 69388652, i32 1402} +!20 = !{i32 700000, i64 33926336, i32 2643} +!21 = !{i32 800000, i64 15635940, i32 5288} +!22 = !{i32 900000, i64 5547105, i32 11637} +!23 = !{i32 950000, i64 2224405, i32 20074} +!24 = !{i32 990000, i64 359838, i32 44778} +!25 = !{i32 999000, i64 37485, i32 81744} +!26 = !{i32 999900, i64 3465, i32 119656} +!27 = !{i32 999990, i64 529, i32 155440} +!28 = !{i32 999999, i64 70, i32 178344} +!29 = !{!"function_entry_count", i64 33781183} +!30 = !{!"branch_weights", i32 0, i32 9263770} +!31 = !{!"branch_weights", i32 0, i32 634318} diff --git a/llvm/test/Transforms/StructurizeCFG/AMDGPU/backedge-id-bug-xfail.ll b/llvm/test/Transforms/StructurizeCFG/AMDGPU/backedge-id-bug-xfail.ll deleted file mode 100644 --- a/llvm/test/Transforms/StructurizeCFG/AMDGPU/backedge-id-bug-xfail.ll +++ /dev/null @@ -1,77 +0,0 @@ -; XFAIL: * -; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -structurizecfg -verify-region-info %s - -; FIXME: Merge into backedge-id-bug -; Variant which has an issue with region construction - -define amdgpu_kernel void @loop_backedge_misidentified_alt(ptr addrspace(1) %arg0) #0 { -entry: - %tmp = load volatile <2 x i32>, ptr addrspace(1) undef, align 16 - %load1 = load volatile <2 x float>, ptr addrspace(1) undef - %tid = call i32 @llvm.amdgcn.workitem.id.x() - %gep = getelementptr inbounds i32, ptr addrspace(1) %arg0, i32 %tid - %i.initial = load volatile i32, ptr addrspace(1) %gep, align 4 - br label %LOOP.HEADER - -LOOP.HEADER: - %i = phi i32 [ %i.final, %END_ELSE_BLOCK ], [ %i.initial, %entry ] - call void asm sideeffect "s_nop 0x100b ; loop $0 ", "r,~{memory}"(i32 %i) #0 - %tmp12 = zext i32 %i to i64 - %tmp13 = getelementptr inbounds <4 x i32>, ptr addrspace(1) null, i64 %tmp12 - %tmp14 = load <4 x i32>, ptr addrspace(1) %tmp13, align 16 - %tmp15 = extractelement <4 x i32> %tmp14, i64 0 - %tmp16 = and i32 %tmp15, 65535 - %tmp17 = icmp eq i32 %tmp16, 1 - br i1 %tmp17, label %bb18, label %bb62 - -bb18: - %tmp19 = extractelement <2 x i32> %tmp, i64 0 - %tmp22 = lshr i32 %tmp19, 16 - %tmp24 = urem i32 %tmp22, 52 - %tmp25 = mul nuw nsw i32 %tmp24, 52 - br label %INNER_LOOP - -INNER_LOOP: - %inner.loop.j = phi i32 [ %tmp25, %bb18 ], [ %inner.loop.j.inc, %INNER_LOOP ] - call void asm sideeffect "; inner loop body", ""() #0 - %inner.loop.j.inc = add nsw i32 %inner.loop.j, 1 - %inner.loop.cmp = icmp eq i32 %inner.loop.j, 0 - br i1 %inner.loop.cmp, label %INNER_LOOP_BREAK, label %INNER_LOOP - -INNER_LOOP_BREAK: - %tmp59 = extractelement <4 x i32> %tmp14, i64 2 - call void asm sideeffect "s_nop 23 ", "~{memory}"() #0 - br label %END_ELSE_BLOCK - -bb62: - %load13 = icmp ult i32 %tmp16, 271 - ;br i1 %load13, label %bb64, label %INCREMENT_I - ; branching directly to the return avoids the bug - br i1 %load13, label %RETURN, label %INCREMENT_I - - -bb64: - call void asm sideeffect "s_nop 42", "~{memory}"() #0 - br label %RETURN - -INCREMENT_I: - %inc.i = add i32 %i, 1 - call void asm sideeffect "s_nop 0x1336 ; increment $0", "v,~{memory}"(i32 %inc.i) #0 - br label %END_ELSE_BLOCK - -END_ELSE_BLOCK: - %i.final = phi i32 [ %tmp59, %INNER_LOOP_BREAK ], [ %inc.i, %INCREMENT_I ] - call void asm sideeffect "s_nop 0x1337 ; end else block $0", "v,~{memory}"(i32 %i.final) #0 - %cmp.end.else.block = icmp eq i32 %i.final, -1 - br i1 %cmp.end.else.block, label %RETURN, label %LOOP.HEADER - -RETURN: - call void asm sideeffect "s_nop 0x99 ; ClosureEval return", "~{memory}"() #0 - store volatile <2 x float> %load1, ptr addrspace(1) undef, align 8 - ret void -} - -declare i32 @llvm.amdgcn.workitem.id.x() #1 - -attributes #0 = { convergent nounwind } -attributes #1 = { convergent nounwind readnone } diff --git a/llvm/test/Transforms/StructurizeCFG/AMDGPU/backedge-id-bug.ll b/llvm/test/Transforms/StructurizeCFG/AMDGPU/backedge-id-bug.ll --- a/llvm/test/Transforms/StructurizeCFG/AMDGPU/backedge-id-bug.ll +++ b/llvm/test/Transforms/StructurizeCFG/AMDGPU/backedge-id-bug.ll @@ -156,7 +156,72 @@ ; The same function, except break to return block goes directly to the ; return, which managed to hide the bug. -; FIXME: Merge variant from backedge-id-bug-xfail +define amdgpu_kernel void @loop_backedge_misidentified_alt(ptr addrspace(1) %arg0) #0 { +entry: + %tmp = load volatile <2 x i32>, ptr addrspace(1) undef, align 16 + %load1 = load volatile <2 x float>, ptr addrspace(1) undef + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds i32, ptr addrspace(1) %arg0, i32 %tid + %i.initial = load volatile i32, ptr addrspace(1) %gep, align 4 + br label %LOOP.HEADER + +LOOP.HEADER: + %i = phi i32 [ %i.final, %END_ELSE_BLOCK ], [ %i.initial, %entry ] + call void asm sideeffect "s_nop 0x100b ; loop $0 ", "r,~{memory}"(i32 %i) #0 + %tmp12 = zext i32 %i to i64 + %tmp13 = getelementptr inbounds <4 x i32>, ptr addrspace(1) null, i64 %tmp12 + %tmp14 = load <4 x i32>, ptr addrspace(1) %tmp13, align 16 + %tmp15 = extractelement <4 x i32> %tmp14, i64 0 + %tmp16 = and i32 %tmp15, 65535 + %tmp17 = icmp eq i32 %tmp16, 1 + br i1 %tmp17, label %bb18, label %bb62 + +bb18: + %tmp19 = extractelement <2 x i32> %tmp, i64 0 + %tmp22 = lshr i32 %tmp19, 16 + %tmp24 = urem i32 %tmp22, 52 + %tmp25 = mul nuw nsw i32 %tmp24, 52 + br label %INNER_LOOP + +INNER_LOOP: + %inner.loop.j = phi i32 [ %tmp25, %bb18 ], [ %inner.loop.j.inc, %INNER_LOOP ] + call void asm sideeffect "; inner loop body", ""() #0 + %inner.loop.j.inc = add nsw i32 %inner.loop.j, 1 + %inner.loop.cmp = icmp eq i32 %inner.loop.j, 0 + br i1 %inner.loop.cmp, label %INNER_LOOP_BREAK, label %INNER_LOOP + +INNER_LOOP_BREAK: + %tmp59 = extractelement <4 x i32> %tmp14, i64 2 + call void asm sideeffect "s_nop 23 ", "~{memory}"() #0 + br label %END_ELSE_BLOCK + +bb62: + %load13 = icmp ult i32 %tmp16, 271 + ;br i1 %load13, label %bb64, label %INCREMENT_I + ; branching directly to the return avoids the bug + br i1 %load13, label %RETURN, label %INCREMENT_I + + +bb64: + call void asm sideeffect "s_nop 42", "~{memory}"() #0 + br label %RETURN + +INCREMENT_I: + %inc.i = add i32 %i, 1 + call void asm sideeffect "s_nop 0x1336 ; increment $0", "v,~{memory}"(i32 %inc.i) #0 + br label %END_ELSE_BLOCK + +END_ELSE_BLOCK: + %i.final = phi i32 [ %tmp59, %INNER_LOOP_BREAK ], [ %inc.i, %INCREMENT_I ] + call void asm sideeffect "s_nop 0x1337 ; end else block $0", "v,~{memory}"(i32 %i.final) #0 + %cmp.end.else.block = icmp eq i32 %i.final, -1 + br i1 %cmp.end.else.block, label %RETURN, label %LOOP.HEADER + +RETURN: + call void asm sideeffect "s_nop 0x99 ; ClosureEval return", "~{memory}"() #0 + store volatile <2 x float> %load1, ptr addrspace(1) undef, align 8 + ret void +} declare i32 @llvm.amdgcn.workitem.id.x() #1