diff --git a/llvm/include/llvm/Transforms/Utils/CodeMoverUtils.h b/llvm/include/llvm/Transforms/Utils/CodeMoverUtils.h --- a/llvm/include/llvm/Transforms/Utils/CodeMoverUtils.h +++ b/llvm/include/llvm/Transforms/Utils/CodeMoverUtils.h @@ -41,6 +41,12 @@ DominatorTree &DT, const PostDominatorTree &PDT, DependenceInfo &DI); +/// Return true if all instructions (except the terminator) in \p BB can be +/// safely moved before \p InsertPoint. +bool isSafeToMoveBefore(BasicBlock &BB, Instruction &InsertPoint, + DominatorTree &DT, const PostDominatorTree &PDT, + DependenceInfo &DI); + /// Move instructions, in an order-preserving manner, from \p FromBB to the /// beginning of \p ToBB when proven safe. void moveInstructionsToTheBeginning(BasicBlock &FromBB, BasicBlock &ToBB, @@ -48,6 +54,12 @@ const PostDominatorTree &PDT, DependenceInfo &DI); +/// Move instructions, in an order-preserving manner, from \p FromBB to the end +/// of \p ToBB when proven safe. +void moveInstructionsToTheEnd(BasicBlock &FromBB, BasicBlock &ToBB, + DominatorTree &DT, const PostDominatorTree &PDT, + DependenceInfo &DI); + } // end namespace llvm #endif // LLVM_TRANSFORMS_UTILS_CODEMOVERUTILS_H diff --git a/llvm/lib/Transforms/Scalar/LoopFuse.cpp b/llvm/lib/Transforms/Scalar/LoopFuse.cpp --- a/llvm/lib/Transforms/Scalar/LoopFuse.cpp +++ b/llvm/lib/Transforms/Scalar/LoopFuse.cpp @@ -86,7 +86,9 @@ STATISTIC(UncomputableTripCount, "SCEV cannot compute trip count of loop"); STATISTIC(NonEqualTripCount, "Loop trip counts are not the same"); STATISTIC(NonAdjacent, "Loops are not adjacent"); -STATISTIC(NonEmptyPreheader, "Loop has a non-empty preheader"); +STATISTIC( + NonEmptyPreheader, + "Loop has a non-empty preheader with instructions that cannot be moved"); STATISTIC(FusionNotBeneficial, "Fusion is not beneficial"); STATISTIC(NonIdenticalGuards, "Candidates have different guards"); STATISTIC(NonEmptyExitBlock, "Candidate has a non-empty exit block"); @@ -738,19 +740,21 @@ continue; } - // The following three checks look for empty blocks in FC0 and FC1. If - // any of these blocks are non-empty, we do not fuse. This is done - // because we currently do not have the safety checks to determine if - // it is safe to move the blocks past other blocks in the loop. Once - // these checks are added, these conditions can be relaxed. - if (!isEmptyPreheader(*FC1)) { - LLVM_DEBUG(dbgs() << "Fusion candidate does not have empty " - "preheader. Not fusing.\n"); + if (!isSafeToMoveBefore(*FC1->Preheader, + *FC0->Preheader->getTerminator(), DT, PDT, + DI)) { + LLVM_DEBUG(dbgs() << "Fusion candidate contains unsafe " + "instructions in preheader. Not fusing.\n"); reportLoopFusion(*FC0, *FC1, NonEmptyPreheader); continue; } + // The following two checks look for empty blocks in FC0 and FC1. If + // any of these blocks are non-empty, we do not fuse. This is done + // because we currently do not have the safety checks to determine if + // it is safe to move the blocks past other blocks in the loop. Once + // these checks are added, these conditions can be relaxed. if (FC0->GuardBranch && !isEmptyExitBlock(*FC0)) { LLVM_DEBUG(dbgs() << "Fusion candidate does not have empty exit " "block. Not fusing.\n"); @@ -1166,6 +1170,10 @@ LLVM_DEBUG(dbgs() << "Fusion Candidate 0: \n"; FC0.dump(); dbgs() << "Fusion Candidate 1: \n"; FC1.dump();); + // Move instructions from the preheader of FC1 to the end of the preheader + // of FC0. + moveInstructionsToTheEnd(*FC1.Preheader, *FC0.Preheader, DT, PDT, DI); + // Fusing guarded loops is handled slightly differently than non-guarded // loops and has been broken out into a separate method instead of trying to // intersperse the logic within a single method. diff --git a/llvm/lib/Transforms/Utils/CodeMoverUtils.cpp b/llvm/lib/Transforms/Utils/CodeMoverUtils.cpp --- a/llvm/lib/Transforms/Utils/CodeMoverUtils.cpp +++ b/llvm/lib/Transforms/Utils/CodeMoverUtils.cpp @@ -380,6 +380,17 @@ return true; } +bool llvm::isSafeToMoveBefore(BasicBlock &BB, Instruction &InsertPoint, + DominatorTree &DT, const PostDominatorTree &PDT, + DependenceInfo &DI) { + return llvm::all_of(BB, [&](Instruction &I) { + if (BB.getTerminator() == &I) + return true; + + return isSafeToMoveBefore(I, InsertPoint, DT, PDT, DI); + }); +} + void llvm::moveInstructionsToTheBeginning(BasicBlock &FromBB, BasicBlock &ToBB, DominatorTree &DT, const PostDominatorTree &PDT, @@ -394,3 +405,15 @@ I.moveBefore(MovePos); } } + +void llvm::moveInstructionsToTheEnd(BasicBlock &FromBB, BasicBlock &ToBB, + DominatorTree &DT, + const PostDominatorTree &PDT, + DependenceInfo &DI) { + Instruction *MovePos = ToBB.getTerminator(); + while (FromBB.size() > 1) { + Instruction &I = FromBB.front(); + if (isSafeToMoveBefore(I, *MovePos, DT, PDT, DI)) + I.moveBefore(MovePos); + } +} diff --git a/llvm/test/Transforms/LoopFusion/diagnostics_missed.ll b/llvm/test/Transforms/LoopFusion/diagnostics_missed.ll --- a/llvm/test/Transforms/LoopFusion/diagnostics_missed.ll +++ b/llvm/test/Transforms/LoopFusion/diagnostics_missed.ll @@ -104,7 +104,7 @@ ret void } -; CHECK: remark: diagnostics_missed.c:38:3: [negative_dependence]: entry and for.end: Loop has a non-empty preheader +; CHECK: remark: diagnostics_missed.c:38:3: [negative_dependence]: entry and for.end: Dependencies prevent fusion define void @negative_dependence(i32* noalias %A) !dbg !51 { entry: br label %for.body @@ -185,6 +185,36 @@ ; Function Attrs: nounwind readnone speculatable willreturn declare void @llvm.dbg.value(metadata, metadata, metadata) #0 +; CHECK: remark: diagnostics_missed.c:62:3: [unsafe_preheader]: for.first.preheader and for.second.preheader: Loop has a non-empty preheader with instructions that cannot be moved +define void @unsafe_preheader(i32* noalias %A, i32* noalias %B) { +for.first.preheader: + br label %for.first, !dbg !80 + +for.first: + %i.02 = phi i64 [ 0, %for.first.preheader ], [ %inc, %for.first ] + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %i.02 + store i32 0, i32* %arrayidx, align 4 + %inc = add nsw i64 %i.02, 1 + %cmp = icmp slt i64 %inc, 100 + br i1 %cmp, label %for.first, label %for.second.preheader + +for.second.preheader: + call void @bar() + br label %for.second + +for.second: + %j.01 = phi i64 [ 0, %for.second.preheader ], [ %inc6, %for.second ] + %arrayidx4 = getelementptr inbounds i32, i32* %B, i64 %j.01 + store i32 0, i32* %arrayidx4, align 4 + %inc6 = add nsw i64 %j.01, 1 + %cmp2 = icmp slt i64 %inc6, 100 + br i1 %cmp2, label %for.second, label %for.end + +for.end: + ret void +} +declare void @bar() + attributes #0 = { nounwind readnone speculatable willreturn } !llvm.dbg.cu = !{!2} @@ -267,3 +297,7 @@ !74 = !DILocation(line: 51, column: 3, scope: !70) !75 = !DILocation(line: 52, column: 15, scope: !70) !76 = !DILocation(line: 57, column: 3, scope: !63) +!77 = distinct !DISubprogram(name: "unsafe_preheader", scope: !3, file: !3, line: 60, type: !15, scopeLine: 60, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !2, retainedNodes: !78) +!78 = !{} +!79 = distinct !DILexicalBlock(scope: !77, file: !3, line: 3, column: 5) +!80 = !DILocation(line: 62, column: 3, scope: !79) diff --git a/llvm/test/Transforms/LoopFusion/guarded.ll b/llvm/test/Transforms/LoopFusion/guarded.ll --- a/llvm/test/Transforms/LoopFusion/guarded.ll +++ b/llvm/test/Transforms/LoopFusion/guarded.ll @@ -63,3 +63,59 @@ bb12: ; preds = %bb15, %bb14 ret void } + +; Test that `%add` is moved in for.first.preheader, and the two loops for.first +; and for.second are fused. + +; CHECK: void @moveinsts_preheader +; CHECK-LABEL: for.first.guard: +; CHECK: br i1 %cmp.guard, label %for.first.preheader, label %for.end +; CHECK-LABEL: for.first.preheader: +; CHECK-NEXT: %add = add nsw i32 %x, 1 +; CHECK-NEXT: br label %for.first +; CHECK-LABEL: for.first: +; CHECK: br i1 %cmp.j, label %for.first, label %for.second.exit +; CHECK-LABEL: for.second.exit: +; CHECK-NEXT: br label %for.end +; CHECK-LABEL: for.end: +; CHECK-NEXT: ret void +define void @moveinsts_preheader(i32* noalias %A, i32* noalias %B, i64 %N, i32 %x) { +for.first.guard: + %cmp.guard = icmp slt i64 0, %N + br i1 %cmp.guard, label %for.first.preheader, label %for.second.guard + +for.first.preheader: + br label %for.first + +for.first: + %i = phi i64 [ %inc.i, %for.first ], [ 0, %for.first.preheader ] + %Ai = getelementptr inbounds i32, i32* %A, i64 %i + store i32 0, i32* %Ai, align 4 + %inc.i = add nsw i64 %i, 1 + %cmp.i = icmp slt i64 %inc.i, %N + br i1 %cmp.i, label %for.first, label %for.first.exit + +for.first.exit: + br label %for.second.guard + +for.second.guard: + br i1 %cmp.guard, label %for.second.preheader, label %for.end + +for.second.preheader: + %add = add nsw i32 %x, 1 + br label %for.second + +for.second: + %j = phi i64 [ %inc.j, %for.second ], [ 0, %for.second.preheader ] + %Bj = getelementptr inbounds i32, i32* %B, i64 %j + store i32 0, i32* %Bj, align 4 + %inc.j = add nsw i64 %j, 1 + %cmp.j = icmp slt i64 %inc.j, %N + br i1 %cmp.j, label %for.second, label %for.second.exit + +for.second.exit: + br label %for.end + +for.end: + ret void +} diff --git a/llvm/test/Transforms/LoopFusion/simple.ll b/llvm/test/Transforms/LoopFusion/simple.ll --- a/llvm/test/Transforms/LoopFusion/simple.ll +++ b/llvm/test/Transforms/LoopFusion/simple.ll @@ -306,3 +306,88 @@ for.end: ret void } + +; Test that `%add` is moved in basic block entry, and the two loops for.first +; and for.second are fused. + +; CHECK: i32 @moveinsts_preheader +; CHECK-LABEL: entry: +; CHECK-NEXT: %add = add nsw i32 %x, 1 +; CHECK-NEXT: br label %for.first +; CHECK-LABEL: for.first: +; CHECK: br i1 %cmp.j, label %for.first, label %for.second.exit +; CHECK-LABEL: for.second.exit: +; CHECK-NEXT: ret i32 %add + +define i32 @moveinsts_preheader(i32* %A, i32 %x) { +entry: + br label %for.first + +for.first: + %i = phi i64 [ 0, %entry ], [ %inc.i, %for.first ] + %Ai = getelementptr inbounds i32, i32* %A, i64 %i + store i32 0, i32* %Ai, align 4 + %inc.i = add nsw i64 %i, 1 + %cmp.i = icmp slt i64 %inc.i, 100 + br i1 %cmp.i, label %for.first, label %for.first.exit + +for.first.exit: + %add = add nsw i32 %x, 1 + br label %for.second + +for.second: + %j = phi i64 [ 0, %for.first.exit ], [ %inc.j, %for.second ] + %Aj = getelementptr inbounds i32, i32* %A, i64 %j + store i32 2, i32* %Aj, align 4 + %inc.j = add nsw i64 %j, 1 + %cmp.j = icmp slt i64 %inc.j, 100 + br i1 %cmp.j, label %for.second, label %for.second.exit + +for.second.exit: + ret i32 %add +} + +; Test that `%add` cannot be moved to basic block entry, as it uses %i, which +; defined after basic block entry. And the two loops for.first and for.second +; are not fused. + +; CHECK: i64 @unsafe_preheader +; CHECK-LABEL: entry: +; CHECK-NEXT: br label %for.first +; CHECK-LABEL: for.first: +; CHECK: br i1 %cmp.i, label %for.first, label %for.first.exit +; CHECK-LABEL: for.first.exit: +; CHECK-NEXT: %add = add nsw i64 %x, %i +; CHECK-NEXT: br label %for.second +; CHECK-LABEL: for.second: +; CHECK: br i1 %cmp.j, label %for.second, label %for.second.exit +; CHECK-LABEL: for.second.exit: +; CHECK-NEXT: ret i64 %add + +define i64 @unsafe_preheader(i32* %A, i64 %x) { +entry: + br label %for.first + +for.first: + %i = phi i64 [ 0, %entry ], [ %inc.i, %for.first ] + %Ai = getelementptr inbounds i32, i32* %A, i64 %i + store i32 0, i32* %Ai, align 4 + %inc.i = add nsw i64 %i, 1 + %cmp.i = icmp slt i64 %inc.i, 100 + br i1 %cmp.i, label %for.first, label %for.first.exit + +for.first.exit: + %add = add nsw i64 %x, %i + br label %for.second + +for.second: + %j = phi i64 [ 0, %for.first.exit ], [ %inc.j, %for.second ] + %Aj = getelementptr inbounds i32, i32* %A, i64 %j + store i32 2, i32* %Aj, align 4 + %inc.j = add nsw i64 %j, 1 + %cmp.j = icmp slt i64 %inc.j, 100 + br i1 %cmp.j, label %for.second, label %for.second.exit + +for.second.exit: + ret i64 %add +}