diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp --- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp +++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp @@ -303,7 +303,7 @@ /// Check if the loops can be interchanged. bool canInterchangeLoops(unsigned InnerLoopId, unsigned OuterLoopId, - CharMatrix &DepMatrix); + CharMatrix &DepMatrix, DominatorTree *DT); /// Discover induction PHIs in the header of \p L. Induction /// PHIs are added to \p Inductions. @@ -524,7 +524,7 @@ LLVM_DEBUG(dbgs() << "Processing InnerLoopId = " << InnerLoopId << " and OuterLoopId = " << OuterLoopId << "\n"); LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, ORE); - if (!LIL.canInterchangeLoops(InnerLoopId, OuterLoopId, DependencyMatrix)) { + if (!LIL.canInterchangeLoops(InnerLoopId, OuterLoopId, DependencyMatrix, DT)) { LLVM_DEBUG(dbgs() << "Not interchanging loops. Cannot prove legality.\n"); return false; } @@ -956,7 +956,8 @@ bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId, unsigned OuterLoopId, - CharMatrix &DepMatrix) { + CharMatrix &DepMatrix, + DominatorTree *DT) { if (!isLegalToInterChangeLoops(DepMatrix, InnerLoopId, OuterLoopId)) { LLVM_DEBUG(dbgs() << "Failed interchange InnerLoopId = " << InnerLoopId << " and OuterLoopId = " << OuterLoopId @@ -971,14 +972,14 @@ } // Check if outer and inner loop contain legal instructions only. for (auto *BB : OuterLoop->blocks()) - for (Instruction &I : BB->instructionsWithoutDebug()) + for (Instruction &I : BB->instructionsWithoutDebug()) { if (CallInst *CI = dyn_cast(&I)) { // readnone functions do not prevent interchanging. if (CI->onlyWritesMemory()) continue; LLVM_DEBUG( dbgs() << "Loops with call instructions cannot be interchanged " - << "safely."); + << "safely.\n"); ORE->emit([&]() { return OptimizationRemarkMissed(DEBUG_TYPE, "CallInst", CI->getDebugLoc(), @@ -989,6 +990,40 @@ return false; } + // Check if there is control-flow divergence in the inner loop with + // loop-variant conditions, if that is the case, and if there are + // memory accesses in one of the diverged branches, do not interchange. + else if (BranchInst *BI = dyn_cast(&I)) { + if (!InnerLoop->contains(BI) || !BI->isConditional()) + continue; + BasicBlock *Cur = BI->getParent(); + BasicBlock *InnerLatch = InnerLoop->getLoopLatch(); + // Loop latch branches are fine. + if (Cur == InnerLatch) + continue; + BasicBlock *Succ0 = BI->getSuccessor(0); + BasicBlock *Succ1 = BI->getSuccessor(1); + if ((!DT->dominates(Succ0, InnerLatch) && + containsUnsafeInstructions(Succ0)) || + (!DT->dominates(Succ1, InnerLatch) && + containsUnsafeInstructions(Succ1))) { + Value *Cond = BI->getCondition(); + if (SE->isLoopInvariant(SE->getSCEV(Cond), OuterLoop)) + continue; + LLVM_DEBUG(dbgs() << "Loops with unsafe control-flow divergence " + "cannot be interchanged " + << "safely.\n"); + ORE->emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "BranchInst", + BI->getDebugLoc(), BI->getParent()) + << "Cannot interchange loops due to branch instruction."; + }); + + return false; + } + } + } + if (!findInductions(InnerLoop, InnerLoopInductions)) { LLVM_DEBUG(dbgs() << "Cound not find inner loop induction variables.\n"); return false; diff --git a/llvm/test/Transforms/LoopInterchange/pr48057.ll b/llvm/test/Transforms/LoopInterchange/pr48057.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopInterchange/pr48057.ll @@ -0,0 +1,301 @@ +; RUN: opt < %s -basic-aa -loop-interchange -pass-remarks='loop-interchange' -pass-remarks-output=%t -S \ +; RUN: -verify-dom-info -verify-loop-info -verify-loop-lcssa 2>&1 | FileCheck -check-prefix=IR %s +; RUN: FileCheck --input-file=%t %s + + +; Check whether there is unsafe control-flow divergence in the inner loop. That is, +; if a control flow is based on a loop-variant condition and if there are memory acesses +; inside the control flow divergence. Do not interchange if that is the case. + +; Both outer and inner indvars are involved in the +; control-flow divergence condition. +; +; char b[][8] = {{}, {}, {}, {}, {5}, {}, 2, 3}; +; int c, d; +; short e; +; static char test1() { +; for (; c <= 7; c++) { +; d = 4; +; for (; d; d--) +; b[d + 2][c] && (e = b[d][0]); +; } +; } + +; CHECK: --- !Missed +; CHECK-NEXT: Pass: loop-interchange +; CHECK-NEXT: Name: BranchInst +; CHECK-NEXT: Function: test1 + +; IR-LABEL: @test1( +; IR-NOT: split + +@b = dso_local local_unnamed_addr global [7 x [8 x i8]] [[8 x i8] zeroinitializer, [8 x i8] zeroinitializer, [8 x i8] zeroinitializer, [8 x i8] zeroinitializer, [8 x i8] c"\05\00\00\00\00\00\00\00", [8 x i8] zeroinitializer, [8 x i8] c"\02\03\00\00\00\00\00\00"], align 1 +@.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1 +@e = common dso_local local_unnamed_addr global i16 0, align 2 +@c = common dso_local local_unnamed_addr global i32 0, align 4 +@d = common dso_local local_unnamed_addr global i32 0, align 4 + +; Function Attrs: nofree nounwind optsize +define dso_local i32 @test1() local_unnamed_addr #0 { +entry: + %.pr.i = load i32, i32* @c, align 4 + %cmp2.i = icmp slt i32 %.pr.i, 8 + br i1 %cmp2.i, label %for.cond1.preheader.lr.ph.i, label %f.exit + +for.cond1.preheader.lr.ph.i: ; preds = %entry + %0 = sext i32 %.pr.i to i64 + br label %for.cond1.preheader.i + +for.cond1.preheader.i: ; preds = %for.inc12.i, %for.cond1.preheader.lr.ph.i + %indvars.iv4.i = phi i64 [ %0, %for.cond1.preheader.lr.ph.i ], [ %indvars.iv.next5.i, %for.inc12.i ] + br label %for.body2.i + +for.body2.i: ; preds = %land.end.i, %for.cond1.preheader.i + %indvars.iv.i = phi i64 [ 4, %for.cond1.preheader.i ], [ %indvars.iv.next.i, %land.end.i ] + %1 = add nuw nsw i64 %indvars.iv.i, 2 + %arrayidx4.i = getelementptr inbounds [7 x [8 x i8]], [7 x [8 x i8]]* @b, i64 0, i64 %1, i64 %indvars.iv4.i + %2 = load i8, i8* %arrayidx4.i, align 1 + %tobool5.i = icmp eq i8 %2, 0 + br i1 %tobool5.i, label %land.end.i, label %land.rhs.i + +land.rhs.i: ; preds = %for.body2.i + %arrayidx8.i = getelementptr inbounds [7 x [8 x i8]], [7 x [8 x i8]]* @b, i64 0, i64 %indvars.iv.i, i64 0 + %3 = load i8, i8* %arrayidx8.i, align 1 + %conv9.i = zext i8 %3 to i16 + store i16 %conv9.i, i16* @e, align 2 + br label %land.end.i + +land.end.i: ; preds = %land.rhs.i, %for.body2.i + %indvars.iv.next.i = add nsw i64 %indvars.iv.i, -1 + %tobool.i = icmp eq i64 %indvars.iv.next.i, 0 + br i1 %tobool.i, label %for.inc12.i, label %for.body2.i + +for.inc12.i: ; preds = %land.end.i + %indvars.iv.next5.i = add nsw i64 %indvars.iv4.i, 1 + %cmp.i = icmp slt i64 %indvars.iv4.i, 7 + br i1 %cmp.i, label %for.cond1.preheader.i, label %for.cond.for.end13_crit_edge.i + +for.cond.for.end13_crit_edge.i: ; preds = %for.inc12.i + %indvars.iv.next5.i.lcssa = phi i64 [ %indvars.iv.next5.i, %for.inc12.i ] + %4 = trunc i64 %indvars.iv.next5.i.lcssa to i32 + store i32 0, i32* @d, align 4 + store i32 %4, i32* @c, align 4 + br label %f.exit + +f.exit: ; preds = %for.cond.for.end13_crit_edge.i, %entry + %5 = load i16, i16* @e, align 2 + %conv = sext i16 %5 to i32 + ret i32 0 +} + +; Only the inner indvar is involved in the +; control-flow divergence condition. +; +; char b[][8] = {{}, {}, {}, {}, {5}, {}, 2, 3}; +; int c, d; +; short e; +; static char test2() { +; for (; c <= 7; c++) { +; d = 4; +; for (; d; d--) +; b[d + 2][0] && (e = b[d][0]); +; } +; } + +; CHECK: --- !Missed +; CHECK-NEXT: Pass: loop-interchange +; CHECK-NEXT: Name: BranchInst +; CHECK-NEXT: Function: test2 + +; IR-LABEL: @test2( +; IR-NOT: split + +; Function Attrs: nofree nounwind optsize +define dso_local i32 @test2() local_unnamed_addr #0 { +entry: + %.pr.i = load i32, i32* @c, align 4 + %cmp2.i = icmp slt i32 %.pr.i, 8 + br i1 %cmp2.i, label %for.cond1.preheader.lr.ph.i, label %f.exit + +for.cond1.preheader.lr.ph.i: ; preds = %entry + %0 = sext i32 %.pr.i to i64 + br label %for.cond1.preheader.i + +for.cond1.preheader.i: ; preds = %for.inc12.i, %for.cond1.preheader.lr.ph.i + %indvars.iv4.i = phi i64 [ %0, %for.cond1.preheader.lr.ph.i ], [ %indvars.iv.next5.i, %for.inc12.i ] + br label %for.body2.i + +for.body2.i: ; preds = %land.end.i, %for.cond1.preheader.i + %indvars.iv.i = phi i64 [ 4, %for.cond1.preheader.i ], [ %indvars.iv.next.i, %land.end.i ] + %1 = add nuw nsw i64 %indvars.iv.i, 2 + %arrayidx4.i = getelementptr inbounds [7 x [8 x i8]], [7 x [8 x i8]]* @b, i64 0, i64 %1, i64 0 + %2 = load i8, i8* %arrayidx4.i, align 1 + %tobool5.i = icmp eq i8 %2, 0 + br i1 %tobool5.i, label %land.end.i, label %land.rhs.i + +land.rhs.i: ; preds = %for.body2.i + %arrayidx8.i = getelementptr inbounds [7 x [8 x i8]], [7 x [8 x i8]]* @b, i64 0, i64 %indvars.iv.i, i64 0 + %3 = load i8, i8* %arrayidx8.i, align 1 + %conv9.i = zext i8 %3 to i16 + store i16 %conv9.i, i16* @e, align 2 + br label %land.end.i + +land.end.i: ; preds = %land.rhs.i, %for.body2.i + %indvars.iv.next.i = add nsw i64 %indvars.iv.i, -1 + %tobool.i = icmp eq i64 %indvars.iv.next.i, 0 + br i1 %tobool.i, label %for.inc12.i, label %for.body2.i + +for.inc12.i: ; preds = %land.end.i + %indvars.iv.next5.i = add nsw i64 %indvars.iv4.i, 1 + %cmp.i = icmp slt i64 %indvars.iv4.i, 7 + br i1 %cmp.i, label %for.cond1.preheader.i, label %for.cond.for.end13_crit_edge.i + +for.cond.for.end13_crit_edge.i: ; preds = %for.inc12.i + %indvars.iv.next5.i.lcssa = phi i64 [ %indvars.iv.next5.i, %for.inc12.i ] + %4 = trunc i64 %indvars.iv.next5.i.lcssa to i32 + store i32 0, i32* @d, align 4 + store i32 %4, i32* @c, align 4 + br label %f.exit + +f.exit: ; preds = %for.cond.for.end13_crit_edge.i, %entry + %5 = load i16, i16* @e, align 2 + %conv = sext i16 %5 to i32 + ret i32 0 +} + +; Only the outer indvar is involved in the +; control-flow divergence condition. +; +; char b[][8] = {{}, {}, {}, {}, {5}, {}, 2, 3}; +; int c, d; +; short e; +; static char test3() { +; for (; c <= 7; c++) { +; d = 4; +; for (; d; d--) +; b[0][c] && (e = b[d][0]); +; } +; } + +; CHECK: --- !Missed +; CHECK-NEXT: Pass: loop-interchange +; CHECK-NEXT: Name: BranchInst +; CHECK-NEXT: Function: test3 + +; IR-LABEL: @test3( +; IR-NOT: split + +; Function Attrs: nofree nounwind optsize +define dso_local i32 @test3() local_unnamed_addr #0 { +entry: + %.pr.i = load i32, i32* @c, align 4 + %cmp2.i = icmp slt i32 %.pr.i, 8 + br i1 %cmp2.i, label %for.cond1.preheader.lr.ph.i, label %f.exit + +for.cond1.preheader.lr.ph.i: ; preds = %entry + %0 = sext i32 %.pr.i to i64 + br label %for.cond1.preheader.i + +for.cond1.preheader.i: ; preds = %for.inc12.i, %for.cond1.preheader.lr.ph.i + %indvars.iv4.i = phi i64 [ %0, %for.cond1.preheader.lr.ph.i ], [ %indvars.iv.next5.i, %for.inc12.i ] + br label %for.body2.i + +for.body2.i: ; preds = %land.end.i, %for.cond1.preheader.i + %indvars.iv.i = phi i64 [ 4, %for.cond1.preheader.i ], [ %indvars.iv.next.i, %land.end.i ] + %1 = add nuw nsw i64 %indvars.iv.i, 2 + %arrayidx4.i = getelementptr inbounds [7 x [8 x i8]], [7 x [8 x i8]]* @b, i64 0, i64 0, i64 %indvars.iv4.i + %2 = load i8, i8* %arrayidx4.i, align 1 + %tobool5.i = icmp eq i8 %2, 0 + br i1 %tobool5.i, label %land.end.i, label %land.rhs.i + +land.rhs.i: ; preds = %for.body2.i + %arrayidx8.i = getelementptr inbounds [7 x [8 x i8]], [7 x [8 x i8]]* @b, i64 0, i64 %indvars.iv.i, i64 0 + %3 = load i8, i8* %arrayidx8.i, align 1 + %conv9.i = zext i8 %3 to i16 + store i16 %conv9.i, i16* @e, align 2 + br label %land.end.i + +land.end.i: ; preds = %land.rhs.i, %for.body2.i + %indvars.iv.next.i = add nsw i64 %indvars.iv.i, -1 + %tobool.i = icmp eq i64 %indvars.iv.next.i, 0 + br i1 %tobool.i, label %for.inc12.i, label %for.body2.i + +for.inc12.i: ; preds = %land.end.i + %indvars.iv.next5.i = add nsw i64 %indvars.iv4.i, 1 + %cmp.i = icmp slt i64 %indvars.iv4.i, 7 + br i1 %cmp.i, label %for.cond1.preheader.i, label %for.cond.for.end13_crit_edge.i + +for.cond.for.end13_crit_edge.i: ; preds = %for.inc12.i + %indvars.iv.next5.i.lcssa = phi i64 [ %indvars.iv.next5.i, %for.inc12.i ] + %4 = trunc i64 %indvars.iv.next5.i.lcssa to i32 + store i32 0, i32* @d, align 4 + store i32 %4, i32* @c, align 4 + br label %f.exit + +f.exit: ; preds = %for.cond.for.end13_crit_edge.i, %entry + %5 = load i16, i16* @e, align 2 + %conv = sext i16 %5 to i32 + ret i32 0 +} + +; Switched the order of inner and outer loops. +; +; char b[][8] = {{}, {}, {}, {}, {5}, {}, 2, 3}; +; int c, d; +; short e; +; static char f() { +; for (d = 4; d; d--) +; for (c = 0; c <= 7; c++) { +; b[d + 2][c] && (e = b[d][0]); +; } +; } + +; CHECK: --- !Missed +; CHECK-NEXT: Pass: loop-interchange +; CHECK-NEXT: Name: BranchInst +; CHECK-NEXT: Function: test4 + +; IR-LABEL: @test4( +; IR-NOT: split + +; Function Attrs: nofree nounwind optsize +define dso_local i32 @test4() local_unnamed_addr #0 { +entry: + br label %for.cond1.preheader.i + +for.cond1.preheader.i: ; preds = %for.inc12.i, %entry + %indvars.iv4.i = phi i64 [ 4, %entry ], [ %indvars.iv.next5.i, %for.inc12.i ] + %0 = add nuw nsw i64 %indvars.iv4.i, 2 + %arrayidx8.i = getelementptr inbounds [7 x [8 x i8]], [7 x [8 x i8]]* @b, i64 0, i64 %indvars.iv4.i, i64 0 + br label %for.body2.i + +for.body2.i: ; preds = %land.end.i, %for.cond1.preheader.i + %indvars.iv.i = phi i64 [ 0, %for.cond1.preheader.i ], [ %indvars.iv.next.i, %land.end.i ] + %arrayidx4.i = getelementptr inbounds [7 x [8 x i8]], [7 x [8 x i8]]* @b, i64 0, i64 %0, i64 %indvars.iv.i + %1 = load i8, i8* %arrayidx4.i + %tobool5.not.i = icmp eq i8 %1, 0 + br i1 %tobool5.not.i, label %land.end.i, label %land.rhs.i + +land.rhs.i: ; preds = %for.body2.i + %2 = load i8, i8* %arrayidx8.i + %conv9.i = zext i8 %2 to i16 + store i16 %conv9.i, i16* @e + br label %land.end.i + +land.end.i: ; preds = %land.rhs.i, %for.body2.i + %indvars.iv.next.i = add nuw nsw i64 %indvars.iv.i, 1 + %exitcond.not.i = icmp eq i64 %indvars.iv.next.i, 8 + br i1 %exitcond.not.i, label %for.inc12.i, label %for.body2.i + +for.inc12.i: ; preds = %land.end.i + %indvars.iv.next5.i = add nsw i64 %indvars.iv4.i, -1 + %tobool.not.i = icmp eq i64 %indvars.iv.next5.i, 0 + br i1 %tobool.not.i, label %f.exit, label %for.cond1.preheader.i + +f.exit: ; preds = %for.inc12.i + store i32 8, i32* @c + store i32 0, i32* @d + %3 = load i16, i16* @e + %conv = sext i16 %3 to i32 + ret i32 0 +} \ No newline at end of file