diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp --- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp +++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp @@ -556,6 +556,9 @@ Loop *InnerLoop = LoopList[InnerLoopId]; Loop *OuterLoop = LoopList[OuterLoopId]; + assert(InnerLoop->isLCSSAForm(*DT) && "Inner loop is not in LCSSA form."); + assert(OuterLoop->isLCSSAForm(*DT) && "Outer loop is not in LCSSA form."); + LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, ORE); if (!LIL.canInterchangeLoops(InnerLoopId, OuterLoopId, DependencyMatrix)) { LLVM_DEBUG(dbgs() << "Not interchanging loops. Cannot prove legality.\n"); @@ -580,6 +583,12 @@ LIT.transform(); LLVM_DEBUG(dbgs() << "Loops interchanged.\n"); LoopsInterchanged++; + + assert(InnerLoop->isLCSSAForm(*DT) && + "Inner loop not left in LCSSA form after loop interchange!"); + assert(OuterLoop->isLCSSAForm(*DT) && + "Outer loop not left in LCSSA form after loop interchange!"); + return true; } }; @@ -1319,6 +1328,29 @@ FromBB->getTerminator()->getIterator()); } +/// Swap instructions between \p BB1 and \p BB2 but keep terminators intact. +static void swapBBContents(BasicBlock *BB1, BasicBlock *BB2) { + // Put instructions from BB1 into TempInstrs and unlink them from BB1. + SmallVector TempInstrs; + for (Instruction &I : *BB1) { + if (&I == BB1->getTerminator()) { + break; + } + TempInstrs.push_back(&I); + } + for (Instruction *I : TempInstrs) { + I->removeFromParent(); + } + + // Move instructions from BB2 to BB1. + moveBBContents(BB2, BB1->getTerminator()); + + // Move instructions from TempInstrs to BB2. + for (Instruction *I : TempInstrs) { + I->insertBefore(BB2->getTerminator()); + } +} + // Update BI to jump to NewBB instead of OldBB. Records updates to the // dominator tree in DTUpdates. If \p MustUpdateOnce is true, assert that // \p OldBB is exactly once in BI's successor list. @@ -1585,16 +1617,7 @@ // inside the outer loop. BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader(); BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); - BasicBlock *OuterLoopHeader = OuterLoop->getHeader(); - BranchInst *InnerTermBI = - cast(InnerLoopPreHeader->getTerminator()); - - // These instructions should now be executed inside the loop. - // Move instruction into a new block after outer header. - moveBBContents(InnerLoopPreHeader, OuterLoopHeader->getTerminator()); - // These instructions were not executed previously in the loop so move them to - // the older inner loop preheader. - moveBBContents(OuterLoopPreHeader, InnerTermBI); + swapBBContents(OuterLoopPreHeader, InnerLoopPreHeader); } bool LoopInterchangeTransform::adjustLoopLinks() { diff --git a/llvm/test/Transforms/LoopInterchange/lcssa-preheader.ll b/llvm/test/Transforms/LoopInterchange/lcssa-preheader.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopInterchange/lcssa-preheader.ll @@ -0,0 +1,66 @@ +; RUN: opt < %s -basicaa -loop-interchange -pass-remarks-missed='loop-interchange' -verify-loop-lcssa -pass-remarks-output=%t -S +; RUN: FileCheck --input-file %t --check-prefix REMARK %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; void foo(int **res, int n, int m) { +; int temp[16][16]; +; for(int i = 0; i < n; i++) { +; for(int j = 0; j < m; j++) +; res[j][i] = temp[j][i]; +; } +; } + +; REMARK: Interchanged +; REMARK-NEXT: lcssa_08 +; Function Attrs: nounwind +define dso_local void @lcssa_08(i32** nocapture readonly %res, i32 %n, i32 %m) local_unnamed_addr #0 { +entry: + %temp = alloca [16 x [16 x i32]], align 4 + %0 = bitcast [16 x [16 x i32]]* %temp to i8* + %cmp24 = icmp sgt i32 %n, 0 + br i1 %cmp24, label %outer.preheader, label %for.cond.cleanup + +outer.preheader: ; preds = %entry + %wide.trip.count29 = zext i32 %n to i64 + br label %outer.header + +outer.header: ; preds = %outer.preheader, %outer.latch + %indvars.iv27 = phi i64 [ 0, %outer.preheader ], [ %indvars.iv.next28, %outer.latch ] + %cmp222 = icmp sgt i32 %m, 0 + br i1 %cmp222, label %inner.preheader, label %outer.latch + +inner.preheader: ; preds = %outer.header + ; When inner.preheader becomes the outer preheader, do not move + ; %wide.trip.count into the inner loop header lest LCSSA break + ; (if %wide.trip.count gets moved, its use is now outside the inner loop). + %wide.trip.count = zext i32 %m to i64 + br label %inner.for.body + +inner.for.body: ; preds = %inner.preheader, %inner.for.body + %indvars.iv = phi i64 [ 0, %inner.preheader ], [ %indvars.iv.next, %inner.for.body ] + %arrayidx6 = getelementptr inbounds [16 x [16 x i32]], [16 x [16 x i32]]* %temp, i64 0, i64 %indvars.iv, i64 %indvars.iv27 + %1 = load i32, i32* %arrayidx6, align 4 + %arrayidx8 = getelementptr inbounds i32*, i32** %res, i64 %indvars.iv + %2 = load i32*, i32** %arrayidx8, align 8 + %arrayidx10 = getelementptr inbounds i32, i32* %2, i64 %indvars.iv27 + store i32 %1, i32* %arrayidx10, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %inner.for.body, label %inner.crit_edge + +inner.crit_edge: ; preds = %inner.for.body + br label %outer.latch + +outer.latch: ; preds = %inner.crit_edge, %outer.header + %indvars.iv.next28 = add nuw nsw i64 %indvars.iv27, 1 + %exitcond30 = icmp ne i64 %indvars.iv.next28, %wide.trip.count29 + br i1 %exitcond30, label %outer.header, label %outer.crit_edge + +outer.crit_edge: ; preds = %outer.latch + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %outer.crit_edge, %entry + ret void +}