diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp --- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp +++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp @@ -266,26 +266,28 @@ return true; } -static LoopVector populateWorklist(Loop &L) { +static void populateWorklist(Loop &L, LoopVector &LoopList) { LLVM_DEBUG(dbgs() << "Calling populateWorklist on Func: " << L.getHeader()->getParent()->getName() << " Loop: %" << L.getHeader()->getName() << '\n'); - LoopVector LoopList; + assert(LoopList.empty() && "LoopList should initially be empty!"); Loop *CurrentLoop = &L; const std::vector *Vec = &CurrentLoop->getSubLoops(); while (!Vec->empty()) { // The current loop has multiple subloops in it hence it is not tightly // nested. // Discard all loops above it added into Worklist. - if (Vec->size() != 1) - return {}; + if (Vec->size() != 1) { + LoopList = {}; + return; + } LoopList.push_back(CurrentLoop); CurrentLoop = Vec->front(); Vec = &CurrentLoop->getSubLoops(); } LoopList.push_back(CurrentLoop); - return LoopList; + return; } namespace { @@ -419,12 +421,13 @@ bool run(Loop *L) { if (L->getParentLoop()) return false; - - return processLoopList(populateWorklist(*L)); + SmallVector LoopList; + populateWorklist(*L, LoopList); + return processLoopList(LoopList); } bool run(LoopNest &LN) { - const auto &LoopList = LN.getLoops(); + SmallVector LoopList(LN.getLoops().begin(), LN.getLoops().end()); for (unsigned I = 1; I < LoopList.size(); ++I) if (LoopList[I]->getParentLoop() != LoopList[I - 1]) return false; @@ -456,7 +459,7 @@ return LoopList.size() - 1; } - bool processLoopList(ArrayRef LoopList) { + bool processLoopList(SmallVectorImpl &LoopList) { bool Changed = false; unsigned LoopNestDepth = LoopList.size(); if (LoopNestDepth < 2) { @@ -496,20 +499,32 @@ } unsigned SelecLoopId = selectLoopForInterchange(LoopList); - // Move the selected loop outwards to the best possible position. - Loop *LoopToBeInterchanged = LoopList[SelecLoopId]; - for (unsigned i = SelecLoopId; i > 0; i--) { - bool Interchanged = processLoop(LoopToBeInterchanged, LoopList[i - 1], i, - i - 1, DependencyMatrix); - if (!Interchanged) - return Changed; - // Update the DependencyMatrix - interChangeDependencies(DependencyMatrix, i, i - 1); + // We try to achieve the globally optimal memory access for the loopnest, + // and do interchange based on a bubble-sort fasion. We start from + // the innermost loop, move it outwards to the best possible position + // and repeat this process. + for (unsigned j = SelecLoopId; j > 0; j--) { + bool ChangedPerIter = false; + for (unsigned i = SelecLoopId; i > SelecLoopId - j; i--) { + bool Interchanged = processLoop(LoopList[i], LoopList[i - 1], i, i - 1, + DependencyMatrix); + if (!Interchanged) + continue; + // Loops interchanged, update LoopList accordingly. + std::swap(LoopList[i - 1], LoopList[i]); + // Update the DependencyMatrix + interChangeDependencies(DependencyMatrix, i, i - 1); #ifdef DUMP_DEP_MATRICIES - LLVM_DEBUG(dbgs() << "Dependence after interchange\n"); - printDepMatrix(DependencyMatrix); + LLVM_DEBUG(dbgs() << "Dependence after interchange\n"); + printDepMatrix(DependencyMatrix); #endif - Changed |= Interchanged; + ChangedPerIter |= Interchanged; + Changed |= Interchanged; + } + // Early abort if there was no interchange during an entire round of + // moving loops outwards. + if (!ChangedPerIter) + break; } return Changed; } @@ -1419,9 +1434,13 @@ // Incoming values are guaranteed be instructions currently. auto IncI = cast(P.getIncomingValueForBlock(InnerLatch)); + // In case of multi-level nested loops, follow LCSSA to find the incoming + // value defined from the innermost loop. + auto IncIInnerMost = cast(followLCSSA(IncI)); // Skip phis with incoming values from the inner loop body, excluding the // header and latch. - if (IncI->getParent() != InnerLatch && IncI->getParent() != InnerHeader) + if (IncIInnerMost->getParent() != InnerLatch && + IncIInnerMost->getParent() != InnerHeader) continue; assert(all_of(P.users(), diff --git a/llvm/test/Transforms/LoopInterchange/phi-ordering.ll b/llvm/test/Transforms/LoopInterchange/phi-ordering.ll --- a/llvm/test/Transforms/LoopInterchange/phi-ordering.ll +++ b/llvm/test/Transforms/LoopInterchange/phi-ordering.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -loop-interchange -verify-dom-info -verify-loop-info -verify-scev -verify-loop-lcssa -loop-interchange-threshold=-1000 -S 2>&1 | FileCheck %s +; RUN: opt < %s -loop-interchange -verify-dom-info -verify-loop-info -verify-scev -verify-loop-lcssa -loop-interchange-threshold=0 -S 2>&1 | FileCheck %s ;; Checks the order of the inner phi nodes does not cause havoc. ;; The inner loop has a reduction into c. The IV is not the first phi. @@ -9,7 +9,7 @@ ; Function Attrs: norecurse nounwind -define void @test(i32 %T, [90 x i32]* noalias nocapture %C, i16* noalias nocapture readonly %A, i16* noalias nocapture readonly %B) local_unnamed_addr #0 { +define void @test(i32 %T, [90 x i32]* noalias nocapture %C, [90 x [90 x i16]]* noalias nocapture readonly %A, i16* noalias nocapture readonly %B) local_unnamed_addr #0 { ; CHECK-LABEL: @test( ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[FOR3_PREHEADER:%.*]] @@ -31,7 +31,7 @@ ; CHECK-NEXT: br label [[FOR1_HEADER_PREHEADER]] ; CHECK: for3.split1: ; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[K]], [[MUL]] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[A:%.*]], i32 [[ADD]] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [90 x [90 x i16]], [90 x [90 x i16]]* [[A:%.*]], i32 [[ADD]], i32 [[J]], i32 [[I]] ; CHECK-NEXT: [[TMP0:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 ; CHECK-NEXT: [[ADD15:%.*]] = add nsw i16 [[TMP0]], 1 ; CHECK-NEXT: store i16 [[ADD15]], i16* [[ARRAYIDX]] @@ -70,7 +70,7 @@ for3: ; preds = %for3, %for2.header %k = phi i32 [ 1, %for2.header ], [ %inc, %for3 ] %add = add nsw i32 %k, %mul - %arrayidx = getelementptr inbounds i16, i16* %A, i32 %add + %arrayidx = getelementptr inbounds [90 x [90 x i16]], [90 x [90 x i16]]* %A, i32 %add, i32 %j, i32 %i %0 = load i16, i16* %arrayidx, align 2 %add15 = add nsw i16 %0, 1 store i16 %add15, i16* %arrayidx diff --git a/llvm/test/Transforms/LoopInterchange/pr43326-ideal-access-pattern.ll b/llvm/test/Transforms/LoopInterchange/pr43326-ideal-access-pattern.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopInterchange/pr43326-ideal-access-pattern.ll @@ -0,0 +1,64 @@ +; RUN: opt < %s -basic-aa -loop-interchange -pass-remarks-missed='loop-interchange' -pass-remarks-output=%t -S \ +; RUN: -verify-dom-info -verify-loop-info -verify-loop-lcssa -stats 2>&1 +; RUN: FileCheck --input-file=%t --check-prefix=REMARKS %s + +; Triply nested loop, should be able to do interchange three times +; to get the ideal access pattern. +; void f(int e[10][10][10], int f[10][10][10]) { +; for (int a = 0; a < 10; a++) { +; for (int b = 0; b < 10; b++) { +; for (int c = 0; c < 10; c++) { +; f[c][b][a] = e[c][b][a]; +; } +; } +; } +; } + +; REMARKS: --- !Passed +; REMARKS-NEXT: Pass: loop-interchange +; REMARKS-NEXT: Name: Interchanged +; REMARKS-NEXT: Function: pr43326-triply-nested +; REMARKS: --- !Passed +; REMARKS-NEXT: Pass: loop-interchange +; REMARKS-NEXT: Name: Interchanged +; REMARKS-NEXT: Function: pr43326-triply-nested +; REMARKS: --- !Passed +; REMARKS-NEXT: Pass: loop-interchange +; REMARKS-NEXT: Name: Interchanged +; REMARKS-NEXT: Function: pr43326-triply-nested + +define void @pr43326-triply-nested([10 x [10 x i32]]* %e, [10 x [10 x i32]]* %f) { +entry: + br label %for.outermost.header + +for.outermost.header: ; preds = %entry, %for.outermost.latch + %indvars.outermost = phi i64 [ 0, %entry ], [ %indvars.outermost.next, %for.outermost.latch ] + br label %for.middle.header + +for.cond.cleanup: ; preds = %for.outermost.latch + ret void + +for.middle.header: ; preds = %for.outermost.header, %for.middle.latch + %indvars.middle = phi i64 [ 0, %for.outermost.header ], [ %indvars.middle.next, %for.middle.latch ] + br label %for.innermost + +for.outermost.latch: ; preds = %for.middle.latch + %indvars.outermost.next = add nuw nsw i64 %indvars.outermost, 1 + %exitcond.outermost = icmp ne i64 %indvars.outermost.next, 10 + br i1 %exitcond.outermost, label %for.outermost.header, label %for.cond.cleanup + +for.middle.latch: ; preds = %for.innermost + %indvars.middle.next = add nuw nsw i64 %indvars.middle, 1 + %exitcond.middle = icmp ne i64 %indvars.middle.next, 10 + br i1 %exitcond.middle, label %for.middle.header, label %for.outermost.latch + +for.innermost: ; preds = %for.middle.header, %for.innermost + %indvars.innermost = phi i64 [ 0, %for.middle.header ], [ %indvars.innermost.next, %for.innermost ] + %arrayidx12 = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* %e, i64 %indvars.innermost, i64 %indvars.middle, i64 %indvars.outermost + %0 = load i32, i32* %arrayidx12 + %arrayidx18 = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* %f, i64 %indvars.innermost, i64 %indvars.middle, i64 %indvars.outermost + store i32 %0, i32* %arrayidx18 + %indvars.innermost.next = add nuw nsw i64 %indvars.innermost, 1 + %exitcond.innermost = icmp ne i64 %indvars.innermost.next, 10 + br i1 %exitcond.innermost, label %for.innermost, label %for.middle.latch +} \ No newline at end of file