Index: llvm/lib/Transforms/Scalar/LoopInterchange.cpp =================================================================== --- llvm/lib/Transforms/Scalar/LoopInterchange.cpp +++ llvm/lib/Transforms/Scalar/LoopInterchange.cpp @@ -311,11 +311,18 @@ bool isProfitable(const Loop *InnerLoop, const Loop *OuterLoop, unsigned InnerLoopId, unsigned OuterLoopId, CharMatrix &DepMatrix, - const DenseMap &CostMap); + const DenseMap &CostMap, + std::unique_ptr &CC); private: int getInstrOrderCost(); - + std::optional isProfitablePerLoopCacheAnalysis( + const DenseMap &CostMap, + std::unique_ptr &CC); + std::optional isProfitablePerInstrOrderCost(); + std::optional isProfitableForVectorization(unsigned InnerLoopId, + unsigned OuterLoopId, + CharMatrix &DepMatrix); Loop *OuterLoop; Loop *InnerLoop; @@ -512,7 +519,7 @@ LLVM_DEBUG(dbgs() << "Loops are legal to interchange\n"); LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE); if (!LIP.isProfitable(InnerLoop, OuterLoop, InnerLoopId, OuterLoopId, - DependencyMatrix, CostMap)) { + DependencyMatrix, CostMap, CC)) { LLVM_DEBUG(dbgs() << "Interchanging loops not profitable.\n"); return false; } @@ -1091,31 +1098,10 @@ return GoodOrder - BadOrder; } -static bool isProfitableForVectorization(unsigned InnerLoopId, - unsigned OuterLoopId, - CharMatrix &DepMatrix) { - // TODO: Improve this heuristic to catch more cases. - // If the inner loop is loop independent or doesn't carry any dependency it is - // profitable to move this to outer position. - for (auto &Row : DepMatrix) { - if (Row[InnerLoopId] != 'S' && Row[InnerLoopId] != 'I') - return false; - // TODO: We need to improve this heuristic. - if (Row[OuterLoopId] != '=') - return false; - } - // If outer loop has dependence and inner loop is loop independent then it is - // profitable to interchange to enable parallelism. - // If there are no dependences, interchanging will not improve anything. - return !DepMatrix.empty(); -} - -bool LoopInterchangeProfitability::isProfitable( - const Loop *InnerLoop, const Loop *OuterLoop, unsigned InnerLoopId, - unsigned OuterLoopId, CharMatrix &DepMatrix, - const DenseMap &CostMap) { - // TODO: Remove the legacy cost model. - +std::optional +LoopInterchangeProfitability::isProfitablePerLoopCacheAnalysis( + const DenseMap &CostMap, + std::unique_ptr &CC) { // This is the new cost model returned from loop cache analysis. // A smaller index means the loop should be placed an outer loop, and vice // versa. @@ -1127,30 +1113,90 @@ LLVM_DEBUG(dbgs() << "InnerIndex = " << InnerIndex << ", OuterIndex = " << OuterIndex << "\n"); if (InnerIndex < OuterIndex) - return true; - } else { - // Legacy cost model: this is rough cost estimation algorithm. It counts the - // good and bad order of induction variables in the instruction and allows - // reordering if number of bad orders is more than good. - int Cost = getInstrOrderCost(); - LLVM_DEBUG(dbgs() << "Cost = " << Cost << "\n"); - if (Cost < -LoopInterchangeCostThreshold) - return true; + return std::optional(true); + + if (CC->getLoopCost(*OuterLoop) == CC->getLoopCost(*InnerLoop)) + return std::nullopt; + return std::optional(false); } + return std::nullopt; +} - // It is not profitable as per current cache profitability model. But check if - // we can move this loop outside to improve parallelism. - if (isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix)) - return true; +std::optional +LoopInterchangeProfitability::isProfitablePerInstrOrderCost() { + // Legacy cost model: this is rough cost estimation algorithm. It counts the + // good and bad order of induction variables in the instruction and allows + // reordering if number of bad orders is more than good. + int Cost = getInstrOrderCost(); + LLVM_DEBUG(dbgs() << "Cost = " << Cost << "\n"); + if (Cost < 0 && Cost < LoopInterchangeCostThreshold) + return std::optional(true); + + return std::nullopt; +} - ORE->emit([&]() { - return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable", - InnerLoop->getStartLoc(), - InnerLoop->getHeader()) - << "Interchanging loops is too costly and it does not improve " - "parallelism."; - }); - return false; +std::optional LoopInterchangeProfitability::isProfitableForVectorization( + unsigned InnerLoopId, unsigned OuterLoopId, CharMatrix &DepMatrix) { + for (auto &Row : DepMatrix) { + // If the inner loop is loop independent or doesn't carry any dependency + // it is not profitable to move this to outer position, since we are + // likely able to do inner loop vectorization already. + if (Row[InnerLoopId] == 'I' || Row[InnerLoopId] == '=') + return std::optional(false); + + // If the outer loop is not loop independent it is not profitable to move + // this to inner position, since doing so would not enable inner loop + // parallelism. + if (Row[OuterLoopId] != '=') + return std::optional(false); + } + // If inner loop has dependence and outer loop is loop independent then it + // is/ profitable to interchange to enable inner loop parallelism. + // If there are no dependences, interchanging will not improve anything. + return std::optional(!DepMatrix.empty()); +} + +bool LoopInterchangeProfitability::isProfitable( + const Loop *InnerLoop, const Loop *OuterLoop, unsigned InnerLoopId, + unsigned OuterLoopId, CharMatrix &DepMatrix, + const DenseMap &CostMap, + std::unique_ptr &CC) { + // isProfitable() is structured to avoid endless loop interchange. + // If loop cache analysis could decide the profitability then, + // profitability check will stop and return the analysis result. + // If cache analysis failed to analyze the loopnest (e.g., + // due to delinearization issues) then only check whether it is + // profitable for InstrOrderCost. Likewise, if InstrOrderCost failed to + // analysis the profitability then only, isProfitableForVectorization + // will decide. + std::optional shouldInterchange = + isProfitablePerLoopCacheAnalysis(CostMap, CC); + if (!shouldInterchange.has_value()) { + shouldInterchange = isProfitablePerInstrOrderCost(); + if (!shouldInterchange.has_value()) + shouldInterchange = + isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix); + } + if (!shouldInterchange.has_value()) { + ORE->emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable", + InnerLoop->getStartLoc(), + InnerLoop->getHeader()) + << "Insufficient information to calculate the cost of loop for " + "interchange."; + }); + return false; + } else if (!shouldInterchange.value()) { + ORE->emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable", + InnerLoop->getStartLoc(), + InnerLoop->getHeader()) + << "Interchanging loops is not considered to improve cache locality " + "nor vectorization."; + }); + return false; + } + return true; } void LoopInterchangeTransform::removeChildLoop(Loop *OuterLoop, Index: llvm/test/Transforms/LoopInterchange/loop-interchange-optimization-remarks.ll =================================================================== --- llvm/test/Transforms/LoopInterchange/loop-interchange-optimization-remarks.ll +++ llvm/test/Transforms/LoopInterchange/loop-interchange-optimization-remarks.ll @@ -71,7 +71,7 @@ ; DELIN-NEXT: Name: InterchangeNotProfitable ; DELIN-NEXT: Function: test01 ; DELIN-NEXT: Args: -; DELIN-NEXT: - String: Interchanging loops is too costly and it does not improve parallelism. +; DELIN-NEXT: - String: Interchanging loops is not considered to improve cache locality nor vectorization. ; DELIN-NEXT: ... ;;--------------------------------------Test case 02------------------------------------ Index: llvm/test/Transforms/LoopInterchange/perserve-lcssa.ll =================================================================== --- llvm/test/Transforms/LoopInterchange/perserve-lcssa.ll +++ llvm/test/Transforms/LoopInterchange/perserve-lcssa.ll @@ -156,20 +156,29 @@ ; Make sure we do not crash for loops without reachable exits. define void @no_reachable_exits() { -; Check we interchanged. -; CHECK-LABEL: @no_reachable_exits() { +; Check we do not crash. +; CHECK-LABEL: @no_reachable_exits( ; CHECK-NEXT: bb: -; CHECK-NEXT: br label %inner.ph -; CHECK-LABEL: outer.ph: -; CHECK-NEXT: br label %outer.header -; CHECK-LABEL: inner.ph: -; CHECK-NEXT: br label %inner.body -; CHECK-LABEL: inner.body: -; CHECK-NEXT: %tmp31 = phi i32 [ 0, %inner.ph ], [ %[[IVNEXT:[0-9]]], %inner.body.split ] -; CHECK-NEXT: br label %outer.ph -; CHECK-LABEL: inner.body.split: -; CHECK-NEXT: %[[IVNEXT]] = add nsw i32 %tmp31, 1 -; CHECK-NEXT: br i1 false, label %inner.body, label %exit +; CHECK-NEXT: br label [[OUTER_PH:%.*]] +; CHECK: outer.ph: +; CHECK-NEXT: br label [[OUTER_HEADER:%.*]] +; CHECK: outer.header: +; CHECK-NEXT: [[TMP2:%.*]] = phi i32 [ 0, [[OUTER_PH]] ], [ [[TMP8:%.*]], [[OUTER_LATCH:%.*]] ] +; CHECK-NEXT: br i1 undef, label [[INNER_PH:%.*]], label [[OUTER_LATCH]] +; CHECK: inner.ph: +; CHECK-NEXT: br label [[INNER_BODY:%.*]] +; CHECK: inner.body: +; CHECK-NEXT: [[TMP31:%.*]] = phi i32 [ 0, [[INNER_PH]] ], [ [[TMP6:%.*]], [[INNER_BODY]] ] +; CHECK-NEXT: [[TMP5:%.*]] = load i32*, i32** undef, align 8 +; CHECK-NEXT: [[TMP6]] = add nsw i32 [[TMP31]], 1 +; CHECK-NEXT: br i1 false, label [[INNER_BODY]], label [[OUTER_LATCH_LOOPEXIT:%.*]] +; CHECK: outer.latch.loopexit: +; CHECK-NEXT: br label [[OUTER_LATCH]] +; CHECK: outer.latch: +; CHECK-NEXT: [[TMP8]] = add nsw i32 [[TMP2]], 1 +; CHECK-NEXT: br i1 false, label [[OUTER_HEADER]], label [[EXIT:%.*]] +; CHECK: exit: +; CHECK-NEXT: unreachable bb: Index: llvm/test/Transforms/LoopInterchange/pr57148.ll =================================================================== --- llvm/test/Transforms/LoopInterchange/pr57148.ll +++ llvm/test/Transforms/LoopInterchange/pr57148.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -passes=loop-interchange -cache-line-size=4 -verify-dom-info -verify-loop-info -verify-scev -verify-loop-lcssa -S | FileCheck %s +; RUN: opt < %s -passes=loop-interchange -cache-line-size=4 -loop-interchange-threshold=-100 -verify-dom-info -verify-loop-info -verify-scev -verify-loop-lcssa -S | FileCheck %s ; Make sure the loops are in LCSSA form after loop interchange, ; and loop interchange does not hit assertion errors and crash. Index: llvm/test/Transforms/LoopInterchange/profitability.ll =================================================================== --- llvm/test/Transforms/LoopInterchange/profitability.ll +++ llvm/test/Transforms/LoopInterchange/profitability.ll @@ -2,6 +2,10 @@ ; RUN: -pass-remarks=loop-interchange -pass-remarks-missed=loop-interchange ; RUN: FileCheck -input-file %t %s +; RUN: opt < %s -passes=loop-interchange,loop-interchange -cache-line-size=64 \ +; RUN: -pass-remarks-output=%t -pass-remarks='loop-interchange' -S +; RUN: cat %t | FileCheck --check-prefix=PROFIT %s + ;; We test profitability model in these test cases. target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" @@ -10,7 +14,7 @@ @B = common global [100 x [100 x i32]] zeroinitializer ;;---------------------------------------Test case 01--------------------------------- -;; Loops interchange will result in code vectorization and hence profitable. Check for interchange. +;; Loops interchange will result in better cache locality and hence profitable. Check for interchange. ;; for(int i=1;i<100;i++) ;; for(int j=1;j<100;j++) ;; A[j][i] = A[j - 1][i] + B[j][i]; @@ -162,3 +166,60 @@ for.end12: ret void } + +;;---------------------------------------Test case 05--------------------------------- +;; This test is to make sure, that multiple invocations of loop interchange will not +;; undo previous interchange and will converge to a particular order determined by the +;; profitability analysis. +;; for(int i=1;i<100;i++) +;; for(int j=1;j<100;j++) +;; A[j][0] = A[j][0] + B[j][i]; + +; CHECK: Name: Interchanged +; CHECK-NEXT: Function: interchange_05 + +; PROFIT-LABEL: --- !Passed +; PROFIT-NEXT: Pass: loop-interchange +; PROFIT-NEXT: Name: Interchanged +; PROFIT-LABEL: Function: interchange_05 +; PROFIT-NEXT: Args: +; PROFIT-NEXT: - String: Loop interchanged with enclosing loop. +; PROFIT-NEXT: ... +; PROFIT: --- !Missed +; PROFIT-NEXT: Pass: loop-interchange +; PROFIT-NEXT: Name: InterchangeNotProfitable +; PROFIT-NEXT: Function: interchange_05 +; PROFIT-NEXT: Args: +; PROFIT-NEXT: - String: Interchanging loops is not considered to improve cache locality nor vectorization. +; PROFIT-NEXT: ... +define void @interchange_05() { +entry: + br label %for2.preheader + +for2.preheader: + %i30 = phi i64 [ 1, %entry ], [ %i.next31, %for1.inc14 ] + br label %for2 + +for2: + %j = phi i64 [ %i.next, %for2 ], [ 1, %for2.preheader ] + %j.prev = add nsw i64 %j, -1 + %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %j, i64 0 + %lv1 = load i32, i32* %arrayidx5 + %arrayidx9 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @B, i64 0, i64 %j, i64 %i30 + %lv2 = load i32, i32* %arrayidx9 + %add = add nsw i32 %lv1, %lv2 + %arrayidx13 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %j, i64 0 + store i32 %add, i32* %arrayidx13 + %i.next = add nuw nsw i64 %j, 1 + %exitcond = icmp eq i64 %j, 99 + br i1 %exitcond, label %for1.inc14, label %for2 + +for1.inc14: + %i.next31 = add nuw nsw i64 %i30, 1 + %exitcond33 = icmp eq i64 %i30, 99 + br i1 %exitcond33, label %for.end16, label %for2.preheader + +for.end16: + ret void +} +