Index: llvm/lib/Transforms/Scalar/LoopInterchange.cpp =================================================================== --- llvm/lib/Transforms/Scalar/LoopInterchange.cpp +++ llvm/lib/Transforms/Scalar/LoopInterchange.cpp @@ -338,10 +338,18 @@ bool isProfitable(const Loop *InnerLoop, const Loop *OuterLoop, unsigned InnerLoopId, unsigned OuterLoopId, CharMatrix &DepMatrix, - const DenseMap &CostMap); + const DenseMap &CostMap, + std::unique_ptr& CC); private: int getInstrOrderCost(); + std::optional isProfitableAccordingLoopCacheAnalysis( + const DenseMap &CostMap); + std::optional isProfitableAccordingInstrOrderCost(); + std::optional isProfitableForVectorization(unsigned InnerLoopId, + unsigned OuterLoopId, + CharMatrix &DepMatrix, + std::unique_ptr& CC); Loop *OuterLoop; Loop *InnerLoop; @@ -539,7 +547,7 @@ LLVM_DEBUG(dbgs() << "Loops are legal to interchange\n"); LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE); if (!LIP.isProfitable(InnerLoop, OuterLoop, InnerLoopId, OuterLoopId, - DependencyMatrix, CostMap)) { + DependencyMatrix, CostMap, CC)) { LLVM_DEBUG(dbgs() << "Interchanging loops not profitable.\n"); return false; } @@ -1110,31 +1118,8 @@ return GoodOrder - BadOrder; } -static bool isProfitableForVectorization(unsigned InnerLoopId, - unsigned OuterLoopId, - CharMatrix &DepMatrix) { - // TODO: Improve this heuristic to catch more cases. - // If the inner loop is loop independent or doesn't carry any dependency it is - // profitable to move this to outer position. - for (auto &Row : DepMatrix) { - if (Row[InnerLoopId] != 'S' && Row[InnerLoopId] != 'I') - return false; - // TODO: We need to improve this heuristic. - if (Row[OuterLoopId] != '=') - return false; - } - // If outer loop has dependence and inner loop is loop independent then it is - // profitable to interchange to enable parallelism. - // If there are no dependences, interchanging will not improve anything. - return !DepMatrix.empty(); -} - -bool LoopInterchangeProfitability::isProfitable( - const Loop *InnerLoop, const Loop *OuterLoop, unsigned InnerLoopId, - unsigned OuterLoopId, CharMatrix &DepMatrix, - const DenseMap &CostMap) { - // TODO: Remove the legacy cost model. - +std::optional LoopInterchangeProfitability::isProfitableAccordingLoopCacheAnalysis( + const DenseMap &CostMap) { // This is the new cost model returned from loop cache analysis. // A smaller index means the loop should be placed an outer loop, and vice // versa. @@ -1146,30 +1131,87 @@ LLVM_DEBUG(dbgs() << "InnerIndex = " << InnerIndex << ", OuterIndex = " << OuterIndex << "\n"); if (InnerIndex < OuterIndex) - return true; - } else { - // Legacy cost model: this is rough cost estimation algorithm. It counts the - // good and bad order of induction variables in the instruction and allows - // reordering if number of bad orders is more than good. - int Cost = getInstrOrderCost(); - LLVM_DEBUG(dbgs() << "Cost = " << Cost << "\n"); - if (Cost < -LoopInterchangeCostThreshold) - return true; + return std::optional(true); + else + return std::optional(false); } + return std::nullopt; +} - // It is not profitable as per current cache profitability model. But check if - // we can move this loop outside to improve parallelism. - if (isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix)) - return true; +std::optional LoopInterchangeProfitability::isProfitableAccordingInstrOrderCost() { + // Legacy cost model: this is rough cost estimation algorithm. It counts the + // good and bad order of induction variables in the instruction and allows + // reordering if number of bad orders is more than good. + const int LoopInterchangeCostMeaningfulnessThreshold = 10; + int Cost = getInstrOrderCost(); + LLVM_DEBUG(dbgs() << "Cost = " << Cost << "\n"); + if (abs(Cost) < LoopInterchangeCostMeaningfulnessThreshold) + if (Cost < -LoopInterchangeCostThreshold) + return std::optional(true); + return std::nullopt; +} - ORE->emit([&]() { - return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable", - InnerLoop->getStartLoc(), - InnerLoop->getHeader()) - << "Interchanging loops is too costly and it does not improve " - "parallelism."; - }); - return false; +std::optional LoopInterchangeProfitability::isProfitableForVectorization( + unsigned InnerLoopId, + unsigned OuterLoopId, + CharMatrix &DepMatrix, + std::unique_ptr& CC) { + // To prevent endless interchange, only check whether it is profitable for vectorization + // when InnerLoop and OuterLoop has the same locality, or loop cache analysis failed to + // analyze the loopnest (e.g., due to delinearization issues). + if (CC && CC->getLoopCost(*InnerLoop) == CC->getLoopCost(*OuterLoop)) { + for (auto &Row : DepMatrix) { + // If the inner loop is loop independent or doesn't carry any dependency it is + // not profitable to move this to outer position, since we are likely able to do + // inner loop vectorization already. + if (Row[InnerLoopId] == 'I' || Row[InnerLoopId] == '=') + return std::optional(false); + + // If the outer loop is not loop independent it is not profitable to move this + // to inner position, since doing so would not enable inner loop parallelism. + if (Row[OuterLoopId] != '=') + return std::optional(false); + } + // If inner loop has dependence and outer loop is loop independent then it is + // profitable to interchange to enable inner loop parallelism. + // If there are no dependences, interchanging will not improve anything. + return std::optional(!DepMatrix.empty()); + } + return std::nullopt; +} + +bool LoopInterchangeProfitability::isProfitable( + const Loop *InnerLoop, const Loop *OuterLoop, unsigned InnerLoopId, + unsigned OuterLoopId, CharMatrix &DepMatrix, + const DenseMap &CostMap, + std::unique_ptr& CC) { + std::optional shouldInterchange = isProfitableAccordingLoopCacheAnalysis(CostMap); + if (!shouldInterchange.has_value()) { + shouldInterchange = isProfitableAccordingInstrOrderCost(); + if (!shouldInterchange.has_value()) { + shouldInterchange = isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix, CC); + } + } + if (!shouldInterchange.has_value()) { + ORE->emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable", + InnerLoop->getStartLoc(), + InnerLoop->getHeader()) + << "Insufficient information to calculate the cost of loop for interchange."; + }); + return false; + } + else if (!shouldInterchange.value()) { + ORE->emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable", + InnerLoop->getStartLoc(), + InnerLoop->getHeader()) + << "Interchanging loops not considered to improve cache locality " + "nor vectorization."; + }); + return false; + } + return true; } void LoopInterchangeTransform::removeChildLoop(Loop *OuterLoop, Index: llvm/test/Transforms/LoopInterchange/loop-interchange-optimization-remarks.ll =================================================================== --- llvm/test/Transforms/LoopInterchange/loop-interchange-optimization-remarks.ll +++ llvm/test/Transforms/LoopInterchange/loop-interchange-optimization-remarks.ll @@ -71,7 +71,7 @@ ; DELIN-NEXT: Name: InterchangeNotProfitable ; DELIN-NEXT: Function: test01 ; DELIN-NEXT: Args: -; DELIN-NEXT: - String: Interchanging loops is too costly and it does not improve parallelism. +; DELIN-NEXT: - String: Interchanging loops not considered to improve cache locality nor vectorization. ; DELIN-NEXT: ... ;;--------------------------------------Test case 02------------------------------------ Index: llvm/test/Transforms/LoopInterchange/profitability.ll =================================================================== --- llvm/test/Transforms/LoopInterchange/profitability.ll +++ llvm/test/Transforms/LoopInterchange/profitability.ll @@ -2,6 +2,10 @@ ; RUN: -pass-remarks=loop-interchange -pass-remarks-missed=loop-interchange ; RUN: FileCheck -input-file %t %s +; RUN: opt < %s -passes=loop-interchange,loop-interchange -cache-line-size=64 \ +; RUN: -pass-remarks-output=%t -pass-remarks='loop-interchange' -S +; RUN: cat %t | FileCheck --check-prefix=PROFIT %s + ;; We test profitability model in these test cases. target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" @@ -10,7 +14,7 @@ @B = common global [100 x [100 x i32]] zeroinitializer ;;---------------------------------------Test case 01--------------------------------- -;; Loops interchange will result in code vectorization and hence profitable. Check for interchange. +;; Loops interchange will result in better cache locality and hence profitable. Check for interchange. ;; for(int i=1;i<100;i++) ;; for(int j=1;j<100;j++) ;; A[j][i] = A[j - 1][i] + B[j][i]; @@ -18,6 +22,20 @@ ; CHECK: Name: Interchanged ; CHECK-NEXT: Function: interchange_01 +; PROFIT: --- !Passed +; PROFIT-NEXT: Pass: loop-interchange +; PROFIT-NEXT: Name: Interchanged +; PROFIT-NEXT: Function: interchange_01 +; PROFIT-NEXT: Args: +; PROFIT-NEXT: - String: Loop interchanged with enclosing loop. +; PROFIT-NEXT: ... +; PROFIT: --- !Missed +; PROFIT-NEXT: Pass: loop-interchange +; PROFIT-NEXT: Name: InterchangeNotProfitable +; PROFIT-NEXT: Function: interchange_01 +; PROFIT-NEXT: Args: +; PROFIT-NEXT: - String: Interchanging loops not considered to improve cache locality nor vectorization. +; PROFIT-NEXT: ... define void @interchange_01() { entry: br label %for2.preheader @@ -59,6 +77,21 @@ ; CHECK: Name: Interchanged ; CHECK-NEXT: Function: interchange_02 + +; PROFIT: --- !Passed +; PROFIT-NEXT: Pass: loop-interchange +; PROFIT-NEXT: Name: Interchanged +; PROFIT-NEXT: Function: interchange_02 +; PROFIT-NEXT: Args: +; PROFIT-NEXT: - String: Loop interchanged with enclosing loop. +; PROFIT-NEXT: ... +; PROFIT: --- !Missed +; PROFIT-NEXT: Pass: loop-interchange +; PROFIT-NEXT: Name: InterchangeNotProfitable +; PROFIT-NEXT: Function: interchange_02 +; PROFIT-NEXT: Args: +; PROFIT-NEXT: - String: Interchanging loops not considered to improve cache locality nor vectorization. +; PROFIT-NEXT: ... define void @interchange_02() { entry: br label %for1.header @@ -98,6 +131,21 @@ ; CHECK: Name: InterchangeNotProfitable ; CHECK-NEXT: Function: interchange_03 + +; PROFIT: --- !Missed +; PROFIT-NEXT: Pass: loop-interchange +; PROFIT-NEXT: Name: InterchangeNotProfitable +; PROFIT-NEXT: Function: interchange_03 +; PROFIT-NEXT: Args: +; PROFIT-NEXT: - String: Interchanging loops not considered to improve cache locality nor vectorization. +; PROFIT-NEXT: ... +; PROFIT: --- !Missed +; PROFIT-NEXT: Pass: loop-interchange +; PROFIT-NEXT: Name: InterchangeNotProfitable +; PROFIT-NEXT: Function: interchange_03 +; PROFIT-NEXT: Args: +; PROFIT-NEXT: - String: Interchanging loops not considered to improve cache locality nor vectorization. +; PROFIT-NEXT: ... define void @interchange_03(){ entry: br label %for1.header @@ -136,6 +184,21 @@ ; CHECK: Name: InterchangeNotProfitable ; CHECK-NEXT: Function: interchange_04 + +; PROFIT: --- !Missed +; PROFIT-NEXT: Pass: loop-interchange +; PROFIT-NEXT: Name: InterchangeNotProfitable +; PROFIT-NEXT: Function: interchange_04 +; PROFIT-NEXT: Args: +; PROFIT-NEXT: - String: Interchanging loops not considered to improve cache locality nor vectorization. +; PROFIT-NEXT: ... +; PROFIT: --- !Missed +; PROFIT-NEXT: Pass: loop-interchange +; PROFIT-NEXT: Name: InterchangeNotProfitable +; PROFIT-NEXT: Function: interchange_04 +; PROFIT-NEXT: Args: +; PROFIT-NEXT: - String: Interchanging loops not considered to improve cache locality nor vectorization. +; PROFIT-NEXT: ... define void @interchange_04(i32 %k) { entry: br label %for.cond1.preheader @@ -162,3 +225,58 @@ for.end12: ret void } + +;;---------------------------------------Test case 05--------------------------------- +;; Loops interchange will result in code cache cost and hence profitable. Check for interchange. +;; for(int i=1;i<100;i++) +;; for(int j=1;j<100;j++) +;; A[j][0] = A[j][0] + B[j][i]; + +; CHECK: Name: Interchanged +; CHECK-NEXT: Function: interchange_05 + +; PROFIT: --- !Passed +; PROFIT-NEXT: Pass: loop-interchange +; PROFIT-NEXT: Name: Interchanged +; PROFIT-NEXT: Function: interchange_05 +; PROFIT-NEXT: Args: +; PROFIT-NEXT: - String: Loop interchanged with enclosing loop. +; PROFIT-NEXT: ... +; PROFIT: --- !Missed +; PROFIT-NEXT: Pass: loop-interchange +; PROFIT-NEXT: Name: InterchangeNotProfitable +; PROFIT-NEXT: Function: interchange_05 +; PROFIT-NEXT: Args: +; PROFIT-NEXT: - String: Interchanging loops not considered to improve cache locality nor vectorization. +; PROFIT-NEXT: ... +define void @interchange_05() { +entry: + br label %for2.preheader + +for2.preheader: + %i30 = phi i64 [ 1, %entry ], [ %i.next31, %for1.inc14 ] + br label %for2 + +for2: + %j = phi i64 [ %i.next, %for2 ], [ 1, %for2.preheader ] + %j.prev = add nsw i64 %j, -1 + %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %j, i64 0 + %lv1 = load i32, i32* %arrayidx5 + %arrayidx9 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @B, i64 0, i64 %j, i64 %i30 + %lv2 = load i32, i32* %arrayidx9 + %add = add nsw i32 %lv1, %lv2 + %arrayidx13 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %j, i64 0 + store i32 %add, i32* %arrayidx13 + %i.next = add nuw nsw i64 %j, 1 + %exitcond = icmp eq i64 %j, 99 + br i1 %exitcond, label %for1.inc14, label %for2 + +for1.inc14: + %i.next31 = add nuw nsw i64 %i30, 1 + %exitcond33 = icmp eq i64 %i30, 99 + br i1 %exitcond33, label %for.end16, label %for2.preheader + +for.end16: + ret void +} +