Index: llvm/lib/Transforms/Scalar/LoopInterchange.cpp =================================================================== --- llvm/lib/Transforms/Scalar/LoopInterchange.cpp +++ llvm/lib/Transforms/Scalar/LoopInterchange.cpp @@ -18,6 +18,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringRef.h" #include "llvm/Analysis/DependenceAnalysis.h" +#include "llvm/Analysis/LoopCacheAnalysis.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopNestAnalysis.h" #include "llvm/Analysis/LoopPass.h" @@ -358,8 +359,10 @@ : OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {} /// Check if the loop interchange is profitable. - bool isProfitable(unsigned InnerLoopId, unsigned OuterLoopId, - CharMatrix &DepMatrix); + bool isProfitable(const Loop *InnerLoop, const Loop *OuterLoop, + unsigned InnerLoopId, unsigned OuterLoopId, + CharMatrix &DepMatrix, + const DenseMap &CostMap); private: int getInstrOrderCost(); @@ -410,13 +413,15 @@ LoopInfo *LI = nullptr; DependenceInfo *DI = nullptr; DominatorTree *DT = nullptr; + std::unique_ptr CC = nullptr; /// Interface to emit optimization remarks. OptimizationRemarkEmitter *ORE; LoopInterchange(ScalarEvolution *SE, LoopInfo *LI, DependenceInfo *DI, - DominatorTree *DT, OptimizationRemarkEmitter *ORE) - : SE(SE), LI(LI), DI(DI), DT(DT), ORE(ORE) {} + DominatorTree *DT, std::unique_ptr &CC, + OptimizationRemarkEmitter *ORE) + : SE(SE), LI(LI), DI(DI), DT(DT), CC(std::move(CC)), ORE(ORE) {} bool run(Loop *L) { if (L->getParentLoop()) @@ -499,6 +504,21 @@ } unsigned SelecLoopId = selectLoopForInterchange(LoopList); + // Obtain the loop vector returned from loop cache analysis beforehand, + // and put each pair into a map for constant time query + // later. Indices in loop vector reprsent the optimal order of the + // corresponding loop, e.g., given a loopnest with depth N, index 0 + // indicates the loop should be placed as the outermost loop and index N + // indicates the loop should be placed as the innermost loop. + // + // For the old pass manager CacheCost would be null. + DenseMap CostMap; + if (CC != nullptr) { + const auto &LoopCosts = CC->getLoopCosts(); + for (unsigned i = 0; i < LoopCosts.size(); i++) { + CostMap[LoopCosts[i].first] = i; + } + } // We try to achieve the globally optimal memory access for the loopnest, // and do interchange based on a bubble-sort fasion. We start from // the innermost loop, move it outwards to the best possible position @@ -507,7 +527,7 @@ bool ChangedPerIter = false; for (unsigned i = SelecLoopId; i > SelecLoopId - j; i--) { bool Interchanged = processLoop(LoopList[i], LoopList[i - 1], i, i - 1, - DependencyMatrix); + DependencyMatrix, CostMap); if (!Interchanged) continue; // Loops interchanged, update LoopList accordingly. @@ -531,7 +551,8 @@ bool processLoop(Loop *InnerLoop, Loop *OuterLoop, unsigned InnerLoopId, unsigned OuterLoopId, - std::vector> &DependencyMatrix) { + std::vector> &DependencyMatrix, + const DenseMap &CostMap) { LLVM_DEBUG(dbgs() << "Processing InnerLoopId = " << InnerLoopId << " and OuterLoopId = " << OuterLoopId << "\n"); LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, ORE); @@ -541,7 +562,8 @@ } LLVM_DEBUG(dbgs() << "Loops are legal to interchange\n"); LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE); - if (!LIP.isProfitable(InnerLoopId, OuterLoopId, DependencyMatrix)) { + if (!LIP.isProfitable(InnerLoop, OuterLoop, InnerLoopId, OuterLoopId, + DependencyMatrix, CostMap)) { LLVM_DEBUG(dbgs() << "Interchanging loops not profitable.\n"); return false; } @@ -1135,21 +1157,33 @@ return !DepMatrix.empty(); } -bool LoopInterchangeProfitability::isProfitable(unsigned InnerLoopId, - unsigned OuterLoopId, - CharMatrix &DepMatrix) { - // TODO: Add better profitability checks. - // e.g - // 1) Construct dependency matrix and move the one with no loop carried dep - // inside to enable vectorization. - - // This is rough cost estimation algorithm. It counts the good and bad order - // of induction variables in the instruction and allows reordering if number - // of bad orders is more than good. - int Cost = getInstrOrderCost(); - LLVM_DEBUG(dbgs() << "Cost = " << Cost << "\n"); - if (Cost < -LoopInterchangeCostThreshold) - return true; +bool LoopInterchangeProfitability::isProfitable( + const Loop *InnerLoop, const Loop *OuterLoop, unsigned InnerLoopId, + unsigned OuterLoopId, CharMatrix &DepMatrix, + const DenseMap &CostMap) { + // TODO: Remove the legacy cost model. + + // This is the new cost model returned from loop cache analysis. + // A smaller index means the loop should be placed an outer loop, and vice + // versa. + if (CostMap.find(InnerLoop) != CostMap.end() && + CostMap.find(OuterLoop) != CostMap.end()) { + unsigned InnerIndex = 0, OuterIndex = 0; + InnerIndex = CostMap.find(InnerLoop)->second; + OuterIndex = CostMap.find(OuterLoop)->second; + LLVM_DEBUG(dbgs() << "InnerIndex = " << InnerIndex + << ", OuterIndex = " << OuterIndex << "\n"); + if (InnerIndex < OuterIndex) + return true; + } else { + // Legacy cost model: this is rough cost estimation algorithm. It counts the + // good and bad order of induction variables in the instruction and allows + // reordering if number of bad orders is more than good. + int Cost = getInstrOrderCost(); + LLVM_DEBUG(dbgs() << "Cost = " << Cost << "\n"); + if (Cost < -LoopInterchangeCostThreshold) + return true; + } // It is not profitable as per current cache profitability model. But check if // we can move this loop outside to improve parallelism. @@ -1160,10 +1194,8 @@ return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable", InnerLoop->getStartLoc(), InnerLoop->getHeader()) - << "Interchanging loops is too costly (cost=" - << ore::NV("Cost", Cost) << ", threshold=" - << ore::NV("Threshold", LoopInterchangeCostThreshold) - << ") and it does not improve parallelism."; + << "Interchanging loops is too costly and it does not improve " + "parallelism."; }); return false; } @@ -1709,8 +1741,8 @@ auto *DI = &getAnalysis().getDI(); auto *DT = &getAnalysis().getDomTree(); auto *ORE = &getAnalysis().getORE(); - - return LoopInterchange(SE, LI, DI, DT, ORE).run(L); + std::unique_ptr CC = nullptr; + return LoopInterchange(SE, LI, DI, DT, CC, ORE).run(L); } }; } // namespace @@ -1737,8 +1769,10 @@ Function &F = *LN.getParent(); DependenceInfo DI(&F, &AR.AA, &AR.SE, &AR.LI); + std::unique_ptr CC = + CacheCost::getCacheCost(LN.getOutermostLoop(), AR, DI); OptimizationRemarkEmitter ORE(&F); - if (!LoopInterchange(&AR.SE, &AR.LI, &DI, &AR.DT, &ORE).run(LN)) + if (!LoopInterchange(&AR.SE, &AR.LI, &DI, &AR.DT, CC, &ORE).run(LN)) return PreservedAnalyses::all(); return getLoopPassPreservedAnalyses(); } Index: llvm/test/Transforms/LICM/lnicm.ll =================================================================== --- llvm/test/Transforms/LICM/lnicm.ll +++ llvm/test/Transforms/LICM/lnicm.ll @@ -1,12 +1,13 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -aa-pipeline=basic-aa -passes='loop(loop-interchange)' -S %s | FileCheck %s --check-prefixes INTC -; RUN: opt -aa-pipeline=basic-aa -passes='loop-mssa(lnicm),loop(loop-interchange)' -S %s | FileCheck %s --check-prefixes LNICM,CHECK -; RUN: opt -aa-pipeline=basic-aa -passes='loop-mssa(licm),loop(loop-interchange)' -S %s | FileCheck %s --check-prefixes LICM,CHECK +; RUN: opt -aa-pipeline=basic-aa -passes='loop-mssa(lnicm),loop(loop-interchange)' -S %s | FileCheck %s --check-prefixes LNICM +; RUN: opt -aa-pipeline=basic-aa -passes='loop-mssa(licm),loop(loop-interchange)' -S %s | FileCheck %s --check-prefixes LICM ; This test represents the following function: -; void test(int x[10][10], int y[10], int *z) { -; for (int k = 0; k < 10; k++) { +; void test(int n, int m, int x[m][n], int y[n], int *z) { +; for (int k = 0; k < n; k++) { ; int tmp = *z; -; for (int i = 0; i < 10; i++) +; for (int i = 0; i < m; i++) ; x[i][k] += y[k] + tmp; ; } ; } @@ -15,83 +16,189 @@ ; to keep perfect loop nest. This enables optimizations that require ; perfect loop nest (e.g. loop-interchange) to perform. +target triple = "powerpc64le-unknown-linux-gnu" -define dso_local void @test([10 x i32]* noalias %x, i32* noalias readonly %y, i32* readonly %z) { -; CHECK-LABEL: @test( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[Z:%.*]] = load i32, i32* %z, align 4 -; CHECK-NEXT: br label [[FOR_BODY3_PREHEADER:%.*]] -; LNICM: for.body.preheader: -; LICM-NOT: for.body.preheader: -; INTC-NOT: for.body.preheader: -; LNICM-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; LNICM-NEXT: [[K:%.*]] = phi i32 [ [[INC10:%.*]], [[FOR_END:%.*]] ], [ 0, [[FOR_BODY_PREHEADER:%.*]] ] -; LNICM-NEXT: br label [[FOR_BODY3_SPLIT1:%.*]] -; LICM: [[TMP:%.*]] = load i32, i32* [[ARRAYIDX:%.*]], align 4 -; LNICM: for.body3.preheader: -; LICM-NOT: for.body3.preheader: -; INTC-NOT: for.body3.preheader: -; LNICM-NEXT: br label [[FOR_BODY3:%.*]] -; CHECK: for.body3: -; LNICM-NEXT: [[I:%.*]] = phi i32 [ [[TMP3:%.*]], [[FOR_BODY3_SPLIT:%.*]] ], [ 0, [[FOR_BODY3_PREHEADER:%.*]] ] -; LNICM-NEXT: br label [[FOR_BODY_PREHEADER:%.*]] -; LNICM: for.body3.split1: -; LNICM-NEXT: [[IDXPROM:%.*]] = sext i32 [[K:%.*]] to i64 -; LNICM-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* %y, i64 [[IDXPROM:%.*]] -; LNICM-NEXT: [[TMP:%.*]] = load i32, i32* [[ARRAYIDX:%.*]], align 4 -; LNICM-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP:%.*]], [[Z:%.*]] -; LNICM-NEXT: [[IDXPROM4:%.*]] = sext i32 [[I:%.*]] to i64 -; LNICM-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* %x, i64 [[IDXPROM4:%.*]] -; LNICM-NEXT: [[IDXPROM6:%.*]] = sext i32 [[K:%.*]] to i64 -; LNICM-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX5:%.*]], i64 0, i64 [[IDXPROM6:%.*]] -; LNICM-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX7:%.*]], align 4 -; LNICM-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP2:%.*]], [[ADD:%.*]] -; LNICM-NEXT: store i32 [[ADD8:%.*]], i32* [[ARRAYIDX7:%.*]], align 4 -; LNICM-NEXT: [[INC:%.*]] = add nsw i32 [[I:%.*]], 1 -; LNICM-NEXT: [[CMP2:%.*]] = icmp slt i32 [[INC:%.*]], 10 -; LNICM-NEXT: br label [[FOR_END:%.*]] -; LNICM: for.body3.split: -; LICM-NOT: for.body3.split: -; INTC-NOT: for.body3.split: -; LNICM-NEXT: [[TMP3:%.*]] = add nsw i32 [[I:%.*]], 1 -; LNICM-NEXT: [[TMP4:%.*]] = icmp slt i32 [[TMP3:%.*]], 10 -; LNICM-NEXT: br i1 [[TMP4:%.*]], label [[FOR_BODY3:%.*]], label [[FOR_END11:%.*]], !llvm.loop !0 -; LNICM: for.end: -; LNICM-NEXT: [[INC10:%.*]] = add nsw i32 [[K:%.*]], 1 -; LNICM-NEXT: [[CMP:%.*]] = icmp slt i32 [[INC10:%.*]], 10 -; LNICM-NEXT: br i1 [[CMP:%.*]], label [[FOR_BODY:%.*]], label [[FOR_BODY3_SPLIT:%.*]], !llvm.loop !2 -; LNICM: for.end11: -; LNICM-NEXT: ret void +define dso_local void @test(i64 %n, i64 %m, ptr noalias %x, ptr noalias readonly %y, ptr readonly %z) { +; The loopnest is not interchanged when we only run loop interchange. +; INTC-LABEL: @test( +; INTC-NEXT: gurad: +; INTC-NEXT: [[CMP23:%.*]] = icmp sgt i64 [[M:%.*]], 0 +; INTC-NEXT: [[CMP32:%.*]] = icmp sgt i64 [[N:%.*]], 0 +; INTC-NEXT: br i1 [[CMP23]], label [[FOR_COND1_PREHEADER_LR_PH:%.*]], label [[FOR_END11:%.*]] +; INTC: for.cond1.preheader.lr.ph: +; INTC-NEXT: br i1 [[CMP32]], label [[FOR_I_PREHEADER:%.*]], label [[FOR_END11]] +; INTC: for.i.preheader: +; INTC-NEXT: br label [[ENTRY:%.*]] +; INTC: entry: +; INTC-NEXT: br label [[FOR_BODY:%.*]] +; INTC: for.body: +; INTC-NEXT: [[K_02:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC10:%.*]], [[FOR_END:%.*]] ] +; INTC-NEXT: [[TMP0:%.*]] = load i32, ptr [[Z:%.*]], align 4 +; INTC-NEXT: br label [[FOR_BODY3:%.*]] +; INTC: for.body3: +; INTC-NEXT: [[I_01:%.*]] = phi i32 [ 0, [[FOR_BODY]] ], [ [[INC:%.*]], [[FOR_BODY3]] ] +; INTC-NEXT: [[IDXPROM:%.*]] = sext i32 [[K_02]] to i64 +; INTC-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[Y:%.*]], i64 [[IDXPROM]] +; INTC-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; INTC-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]] +; INTC-NEXT: [[IDXPROM4:%.*]] = sext i32 [[I_01]] to i64 +; INTC-NEXT: [[INDEX0:%.*]] = mul i64 [[IDXPROM4]], [[N]] +; INTC-NEXT: [[INDEX1:%.*]] = add i64 [[INDEX0]], [[IDXPROM]] +; INTC-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i64 [[INDEX1]] +; INTC-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX7]], align 4 +; INTC-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP2]], [[ADD]] +; INTC-NEXT: store i32 [[ADD8]], ptr [[ARRAYIDX7]], align 4 +; INTC-NEXT: [[INC]] = add nsw i32 [[I_01]], 1 +; INTC-NEXT: [[INC_EXT:%.*]] = sext i32 [[INC]] to i64 +; INTC-NEXT: [[CMP2:%.*]] = icmp slt i64 [[INC_EXT]], [[M]] +; INTC-NEXT: br i1 [[CMP2]], label [[FOR_BODY3]], label [[FOR_END]], !llvm.loop [[LOOP0:![0-9]+]] +; INTC: for.end: +; INTC-NEXT: [[INC10]] = add nsw i32 [[K_02]], 1 +; INTC-NEXT: [[INC10_EXT:%.*]] = sext i32 [[INC10]] to i64 +; INTC-NEXT: [[CMP:%.*]] = icmp slt i64 [[INC10_EXT]], [[N]] +; INTC-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END11_LOOPEXIT:%.*]], !llvm.loop [[LOOP2:![0-9]+]] +; INTC: for.end11.loopexit: +; INTC-NEXT: br label [[FOR_END11]] +; INTC: for.end11: +; INTC-NEXT: ret void +; +; The loopnest is interchanged when we run lnicm and loop interchange. +; LNICM-LABEL: @test( +; LNICM-NEXT: gurad: +; LNICM-NEXT: [[CMP23:%.*]] = icmp sgt i64 [[M:%.*]], 0 +; LNICM-NEXT: [[CMP32:%.*]] = icmp sgt i64 [[N:%.*]], 0 +; LNICM-NEXT: br i1 [[CMP23]], label [[FOR_COND1_PREHEADER_LR_PH:%.*]], label [[FOR_END11:%.*]] +; LNICM: for.cond1.preheader.lr.ph: +; LNICM-NEXT: br i1 [[CMP32]], label [[FOR_I_PREHEADER:%.*]], label [[FOR_END11]] +; LNICM: for.i.preheader: +; LNICM-NEXT: br label [[FOR_BODY3_PREHEADER:%.*]] +; LNICM: entry: +; LNICM-NEXT: br label [[FOR_BODY:%.*]] +; LNICM: for.body: +; LNICM-NEXT: [[K_02:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC10:%.*]], [[FOR_END:%.*]] ] +; LNICM-NEXT: br label [[FOR_BODY3_SPLIT1:%.*]] +; LNICM: for.body3.preheader: +; LNICM-NEXT: [[TMP0:%.*]] = load i32, ptr [[Z:%.*]], align 4 +; LNICM-NEXT: br label [[FOR_BODY3:%.*]] +; LNICM: for.body3: +; LNICM-NEXT: [[I_01:%.*]] = phi i32 [ [[TMP3:%.*]], [[FOR_BODY3_SPLIT:%.*]] ], [ 0, [[FOR_BODY3_PREHEADER]] ] +; LNICM-NEXT: br label [[ENTRY]] +; LNICM: for.body3.split1: +; LNICM-NEXT: [[IDXPROM:%.*]] = sext i32 [[K_02]] to i64 +; LNICM-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[Y:%.*]], i64 [[IDXPROM]] +; LNICM-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; LNICM-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]] +; LNICM-NEXT: [[IDXPROM4:%.*]] = sext i32 [[I_01]] to i64 +; LNICM-NEXT: [[INDEX0:%.*]] = mul i64 [[IDXPROM4]], [[N]] +; LNICM-NEXT: [[INDEX1:%.*]] = add i64 [[INDEX0]], [[IDXPROM]] +; LNICM-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i64 [[INDEX1]] +; LNICM-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX7]], align 4 +; LNICM-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP2]], [[ADD]] +; LNICM-NEXT: store i32 [[ADD8]], ptr [[ARRAYIDX7]], align 4 +; LNICM-NEXT: [[INC:%.*]] = add nsw i32 [[I_01]], 1 +; LNICM-NEXT: [[INC_EXT:%.*]] = sext i32 [[INC]] to i64 +; LNICM-NEXT: [[CMP2:%.*]] = icmp slt i64 [[INC_EXT]], [[M]] +; LNICM-NEXT: br label [[FOR_END]] +; LNICM: for.body3.split: +; LNICM-NEXT: [[TMP3]] = add nsw i32 [[I_01]], 1 +; LNICM-NEXT: [[TMP4:%.*]] = sext i32 [[TMP3]] to i64 +; LNICM-NEXT: [[TMP5:%.*]] = icmp slt i64 [[TMP4]], [[M]] +; LNICM-NEXT: br i1 [[TMP5]], label [[FOR_BODY3]], label [[FOR_END11_LOOPEXIT:%.*]], !llvm.loop [[LOOP0:![0-9]+]] +; LNICM: for.end: +; LNICM-NEXT: [[INC10]] = add nsw i32 [[K_02]], 1 +; LNICM-NEXT: [[INC10_EXT:%.*]] = sext i32 [[INC10]] to i64 +; LNICM-NEXT: [[CMP:%.*]] = icmp slt i64 [[INC10_EXT]], [[N]] +; LNICM-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_BODY3_SPLIT]], !llvm.loop [[LOOP2:![0-9]+]] +; LNICM: for.end11.loopexit: +; LNICM-NEXT: br label [[FOR_END11]] +; LNICM: for.end11: +; LNICM-NEXT: ret void +; +; The loopnest is not interchanged when we run licm and loop interchange. +; LICM-LABEL: @test( +; LICM-NEXT: gurad: +; LICM-NEXT: [[CMP23:%.*]] = icmp sgt i64 [[M:%.*]], 0 +; LICM-NEXT: [[CMP32:%.*]] = icmp sgt i64 [[N:%.*]], 0 +; LICM-NEXT: br i1 [[CMP23]], label [[FOR_COND1_PREHEADER_LR_PH:%.*]], label [[FOR_END11:%.*]] +; LICM: for.cond1.preheader.lr.ph: +; LICM-NEXT: br i1 [[CMP32]], label [[FOR_I_PREHEADER:%.*]], label [[FOR_END11]] +; LICM: for.i.preheader: +; LICM-NEXT: br label [[ENTRY:%.*]] +; LICM: entry: +; LICM-NEXT: [[TMP0:%.*]] = load i32, ptr [[Z:%.*]], align 4 +; LICM-NEXT: br label [[FOR_BODY:%.*]] +; LICM: for.body: +; LICM-NEXT: [[K_02:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC10:%.*]], [[FOR_END:%.*]] ] +; LICM-NEXT: [[IDXPROM:%.*]] = sext i32 [[K_02]] to i64 +; LICM-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[Y:%.*]], i64 [[IDXPROM]] +; LICM-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; LICM-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]] +; LICM-NEXT: br label [[FOR_BODY3:%.*]] +; LICM: for.body3: +; LICM-NEXT: [[I_01:%.*]] = phi i32 [ 0, [[FOR_BODY]] ], [ [[INC:%.*]], [[FOR_BODY3]] ] +; LICM-NEXT: [[IDXPROM4:%.*]] = sext i32 [[I_01]] to i64 +; LICM-NEXT: [[INDEX0:%.*]] = mul i64 [[IDXPROM4]], [[N]] +; LICM-NEXT: [[INDEX1:%.*]] = add i64 [[INDEX0]], [[IDXPROM]] +; LICM-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i64 [[INDEX1]] +; LICM-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX7]], align 4 +; LICM-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP2]], [[ADD]] +; LICM-NEXT: store i32 [[ADD8]], ptr [[ARRAYIDX7]], align 4 +; LICM-NEXT: [[INC]] = add nsw i32 [[I_01]], 1 +; LICM-NEXT: [[INC_EXT:%.*]] = sext i32 [[INC]] to i64 +; LICM-NEXT: [[CMP2:%.*]] = icmp slt i64 [[INC_EXT]], [[M]] +; LICM-NEXT: br i1 [[CMP2]], label [[FOR_BODY3]], label [[FOR_END]], !llvm.loop [[LOOP0:![0-9]+]] +; LICM: for.end: +; LICM-NEXT: [[INC10]] = add nsw i32 [[K_02]], 1 +; LICM-NEXT: [[INC10_EXT:%.*]] = sext i32 [[INC10]] to i64 +; LICM-NEXT: [[CMP:%.*]] = icmp slt i64 [[INC10_EXT]], [[N]] +; LICM-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END11_LOOPEXIT:%.*]], !llvm.loop [[LOOP2:![0-9]+]] +; LICM: for.end11.loopexit: +; LICM-NEXT: br label [[FOR_END11]] +; LICM: for.end11: +; LICM-NEXT: ret void +; -entry: +gurad: + %cmp23 = icmp sgt i64 %m, 0 + %cmp32 = icmp sgt i64 %n, 0 + br i1 %cmp23, label %for.cond1.preheader.lr.ph, label %for.end11 + +for.cond1.preheader.lr.ph: ; preds = %gurad + br i1 %cmp32, label %for.i.preheader, label %for.end11 + +for.i.preheader: ; preds = %for.cond1.preheader.lr.ph + br label %entry + +entry: ; preds = %for.i.preheader br label %for.body for.body: %k.02 = phi i32 [ 0, %entry ], [ %inc10, %for.end ] - %0 = load i32, i32* %z, align 4 + %0 = load i32, ptr %z, align 4 br label %for.body3 for.body3: %i.01 = phi i32 [ 0, %for.body ], [ %inc, %for.body3 ] %idxprom = sext i32 %k.02 to i64 - %arrayidx = getelementptr inbounds i32, i32* %y, i64 %idxprom - %1 = load i32, i32* %arrayidx, align 4 + %arrayidx = getelementptr inbounds i32, ptr %y, i64 %idxprom + %1 = load i32, ptr %arrayidx, align 4 %add = add nsw i32 %1, %0 %idxprom4 = sext i32 %i.01 to i64 - %arrayidx5 = getelementptr inbounds [10 x i32], [10 x i32]* %x, i64 %idxprom4 - %idxprom6 = sext i32 %k.02 to i64 - %arrayidx7 = getelementptr inbounds [10 x i32], [10 x i32]* %arrayidx5, i64 0, i64 %idxprom6 - %2 = load i32, i32* %arrayidx7, align 4 + %index0 = mul i64 %idxprom4, %n + %index1 = add i64 %index0, %idxprom + %arrayidx7 = getelementptr inbounds i32, ptr %x, i64 %index1 + %2 = load i32, ptr %arrayidx7, align 4 %add8 = add nsw i32 %2, %add - store i32 %add8, i32* %arrayidx7, align 4 + store i32 %add8, ptr %arrayidx7, align 4 %inc = add nsw i32 %i.01, 1 - %cmp2 = icmp slt i32 %inc, 10 + %inc.ext = sext i32 %inc to i64 + %cmp2 = icmp slt i64 %inc.ext, %m br i1 %cmp2, label %for.body3, label %for.end, !llvm.loop !0 for.end: %inc10 = add nsw i32 %k.02, 1 - %cmp = icmp slt i32 %inc10, 10 + %inc10.ext = sext i32 %inc10 to i64 + %cmp = icmp slt i64 %inc10.ext, %n br i1 %cmp, label %for.body, label %for.end11, !llvm.loop !2 for.end11: Index: llvm/test/Transforms/LoopInterchange/call-instructions.ll =================================================================== --- llvm/test/Transforms/LoopInterchange/call-instructions.ll +++ llvm/test/Transforms/LoopInterchange/call-instructions.ll @@ -4,7 +4,7 @@ ; RUN: FileCheck --input-file=%t %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" +target triple = "powerpc64le-unknown-linux-gnu" @A = common global [100 x [100 x i32]] zeroinitializer Index: llvm/test/Transforms/LoopInterchange/currentLimitation.ll =================================================================== --- llvm/test/Transforms/LoopInterchange/currentLimitation.ll +++ llvm/test/Transforms/LoopInterchange/currentLimitation.ll @@ -8,7 +8,7 @@ ; RUN: FileCheck --check-prefix=DELIN --input-file=%t %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" +target triple = "powerpc64le-unknown-linux-gnu" @A = common global [100 x [100 x i32]] zeroinitializer @B = common global [100 x [100 x [100 x i32]]] zeroinitializer Index: llvm/test/Transforms/LoopInterchange/debuginfo.ll =================================================================== --- llvm/test/Transforms/LoopInterchange/debuginfo.ll +++ llvm/test/Transforms/LoopInterchange/debuginfo.ll @@ -4,7 +4,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" +target triple = "powerpc64le-unknown-linux-gnu" @A = common global [100 x [100 x i64]] zeroinitializer Index: llvm/test/Transforms/LoopInterchange/inner-indvar-depend-on-outer-indvar.ll =================================================================== --- llvm/test/Transforms/LoopInterchange/inner-indvar-depend-on-outer-indvar.ll +++ llvm/test/Transforms/LoopInterchange/inner-indvar-depend-on-outer-indvar.ll @@ -2,6 +2,7 @@ ; RUN: opt < %s -basic-aa -loop-interchange -verify-dom-info -verify-loop-info \ ; RUN: -S -debug 2>&1 | FileCheck %s +target triple = "powerpc64le-unknown-linux-gnu" @A = common global [100 x [100 x i64]] zeroinitializer @N = dso_local local_unnamed_addr global i64 100, align 8 Index: llvm/test/Transforms/LoopInterchange/inner-only-reductions.ll =================================================================== --- llvm/test/Transforms/LoopInterchange/inner-only-reductions.ll +++ llvm/test/Transforms/LoopInterchange/inner-only-reductions.ll @@ -5,6 +5,7 @@ ; Inner loop only reductions are not supported currently. See discussion at ; D53027 for more information on the required checks. +target triple = "powerpc64le-unknown-linux-gnu" @A = common global [500 x [500 x i32]] zeroinitializer @X = common global i32 0 @B = common global [500 x [500 x i32]] zeroinitializer Index: llvm/test/Transforms/LoopInterchange/innermost-latch-uses-values-in-middle-header.ll =================================================================== --- llvm/test/Transforms/LoopInterchange/innermost-latch-uses-values-in-middle-header.ll +++ llvm/test/Transforms/LoopInterchange/innermost-latch-uses-values-in-middle-header.ll @@ -2,6 +2,7 @@ ; RUN: opt < %s -basic-aa -loop-interchange -verify-dom-info -verify-loop-info \ ; RUN: -S -debug 2>&1 | FileCheck %s +target triple = "powerpc64le-unknown-linux-gnu" @a = common global i32 0, align 4 @d = common dso_local local_unnamed_addr global [1 x [6 x i32]] zeroinitializer, align 4 Index: llvm/test/Transforms/LoopInterchange/interchange-flow-dep-outer.ll =================================================================== --- llvm/test/Transforms/LoopInterchange/interchange-flow-dep-outer.ll +++ llvm/test/Transforms/LoopInterchange/interchange-flow-dep-outer.ll @@ -3,7 +3,7 @@ ; RUN: -S -debug 2>&1 | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" +target triple = "powerpc64le-unknown-linux-gnu" @A = common global [100 x [100 x i32]] zeroinitializer @B = common global [100 x i32] zeroinitializer Index: llvm/test/Transforms/LoopInterchange/interchange-insts-between-indvar.ll =================================================================== --- llvm/test/Transforms/LoopInterchange/interchange-insts-between-indvar.ll +++ llvm/test/Transforms/LoopInterchange/interchange-insts-between-indvar.ll @@ -1,6 +1,7 @@ ; RUN: opt < %s -basic-aa -loop-interchange -verify-dom-info -verify-loop-info \ ; RUN: -S -pass-remarks=loop-interchange 2>&1 | FileCheck %s +target triple = "powerpc64le-unknown-linux-gnu" @A10 = local_unnamed_addr global [3 x [3 x i32]] zeroinitializer, align 16 ;; Test to make sure we can handle zext instructions introduced by Index: llvm/test/Transforms/LoopInterchange/interchange-no-deps.ll =================================================================== --- llvm/test/Transforms/LoopInterchange/interchange-no-deps.ll +++ llvm/test/Transforms/LoopInterchange/interchange-no-deps.ll @@ -4,6 +4,7 @@ ; RUN: | FileCheck -check-prefix=STATS %s ; RUN: FileCheck -input-file %t %s +target triple = "powerpc64le-unknown-linux-gnu" ; no_deps_interchange just accesses a single nested array and can be interchange. ; CHECK: Name: Interchanged @@ -34,35 +35,6 @@ } -; Only the inner loop induction variable is used for memory accesses. -; Interchanging is not beneficial. -; CHECK: Name: InterchangeNotProfitable -; CHECK-NEXT: Function: no_bad_order -define i32 @no_bad_order(i32* %Arr) { -entry: - br label %for1.header - -for1.header: ; preds = %entry, %for1.inc - %indvars.iv19 = phi i64 [ 0, %entry ], [ %indvars.iv.next20, %for1.inc ] - br label %for2 - -for2: ; preds = %for1.header, %for2 - %indvars.iv = phi i64 [ 0, %for1.header ], [ %indvars.iv.next, %for2 ] - %arrayidx6 = getelementptr inbounds i32, i32* %Arr, i64 %indvars.iv - store i32 0, i32* %arrayidx6, align 4 - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %exitcond = icmp ne i64 %indvars.iv.next, 1024 - br i1 %exitcond, label %for2, label %for1.inc - -for1.inc: - %indvars.iv.next20 = add nuw nsw i64 %indvars.iv19, 1 - %exitcond21 = icmp ne i64 %indvars.iv.next20, 1024 - br i1 %exitcond21, label %for1.header, label %exit - -exit: ; preds = %for1.inc - ret i32 0 -} - ; No memory access using any induction variables, interchanging not beneficial. ; CHECK: Name: InterchangeNotProfitable ; CHECK-NEXT: Function: no_mem_instrs Index: llvm/test/Transforms/LoopInterchange/interchangeable-innerloop-multiple-indvars.ll =================================================================== --- llvm/test/Transforms/LoopInterchange/interchangeable-innerloop-multiple-indvars.ll +++ llvm/test/Transforms/LoopInterchange/interchangeable-innerloop-multiple-indvars.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -basic-aa -loop-interchange -verify-dom-info -verify-loop-info -verify-scev -verify-loop-lcssa -S | FileCheck %s +target triple = "powerpc64le-unknown-linux-gnu" @b = common dso_local local_unnamed_addr global [200 x [200 x i32]] zeroinitializer, align 4 @a = common dso_local local_unnamed_addr global i32 0, align 4 Index: llvm/test/Transforms/LoopInterchange/interchangeable-outerloop-multiple-indvars.ll =================================================================== --- llvm/test/Transforms/LoopInterchange/interchangeable-outerloop-multiple-indvars.ll +++ llvm/test/Transforms/LoopInterchange/interchangeable-outerloop-multiple-indvars.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s --basic-aa -loop-interchange -verify-dom-info -verify-loop-info -verify-scev -verify-loop-lcssa -S | FileCheck %s +; RUN: opt < %s -basic-aa -loop-interchange -verify-dom-info -verify-loop-info -verify-scev -verify-loop-lcssa -S | FileCheck %s +target triple = "powerpc64le-unknown-linux-gnu" @b = constant [200 x [100 x i32]] zeroinitializer, align 4 @a = constant i32 0, align 4 Index: llvm/test/Transforms/LoopInterchange/interchangeable.ll =================================================================== --- llvm/test/Transforms/LoopInterchange/interchangeable.ll +++ llvm/test/Transforms/LoopInterchange/interchangeable.ll @@ -3,7 +3,7 @@ ; RUN: opt < %s -aa-pipeline=basic-aa -passes=loop-interchange -verify-dom-info -verify-loop-info -verify-scev -verify-loop-lcssa -S | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" +target triple = "powerpc64le-unknown-linux-gnu" @A = common global [100 x [100 x i64]] zeroinitializer @B = common global [100 x i64] zeroinitializer Index: llvm/test/Transforms/LoopInterchange/interchanged-loop-nest-3.ll =================================================================== --- llvm/test/Transforms/LoopInterchange/interchanged-loop-nest-3.ll +++ llvm/test/Transforms/LoopInterchange/interchanged-loop-nest-3.ll @@ -3,7 +3,7 @@ ; RUN: -S -debug 2>&1 | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" +target triple = "powerpc64le-unknown-linux-gnu" @D = common global [100 x [100 x [100 x i32]]] zeroinitializer @@ -24,31 +24,31 @@ br label %for.cond1.preheader for.cond1.preheader: ; preds = %for.inc15, %entry - %i.028 = phi i32 [ 0, %entry ], [ %inc16, %for.inc15 ] + %i.028 = phi i64 [ 0, %entry ], [ %inc16, %for.inc15 ] br label %for.cond4.preheader for.cond4.preheader: ; preds = %for.inc12, %for.cond1.preheader - %j.027 = phi i32 [ 0, %for.cond1.preheader ], [ %inc13, %for.inc12 ] + %j.027 = phi i64 [ 0, %for.cond1.preheader ], [ %inc13, %for.inc12 ] br label %for.body6 for.body6: ; preds = %for.body6, %for.cond4.preheader - %k.026 = phi i32 [ 0, %for.cond4.preheader ], [ %inc, %for.body6 ] - %arrayidx8 = getelementptr inbounds [100 x [100 x [100 x i32]]], [100 x [100 x [100 x i32]]]* @D, i32 0, i32 %k.026, i32 %j.027, i32 %i.028 + %k.026 = phi i64 [ 0, %for.cond4.preheader ], [ %inc, %for.body6 ] + %arrayidx8 = getelementptr inbounds [100 x [100 x [100 x i32]]], [100 x [100 x [100 x i32]]]* @D, i64 0, i64 %k.026, i64 %j.027, i64 %i.028 %0 = load i32, i32* %arrayidx8 %add = add nsw i32 %0, %t store i32 %add, i32* %arrayidx8 - %inc = add nuw nsw i32 %k.026, 1 - %exitcond = icmp eq i32 %inc, 100 + %inc = add nuw nsw i64 %k.026, 1 + %exitcond = icmp eq i64 %inc, 100 br i1 %exitcond, label %for.inc12, label %for.body6 for.inc12: ; preds = %for.body6 - %inc13 = add nuw nsw i32 %j.027, 1 - %exitcond29 = icmp eq i32 %inc13, 100 + %inc13 = add nuw nsw i64 %j.027, 1 + %exitcond29 = icmp eq i64 %inc13, 100 br i1 %exitcond29, label %for.inc15, label %for.cond4.preheader for.inc15: ; preds = %for.inc12 - %inc16 = add nuw nsw i32 %i.028, 1 - %exitcond30 = icmp eq i32 %inc16, 100 + %inc16 = add nuw nsw i64 %i.028, 1 + %exitcond30 = icmp eq i64 %inc16, 100 br i1 %exitcond30, label %for.end17, label %for.cond1.preheader for.end17: ; preds = %for.inc15 Index: llvm/test/Transforms/LoopInterchange/lcssa-preheader.ll =================================================================== --- llvm/test/Transforms/LoopInterchange/lcssa-preheader.ll +++ llvm/test/Transforms/LoopInterchange/lcssa-preheader.ll @@ -3,6 +3,7 @@ ; RUN: opt < %s -basic-aa -loop-interchange -da-disable-delinearization-checks -pass-remarks-missed='loop-interchange' -verify-loop-lcssa -S | FileCheck -check-prefix=CHECK-DELIN %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "powerpc64le-unknown-linux-gnu" ; void foo(int n, int m) { ; int temp[16][16]; Index: llvm/test/Transforms/LoopInterchange/lcssa.ll =================================================================== --- llvm/test/Transforms/LoopInterchange/lcssa.ll +++ llvm/test/Transforms/LoopInterchange/lcssa.ll @@ -2,7 +2,7 @@ ; RUN: FileCheck --input-file %t --check-prefix REMARK %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" +target triple = "powerpc64le-unknown-linux-gnu" @A = common global [100 x [100 x i32]] zeroinitializer @C = common global [100 x [100 x i32]] zeroinitializer Index: llvm/test/Transforms/LoopInterchange/loop-interchange-optimization-remarks.ll =================================================================== --- llvm/test/Transforms/LoopInterchange/loop-interchange-optimization-remarks.ll +++ llvm/test/Transforms/LoopInterchange/loop-interchange-optimization-remarks.ll @@ -10,6 +10,7 @@ ; RUN: -pass-remarks='loop-interchange' -S -da-disable-delinearization-checks ; RUN: cat %t | FileCheck --check-prefix=DELIN %s +target triple = "powerpc64le-unknown-linux-gnu" @A = common global [100 x [100 x i32]] zeroinitializer @B = common global [100 x [100 x i32]] zeroinitializer @C = common global [100 x i32] zeroinitializer @@ -71,11 +72,7 @@ ; DELIN-NEXT: Name: InterchangeNotProfitable ; DELIN-NEXT: Function: test01 ; DELIN-NEXT: Args: -; DELIN-NEXT: - String: 'Interchanging loops is too costly (cost=' -; DELIN-NEXT: - Cost: '2' -; DELIN-NEXT: - String: ', threshold=' -; DELIN-NEXT: - Threshold: '0' -; DELIN-NEXT: - String: ') and it does not improve parallelism.' +; DELIN-NEXT: - String: Interchanging loops is too costly and it does not improve parallelism. ; DELIN-NEXT: ... ;;--------------------------------------Test case 02------------------------------------ Index: llvm/test/Transforms/LoopInterchange/not-interchanged-dependencies-1.ll =================================================================== --- llvm/test/Transforms/LoopInterchange/not-interchanged-dependencies-1.ll +++ llvm/test/Transforms/LoopInterchange/not-interchanged-dependencies-1.ll @@ -3,7 +3,7 @@ ; RUN: -S -debug 2>&1 | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" +target triple = "powerpc64le-unknown-linux-gnu" @A = common global [100 x [100 x i32]] zeroinitializer @B = common global [100 x i32] zeroinitializer Index: llvm/test/Transforms/LoopInterchange/not-interchanged-loop-nest-3.ll =================================================================== --- llvm/test/Transforms/LoopInterchange/not-interchanged-loop-nest-3.ll +++ llvm/test/Transforms/LoopInterchange/not-interchanged-loop-nest-3.ll @@ -3,7 +3,7 @@ ; RUN: -S -debug 2>&1 | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" +target triple = "powerpc64le-unknown-linux-gnu" @D = common global [100 x [100 x [100 x i32]]] zeroinitializer @@ -24,31 +24,31 @@ br label %for.cond1.preheader for.cond1.preheader: ; preds = %for.inc15, %entry - %i.028 = phi i32 [ 0, %entry ], [ %inc16, %for.inc15 ] + %i.028 = phi i64 [ 0, %entry ], [ %inc16, %for.inc15 ] br label %for.cond4.preheader for.cond4.preheader: ; preds = %for.inc12, %for.cond1.preheader - %j.027 = phi i32 [ 0, %for.cond1.preheader ], [ %inc13, %for.inc12 ] + %j.027 = phi i64 [ 0, %for.cond1.preheader ], [ %inc13, %for.inc12 ] br label %for.body6 for.body6: ; preds = %for.body6, %for.cond4.preheader - %k.026 = phi i32 [ 0, %for.cond4.preheader ], [ %inc, %for.body6 ] - %arrayidx8 = getelementptr inbounds [100 x [100 x [100 x i32]]], [100 x [100 x [100 x i32]]]* @D, i32 0, i32 %i.028, i32 %k.026, i32 %j.027 + %k.026 = phi i64 [ 0, %for.cond4.preheader ], [ %inc, %for.body6 ] + %arrayidx8 = getelementptr inbounds [100 x [100 x [100 x i32]]], [100 x [100 x [100 x i32]]]* @D, i32 0, i64 %i.028, i64 %k.026, i64 %j.027 %0 = load i32, i32* %arrayidx8 %add = add nsw i32 %0, %t store i32 %add, i32* %arrayidx8 - %inc = add nuw nsw i32 %k.026, 1 - %exitcond = icmp eq i32 %inc, 100 + %inc = add nuw nsw i64 %k.026, 1 + %exitcond = icmp eq i64 %inc, 100 br i1 %exitcond, label %for.inc12, label %for.body6 for.inc12: ; preds = %for.body6 - %inc13 = add nuw nsw i32 %j.027, 1 - %exitcond29 = icmp eq i32 %inc13, 100 + %inc13 = add nuw nsw i64 %j.027, 1 + %exitcond29 = icmp eq i64 %inc13, 100 br i1 %exitcond29, label %for.inc15, label %for.cond4.preheader for.inc15: ; preds = %for.inc12 - %inc16 = add nuw nsw i32 %i.028, 1 - %exitcond30 = icmp eq i32 %inc16, 100 + %inc16 = add nuw nsw i64 %i.028, 1 + %exitcond30 = icmp eq i64 %inc16, 100 br i1 %exitcond30, label %for.end17, label %for.cond1.preheader for.end17: ; preds = %for.inc15 Index: llvm/test/Transforms/LoopInterchange/not-interchanged-tightly-nested.ll =================================================================== --- llvm/test/Transforms/LoopInterchange/not-interchanged-tightly-nested.ll +++ llvm/test/Transforms/LoopInterchange/not-interchanged-tightly-nested.ll @@ -3,7 +3,7 @@ ; RUN: -S -debug 2>&1 | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" +target triple = "powerpc64le-unknown-linux-gnu" @A = common global [100 x [100 x i32]] zeroinitializer @B = common global [100 x i32] zeroinitializer @@ -108,13 +108,13 @@ ;; The outer loop header does not branch to the inner loop preheader, or the ;; inner loop header, or the outer loop latch. ; CHECK: Not interchanging loops. Cannot prove legality. -define void @interchange_07(i32 %k, i32 %N, i32 %ny) { +define void @interchange_07(i32 %k, i32 %N, i64 %ny) { entry: br label %for1.header for1.header: - %j23 = phi i32 [ 0, %entry ], [ %j.next24, %for1.inc10 ] - %cmp21 = icmp slt i32 0, %ny + %j23 = phi i64 [ 0, %entry ], [ %j.next24, %for1.inc10 ] + %cmp21 = icmp slt i64 0, %ny br label %singleSucc singleSucc: @@ -124,18 +124,18 @@ br label %for2 for2: - %j = phi i32 [ %j.next, %for2 ], [ 0, %preheader.j ] - %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i32 0, i32 %j, i32 %j23 + %j = phi i64 [ %j.next, %for2 ], [ 0, %preheader.j ] + %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %j, i64 %j23 %lv = load i32, i32* %arrayidx5 %add = add nsw i32 %lv, %k store i32 %add, i32* %arrayidx5 - %j.next = add nuw nsw i32 %j, 1 - %exitcond = icmp eq i32 %j, 99 + %j.next = add nuw nsw i64 %j, 1 + %exitcond = icmp eq i64 %j, 99 br i1 %exitcond, label %for1.inc10, label %for2 for1.inc10: - %j.next24 = add nuw nsw i32 %j23, 1 - %exitcond26 = icmp eq i32 %j23, 99 + %j.next24 = add nuw nsw i64 %j23, 1 + %exitcond26 = icmp eq i64 %j23, 99 br i1 %exitcond26, label %for.end12, label %for1.header for.end12: Index: llvm/test/Transforms/LoopInterchange/outer-header-jump-to-inner-latch.ll =================================================================== --- llvm/test/Transforms/LoopInterchange/outer-header-jump-to-inner-latch.ll +++ llvm/test/Transforms/LoopInterchange/outer-header-jump-to-inner-latch.ll @@ -1,5 +1,6 @@ ; RUN: opt -basic-aa -loop-interchange -verify-dom-info -verify-loop-info -verify-loop-lcssa -S %s | FileCheck %s +target triple = "powerpc64le-unknown-linux-gnu" @b = global [3 x [5 x [8 x i16]]] [[5 x [8 x i16]] zeroinitializer, [5 x [8 x i16]] [[8 x i16] zeroinitializer, [8 x i16] [i16 0, i16 0, i16 0, i16 6, i16 1, i16 6, i16 0, i16 0], [8 x i16] zeroinitializer, [8 x i16] zeroinitializer, [8 x i16] zeroinitializer], [5 x [8 x i16]] zeroinitializer], align 2 @a = common global i32 0, align 4 @d = common dso_local local_unnamed_addr global [1 x [6 x i32]] zeroinitializer, align 4 Index: llvm/test/Transforms/LoopInterchange/outer-only-reductions.ll =================================================================== --- llvm/test/Transforms/LoopInterchange/outer-only-reductions.ll +++ llvm/test/Transforms/LoopInterchange/outer-only-reductions.ll @@ -4,6 +4,7 @@ ; Outer loop only reductions are not supported currently. +target triple = "powerpc64le-unknown-linux-gnu" @A = common global [500 x [500 x i32]] zeroinitializer ;; global X Index: llvm/test/Transforms/LoopInterchange/perserve-lcssa.ll =================================================================== --- llvm/test/Transforms/LoopInterchange/perserve-lcssa.ll +++ llvm/test/Transforms/LoopInterchange/perserve-lcssa.ll @@ -3,6 +3,7 @@ ; Test case for PR41725. The induction variables in the latches escape the ; loops and we must move some PHIs around. +target triple = "powerpc64le-unknown-linux-gnu" @a = common dso_local global i64 0, align 4 @b = common dso_local global i64 0, align 4 @c = common dso_local global [10 x [10 x i32 ]] zeroinitializer, align 16 @@ -157,7 +158,7 @@ ; Make sure we do not crash for loops without reachable exits. define void @no_reachable_exits() { ; Check we interchanged. -; CHECK-LABEL: @no_reachable_exits() { +; CHECK-LABEL: @no_reachable_exits() ; CHECK-NEXT: bb: ; CHECK-NEXT: br label %inner.ph ; CHECK-LABEL: outer.ph: Index: llvm/test/Transforms/LoopInterchange/pr43176-move-to-new-latch.ll =================================================================== --- llvm/test/Transforms/LoopInterchange/pr43176-move-to-new-latch.ll +++ llvm/test/Transforms/LoopInterchange/pr43176-move-to-new-latch.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -loop-interchange -verify-loop-lcssa -verify-dom-info -S %s | FileCheck %s +target triple = "powerpc64le-unknown-linux-gnu" @b = external dso_local global [5 x i32], align 16 define void @test1() { Index: llvm/test/Transforms/LoopInterchange/pr43326-ideal-access-pattern.ll =================================================================== --- llvm/test/Transforms/LoopInterchange/pr43326-ideal-access-pattern.ll +++ llvm/test/Transforms/LoopInterchange/pr43326-ideal-access-pattern.ll @@ -14,6 +14,8 @@ ; } ; } +target triple = "powerpc64le-unknown-linux-gnu" + ; REMARKS: --- !Passed ; REMARKS-NEXT: Pass: loop-interchange ; REMARKS-NEXT: Name: Interchanged Index: llvm/test/Transforms/LoopInterchange/pr43326.ll =================================================================== --- llvm/test/Transforms/LoopInterchange/pr43326.ll +++ llvm/test/Transforms/LoopInterchange/pr43326.ll @@ -2,6 +2,7 @@ ; RUN: -verify-dom-info -verify-loop-info -verify-loop-lcssa -stats 2>&1 ; RUN: FileCheck --input-file=%t --check-prefix=REMARKS %s +target triple = "powerpc64le-unknown-linux-gnu" @a = global i32 0 @b = global i8 0 @c = global i32 0 Index: llvm/test/Transforms/LoopInterchange/pr43473-invalid-lcssa-phis-in-inner-exit.ll =================================================================== --- llvm/test/Transforms/LoopInterchange/pr43473-invalid-lcssa-phis-in-inner-exit.ll +++ llvm/test/Transforms/LoopInterchange/pr43473-invalid-lcssa-phis-in-inner-exit.ll @@ -6,6 +6,8 @@ ; In the 2 test cases below, we have a LCSSA PHI in the inner loop exit, which ; is used in the outer loop latch. This is not supported. +target triple = "powerpc64le-unknown-linux-gnu" + define void @test1() { ; CHECK-LABEL: @test1( ; CHECK-NEXT: entry: Index: llvm/test/Transforms/LoopInterchange/pr43797-lcssa-for-multiple-outer-loop-blocks.ll =================================================================== --- llvm/test/Transforms/LoopInterchange/pr43797-lcssa-for-multiple-outer-loop-blocks.ll +++ llvm/test/Transforms/LoopInterchange/pr43797-lcssa-for-multiple-outer-loop-blocks.ll @@ -3,6 +3,7 @@ ; Tests for PR43797. +target triple = "powerpc64le-unknown-linux-gnu" @wdtdr = external dso_local global [5 x [5 x double]], align 16 define void @test1() { Index: llvm/test/Transforms/LoopInterchange/pr45743-move-from-inner-preheader.ll =================================================================== --- llvm/test/Transforms/LoopInterchange/pr45743-move-from-inner-preheader.ll +++ llvm/test/Transforms/LoopInterchange/pr45743-move-from-inner-preheader.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -loop-interchange -S %s | FileCheck %s +target triple = "powerpc64le-unknown-linux-gnu" @global = external local_unnamed_addr global [400 x [400 x i32]], align 16 ; We need to move %tmp4 from the inner loop pre header to the outer loop header Index: llvm/test/Transforms/LoopInterchange/pr48212.ll =================================================================== --- llvm/test/Transforms/LoopInterchange/pr48212.ll +++ llvm/test/Transforms/LoopInterchange/pr48212.ll @@ -2,6 +2,8 @@ ; RUN: -verify-dom-info -verify-loop-info -verify-loop-lcssa 2>&1 ; RUN: FileCheck --input-file=%t --check-prefix=REMARKS %s +target triple = "powerpc64le-unknown-linux-gnu" + ; REMARKS: --- !Passed ; REMARKS-NEXT: Pass: loop-interchange ; REMARKS-NEXT: Name: Interchanged Index: llvm/test/Transforms/LoopInterchange/profitability.ll =================================================================== --- llvm/test/Transforms/LoopInterchange/profitability.ll +++ llvm/test/Transforms/LoopInterchange/profitability.ll @@ -5,7 +5,7 @@ ;; We test profitability model in these test cases. target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" +target triple = "powerpc64le-unknown-linux-gnu" @A = common global [100 x [100 x i32]] zeroinitializer @B = common global [100 x [100 x i32]] zeroinitializer Index: llvm/test/Transforms/LoopInterchange/reductions-across-inner-and-outer-loop.ll =================================================================== --- llvm/test/Transforms/LoopInterchange/reductions-across-inner-and-outer-loop.ll +++ llvm/test/Transforms/LoopInterchange/reductions-across-inner-and-outer-loop.ll @@ -4,7 +4,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" +target triple = "powerpc64le-unknown-linux-gnu" ; REMARKS: --- !Passed ; REMARKS-NEXT: Pass: loop-interchange Index: llvm/test/Transforms/LoopInterchange/update-condbranch-duplicate-successors.ll =================================================================== --- llvm/test/Transforms/LoopInterchange/update-condbranch-duplicate-successors.ll +++ llvm/test/Transforms/LoopInterchange/update-condbranch-duplicate-successors.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -loop-interchange -S %s | FileCheck %s - +target triple = "powerpc64le-unknown-linux-gnu" @global = external dso_local global [1000 x [1000 x i32]], align 16 ; Test that we support updating conditional branches where both targets are the same Index: llvm/test/Transforms/LoopInterchange/vector-gep-operand.ll =================================================================== --- llvm/test/Transforms/LoopInterchange/vector-gep-operand.ll +++ llvm/test/Transforms/LoopInterchange/vector-gep-operand.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -loop-interchange -loop-interchange-threshold=-10 -S %s | FileCheck %s +target triple = "powerpc64le-unknown-linux-gnu" + ; The test contains a GEP with an operand that is not SCEV-able. Make sure ; loop-interchange does not crash. define void @test([256 x float]* noalias %src, float* %dst) {