diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp --- a/llvm/lib/Passes/PassBuilderPipelines.cpp +++ b/llvm/lib/Passes/PassBuilderPipelines.cpp @@ -300,6 +300,8 @@ // TODO: Investigate promotion cap for O1. LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap)); LPM1.addPass(SimpleLoopUnswitchPass()); + if (EnableLoopFlatten) + LPM1.addPass(LoopFlattenPass()); LPM2.addPass(LoopIdiomRecognizePass()); LPM2.addPass(IndVarSimplifyPass()); @@ -311,8 +313,6 @@ if (EnableLoopInterchange) LPM2.addPass(LoopInterchangePass()); - if (EnableLoopFlatten) - LPM2.addPass(LoopFlattenPass()); // Do not enable unrolling in PreLinkThinLTO phase during sample PGO // because it changes IR to makes profile annotation in back compile @@ -475,6 +475,9 @@ LPM1.addPass( SimpleLoopUnswitchPass(/* NonTrivial */ Level == OptimizationLevel::O3 && EnableO3NonTrivialUnswitching)); + if (EnableLoopFlatten) + LPM1.addPass(LoopFlattenPass()); + LPM2.addPass(LoopIdiomRecognizePass()); LPM2.addPass(IndVarSimplifyPass()); @@ -485,8 +488,6 @@ if (EnableLoopInterchange) LPM2.addPass(LoopInterchangePass()); - if (EnableLoopFlatten) - LPM2.addPass(LoopFlattenPass()); // Do not enable unrolling in PreLinkThinLTO phase during sample PGO // because it changes IR to makes profile annotation in back compile @@ -1628,10 +1629,10 @@ MainFPM.addPass(ConstraintEliminationPass()); LoopPassManager LPM; - LPM.addPass(IndVarSimplifyPass()); - LPM.addPass(LoopDeletionPass()); if (EnableLoopFlatten && Level.getSpeedupLevel() > 1) LPM.addPass(LoopFlattenPass()); + LPM.addPass(IndVarSimplifyPass()); + LPM.addPass(LoopDeletionPass()); // FIXME: Add loop interchange. // Unroll small loops and perform peeling. diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/loopflatten.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/loopflatten.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/PhaseOrdering/AArch64/loopflatten.ll @@ -0,0 +1,64 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -passes='default' -enable-loop-flatten -loop-flatten-cost-threshold=3 -S %s | FileCheck %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64" + +define dso_local void @_Z3fooPiii(i32* %A, i32 %N, i32 %M) #0 { +; CHECK-LABEL: @_Z3fooPiii( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP3:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; CHECK-NEXT: [[CMP21:%.*]] = icmp sgt i32 [[M:%.*]], 0 +; CHECK-NEXT: [[OR_COND:%.*]] = select i1 [[CMP3]], i1 [[CMP21]], i1 false +; CHECK-NEXT: br i1 [[OR_COND]], label [[FOR_COND1_PREHEADER_LR_PH_SPLIT_US:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK: for.cond1.preheader.lr.ph.split.us: +; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[M]] to i64 +; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: [[FLATTEN_TRIPCOUNT:%.*]] = mul nuw nsw i64 [[TMP0]], [[TMP1]] +; CHECK-NEXT: br label [[FOR_COND1_PREHEADER_US:%.*]] +; CHECK: for.cond1.preheader.us: +; CHECK-NEXT: [[INDVAR6:%.*]] = phi i64 [ [[INDVAR_NEXT7:%.*]], [[FOR_COND1_PREHEADER_US]] ], [ 0, [[FOR_COND1_PREHEADER_LR_PH_SPLIT_US]] ] +; CHECK-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVAR6]] +; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX_US]], align 4 +; CHECK-NEXT: tail call void @_Z1fi(i32 [[TMP2]]) +; CHECK-NEXT: [[INDVAR_NEXT7]] = add nuw nsw i64 [[INDVAR6]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVAR_NEXT7]], [[FLATTEN_TRIPCOUNT]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_COND1_PREHEADER_US]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; +entry: + br label %for.cond + +for.cond: + %i.0 = phi i32 [ 0, %entry ], [ %inc6, %for.cond.cleanup3 ] + %cmp = icmp slt i32 %i.0, %N + br i1 %cmp, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: + ret void + +for.body: + br label %for.cond1 + +for.cond1: + %j.0 = phi i32 [ 0, %for.body ], [ %inc, %for.body4 ] + %cmp2 = icmp slt i32 %j.0, %M + br i1 %cmp2, label %for.body4, label %for.cond.cleanup3 + +for.cond.cleanup3: + %inc6 = add nsw i32 %i.0, 1 + br label %for.cond + +for.body4: + %mul = mul nsw i32 %i.0, %M + %add = add nsw i32 %mul, %j.0 + %idxprom = sext i32 %add to i64 + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %idxprom + %0 = load i32, i32* %arrayidx, align 4 + call void @_Z1fi(i32 %0) + %inc = add nsw i32 %j.0, 1 + br label %for.cond1 +} + +declare dso_local void @_Z1fi(i32) #2