Index: llvm/include/llvm/InitializePasses.h =================================================================== --- llvm/include/llvm/InitializePasses.h +++ llvm/include/llvm/InitializePasses.h @@ -242,6 +242,7 @@ void initializeLoopInfoWrapperPassPass(PassRegistry&); void initializeLoopInstSimplifyLegacyPassPass(PassRegistry&); void initializeLoopInterchangePass(PassRegistry&); +void initializeLoopFlattenLegacyPassPass(PassRegistry&); void initializeLoopLoadEliminationPass(PassRegistry&); void initializeLoopPassPass(PassRegistry&); void initializeLoopPredicationLegacyPassPass(PassRegistry&); Index: llvm/include/llvm/LinkAllPasses.h =================================================================== --- llvm/include/llvm/LinkAllPasses.h +++ llvm/include/llvm/LinkAllPasses.h @@ -127,6 +127,7 @@ (void) llvm::createLazyValueInfoPass(); (void) llvm::createLoopExtractorPass(); (void) llvm::createLoopInterchangePass(); + (void) llvm::createLoopFlattenPass(); (void) llvm::createLoopPredicationPass(); (void) llvm::createLoopSimplifyPass(); (void) llvm::createLoopSimplifyCFGPass(); Index: llvm/include/llvm/Transforms/Scalar.h =================================================================== --- llvm/include/llvm/Transforms/Scalar.h +++ llvm/include/llvm/Transforms/Scalar.h @@ -149,6 +149,12 @@ // Pass *createLoopInterchangePass(); +//===----------------------------------------------------------------------===// +// +// LoopFlatten - This pass flattens nested loops into a single loop. +// +Pass *createLoopFlattenPass(); + //===----------------------------------------------------------------------===// // // LoopStrengthReduce - This pass is strength reduces GEP instructions that use Index: llvm/include/llvm/Transforms/Scalar/LoopFlatten.h =================================================================== --- /dev/null +++ llvm/include/llvm/Transforms/Scalar/LoopFlatten.h @@ -0,0 +1,33 @@ +//===- LoopFlatten.h - Loop Flatten ---------------- -----------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file provides the interface for the Loop Flatten Pass. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_SCALAR_LOOPFLATTEN_H +#define LLVM_TRANSFORMS_SCALAR_LOOPFLATTEN_H + +#include "llvm/Analysis/LoopAnalysisManager.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/IR/PassManager.h" +#include "llvm/Transforms/Scalar/LoopPassManager.h" + +namespace llvm { + +class LoopFlattenPass : public PassInfoMixin { +public: + LoopFlattenPass() = default; + + PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM, + LoopStandardAnalysisResults &AR, LPMUpdater &U); +}; + +} // end namespace llvm + +#endif // LLVM_TRANSFORMS_SCALAR_LOOPFLATTEN_H Index: llvm/lib/Passes/PassBuilder.cpp =================================================================== --- llvm/lib/Passes/PassBuilder.cpp +++ llvm/lib/Passes/PassBuilder.cpp @@ -149,6 +149,7 @@ #include "llvm/Transforms/Scalar/LoopDataPrefetch.h" #include "llvm/Transforms/Scalar/LoopDeletion.h" #include "llvm/Transforms/Scalar/LoopDistribute.h" +#include "llvm/Transforms/Scalar/LoopFlatten.h" #include "llvm/Transforms/Scalar/LoopFuse.h" #include "llvm/Transforms/Scalar/LoopIdiomRecognize.h" #include "llvm/Transforms/Scalar/LoopInstSimplify.h" @@ -249,6 +250,10 @@ "enable-npm-unroll-and-jam", cl::init(false), cl::Hidden, cl::desc("Enable the Unroll and Jam pass for the new PM (default = off)")); +static cl::opt EnableLoopFlatten( + "enable-npm-loop-flatten", cl::init(false), cl::Hidden, + cl::desc("Enable the Loop flattening pass for the new PM (default = off)")); + static cl::opt EnableSyntheticCounts( "enable-npm-synthetic-counts", cl::init(false), cl::Hidden, cl::ZeroOrMore, cl::desc("Run synthetic function entry count generation " @@ -509,6 +514,8 @@ C(LPM2, Level); LPM2.addPass(LoopDeletionPass()); + if (EnableLoopFlatten) + LPM2.addPass(LoopFlattenPass()); // Do not enable unrolling in PreLinkThinLTO phase during sample PGO // because it changes IR to makes profile annotation in back compile // inaccurate. The normal unroller doesn't pay attention to forced full unroll Index: llvm/lib/Passes/PassRegistry.def =================================================================== --- llvm/lib/Passes/PassRegistry.def +++ llvm/lib/Passes/PassRegistry.def @@ -358,6 +358,7 @@ LOOP_PASS("no-op-loop", NoOpLoopPass()) LOOP_PASS("print", PrintLoopPass(dbgs())) LOOP_PASS("loop-deletion", LoopDeletionPass()) +LOOP_PASS("loop-flatten", LoopFlattenPass()) LOOP_PASS("loop-simplifycfg", LoopSimplifyCFGPass()) LOOP_PASS("loop-reduce", LoopStrengthReducePass()) LOOP_PASS("indvars", IndVarSimplifyPass()) Index: llvm/lib/Transforms/IPO/PassManagerBuilder.cpp =================================================================== --- llvm/lib/Transforms/IPO/PassManagerBuilder.cpp +++ llvm/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -92,6 +92,10 @@ cl::init(false), cl::Hidden, cl::desc("Enable Unroll And Jam Pass")); +static cl::opt EnableLoopFlatten("enable-loop-flatten", cl::init(false), + cl::Hidden, + cl::desc("Enable the LoopFlatten Pass")); + static cl::opt EnablePrepareForThinLTO("prepare-for-thinlto", cl::init(false), cl::Hidden, cl::desc("Enable preparation for ThinLTO.")); @@ -444,6 +448,10 @@ if (EnableLoopInterchange) MPM.add(createLoopInterchangePass()); // Interchange loops + if (EnableLoopFlatten) { + MPM.add(createLoopFlattenPass()); // Flatten loops + MPM.add(createLoopSimplifyCFGPass()); + } // Unroll small loops MPM.add(createSimpleLoopUnrollPass(OptLevel, DisableUnrollLoops, @@ -1035,6 +1043,8 @@ PM.add(createLoopDeletionPass()); if (EnableLoopInterchange) PM.add(createLoopInterchangePass()); + if (EnableLoopFlatten) + PM.add(createLoopFlattenPass()); // Unroll small loops PM.add(createSimpleLoopUnrollPass(OptLevel, DisableUnrollLoops, Index: llvm/lib/Transforms/Scalar/CMakeLists.txt =================================================================== --- llvm/lib/Transforms/Scalar/CMakeLists.txt +++ llvm/lib/Transforms/Scalar/CMakeLists.txt @@ -32,6 +32,7 @@ LoopIdiomRecognize.cpp LoopInstSimplify.cpp LoopInterchange.cpp + LoopFlatten.cpp LoopLoadElimination.cpp LoopPassManager.cpp LoopPredication.cpp Index: llvm/lib/Transforms/Scalar/Scalar.cpp =================================================================== --- llvm/lib/Transforms/Scalar/Scalar.cpp +++ llvm/lib/Transforms/Scalar/Scalar.cpp @@ -67,6 +67,7 @@ initializeLoopAccessLegacyAnalysisPass(Registry); initializeLoopInstSimplifyLegacyPassPass(Registry); initializeLoopInterchangePass(Registry); + initializeLoopFlattenLegacyPassPass(Registry); initializeLoopPredicationLegacyPassPass(Registry); initializeLoopRotateLegacyPassPass(Registry); initializeLoopStrengthReducePass(Registry); @@ -186,6 +187,10 @@ unwrap(PM)->add(createLoopDeletionPass()); } +void LLVMAddLoopFlattenPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createLoopFlattenPass()); +} + void LLVMAddLoopIdiomPass(LLVMPassManagerRef PM) { unwrap(PM)->add(createLoopIdiomPass()); } Index: llvm/test/Transforms/LoopFlatten/loop-flatten-negative.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopFlatten/loop-flatten-negative.ll @@ -0,0 +1,395 @@ +; RUN: opt < %s -S -loop-flatten -debug-only=loop-flatten 2>&1 | FileCheck %s +; REQUIRES: asserts + +; Every function in this file has a reason that it can't be transformed. + +; CHECK-NOT: Checks all passed, doing the transformation + +; Outer loop does not start at zero +define void @test_1(i32 %N, i32* nocapture %C, i32* nocapture readonly %A, i32 %scale) { +entry: + %cmp25 = icmp sgt i32 %N, 0 + br i1 %cmp25, label %for.body4.lr.ph, label %for.cond.cleanup + +for.body4.lr.ph: + %i.026 = phi i32 [ %inc10, %for.cond.cleanup3 ], [ 1, %entry ] + %mul = mul nsw i32 %i.026, %N + br label %for.body4 + +for.body4: + %j.024 = phi i32 [ 0, %for.body4.lr.ph ], [ %inc, %for.body4 ] + %add = add nsw i32 %j.024, %mul + %arrayidx = getelementptr inbounds i32, i32* %A, i32 %add + %0 = load i32, i32* %arrayidx, align 4 + %mul5 = mul nsw i32 %0, %scale + %arrayidx8 = getelementptr inbounds i32, i32* %C, i32 %add + store i32 %mul5, i32* %arrayidx8, align 4 + %inc = add nuw nsw i32 %j.024, 1 + %exitcond = icmp eq i32 %inc, %N + br i1 %exitcond, label %for.cond.cleanup3, label %for.body4 + +for.cond.cleanup3: + %inc10 = add nuw nsw i32 %i.026, 1 + %exitcond27 = icmp eq i32 %inc10, %N + br i1 %exitcond27, label %for.cond.cleanup, label %for.body4.lr.ph + +for.cond.cleanup: + ret void +} + +; Inner loop does not start at zero +define void @test_2(i32 %N, i32* nocapture %C, i32* nocapture readonly %A, i32 %scale) { +entry: + %cmp25 = icmp sgt i32 %N, 0 + br i1 %cmp25, label %for.body4.lr.ph, label %for.cond.cleanup + +for.body4.lr.ph: + %i.026 = phi i32 [ %inc10, %for.cond.cleanup3 ], [ 0, %entry ] + %mul = mul nsw i32 %i.026, %N + br label %for.body4 + +for.body4: + %j.024 = phi i32 [ 1, %for.body4.lr.ph ], [ %inc, %for.body4 ] + %add = add nsw i32 %j.024, %mul + %arrayidx = getelementptr inbounds i32, i32* %A, i32 %add + %0 = load i32, i32* %arrayidx, align 4 + %mul5 = mul nsw i32 %0, %scale + %arrayidx8 = getelementptr inbounds i32, i32* %C, i32 %add + store i32 %mul5, i32* %arrayidx8, align 4 + %inc = add nuw nsw i32 %j.024, 1 + %exitcond = icmp eq i32 %inc, %N + br i1 %exitcond, label %for.cond.cleanup3, label %for.body4 + +for.cond.cleanup3: + %inc10 = add nuw nsw i32 %i.026, 1 + %exitcond27 = icmp eq i32 %inc10, %N + br i1 %exitcond27, label %for.cond.cleanup, label %for.body4.lr.ph + +for.cond.cleanup: + ret void +} + +; Outer IV used directly +define hidden void @test_3(i16 zeroext %N, i32* nocapture %C, i32* nocapture readonly %A, i32 %scale) { +entry: + %conv = zext i16 %N to i32 + %cmp25 = icmp eq i16 %N, 0 + br i1 %cmp25, label %for.cond.cleanup, label %for.body.lr.ph.split.us + +for.body.lr.ph.split.us: ; preds = %entry + br label %for.body.us + +for.body.us: ; preds = %for.cond2.for.cond.cleanup6_crit_edge.us, %for.body.lr.ph.split.us + %i.026.us = phi i32 [ 0, %for.body.lr.ph.split.us ], [ %inc12.us, %for.cond2.for.cond.cleanup6_crit_edge.us ] + %arrayidx.us = getelementptr inbounds i32, i32* %A, i32 %i.026.us + %mul9.us = mul nuw nsw i32 %i.026.us, %conv + br label %for.body7.us + +for.body7.us: ; preds = %for.body.us, %for.body7.us + %j.024.us = phi i32 [ 0, %for.body.us ], [ %inc.us, %for.body7.us ] + %0 = load i32, i32* %arrayidx.us, align 4 + %mul.us = mul nsw i32 %0, %scale + %add.us = add nuw nsw i32 %j.024.us, %mul9.us + %arrayidx10.us = getelementptr inbounds i32, i32* %C, i32 %add.us + store i32 %mul.us, i32* %arrayidx10.us, align 4 + %inc.us = add nuw nsw i32 %j.024.us, 1 + %exitcond = icmp ne i32 %inc.us, %conv + br i1 %exitcond, label %for.body7.us, label %for.cond2.for.cond.cleanup6_crit_edge.us + +for.cond2.for.cond.cleanup6_crit_edge.us: ; preds = %for.body7.us + %inc12.us = add nuw nsw i32 %i.026.us, 1 + %exitcond27 = icmp ne i32 %inc12.us, %conv + br i1 %exitcond27, label %for.body.us, label %for.cond.cleanup.loopexit + +for.cond.cleanup.loopexit: ; preds = %for.cond2.for.cond.cleanup6_crit_edge.us + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + ret void +} + +; Inner IV used directly +define hidden void @test_4(i16 zeroext %N, i32* nocapture %C, i32* nocapture readonly %A, i32 %scale) { +entry: + %conv = zext i16 %N to i32 + %cmp25 = icmp eq i16 %N, 0 + br i1 %cmp25, label %for.cond.cleanup, label %for.body.lr.ph.split.us + +for.body.lr.ph.split.us: ; preds = %entry + br label %for.body.us + +for.body.us: ; preds = %for.cond2.for.cond.cleanup6_crit_edge.us, %for.body.lr.ph.split.us + %i.026.us = phi i32 [ 0, %for.body.lr.ph.split.us ], [ %inc12.us, %for.cond2.for.cond.cleanup6_crit_edge.us ] + %mul9.us = mul nuw nsw i32 %i.026.us, %conv + br label %for.body7.us + +for.body7.us: ; preds = %for.body.us, %for.body7.us + %j.024.us = phi i32 [ 0, %for.body.us ], [ %inc.us, %for.body7.us ] + %arrayidx.us = getelementptr inbounds i32, i32* %A, i32 %j.024.us + %0 = load i32, i32* %arrayidx.us, align 4 + %mul.us = mul nsw i32 %0, %scale + %add.us = add nuw nsw i32 %j.024.us, %mul9.us + %arrayidx10.us = getelementptr inbounds i32, i32* %C, i32 %add.us + store i32 %mul.us, i32* %arrayidx10.us, align 4 + %inc.us = add nuw nsw i32 %j.024.us, 1 + %exitcond = icmp ne i32 %inc.us, %conv + br i1 %exitcond, label %for.body7.us, label %for.cond2.for.cond.cleanup6_crit_edge.us + +for.cond2.for.cond.cleanup6_crit_edge.us: ; preds = %for.body7.us + %inc12.us = add nuw nsw i32 %i.026.us, 1 + %exitcond27 = icmp ne i32 %inc12.us, %conv + br i1 %exitcond27, label %for.body.us, label %for.cond.cleanup.loopexit + +for.cond.cleanup.loopexit: ; preds = %for.cond2.for.cond.cleanup6_crit_edge.us + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + ret void +} + +; Inner iteration count not invariant in outer loop +declare i32 @get_int() readonly +define void @test_5(i16 zeroext %N, i32* nocapture %C, i32* nocapture readonly %A, i32 %scale) { +entry: + %conv = zext i16 %N to i32 + %cmp27 = icmp eq i16 %N, 0 + br i1 %cmp27, label %for.cond.cleanup, label %for.body.lr.ph + +for.body.lr.ph: ; preds = %entry + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %for.cond.cleanup5 + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + ret void + +for.body: ; preds = %for.body.lr.ph, %for.cond.cleanup5 + %i.028 = phi i32 [ 0, %for.body.lr.ph ], [ %inc12, %for.cond.cleanup5 ] + %call = tail call i32 @get_int() + %cmp325 = icmp sgt i32 %call, 0 + br i1 %cmp325, label %for.body6.lr.ph, label %for.cond.cleanup5 + +for.body6.lr.ph: ; preds = %for.body + %mul = mul nsw i32 %call, %i.028 + br label %for.body6 + +for.cond.cleanup5.loopexit: ; preds = %for.body6 + br label %for.cond.cleanup5 + +for.cond.cleanup5: ; preds = %for.cond.cleanup5.loopexit, %for.body + %inc12 = add nuw nsw i32 %i.028, 1 + %exitcond29 = icmp ne i32 %inc12, %conv + br i1 %exitcond29, label %for.body, label %for.cond.cleanup.loopexit + +for.body6: ; preds = %for.body6.lr.ph, %for.body6 + %j.026 = phi i32 [ 0, %for.body6.lr.ph ], [ %inc, %for.body6 ] + %add = add nsw i32 %j.026, %mul + %arrayidx = getelementptr inbounds i32, i32* %A, i32 %add + %0 = load i32, i32* %arrayidx, align 4 + %mul7 = mul nsw i32 %0, %scale + %arrayidx10 = getelementptr inbounds i32, i32* %C, i32 %add + store i32 %mul7, i32* %arrayidx10, align 4 + %inc = add nuw nsw i32 %j.026, 1 + %exitcond = icmp ne i32 %inc, %call + br i1 %exitcond, label %for.body6, label %for.cond.cleanup5.loopexit +} + +; Inner loop has an early exit +define hidden void @test_6(i16 zeroext %N, i32* nocapture %C, i32* nocapture readonly %A, i32 %scale) { +entry: + %conv = zext i16 %N to i32 + %cmp39 = icmp eq i16 %N, 0 + br i1 %cmp39, label %for.cond.cleanup, label %for.body.us.preheader + +for.body.us.preheader: ; preds = %entry + br label %for.body.us + +for.body.us: ; preds = %for.body.us.preheader, %cleanup.us + %i.040.us = phi i32 [ %inc19.us, %cleanup.us ], [ 0, %for.body.us.preheader ] + %mul.us = mul nuw nsw i32 %i.040.us, %conv + br label %for.body7.us + +for.body7.us: ; preds = %for.body.us, %if.end.us + %j.038.us = phi i32 [ 0, %for.body.us ], [ %inc.us, %if.end.us ] + %add.us = add nuw nsw i32 %j.038.us, %mul.us + %arrayidx.us = getelementptr inbounds i32, i32* %A, i32 %add.us + %0 = load i32, i32* %arrayidx.us, align 4 + %tobool.us = icmp eq i32 %0, 0 + br i1 %tobool.us, label %if.end.us, label %cleanup.us + +cleanup.us: ; preds = %if.end.us, %for.body7.us + %inc19.us = add nuw nsw i32 %i.040.us, 1 + %exitcond = icmp eq i32 %inc19.us, %conv + br i1 %exitcond, label %for.cond.cleanup, label %for.body.us + +if.end.us: ; preds = %for.body7.us + %arrayidx17.us = getelementptr inbounds i32, i32* %C, i32 %add.us + store i32 0, i32* %arrayidx17.us, align 4 + %inc.us = add nuw nsw i32 %j.038.us, 1 + %cmp4.us = icmp ult i32 %inc.us, %conv + br i1 %cmp4.us, label %for.body7.us, label %cleanup.us + +for.cond.cleanup: ; preds = %cleanup.us, %entry + ret void +} + +define hidden void @test_7(i16 zeroext %N, i32* nocapture %C, i32* nocapture readonly %A, i32 %scale) { +entry: + %conv = zext i16 %N to i32 + %cmp30 = icmp eq i16 %N, 0 + br i1 %cmp30, label %cleanup, label %for.body.us.preheader + +for.body.us.preheader: ; preds = %entry + br label %for.body.us + +for.body.us: ; preds = %for.body.us.preheader, %for.cond2.for.cond.cleanup6_crit_edge.us + %i.031.us = phi i32 [ %inc15.us, %for.cond2.for.cond.cleanup6_crit_edge.us ], [ 0, %for.body.us.preheader ] + %call.us = tail call i32 @get_int() #2 + %tobool.us = icmp eq i32 %call.us, 0 + br i1 %tobool.us, label %for.body7.lr.ph.us, label %cleanup + +for.body7.us: ; preds = %for.body7.us, %for.body7.lr.ph.us + %j.029.us = phi i32 [ 0, %for.body7.lr.ph.us ], [ %inc.us, %for.body7.us ] + %add.us = add nuw nsw i32 %j.029.us, %mul.us + %arrayidx.us = getelementptr inbounds i32, i32* %A, i32 %add.us + %0 = load i32, i32* %arrayidx.us, align 4 + %mul9.us = mul nsw i32 %0, %scale + %arrayidx13.us = getelementptr inbounds i32, i32* %C, i32 %add.us + store i32 %mul9.us, i32* %arrayidx13.us, align 4 + %inc.us = add nuw nsw i32 %j.029.us, 1 + %exitcond = icmp eq i32 %inc.us, %conv + br i1 %exitcond, label %for.cond2.for.cond.cleanup6_crit_edge.us, label %for.body7.us + +for.body7.lr.ph.us: ; preds = %for.body.us + %mul.us = mul nuw nsw i32 %i.031.us, %conv + br label %for.body7.us + +for.cond2.for.cond.cleanup6_crit_edge.us: ; preds = %for.body7.us + %inc15.us = add nuw nsw i32 %i.031.us, 1 + %cmp.us = icmp ult i32 %inc15.us, %conv + br i1 %cmp.us, label %for.body.us, label %cleanup + +cleanup: ; preds = %for.cond2.for.cond.cleanup6_crit_edge.us, %for.body.us, %entry + ret void +} + +; Step is not 1 +define i32 @test_8(i32 %val, i16* nocapture %A) { +entry: + br label %for.body + +for.body: ; preds = %entry, %for.inc6 + %i.018 = phi i32 [ 0, %entry ], [ %inc7, %for.inc6 ] + %mul = mul nuw nsw i32 %i.018, 20 + br label %for.body3 + +for.body3: ; preds = %for.body, %for.body3 + %j.017 = phi i32 [ 0, %for.body ], [ %inc, %for.body3 ] + %add = add nuw nsw i32 %j.017, %mul + %arrayidx = getelementptr inbounds i16, i16* %A, i32 %add + %0 = load i16, i16* %arrayidx, align 2 + %conv16 = zext i16 %0 to i32 + %add4 = add i32 %conv16, %val + %conv5 = trunc i32 %add4 to i16 + store i16 %conv5, i16* %arrayidx, align 2 + %inc = add nuw nsw i32 %j.017, 1 + %exitcond = icmp ne i32 %inc, 20 + br i1 %exitcond, label %for.body3, label %for.inc6 + +for.inc6: ; preds = %for.body3 + %inc7 = add nuw nsw i32 %i.018, 2 + %exitcond19 = icmp ne i32 %inc7, 10 + br i1 %exitcond19, label %for.body, label %for.end8 + +for.end8: ; preds = %for.inc6 + ret i32 10 +} + + +; Step is not 1 +define i32 @test_9(i32 %val, i16* nocapture %A) { +entry: + br label %for.body + +for.body: ; preds = %entry, %for.inc6 + %i.018 = phi i32 [ 0, %entry ], [ %inc7, %for.inc6 ] + %mul = mul nuw nsw i32 %i.018, 20 + br label %for.body3 + +for.body3: ; preds = %for.body, %for.body3 + %j.017 = phi i32 [ 0, %for.body ], [ %inc, %for.body3 ] + %add = add nuw nsw i32 %j.017, %mul + %arrayidx = getelementptr inbounds i16, i16* %A, i32 %add + %0 = load i16, i16* %arrayidx, align 2 + %conv16 = zext i16 %0 to i32 + %add4 = add i32 %conv16, %val + %conv5 = trunc i32 %add4 to i16 + store i16 %conv5, i16* %arrayidx, align 2 + %inc = add nuw nsw i32 %j.017, 2 + %exitcond = icmp ne i32 %inc, 20 + br i1 %exitcond, label %for.body3, label %for.inc6 + +for.inc6: ; preds = %for.body3 + %inc7 = add nuw nsw i32 %i.018, 1 + %exitcond19 = icmp ne i32 %inc7, 10 + br i1 %exitcond19, label %for.body, label %for.end8 + +for.end8: ; preds = %for.inc6 + ret i32 10 +} + + +; Outer loop conditional phi +define i32 @e() { +entry: + br label %for.body + +for.body: ; preds = %entry, %for.end16 + %f.033 = phi i32 [ 0, %entry ], [ %inc18, %for.end16 ] + %g.032 = phi i32 [ undef, %entry ], [ %g.3.lcssa, %for.end16 ] + %.pr = add i32 10, 10 + %tobool29 = icmp eq i32 %.pr, 0 + br i1 %tobool29, label %for.end, label %for.body2.lr.ph + +for.body2.lr.ph: ; preds = %for.body + br label %for.cond1.for.end_crit_edge + +for.cond1.for.end_crit_edge: ; preds = %for.body2.lr.ph + br label %for.end + +for.end: ; preds = %for.cond1.for.end_crit_edge, %for.body + %g.1.lcssa = phi i32 [ 0, %for.cond1.for.end_crit_edge ], [ %g.032, %for.body ] + br label %for.body5 + +for.body5: ; preds = %for.end, %lor.end + %i.031 = phi i32 [ 0, %for.end ], [ %inc15, %lor.end ] + %g.230 = phi i32 [ %g.1.lcssa, %for.end ], [ %g.3, %lor.end ] + %0 = add i32 10, 10 + %1 = add i32 10, 10 + %tobool9 = icmp eq i32 %1, 0 + br i1 %tobool9, label %lor.rhs, label %lor.end + +lor.rhs: ; preds = %for.body5 + %2 = add i32 10, 10 + %call11 = add i32 10, 10 + %tobool12 = icmp ne i32 %call11, 0 + br label %lor.end + +lor.end: ; preds = %for.body5, %lor.rhs + %g.3 = phi i32 [ %g.230, %for.body5 ], [ %call11, %lor.rhs ] + %3 = phi i1 [ true, %for.body5 ], [ %tobool12, %lor.rhs ] + %lor.ext = zext i1 %3 to i32 + %inc15 = add nuw nsw i32 %i.031, 1 + %exitcond = icmp ne i32 %inc15, 9 + br i1 %exitcond, label %for.body5, label %for.end16 + +for.end16: ; preds = %lor.end + %g.3.lcssa = phi i32 [ %g.3, %lor.end ] + %inc18 = add nuw nsw i32 %f.033, 1 + %exitcond34 = icmp ne i32 %inc18, 7 + br i1 %exitcond34, label %for.body, label %for.end19 + +for.end19: ; preds = %for.end16 + ret i32 undef +} Index: llvm/test/Transforms/LoopFlatten/loop-flatten.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopFlatten/loop-flatten.ll @@ -0,0 +1,591 @@ +; RUN: opt < %s -S -loop-flatten -verify-loop-info -verify-dom-info -verify-scev -verify | FileCheck %s +; RUN: opt < %s -S -passes="loop-flatten" -verify-loop-info -verify-dom-info -verify-scev -verify | FileCheck %s +target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" + +; CHECK-LABEL: test1 +; Simple loop where the IV's is constant +define i32 @test1(i32 %val, i16* nocapture %A) { +entry: + br label %for.body +; CHECK: entry: +; CHECK: %flatten.tripcount = mul i32 20, 10 +; CHECK: br label %for.body + +for.body: ; preds = %entry, %for.inc6 + %i.018 = phi i32 [ 0, %entry ], [ %inc7, %for.inc6 ] + %mul = mul nuw nsw i32 %i.018, 20 + br label %for.body3 +; CHECK: for.body: +; CHECK: %i.018 = phi i32 [ 0, %entry ], [ %inc7, %for.inc6 ] +; CHECK: %mul = mul nuw nsw i32 %i.018, 20 +; CHECK: br label %for.body3 + +for.body3: ; preds = %for.body, %for.body3 + %j.017 = phi i32 [ 0, %for.body ], [ %inc, %for.body3 ] + %add = add nuw nsw i32 %j.017, %mul + %arrayidx = getelementptr inbounds i16, i16* %A, i32 %add + %0 = load i16, i16* %arrayidx, align 2 + %conv16 = zext i16 %0 to i32 + %add4 = add i32 %conv16, %val + %conv5 = trunc i32 %add4 to i16 + store i16 %conv5, i16* %arrayidx, align 2 + %inc = add nuw nsw i32 %j.017, 1 + %exitcond = icmp ne i32 %inc, 20 + br i1 %exitcond, label %for.body3, label %for.inc6 +; CHECK: for.body3: +; CHECK: %j.017 = phi i32 [ 0, %for.body ] +; CHECK: %add = add nuw nsw i32 %j.017, %mul +; CHECK: %arrayidx = getelementptr inbounds i16, i16* %A, i32 %i.018 +; CHECK: %0 = load i16, i16* %arrayidx, align 2 +; CHECK: %conv16 = zext i16 %0 to i32 +; CHECK: %add4 = add i32 %conv16, %val +; CHECK: %conv5 = trunc i32 %add4 to i16 +; CHECK: store i16 %conv5, i16* %arrayidx, align 2 +; CHECK: %inc = add nuw nsw i32 %j.017, 1 +; CHECK: %exitcond = icmp ne i32 %inc, 20 +; CHECK: br label %for.inc6 + +for.inc6: ; preds = %for.body3 + %inc7 = add nuw nsw i32 %i.018, 1 + %exitcond19 = icmp ne i32 %inc7, 10 + br i1 %exitcond19, label %for.body, label %for.end8 +; CHECK: for.inc6: +; CHECK: %inc7 = add nuw nsw i32 %i.018, 1 +; CHECK: %exitcond19 = icmp ne i32 %inc7, %flatten.tripcount +; CHECK: br i1 %exitcond19, label %for.body, label %for.end8 + +for.end8: ; preds = %for.inc6 + ret i32 10 +} + + +; CHECK-LABEL: test2 +; Same as above but non constant IV (which still cannot overflow) +define i32 @test2(i8 zeroext %I, i32 %val, i16* nocapture %A) { +entry: + %conv = zext i8 %I to i32 + %cmp26 = icmp eq i8 %I, 0 + br i1 %cmp26, label %for.end13, label %for.body.lr.ph.split.us + +for.body.lr.ph.split.us: ; preds = %entry + br label %for.body.us +; CHECK: for.body.lr.ph.split.us: +; CHECK: %flatten.tripcount = mul i32 %conv, %conv +; CHECK: br label %for.body.us + +for.body.us: ; preds = %for.cond2.for.inc11_crit_edge.us, %for.body.lr.ph.split.us + %i.027.us = phi i32 [ 0, %for.body.lr.ph.split.us ], [ %inc12.us, %for.cond2.for.inc11_crit_edge.us ] + %mul.us = mul nuw nsw i32 %i.027.us, %conv + br label %for.body6.us +; CHECK: for.body.us: +; CHECK: %i.027.us = phi i32 [ 0, %for.body.lr.ph.split.us ], [ %inc12.us, %for.cond2.for.inc11_crit_edge.us ] +; CHECK: %mul.us = mul nuw nsw i32 %i.027.us, %conv +; CHECK: br label %for.body6.us + +for.body6.us: ; preds = %for.body.us, %for.body6.us + %j.025.us = phi i32 [ 0, %for.body.us ], [ %inc.us, %for.body6.us ] + %add.us = add nuw nsw i32 %j.025.us, %mul.us + %arrayidx.us = getelementptr inbounds i16, i16* %A, i32 %add.us + %0 = load i16, i16* %arrayidx.us, align 2 + %conv823.us = zext i16 %0 to i32 + %add9.us = add i32 %conv823.us, %val + %conv10.us = trunc i32 %add9.us to i16 + store i16 %conv10.us, i16* %arrayidx.us, align 2 + %inc.us = add nuw nsw i32 %j.025.us, 1 + %exitcond = icmp ne i32 %inc.us, %conv + br i1 %exitcond, label %for.body6.us, label %for.cond2.for.inc11_crit_edge.us +; CHECK: for.body6.us: +; CHECK: %j.025.us = phi i32 [ 0, %for.body.us ] +; CHECK: %add.us = add nuw nsw i32 %j.025.us, %mul.us +; CHECK: %arrayidx.us = getelementptr inbounds i16, i16* %A, i32 %i.027.us +; CHECK: %0 = load i16, i16* %arrayidx.us, align 2 +; CHECK: %conv823.us = zext i16 %0 to i32 +; CHECK: %add9.us = add i32 %conv823.us, %val +; CHECK: %conv10.us = trunc i32 %add9.us to i16 +; CHECK: store i16 %conv10.us, i16* %arrayidx.us, align 2 +; CHECK: %inc.us = add nuw nsw i32 %j.025.us, 1 +; CHECK: %exitcond = icmp ne i32 %inc.us, %conv +; CHECK: br label %for.cond2.for.inc11_crit_edge.us + +for.cond2.for.inc11_crit_edge.us: ; preds = %for.body6.us + %inc12.us = add nuw nsw i32 %i.027.us, 1 + %exitcond28 = icmp ne i32 %inc12.us, %conv + br i1 %exitcond28, label %for.body.us, label %for.end13.loopexit +; CHECK: for.cond2.for.inc11_crit_edge.us: ; preds = %for.body6.us +; CHECK: %inc12.us = add nuw nsw i32 %i.027.us, 1 +; CHECK: %exitcond28 = icmp ne i32 %inc12.us, %flatten.tripcount +; CHECK: br i1 %exitcond28, label %for.body.us, label %for.end13.loopexit + +for.end13.loopexit: ; preds = %for.cond2.for.inc11_crit_edge.us + br label %for.end13 + +for.end13: ; preds = %for.end13.loopexit, %entry + %i.0.lcssa = phi i32 [ 0, %entry ], [ %conv, %for.end13.loopexit ] + ret i32 %i.0.lcssa +} + + +; CHECK-LABEL: test3 +; Same as above, uses load to determine it cant overflow +define i32 @test3(i32 %N, i32 %val, i16* nocapture %A) local_unnamed_addr #0 { +entry: + %cmp21 = icmp eq i32 %N, 0 + br i1 %cmp21, label %for.end8, label %for.body.lr.ph.split.us + +for.body.lr.ph.split.us: ; preds = %entry + br label %for.body.us +; CHECK: for.body.lr.ph.split.us: +; CHECK: %flatten.tripcount = mul i32 %N, %N +; CHECK: br label %for.body.us + +for.body.us: ; preds = %for.cond1.for.inc6_crit_edge.us, %for.body.lr.ph.split.us + %i.022.us = phi i32 [ 0, %for.body.lr.ph.split.us ], [ %inc7.us, %for.cond1.for.inc6_crit_edge.us ] + %mul.us = mul i32 %i.022.us, %N + br label %for.body3.us +; CHECK: for.body.us: +; CHECK: %i.022.us = phi i32 [ 0, %for.body.lr.ph.split.us ], [ %inc7.us, %for.cond1.for.inc6_crit_edge.us ] +; CHECK: %mul.us = mul i32 %i.022.us, %N +; CHECK: br label %for.body3.us + +for.body3.us: ; preds = %for.body.us, %for.body3.us + %j.020.us = phi i32 [ 0, %for.body.us ], [ %inc.us, %for.body3.us ] + %add.us = add i32 %j.020.us, %mul.us + %arrayidx.us = getelementptr inbounds i16, i16* %A, i32 %add.us + %0 = load i16, i16* %arrayidx.us, align 2 + %conv18.us = zext i16 %0 to i32 + %add4.us = add i32 %conv18.us, %val + %conv5.us = trunc i32 %add4.us to i16 + store i16 %conv5.us, i16* %arrayidx.us, align 2 + %inc.us = add nuw i32 %j.020.us, 1 + %exitcond = icmp ne i32 %inc.us, %N + br i1 %exitcond, label %for.body3.us, label %for.cond1.for.inc6_crit_edge.us +; CHECK: for.body3.us: +; CHECK: %j.020.us = phi i32 [ 0, %for.body.us ] +; CHECK: %add.us = add i32 %j.020.us, %mul.us +; CHECK: %arrayidx.us = getelementptr inbounds i16, i16* %A, i32 %i.022.us +; CHECK: %0 = load i16, i16* %arrayidx.us, align 2 +; CHECK: %conv18.us = zext i16 %0 to i32 +; CHECK: %add4.us = add i32 %conv18.us, %val +; CHECK: %conv5.us = trunc i32 %add4.us to i16 +; CHECK: store i16 %conv5.us, i16* %arrayidx.us, align 2 +; CHECK: %inc.us = add nuw i32 %j.020.us, 1 +; CHECK: %exitcond = icmp ne i32 %inc.us, %N +; CHECK: br label %for.cond1.for.inc6_crit_edge.us + +for.cond1.for.inc6_crit_edge.us: ; preds = %for.body3.us + %inc7.us = add nuw i32 %i.022.us, 1 + %exitcond23 = icmp ne i32 %inc7.us, %N + br i1 %exitcond23, label %for.body.us, label %for.end8.loopexit +; CHECK: for.cond1.for.inc6_crit_edge.us: +; CHECK: %inc7.us = add nuw i32 %i.022.us, 1 +; CHECK: %exitcond23 = icmp ne i32 %inc7.us, %flatten.tripcount +; CHECK: br i1 %exitcond23, label %for.body.us, label %for.end8.loopexit + +for.end8.loopexit: ; preds = %for.cond1.for.inc6_crit_edge.us + br label %for.end8 + +for.end8: ; preds = %for.end8.loopexit, %entry + %i.0.lcssa = phi i32 [ 0, %entry ], [ %N, %for.end8.loopexit ] + ret i32 %i.0.lcssa +} + + +; CHECK-LABEL: test4 +; Multiplication cannot overflow, so we can replace the original loop. +define void @test4(i16 zeroext %N, i32* nocapture %C, i32* nocapture readonly %A, i32 %scale) { +entry: + %conv = zext i16 %N to i32 + %cmp30 = icmp eq i16 %N, 0 + br i1 %cmp30, label %for.cond.cleanup, label %for.body.lr.ph.split.us +; CHECK: entry: +; CHECK: %[[LIMIT:.*]] = zext i16 %N to i32 +; CHECK: br i1 %{{.*}} label %for.cond.cleanup, label %for.body.lr.ph.split.us + +for.body.lr.ph.split.us: ; preds = %entry + br label %for.body.us +; CHECK: for.body.lr.ph.split.us: +; CHECK: %[[TRIPCOUNT:.*]] = mul i32 %[[LIMIT]], %[[LIMIT]] +; CHECK: br label %for.body.us + +for.body.us: ; preds = %for.cond2.for.cond.cleanup6_crit_edge.us, %for.body.lr.ph.split.us + %i.031.us = phi i32 [ 0, %for.body.lr.ph.split.us ], [ %inc15.us, %for.cond2.for.cond.cleanup6_crit_edge.us ] + %mul.us = mul nuw nsw i32 %i.031.us, %conv + br label %for.body7.us +; CHECK: for.body.us: +; CHECK: %[[OUTER_IV:.*]] = phi i32 +; CHECK: br label %for.body7.us + +for.body7.us: ; preds = %for.body.us, %for.body7.us +; CHECK: for.body7.us: + %j.029.us = phi i32 [ 0, %for.body.us ], [ %inc.us, %for.body7.us ] + %add.us = add nuw nsw i32 %j.029.us, %mul.us + %arrayidx.us = getelementptr inbounds i32, i32* %A, i32 %add.us +; CHECK: getelementptr inbounds i32, i32* %A, i32 %[[OUTER_IV]] + %0 = load i32, i32* %arrayidx.us, align 4 + %mul9.us = mul nsw i32 %0, %scale +; CHECK: getelementptr inbounds i32, i32* %C, i32 %[[OUTER_IV]] + %arrayidx13.us = getelementptr inbounds i32, i32* %C, i32 %add.us + store i32 %mul9.us, i32* %arrayidx13.us, align 4 + %inc.us = add nuw nsw i32 %j.029.us, 1 + %exitcond = icmp ne i32 %inc.us, %conv + br i1 %exitcond, label %for.body7.us, label %for.cond2.for.cond.cleanup6_crit_edge.us +; CHECK: br label %for.cond2.for.cond.cleanup6_crit_edge.us + +for.cond2.for.cond.cleanup6_crit_edge.us: ; preds = %for.body7.us + %inc15.us = add nuw nsw i32 %i.031.us, 1 + %exitcond32 = icmp ne i32 %inc15.us, %conv + br i1 %exitcond32, label %for.body.us, label %for.cond.cleanup.loopexit +; CHECK: for.cond2.for.cond.cleanup6_crit_edge.us: +; CHECK: br i1 %exitcond32, label %for.body.us, label %for.cond.cleanup.loopexit + +for.cond.cleanup.loopexit: ; preds = %for.cond2.for.cond.cleanup6_crit_edge.us + br label %for.cond.cleanup +; CHECK: for.cond.cleanup.loopexit: +; CHECK: br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + ret void +; CHECK: for.cond.cleanup: +; CHECK: ret void +} + + +; CHECK-LABEL: test5 +define i32 @test5(i8 zeroext %I, i16 zeroext %J) { +entry: + %0 = lshr i8 %I, 1 + %div = zext i8 %0 to i32 + %cmp30 = icmp eq i8 %0, 0 + br i1 %cmp30, label %for.cond.cleanup, label %for.body.lr.ph + +for.body.lr.ph: ; preds = %entry + %1 = lshr i16 %J, 1 + %div5 = zext i16 %1 to i32 + %cmp627 = icmp eq i16 %1, 0 + br i1 %cmp627, label %for.body.lr.ph.split, label %for.body.lr.ph.split.us + +for.body.lr.ph.split.us: ; preds = %for.body.lr.ph + br label %for.body.us +; CHECK: for.body.lr.ph.split.us: +; CHECK: %flatten.tripcount = mul i32 %div5, %div +; CHECK: br label %for.body.us + +for.body.us: ; preds = %for.cond3.for.cond.cleanup8_crit_edge.us, %for.body.lr.ph.split.us + %i.032.us = phi i32 [ 0, %for.body.lr.ph.split.us ], [ %inc13.us, %for.cond3.for.cond.cleanup8_crit_edge.us ] + %x.031.us = phi i32 [ 1, %for.body.lr.ph.split.us ], [ %xor.us.lcssa, %for.cond3.for.cond.cleanup8_crit_edge.us ] + br label %for.body9.us +; CHECK: for.body.us: +; CHECK: %i.032.us = phi i32 [ 0, %for.body.lr.ph.split.us ], [ %inc13.us, %for.cond3.for.cond.cleanup8_crit_edge.us ] +; CHECK: %x.031.us = phi i32 [ 1, %for.body.lr.ph.split.us ], [ %xor.us.lcssa, %for.cond3.for.cond.cleanup8_crit_edge.us ] +; CHECK: br label %for.body9.us + +for.body9.us: ; preds = %for.body.us, %for.body9.us + %j.029.us = phi i32 [ 0, %for.body.us ], [ %inc.us, %for.body9.us ] + %x.128.us = phi i32 [ %x.031.us, %for.body.us ], [ %xor.us, %for.body9.us ] + %call.us = tail call i32 @func(i32 1) + %sub.us = sub nsw i32 %call.us, %x.128.us + %xor.us = xor i32 %sub.us, %x.128.us + %inc.us = add nuw nsw i32 %j.029.us, 1 + %cmp6.us = icmp ult i32 %inc.us, %div5 + br i1 %cmp6.us, label %for.body9.us, label %for.cond3.for.cond.cleanup8_crit_edge.us +; CHECK: for.body9.us: +; CHECK: %j.029.us = phi i32 [ 0, %for.body.us ] +; CHECK: %x.128.us = phi i32 [ %x.031.us, %for.body.us ] +; CHECK: %call.us = tail call i32 @func(i32 1) +; CHECK: %sub.us = sub nsw i32 %call.us, %x.128.us +; CHECK: %xor.us = xor i32 %sub.us, %x.128.us +; CHECK: %inc.us = add nuw nsw i32 %j.029.us, 1 +; CHECK: %cmp6.us = icmp ult i32 %inc.us, %div5 +; CHECK: br label %for.cond3.for.cond.cleanup8_crit_edge.us + +for.cond3.for.cond.cleanup8_crit_edge.us: ; preds = %for.body9.us + %xor.us.lcssa = phi i32 [ %xor.us, %for.body9.us ] + %inc13.us = add nuw nsw i32 %i.032.us, 1 + %cmp.us = icmp ult i32 %inc13.us, %div + br i1 %cmp.us, label %for.body.us, label %for.cond.cleanup.loopexit +; CHECK: for.cond3.for.cond.cleanup8_crit_edge.us: +; CHECK: %xor.us.lcssa = phi i32 [ %xor.us, %for.body9.us ] +; CHECK: %inc13.us = add nuw nsw i32 %i.032.us, 1 +; CHECK: %cmp.us = icmp ult i32 %inc13.us, %flatten.tripcount +; CHECK: br i1 %cmp.us, label %for.body.us, label %for.cond.cleanup.loopexit + +for.body.lr.ph.split: ; preds = %for.body.lr.ph + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %for.cond3.for.cond.cleanup8_crit_edge.us + %xor.us.lcssa.lcssa = phi i32 [ %xor.us.lcssa, %for.cond3.for.cond.cleanup8_crit_edge.us ] + br label %for.cond.cleanup + +for.cond.cleanup.loopexit34: ; preds = %for.body + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit34, %for.cond.cleanup.loopexit, %entry + %x.0.lcssa = phi i32 [ 1, %entry ], [ %xor.us.lcssa.lcssa, %for.cond.cleanup.loopexit ], [ 1, %for.cond.cleanup.loopexit34 ] + ret i32 %x.0.lcssa + +for.body: ; preds = %for.body.lr.ph.split, %for.body + %i.032 = phi i32 [ 0, %for.body.lr.ph.split ], [ %inc13, %for.body ] + %inc13 = add nuw nsw i32 %i.032, 1 + %cmp = icmp ult i32 %inc13, %div + br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit34 +} + + +; CHECK-LABEL: test6 +define i32 @test6(i8 zeroext %I, i16 zeroext %J) { +entry: + %0 = lshr i8 %I, 1 + %div = zext i8 %0 to i32 + %cmp30 = icmp eq i8 %0, 0 + br i1 %cmp30, label %for.cond.cleanup, label %for.body.lr.ph + +for.body.lr.ph: ; preds = %entry + %1 = lshr i16 %J, 1 + %div5 = zext i16 %1 to i32 + %cmp627 = icmp eq i16 %1, 0 + br i1 %cmp627, label %for.body.lr.ph.split, label %for.body.lr.ph.split.us + +for.body.lr.ph.split.us: ; preds = %for.body.lr.ph + br label %for.body.us +; CHECK: for.body.lr.ph.split.us: +; CHECK: %flatten.tripcount = mul i32 %div5, %div +; CHECK: br label %for.body.us + +for.body.us: ; preds = %for.cond3.for.cond.cleanup8_crit_edge.us, %for.body.lr.ph.split.us + %i.032.us = phi i32 [ 0, %for.body.lr.ph.split.us ], [ %inc13.us, %for.cond3.for.cond.cleanup8_crit_edge.us ] + %x.031.us = phi i32 [ 1, %for.body.lr.ph.split.us ], [ %xor.us.lcssa, %for.cond3.for.cond.cleanup8_crit_edge.us ] + %mul.us = mul nuw nsw i32 %i.032.us, %div5 + br label %for.body9.us +; CHECK: for.body.us: +; CHECK: %i.032.us = phi i32 [ 0, %for.body.lr.ph.split.us ], [ %inc13.us, %for.cond3.for.cond.cleanup8_crit_edge.us ] +; CHECK: %x.031.us = phi i32 [ 1, %for.body.lr.ph.split.us ], [ %xor.us.lcssa, %for.cond3.for.cond.cleanup8_crit_edge.us ] +; CHECK: %mul.us = mul nuw nsw i32 %i.032.us, %div5 +; CHECK: br label %for.body9.us + +for.body9.us: ; preds = %for.body.us, %for.body9.us + %j.029.us = phi i32 [ 0, %for.body.us ], [ %inc.us, %for.body9.us ] + %x.128.us = phi i32 [ %x.031.us, %for.body.us ], [ %xor.us, %for.body9.us ] + %add.us = add nuw nsw i32 %j.029.us, %mul.us + %call.us = tail call i32 @func(i32 %add.us) + %sub.us = sub nsw i32 %call.us, %x.128.us + %xor.us = xor i32 %sub.us, %x.128.us + %inc.us = add nuw nsw i32 %j.029.us, 1 + %cmp6.us = icmp ult i32 %inc.us, %div5 + br i1 %cmp6.us, label %for.body9.us, label %for.cond3.for.cond.cleanup8_crit_edge.us +; CHECK: for.body9.us: +; CHECK: %j.029.us = phi i32 [ 0, %for.body.us ] +; CHECK: %x.128.us = phi i32 [ %x.031.us, %for.body.us ] +; CHECK: %add.us = add nuw nsw i32 %j.029.us, %mul.us +; CHECK: %call.us = tail call i32 @func(i32 %i.032.us) +; CHECK: %sub.us = sub nsw i32 %call.us, %x.128.us +; CHECK: %xor.us = xor i32 %sub.us, %x.128.us +; CHECK: %inc.us = add nuw nsw i32 %j.029.us, 1 +; CHECK: %cmp6.us = icmp ult i32 %inc.us, %div5 +; CHECK: br label %for.cond3.for.cond.cleanup8_crit_edge.us + +for.cond3.for.cond.cleanup8_crit_edge.us: ; preds = %for.body9.us + %xor.us.lcssa = phi i32 [ %xor.us, %for.body9.us ] + %inc13.us = add nuw nsw i32 %i.032.us, 1 + %cmp.us = icmp ult i32 %inc13.us, %div + br i1 %cmp.us, label %for.body.us, label %for.cond.cleanup.loopexit +; CHECK: for.cond3.for.cond.cleanup8_crit_edge.us: +; CHECK: %xor.us.lcssa = phi i32 [ %xor.us, %for.body9.us ] +; CHECK: %inc13.us = add nuw nsw i32 %i.032.us, 1 +; CHECK: %cmp.us = icmp ult i32 %inc13.us, %flatten.tripcount +; CHECK: br i1 %cmp.us, label %for.body.us, label %for.cond.cleanup.loopexit + +for.body.lr.ph.split: ; preds = %for.body.lr.ph + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %for.cond3.for.cond.cleanup8_crit_edge.us + %xor.us.lcssa.lcssa = phi i32 [ %xor.us.lcssa, %for.cond3.for.cond.cleanup8_crit_edge.us ] + br label %for.cond.cleanup + +for.cond.cleanup.loopexit34: ; preds = %for.body + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit34, %for.cond.cleanup.loopexit, %entry + %x.0.lcssa = phi i32 [ 1, %entry ], [ %xor.us.lcssa.lcssa, %for.cond.cleanup.loopexit ], [ 1, %for.cond.cleanup.loopexit34 ] + ret i32 %x.0.lcssa + +for.body: ; preds = %for.body.lr.ph.split, %for.body + %i.032 = phi i32 [ 0, %for.body.lr.ph.split ], [ %inc13, %for.body ] + %inc13 = add nuw nsw i32 %i.032, 1 + %cmp = icmp ult i32 %inc13, %div + br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit34 +} + +; CHECK-LABEL: test7 +; Various inner phis and conditions which we can still work with +define signext i16 @test7(i32 %I, i32 %J, i32* nocapture readonly %C, i16 signext %limit) { +entry: + %cmp43 = icmp eq i32 %J, 0 + br i1 %cmp43, label %for.end17, label %for.body.lr.ph + +for.body.lr.ph: ; preds = %entry + %conv = sext i16 %limit to i32 + br label %for.body.us +; CHECK: for.body.lr.ph: +; CHECK: %conv = sext i16 %limit to i32 +; CHECK: %flatten.tripcount = mul i32 %J, %J +; CHECK: br label %for.body.us + +for.body.us: ; preds = %for.cond1.for.inc15_crit_edge.us, %for.body.lr.ph + %i.047.us = phi i32 [ 0, %for.body.lr.ph ], [ %inc16.us, %for.cond1.for.inc15_crit_edge.us ] + %ret.046.us = phi i16 [ 0, %for.body.lr.ph ], [ %ret.2.us.lcssa, %for.cond1.for.inc15_crit_edge.us ] + %prev.045.us = phi i32 [ 0, %for.body.lr.ph ], [ %.lcssa, %for.cond1.for.inc15_crit_edge.us ] + %tmp.044.us = phi i32 [ 0, %for.body.lr.ph ], [ %tmp.2.us.lcssa, %for.cond1.for.inc15_crit_edge.us ] + %mul.us = mul i32 %i.047.us, %J + br label %for.body3.us +; CHECK: for.body.us: +; CHECK: %i.047.us = phi i32 [ 0, %for.body.lr.ph ], [ %inc16.us, %for.cond1.for.inc15_crit_edge.us ] +; CHECK: %ret.046.us = phi i16 [ 0, %for.body.lr.ph ], [ %ret.2.us.lcssa, %for.cond1.for.inc15_crit_edge.us ] +; CHECK: %prev.045.us = phi i32 [ 0, %for.body.lr.ph ], [ %.lcssa, %for.cond1.for.inc15_crit_edge.us ] +; CHECK: %tmp.044.us = phi i32 [ 0, %for.body.lr.ph ], [ %tmp.2.us.lcssa, %for.cond1.for.inc15_crit_edge.us ] +; CHECK: %mul.us = mul i32 %i.047.us, %J +; CHECK: br label %for.body3.us + +for.body3.us: ; preds = %for.body.us, %if.end.us + %j.040.us = phi i32 [ 0, %for.body.us ], [ %inc.us, %if.end.us ] + %ret.139.us = phi i16 [ %ret.046.us, %for.body.us ], [ %ret.2.us, %if.end.us ] + %prev.138.us = phi i32 [ %prev.045.us, %for.body.us ], [ %0, %if.end.us ] + %tmp.137.us = phi i32 [ %tmp.044.us, %for.body.us ], [ %tmp.2.us, %if.end.us ] + %add.us = add i32 %j.040.us, %mul.us + %arrayidx.us = getelementptr inbounds i32, i32* %C, i32 %add.us + %0 = load i32, i32* %arrayidx.us, align 4 + %add4.us = add nsw i32 %0, %tmp.137.us + %cmp5.us = icmp sgt i32 %add4.us, %conv + br i1 %cmp5.us, label %if.then.us, label %if.else.us +; CHECK: for.body3.us: +; CHECK: %j.040.us = phi i32 [ 0, %for.body.us ] +; CHECK: %ret.139.us = phi i16 [ %ret.046.us, %for.body.us ] +; CHECK: %prev.138.us = phi i32 [ %prev.045.us, %for.body.us ] +; CHECK: %tmp.137.us = phi i32 [ %tmp.044.us, %for.body.us ] +; CHECK: %add.us = add i32 %j.040.us, %mul.us +; CHECK: %arrayidx.us = getelementptr inbounds i32, i32* %C, i32 %i.047.us +; CHECK: %0 = load i32, i32* %arrayidx.us, align 4 +; CHECK: %add4.us = add nsw i32 %0, %tmp.137.us +; CHECK: %cmp5.us = icmp sgt i32 %add4.us, %conv +; CHECK: br i1 %cmp5.us, label %if.then.us, label %if.else.us + +if.else.us: ; preds = %for.body3.us + %cmp10.us = icmp sgt i32 %0, %prev.138.us + %cond.us = zext i1 %cmp10.us to i32 + %conv1235.us = zext i16 %ret.139.us to i32 + %add13.us = add nuw nsw i32 %cond.us, %conv1235.us + br label %if.end.us +; CHECK: if.else.us: +; CHECK: %cmp10.us = icmp sgt i32 %0, %prev.138.us +; CHECK: %cond.us = zext i1 %cmp10.us to i32 +; CHECK: %conv1235.us = zext i16 %ret.139.us to i32 +; CHECK: %add13.us = add nuw nsw i32 %cond.us, %conv1235.us +; CHECK: br label %if.end.us + +if.then.us: ; preds = %for.body3.us + %conv7.us = sext i16 %ret.139.us to i32 + %add8.us = add nsw i32 %conv7.us, 10 + br label %if.end.us +; CHECK: if.then.us: +; CHECK: %conv7.us = sext i16 %ret.139.us to i32 +; CHECK: %add8.us = add nsw i32 %conv7.us, 10 +; CHECK: br label %if.end.us + +if.end.us: ; preds = %if.then.us, %if.else.us + %tmp.2.us = phi i32 [ 0, %if.then.us ], [ %add4.us, %if.else.us ] + %ret.2.in.us = phi i32 [ %add8.us, %if.then.us ], [ %add13.us, %if.else.us ] + %ret.2.us = trunc i32 %ret.2.in.us to i16 + %inc.us = add nuw i32 %j.040.us, 1 + %exitcond = icmp ne i32 %inc.us, %J + br i1 %exitcond, label %for.body3.us, label %for.cond1.for.inc15_crit_edge.us +; CHECK: if.end.us: +; CHECK: %tmp.2.us = phi i32 [ 0, %if.then.us ], [ %add4.us, %if.else.us ] +; CHECK: %ret.2.in.us = phi i32 [ %add8.us, %if.then.us ], [ %add13.us, %if.else.us ] +; CHECK: %ret.2.us = trunc i32 %ret.2.in.us to i16 +; CHECK: %inc.us = add nuw i32 %j.040.us, 1 +; CHECK: %exitcond = icmp ne i32 %inc.us, %J +; CHECK: br label %for.cond1.for.inc15_crit_edge.us + +for.cond1.for.inc15_crit_edge.us: ; preds = %if.end.us + %tmp.2.us.lcssa = phi i32 [ %tmp.2.us, %if.end.us ] + %ret.2.us.lcssa = phi i16 [ %ret.2.us, %if.end.us ] + %.lcssa = phi i32 [ %0, %if.end.us ] + %inc16.us = add nuw i32 %i.047.us, 1 + %exitcond49 = icmp ne i32 %inc16.us, %J + br i1 %exitcond49, label %for.body.us, label %for.end17.loopexit +; CHECK: for.cond1.for.inc15_crit_edge.us: +; CHECK: %tmp.2.us.lcssa = phi i32 [ %tmp.2.us, %if.end.us ] +; CHECK: %ret.2.us.lcssa = phi i16 [ %ret.2.us, %if.end.us ] +; CHECK: %.lcssa = phi i32 [ %0, %if.end.us ] +; CHECK: %inc16.us = add nuw i32 %i.047.us, 1 +; CHECK: %exitcond49 = icmp ne i32 %inc16.us, %flatten.tripcount +; CHECK: br i1 %exitcond49, label %for.body.us, label %for.end17.loopexit + +for.end17.loopexit: ; preds = %for.cond1.for.inc15_crit_edge.us + %ret.2.us.lcssa.lcssa = phi i16 [ %ret.2.us.lcssa, %for.cond1.for.inc15_crit_edge.us ] + br label %for.end17 + +for.end17: ; preds = %for.end17.loopexit, %entry + %ret.0.lcssa = phi i16 [ 0, %entry ], [ %ret.2.us.lcssa.lcssa, %for.end17.loopexit ] + ret i16 %ret.0.lcssa +} + +; CHECK-LABEL: test8 +; Same as test1, but with different continue block order +; (uses icmp eq and loops on false) +define i32 @test8(i32 %val, i16* nocapture %A) { +entry: + br label %for.body +; CHECK: entry: +; CHECK: %flatten.tripcount = mul i32 20, 10 +; CHECK: br label %for.body + +for.body: ; preds = %entry, %for.inc6 + %i.018 = phi i32 [ 0, %entry ], [ %inc7, %for.inc6 ] + %mul = mul nuw nsw i32 %i.018, 20 + br label %for.body3 +; CHECK: for.body: +; CHECK: %i.018 = phi i32 [ 0, %entry ], [ %inc7, %for.inc6 ] +; CHECK: %mul = mul nuw nsw i32 %i.018, 20 +; CHECK: br label %for.body3 + +for.body3: ; preds = %for.body, %for.body3 + %j.017 = phi i32 [ 0, %for.body ], [ %inc, %for.body3 ] + %add = add nuw nsw i32 %j.017, %mul + %arrayidx = getelementptr inbounds i16, i16* %A, i32 %add + %0 = load i16, i16* %arrayidx, align 2 + %conv16 = zext i16 %0 to i32 + %add4 = add i32 %conv16, %val + %conv5 = trunc i32 %add4 to i16 + store i16 %conv5, i16* %arrayidx, align 2 + %inc = add nuw nsw i32 %j.017, 1 + %exitcond = icmp eq i32 %inc, 20 + br i1 %exitcond, label %for.inc6, label %for.body3 +; CHECK: for.body3: +; CHECK: %j.017 = phi i32 [ 0, %for.body ] +; CHECK: %add = add nuw nsw i32 %j.017, %mul +; CHECK: %arrayidx = getelementptr inbounds i16, i16* %A, i32 %i.018 +; CHECK: %0 = load i16, i16* %arrayidx, align 2 +; CHECK: %conv16 = zext i16 %0 to i32 +; CHECK: %add4 = add i32 %conv16, %val +; CHECK: %conv5 = trunc i32 %add4 to i16 +; CHECK: store i16 %conv5, i16* %arrayidx, align 2 +; CHECK: %inc = add nuw nsw i32 %j.017, 1 +; CHECK: %exitcond = icmp eq i32 %inc, 20 +; CHECK: br label %for.inc6 + +for.inc6: ; preds = %for.body3 + %inc7 = add nuw nsw i32 %i.018, 1 + %exitcond19 = icmp eq i32 %inc7, 10 + br i1 %exitcond19, label %for.end8, label %for.body +; CHECK: for.inc6: +; CHECK: %inc7 = add nuw nsw i32 %i.018, 1 +; CHECK: %exitcond19 = icmp eq i32 %inc7, %flatten.tripcount +; CHECK: br i1 %exitcond19, label %for.end8, label %for.body + +for.end8: ; preds = %for.inc6 + ret i32 10 +} + + +declare i32 @func(i32) +