Index: lib/Transforms/Scalar/LoopStrengthReduce.cpp =================================================================== --- lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -155,6 +155,11 @@ cl::desc("Narrow LSR search space by filtering non-optimal formulae" " with the same ScaledReg and Scale")); +static cl::opt ComplexityLimit( + "lsr-complexity-limit", cl::Hidden, + cl::init(std::numeric_limits::max()), + cl::desc("LSR search space complexity limit")); + #ifndef NDEBUG // Stress test IV chain generation. static cl::opt StressIVChain( @@ -4311,9 +4316,6 @@ }); } -// This is a rough guess that seems to work fairly well. -static const size_t ComplexityLimit = std::numeric_limits::max(); - /// Estimate the worst-case number of solutions the solver might have to /// consider. It almost never considers this many solutions because it prune the /// search space, but the pruning isn't always sufficient. Index: test/Transforms/LoopStrengthReduce/ARM/complexity.ll =================================================================== --- /dev/null +++ test/Transforms/LoopStrengthReduce/ARM/complexity.ll @@ -0,0 +1,160 @@ +target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" + +; RUN: opt -mtriple=thumbv7em %s -S -loop-reduce -lsr-complexity-limit=65536 -o - | FileCheck %s --check-prefix=CHECK-DEFAULT +; RUN: opt -mtriple=thumbv7em %s -S -loop-reduce -lsr-complexity-limit=2147483647 -o - | FileCheck %s --check-prefix=CHECK-COMPLEX + +; CHECK-DEFAULT-LABEL: for.body12.us.us: +; CHECK-DEFAULT: [[LSR_IV:%[^ ]+]] = phi i32 [ 6, %for.body12.us.us.preheader ], [ [[LSR_IV_NEXT:%[^ ]+]], %for.body12.us.us ] +; CHECK-DEFAULT: phi i32 +; CHECK-DEFAULT: phi i32 +; CHECK-DEFAULT: [[LSR_IV_NEXT]] = add i32 [[LSR_IV]], 8 + +; CHECK-COMPLEX-LABEL: for.body12.us.us: +; CHECK-COMPLEX: [[LSR_IV6:%[^ ]+]] = phi i16* [ [[SCEVGEP5:%[^ ]+]], %for.body12.us.us.preheader ], [ [[SCEVGEP7:%[^ ]+]], %for.body12.us.us ] +; CHECK-COMPLEX: [[LSR_IV:%[^ ]+]] = phi i16* [ [[SCEVGEP:%[^ ]+]], %for.body12.us.us.preheader ], [ [[SCEVGEP1:%[^ ]+]], %for.body12.us.us ] +; CHECK-COMPLEX: phi i32 +; CHECK-COMPLEX: phi i32 +; CHECK-COMPLEX: [[SCEVGEP1]] = getelementptr i16, i16* [[LSR_IV]], i32 4 +; CHECK-COMPLEX: [[SCEVGEP7]] = getelementptr i16, i16* [[LSR_IV6]], i32 4 + +define void @convolve(i16** nocapture readonly %input_image, i16** nocapture readonly %filter, i32 %filter_dim, i32 %out_width, i32 %out_height, i32** nocapture readonly %convolved) { +entry: + %cmp92 = icmp eq i32 %out_height, 0 + br i1 %cmp92, label %for.cond.cleanup, label %for.cond1.preheader.lr.ph + +for.cond1.preheader.lr.ph: + %cmp259 = icmp eq i32 %out_width, 0 + %cmp654 = icmp eq i32 %filter_dim, 0 + %0 = shl i32 %out_width, 2 + %1 = add i32 %filter_dim, -1 + %xtraiter = and i32 %filter_dim, 3 + %2 = icmp ult i32 %1, 3 + %unroll_iter = sub i32 %filter_dim, %xtraiter + %lcmp.mod = icmp eq i32 %xtraiter, 0 + br label %for.cond1.preheader + +for.cond1.preheader: + %res_y.093 = phi i32 [ 0, %for.cond1.preheader.lr.ph ], [ %add28, %for.cond.cleanup3 ] + br i1 %cmp259, label %for.cond.cleanup3, label %for.cond5.preheader.lr.ph + +for.cond5.preheader.lr.ph: + %arrayidx22 = getelementptr inbounds i32*, i32** %convolved, i32 %res_y.093 + %3 = load i32*, i32** %arrayidx22, align 4 + br i1 %cmp654, label %for.cond5.preheader.preheader, label %for.cond9.preheader.us.us.preheader + +for.cond5.preheader.preheader: + %4 = bitcast i32* %3 to i8* + call void @llvm.memset.p0i8.i32(i8* align 4 %4, i8 0, i32 %0, i1 false) + br label %for.cond.cleanup3 + +for.cond9.preheader.us.us.preheader: + %res_x.060.us = phi i32 [ %add25.us, %for.cond5.for.cond.cleanup7_crit_edge.us ], [ 0, %for.cond5.preheader.lr.ph ] + br label %for.cond9.preheader.us.us + +for.cond5.for.cond.cleanup7_crit_edge.us: + %arrayidx23.us = getelementptr inbounds i32, i32* %3, i32 %res_x.060.us + store i32 %add18.us.us.lcssa, i32* %arrayidx23.us, align 4 + %add25.us = add nuw i32 %res_x.060.us, 1 + %exitcond99 = icmp eq i32 %add25.us, %out_width + br i1 %exitcond99, label %for.cond.cleanup3, label %for.cond9.preheader.us.us.preheader + +for.cond9.preheader.us.us: + %filter_y.056.us.us = phi i32 [ %inc20.us.us, %for.cond9.for.cond.cleanup11_crit_edge.us.us ], [ 0, %for.cond9.preheader.us.us.preheader ] + %result_element.055.us.us = phi i32 [ %add18.us.us.lcssa, %for.cond9.for.cond.cleanup11_crit_edge.us.us ], [ 0, %for.cond9.preheader.us.us.preheader ] + %add.us.us = add i32 %filter_y.056.us.us, %res_y.093 + %arrayidx.us.us = getelementptr inbounds i16*, i16** %filter, i32 %filter_y.056.us.us + %5 = load i16*, i16** %arrayidx.us.us, align 4 + %arrayidx15.us.us = getelementptr inbounds i16*, i16** %input_image, i32 %add.us.us + %6 = load i16*, i16** %arrayidx15.us.us, align 4 + br i1 %2, label %for.cond9.for.cond.cleanup11_crit_edge.us.us.unr-lcssa, label %for.body12.us.us + +for.cond9.for.cond.cleanup11_crit_edge.us.us.unr-lcssa: + %add18.us.us.lcssa.ph = phi i32 [ undef, %for.cond9.preheader.us.us ], [ %add18.us.us.3, %for.body12.us.us ] + %filter_x.053.us.us.unr = phi i32 [ 0, %for.cond9.preheader.us.us ], [ %inc.us.us.3, %for.body12.us.us ] + %result_element.152.us.us.unr = phi i32 [ %result_element.055.us.us, %for.cond9.preheader.us.us ], [ %add18.us.us.3, %for.body12.us.us ] + br i1 %lcmp.mod, label %for.cond9.for.cond.cleanup11_crit_edge.us.us, label %for.body12.us.us.epil + +for.body12.us.us.epil: + %filter_x.053.us.us.epil = phi i32 [ %inc.us.us.epil, %for.body12.us.us.epil ], [ %filter_x.053.us.us.unr, %for.cond9.for.cond.cleanup11_crit_edge.us.us.unr-lcssa ] + %result_element.152.us.us.epil = phi i32 [ %add18.us.us.epil, %for.body12.us.us.epil ], [ %result_element.152.us.us.unr, %for.cond9.for.cond.cleanup11_crit_edge.us.us.unr-lcssa ] + %epil.iter = phi i32 [ %epil.iter.sub, %for.body12.us.us.epil ], [ %xtraiter, %for.cond9.for.cond.cleanup11_crit_edge.us.us.unr-lcssa ] + %add13.us.us.epil = add i32 %filter_x.053.us.us.epil, %res_x.060.us + %arrayidx14.us.us.epil = getelementptr inbounds i16, i16* %5, i32 %filter_x.053.us.us.epil + %7 = load i16, i16* %arrayidx14.us.us.epil, align 2 + %conv.us.us.epil = sext i16 %7 to i32 + %arrayidx16.us.us.epil = getelementptr inbounds i16, i16* %6, i32 %add13.us.us.epil + %8 = load i16, i16* %arrayidx16.us.us.epil, align 2 + %conv17.us.us.epil = sext i16 %8 to i32 + %mul.us.us.epil = mul nsw i32 %conv17.us.us.epil, %conv.us.us.epil + %add18.us.us.epil = add nsw i32 %mul.us.us.epil, %result_element.152.us.us.epil + %inc.us.us.epil = add nuw i32 %filter_x.053.us.us.epil, 1 + %epil.iter.sub = add i32 %epil.iter, -1 + %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0 + br i1 %epil.iter.cmp, label %for.cond9.for.cond.cleanup11_crit_edge.us.us, label %for.body12.us.us.epil + +for.cond9.for.cond.cleanup11_crit_edge.us.us: + %add18.us.us.lcssa = phi i32 [ %add18.us.us.lcssa.ph, %for.cond9.for.cond.cleanup11_crit_edge.us.us.unr-lcssa ], [ %add18.us.us.epil, %for.body12.us.us.epil ] + %inc20.us.us = add nuw i32 %filter_y.056.us.us, 1 + %exitcond98 = icmp eq i32 %inc20.us.us, %filter_dim + br i1 %exitcond98, label %for.cond5.for.cond.cleanup7_crit_edge.us, label %for.cond9.preheader.us.us + +for.body12.us.us: + %filter_x.053.us.us = phi i32 [ %inc.us.us.3, %for.body12.us.us ], [ 0, %for.cond9.preheader.us.us ] + %result_element.152.us.us = phi i32 [ %add18.us.us.3, %for.body12.us.us ], [ %result_element.055.us.us, %for.cond9.preheader.us.us ] + %niter = phi i32 [ %niter.nsub.3, %for.body12.us.us ], [ %unroll_iter, %for.cond9.preheader.us.us ] + %add13.us.us = add i32 %filter_x.053.us.us, %res_x.060.us + %arrayidx14.us.us = getelementptr inbounds i16, i16* %5, i32 %filter_x.053.us.us + %9 = load i16, i16* %arrayidx14.us.us, align 2 + %conv.us.us = sext i16 %9 to i32 + %arrayidx16.us.us = getelementptr inbounds i16, i16* %6, i32 %add13.us.us + %10 = load i16, i16* %arrayidx16.us.us, align 2 + %conv17.us.us = sext i16 %10 to i32 + %mul.us.us = mul nsw i32 %conv17.us.us, %conv.us.us + %add18.us.us = add nsw i32 %mul.us.us, %result_element.152.us.us + %inc.us.us = or i32 %filter_x.053.us.us, 1 + %add13.us.us.1 = add i32 %inc.us.us, %res_x.060.us + %arrayidx14.us.us.1 = getelementptr inbounds i16, i16* %5, i32 %inc.us.us + %11 = load i16, i16* %arrayidx14.us.us.1, align 2 + %conv.us.us.1 = sext i16 %11 to i32 + %arrayidx16.us.us.1 = getelementptr inbounds i16, i16* %6, i32 %add13.us.us.1 + %12 = load i16, i16* %arrayidx16.us.us.1, align 2 + %conv17.us.us.1 = sext i16 %12 to i32 + %mul.us.us.1 = mul nsw i32 %conv17.us.us.1, %conv.us.us.1 + %add18.us.us.1 = add nsw i32 %mul.us.us.1, %add18.us.us + %inc.us.us.1 = or i32 %filter_x.053.us.us, 2 + %add13.us.us.2 = add i32 %inc.us.us.1, %res_x.060.us + %arrayidx14.us.us.2 = getelementptr inbounds i16, i16* %5, i32 %inc.us.us.1 + %13 = load i16, i16* %arrayidx14.us.us.2, align 2 + %conv.us.us.2 = sext i16 %13 to i32 + %arrayidx16.us.us.2 = getelementptr inbounds i16, i16* %6, i32 %add13.us.us.2 + %14 = load i16, i16* %arrayidx16.us.us.2, align 2 + %conv17.us.us.2 = sext i16 %14 to i32 + %mul.us.us.2 = mul nsw i32 %conv17.us.us.2, %conv.us.us.2 + %add18.us.us.2 = add nsw i32 %mul.us.us.2, %add18.us.us.1 + %inc.us.us.2 = or i32 %filter_x.053.us.us, 3 + %add13.us.us.3 = add i32 %inc.us.us.2, %res_x.060.us + %arrayidx14.us.us.3 = getelementptr inbounds i16, i16* %5, i32 %inc.us.us.2 + %15 = load i16, i16* %arrayidx14.us.us.3, align 2 + %conv.us.us.3 = sext i16 %15 to i32 + %arrayidx16.us.us.3 = getelementptr inbounds i16, i16* %6, i32 %add13.us.us.3 + %16 = load i16, i16* %arrayidx16.us.us.3, align 2 + %conv17.us.us.3 = sext i16 %16 to i32 + %mul.us.us.3 = mul nsw i32 %conv17.us.us.3, %conv.us.us.3 + %add18.us.us.3 = add nsw i32 %mul.us.us.3, %add18.us.us.2 + %inc.us.us.3 = add i32 %filter_x.053.us.us, 4 + %niter.nsub.3 = add i32 %niter, -4 + %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0 + br i1 %niter.ncmp.3, label %for.cond9.for.cond.cleanup11_crit_edge.us.us.unr-lcssa, label %for.body12.us.us + +for.cond.cleanup: + ret void + +for.cond.cleanup3: + %add28 = add nuw i32 %res_y.093, 1 + %exitcond100 = icmp eq i32 %add28, %out_height + br i1 %exitcond100, label %for.cond.cleanup, label %for.cond1.preheader +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.memset.p0i8.i32(i8* nocapture writeonly, i8, i32, i1) #1 + Index: test/Transforms/LoopStrengthReduce/lsr-comp-time.ll =================================================================== --- test/Transforms/LoopStrengthReduce/lsr-comp-time.ll +++ test/Transforms/LoopStrengthReduce/lsr-comp-time.ll @@ -1,4 +1,6 @@ ; RUN: opt -loop-reduce -S < %s | FileCheck %s +; RUN: opt -loop-reduce -lsr-complexity-limit=2147483647 -S < %s | FileCheck %s + ; Test compile time should be <1sec (no hang). target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"