diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -319,6 +319,9 @@ getDecisionAndClampRange(const std::function &Predicate, VFRange &Range); + /// Check if the number of runtime checks exceeds the threshold. + bool requiresTooManyRuntimeChecks(); + protected: /// Collect the instructions from the original loop that would be trivially /// dead in the vectorized loop if generated. diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7591,6 +7591,14 @@ return VectorizationFactor::Disabled(); } +bool LoopVectorizationPlanner::requiresTooManyRuntimeChecks() { + unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks(); + return (NumRuntimePointerChecks > + VectorizerParams::RuntimeMemoryCheckThreshold && + !Hints.allowReordering()) || + NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold; +} + Optional LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { assert(OrigLoop->isInnermost() && "Inner loop expected."); @@ -7659,30 +7667,7 @@ return VectorizationFactor::Disabled(); // Select the optimal vectorization factor. - auto SelectedVF = CM.selectVectorizationFactor(VFCandidates); - - // Check if it is profitable to vectorize with runtime checks. - unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks(); - if (SelectedVF.Width.getKnownMinValue() > 1 && NumRuntimePointerChecks) { - bool PragmaThresholdReached = - NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold; - bool ThresholdReached = - NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold; - if ((ThresholdReached && !Hints.allowReordering()) || - PragmaThresholdReached) { - ORE->emit([&]() { - return OptimizationRemarkAnalysisAliasing( - DEBUG_TYPE, "CantReorderMemOps", OrigLoop->getStartLoc(), - OrigLoop->getHeader()) - << "loop not vectorized: cannot prove it is safe to reorder " - "memory operations"; - }); - LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); - Hints.emitRemarkWithHints(); - return VectorizationFactor::Disabled(); - } - } - return SelectedVF; + return CM.selectVectorizationFactor(VFCandidates); } VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const { @@ -10456,8 +10441,34 @@ if (MaybeVF) { VF = *MaybeVF; + bool requiresTooManyRtChecks = LVP.requiresTooManyRuntimeChecks(); + if (!UserVF && requiresTooManyRtChecks) { + ORE->emit([&]() { + return OptimizationRemarkAnalysisAliasing( + DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(), + L->getHeader()) + << "loop not vectorized: cannot prove it is safe to reorder " + "memory operations"; + }); + LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); + Hints.emitRemarkWithHints(); + VF = VectorizationFactor::Disabled(); + } + // Select the interleave count. IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue()); + if (!UserIC && requiresTooManyRtChecks) { + ORE->emit([&]() { + return OptimizationRemarkAnalysisAliasing( + DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(), + L->getHeader()) + << "loop not interleaved: cannot prove it is safe to reorder " + "memory operations"; + }); + LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); + Hints.emitRemarkWithHints(); + IC = 1; + } } // Identify the diagnostic messages that should be produced. diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/interleaved-pointer-runtime-check-unprofitable.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/interleaved-pointer-runtime-check-unprofitable.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/PowerPC/interleaved-pointer-runtime-check-unprofitable.ll @@ -0,0 +1,91 @@ +; RUN: opt -mtriple=powerpc64-unknown-linux-gnu -mcpu=a2 -S -loop-vectorize < %s -o - | FileCheck %s +; RUN: opt -mtriple=powerpc64-unknown-linux-gnu -mcpu=a2 -S -loop-vectorize -force-vector-interleave=2 < %s -o - | FileCheck -check-prefix=CHECK-INTERLEAVE %s + +; The case will do aggressive interleave on PowerPC, resulting in a lot of memory checks. +; (On the A2, always unroll aggressively. In fact, if aggressive interleaving is enabled, +; similar issues may occur on other targets). +; Interleaving should also be restricted by the threshold of memory checks similar to VF. +; (e.g., runtime-memory-check-threshold, default 8). + +; CHECK-LABEL: @eddy_diff_caleddy_ +; CHECK-NOT: vector.memcheck + +; CHECK-INTERLEAVE-LABEL: @eddy_diff_caleddy_ +; CHECK-INTERLEAVE: vector.memcheck + +define fastcc void @eddy_diff_caleddy_(i64* %wet_cl, i64 %0, i32 %ncol.cast.val) { +entry: + %trip.count = add nuw i32 %ncol.cast.val, 1 + %wide.trip.count = zext i32 %ncol.cast.val to i64 + %1 = shl i64 %0, 1 + %2 = mul i64 %0, 3 + %3 = shl i64 %0, 2 + %4 = mul i64 %0, 5 + %5 = mul i64 %0, 6 + %6 = mul i64 %0, 7 + %7 = shl i64 %0, 3 + %8 = mul i64 %0, 9 + %9 = mul i64 %0, 10 + %10 = mul i64 %0, 11 + %11 = mul i64 %0, 12 + br label %loop.body + +loop.body: + %indvars.iv774 = phi i64 [ 0, %entry ], [ %indvars.iv.next775, %loop.body ] + %12 = add nsw i64 %indvars.iv774, -5 + %13 = add i64 %12, %0 + %14 = getelementptr i64, i64* %wet_cl, i64 %13 + %15 = bitcast i64* %14 to double* + store double 0.000000e+00, double* %15, align 8 + %16 = add i64 %12, %1 + %17 = getelementptr i64, i64* %wet_cl, i64 %16 + %18 = bitcast i64* %17 to double* + store double 0.000000e+00, double* %18, align 8 + %19 = add i64 %12, %2 + %20 = getelementptr i64, i64* %wet_cl, i64 %19 + %21 = bitcast i64* %20 to double* + store double 0.000000e+00, double* %21, align 8 + %22 = add i64 %12, %3 + %23 = getelementptr i64, i64* %wet_cl, i64 %22 + %24 = bitcast i64* %23 to double* + store double 0.000000e+00, double* %24, align 8 + %25 = add i64 %12, %4 + %26 = getelementptr i64, i64* %wet_cl, i64 %25 + %27 = bitcast i64* %26 to double* + store double 0.000000e+00, double* %27, align 8 + %28 = add i64 %12, %5 + %29 = getelementptr i64, i64* %wet_cl, i64 %28 + %30 = bitcast i64* %29 to double* + store double 0.000000e+00, double* %30, align 8 + %31 = add i64 %12, %6 + %32 = getelementptr i64, i64* %wet_cl, i64 %31 + %33 = bitcast i64* %32 to double* + store double 0.000000e+00, double* %33, align 8 + %34 = add i64 %12, %7 + %35 = getelementptr i64, i64* %wet_cl, i64 %34 + %36 = bitcast i64* %35 to double* + store double 0.000000e+00, double* %36, align 8 + %37 = add i64 %12, %8 + %38 = getelementptr i64, i64* %wet_cl, i64 %37 + %39 = bitcast i64* %38 to double* + store double 0.000000e+00, double* %39, align 8 + %40 = add i64 %12, %9 + %41 = getelementptr i64, i64* %wet_cl, i64 %40 + %42 = bitcast i64* %41 to double* + store double 0.000000e+00, double* %42, align 8 + %43 = add i64 %12, %10 + %44 = getelementptr i64, i64* %wet_cl, i64 %43 + %45 = bitcast i64* %44 to double* + store double 0.000000e+00, double* %45, align 8 + %46 = add i64 %12, %11 + %47 = getelementptr i64, i64* %wet_cl, i64 %46 + %48 = bitcast i64* %47 to double* + store double 0.000000e+00, double* %48, align 8 + %indvars.iv.next775 = add nuw nsw i64 %indvars.iv774, 1 + %exitcond778.not = icmp eq i64 %indvars.iv.next775, %wide.trip.count + br i1 %exitcond778.not, label %loop.end, label %loop.body + +loop.end: + ret void +} +