diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -319,6 +319,8 @@ getDecisionAndClampRange(const std::function &Predicate, VFRange &Range); + bool hasTooManyRuntimeChecks(); + protected: /// Collect the instructions from the original loop that would be trivially /// dead in the vectorized loop if generated. diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7591,6 +7591,14 @@ return VectorizationFactor::Disabled(); } +bool LoopVectorizationPlanner::hasTooManyRuntimeChecks() { + unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks(); + return (NumRuntimePointerChecks > + VectorizerParams::RuntimeMemoryCheckThreshold && + !Hints.allowReordering()) || + NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold; +} + Optional LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { assert(OrigLoop->isInnermost() && "Inner loop expected."); @@ -7662,14 +7670,8 @@ auto SelectedVF = CM.selectVectorizationFactor(VFCandidates); // Check if it is profitable to vectorize with runtime checks. - unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks(); - if (SelectedVF.Width.getKnownMinValue() > 1 && NumRuntimePointerChecks) { - bool PragmaThresholdReached = - NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold; - bool ThresholdReached = - NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold; - if ((ThresholdReached && !Hints.allowReordering()) || - PragmaThresholdReached) { + if (SelectedVF.Width.getKnownMinValue() > 1) { + if (hasTooManyRuntimeChecks()) { ORE->emit([&]() { return OptimizationRemarkAnalysisAliasing( DEBUG_TYPE, "CantReorderMemOps", OrigLoop->getStartLoc(), @@ -10457,7 +10459,9 @@ if (MaybeVF) { VF = *MaybeVF; // Select the interleave count. - IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue()); + if (!LVP.hasTooManyRuntimeChecks()) { + IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue()); + } } // Identify the diagnostic messages that should be produced. diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-pointer-runtime-check-unprofitable.ll b/llvm/test/Transforms/LoopVectorize/interleaved-pointer-runtime-check-unprofitable.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/interleaved-pointer-runtime-check-unprofitable.ll @@ -0,0 +1,88 @@ +; RUN: opt -mtriple=powerpc64-unknown-linux-gnu -mcpu=a2 -S -loop-vectorize < %s -o - | FileCheck %s + +; The case will do aggressive interleave on PowerPC, resulting in a lot of memory checks. +; (On the A2, always unroll aggressively. In fact, if aggressive interleaving is enabled, +; similar issues may occur on other targets). +; Interleaving should also be restricted by the threshold of memory checks similar to VF. +; (e.g., runtime-memory-check-threshold, default 8). + +; CHECK-LABEL: @eddy_diff_caleddy_ +; CHECK-NOT: vector.memcheck + +define fastcc void @eddy_diff_caleddy_(i64* %wet_cl, i64* %z_e_8676_1962, i32 %ncol.cast.val) { +L.LB7_2249.preheader: + %0 = load i64, i64* %z_e_8676_1962, align 8 + %trip.count = add nuw i32 %ncol.cast.val, 1 + %wide.trip.count = zext i32 %ncol.cast.val to i64 + %1 = shl i64 %0, 1 + %2 = mul i64 %0, 3 + %3 = shl i64 %0, 2 + %4 = mul i64 %0, 5 + %5 = mul i64 %0, 6 + %6 = mul i64 %0, 7 + %7 = shl i64 %0, 3 + %8 = mul i64 %0, 9 + %9 = mul i64 %0, 10 + %10 = mul i64 %0, 11 + %11 = mul i64 %0, 12 + br label %L.LB7_2249 + +L.LB7_2249: ; preds = %L.LB7_2249, %L.LB7_2249.preheader + %indvars.iv774 = phi i64 [ 0, %L.LB7_2249.preheader ], [ %indvars.iv.next775, %L.LB7_2249 ] + %12 = add nsw i64 %indvars.iv774, -5 + %13 = add i64 %12, %0 + %14 = getelementptr i64, i64* %wet_cl, i64 %13 + %15 = bitcast i64* %14 to double* + store double 0.000000e+00, double* %15, align 8 + %16 = add i64 %12, %1 + %17 = getelementptr i64, i64* %wet_cl, i64 %16 + %18 = bitcast i64* %17 to double* + store double 0.000000e+00, double* %18, align 8 + %19 = add i64 %12, %2 + %20 = getelementptr i64, i64* %wet_cl, i64 %19 + %21 = bitcast i64* %20 to double* + store double 0.000000e+00, double* %21, align 8 + %22 = add i64 %12, %3 + %23 = getelementptr i64, i64* %wet_cl, i64 %22 + %24 = bitcast i64* %23 to double* + store double 0.000000e+00, double* %24, align 8 + %25 = add i64 %12, %4 + %26 = getelementptr i64, i64* %wet_cl, i64 %25 + %27 = bitcast i64* %26 to double* + store double 0.000000e+00, double* %27, align 8 + %28 = add i64 %12, %5 + %29 = getelementptr i64, i64* %wet_cl, i64 %28 + %30 = bitcast i64* %29 to double* + store double 0.000000e+00, double* %30, align 8 + %31 = add i64 %12, %6 + %32 = getelementptr i64, i64* %wet_cl, i64 %31 + %33 = bitcast i64* %32 to double* + store double 0.000000e+00, double* %33, align 8 + %34 = add i64 %12, %7 + %35 = getelementptr i64, i64* %wet_cl, i64 %34 + %36 = bitcast i64* %35 to double* + store double 0.000000e+00, double* %36, align 8 + %37 = add i64 %12, %8 + %38 = getelementptr i64, i64* %wet_cl, i64 %37 + %39 = bitcast i64* %38 to double* + store double 0.000000e+00, double* %39, align 8 + %40 = add i64 %12, %9 + %41 = getelementptr i64, i64* %wet_cl, i64 %40 + %42 = bitcast i64* %41 to double* + store double 0.000000e+00, double* %42, align 8 + %43 = add i64 %12, %10 + %44 = getelementptr i64, i64* %wet_cl, i64 %43 + %45 = bitcast i64* %44 to double* + store double 0.000000e+00, double* %45, align 8 + %46 = add i64 %12, %11 + %47 = getelementptr i64, i64* %wet_cl, i64 %46 + %48 = bitcast i64* %47 to double* + store double 0.000000e+00, double* %48, align 8 + %indvars.iv.next775 = add nuw nsw i64 %indvars.iv774, 1 + %exitcond778.not = icmp eq i64 %indvars.iv.next775, %wide.trip.count + br i1 %exitcond778.not, label %L.LB7_2330.preheader, label %L.LB7_2249 + +L.LB7_2330.preheader: ; preds = %L.LB7_2249 + ret void +} +