Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1442,7 +1442,8 @@ /// \return An upper bound for the vectorization factor, a power-of-2 larger /// than zero. One is returned if vectorization should best be avoided due /// to cost. - ElementCount computeFeasibleMaxVF(unsigned ConstTripCount); + ElementCount computeFeasibleMaxVF(unsigned ConstTripCount, + ElementCount UserVF); /// The vectorization cost is a combination of the cost itself and a boolean /// indicating whether any of the contributing operations will actually @@ -5235,9 +5236,11 @@ return None; } + ElementCount MaxVF = computeFeasibleMaxVF(TC, UserVF); + switch (ScalarEpilogueStatus) { case CM_ScalarEpilogueAllowed: - return UserVF ? UserVF : computeFeasibleMaxVF(TC); + return MaxVF; case CM_ScalarEpilogueNotNeededUsePredicate: LLVM_DEBUG( dbgs() << "LV: vector predicate hint/switch found.\n" @@ -5273,7 +5276,6 @@ InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); } - ElementCount MaxVF = UserVF ? UserVF : computeFeasibleMaxVF(TC); assert(!MaxVF.isScalable() && "Scalable vectors do not yet support tail folding"); assert((UserVF.isNonZero() || isPowerOf2_32(MaxVF.getFixedValue())) && @@ -5326,7 +5328,9 @@ } ElementCount -LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) { +LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount, + ElementCount UserVF) { + assert(!UserVF.isScalable() && "scalable vectorization not yet handled"); MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); unsigned SmallestType, WidestType; std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); @@ -5338,6 +5342,27 @@ // dependence distance). unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth(); + if (UserVF.isNonZero()) { + // If legally unsafe, clamp the user vectorization factor to a safe value. + unsigned MaxSafeVF = PowerOf2Floor(MaxSafeRegisterWidth / WidestType); + if (UserVF.getFixedValue() <= MaxSafeVF) + return UserVF; + + LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF + << " is unsafe, clamping to max safe VF=" << MaxSafeVF + << ".\n"); + ORE->emit([&]() { + return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", + TheLoop->getStartLoc(), + TheLoop->getHeader()) + << "User-specified vectorization factor " + << ore::NV("UserVectorizationFactor", UserVF) + << " is unsafe, clamping to maximum safe vectorization factor " + << ore::NV("VectorizationFactor", MaxSafeVF); + }); + return ElementCount::getFixed(MaxSafeVF); + } + WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth); // Ensure MaxVF is a power of 2; the dependence distance bound may not be. @@ -6997,9 +7022,12 @@ CM.invalidateCostModelingDecisions(); } - if (!UserVF.isZero()) { + ElementCount MaxVF = MaybeMaxVF.getValue(); + assert(MaxVF.isNonZero() && "MaxVF is zero."); + + if (!UserVF.isZero() && UserVF.getFixedValue() <= MaxVF.getFixedValue()) { LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); - assert(isPowerOf2_32(UserVF.getKnownMinValue()) && + assert(isPowerOf2_32(UserVF.getFixedValue()) && "VF needs to be a power of two"); // Collect the instructions (and their associated costs) that will be more // profitable to scalarize. @@ -7010,9 +7038,6 @@ return {{UserVF, 0}}; } - ElementCount MaxVF = MaybeMaxVF.getValue(); - assert(MaxVF.isNonZero() && "MaxVF is zero."); - for (ElementCount VF = ElementCount::getFixed(1); ElementCount::isKnownLE(VF, MaxVF); VF *= 2) { // Collect Uniform and Scalar instructions after vectorization with VF. Index: llvm/test/Transforms/LoopVectorize/unsafe-vf-hint-remark.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/unsafe-vf-hint-remark.ll @@ -0,0 +1,47 @@ +; RUN: opt -loop-vectorize -pass-remarks-analysis=loop-vectorize -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck %s + +; Make sure the unsafe user specified vectorization factor is clamped. + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" + +; void foo(int *a, int *b, int N) { +; #pragma clang loop vectorize(enable) vectorize_width(4) +; for (int i=0; i:0:0: User-specified vectorization factor 4 is unsafe, clamping to maximum safe vectorization factor 2 +; CHECK-LABEL: @foo +; CHECK: <2 x i32> +define void @foo(i32* nocapture %a, i32* nocapture readonly %b, i32 %N) { +entry: + %cmp12 = icmp sgt i32 %N, 0 + br i1 %cmp12, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %N to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv + %1 = load i32, i32* %arrayidx2, align 4 + %add = add nsw i32 %1, %0 + %2 = add nuw nsw i64 %indvars.iv, 2 + %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2 + store i32 %add, i32* %arrayidx5, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !0 +} + +!0 = !{!0, !1, !2} +!1 = !{!"llvm.loop.vectorize.width", i32 4} +!2 = !{!"llvm.loop.vectorize.enable", i1 true}