Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1442,7 +1442,7 @@ /// \return An upper bound for the vectorization factor, a power-of-2 larger /// than zero. One is returned if vectorization should best be avoided due /// to cost. - unsigned computeFeasibleMaxVF(unsigned ConstTripCount); + unsigned computeFeasibleMaxVF(unsigned ConstTripCount, unsigned UserVF); /// The vectorization cost is a combination of the cost itself and a boolean /// indicating whether any of the contributing operations will actually @@ -5234,9 +5234,11 @@ return None; } + auto MaxVF = computeFeasibleMaxVF(TC, UserVF); + switch (ScalarEpilogueStatus) { case CM_ScalarEpilogueAllowed: - return UserVF ? UserVF : computeFeasibleMaxVF(TC); + return MaxVF; case CM_ScalarEpilogueNotNeededUsePredicate: LLVM_DEBUG( dbgs() << "LV: vector predicate hint/switch found.\n" @@ -5272,8 +5274,7 @@ InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); } - unsigned MaxVF = UserVF ? UserVF : computeFeasibleMaxVF(TC); - assert((UserVF || isPowerOf2_32(MaxVF)) && "MaxVF must be a power of 2"); + assert(isPowerOf2_32(MaxVF) && "MaxVF must be a power of 2"); unsigned MaxVFtimesIC = UserIC ? MaxVF * UserIC : MaxVF; if (TC > 0 && TC % MaxVFtimesIC == 0) { // Accept MaxVF if we do not have a tail. @@ -5321,7 +5322,8 @@ } unsigned -LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) { +LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount, + unsigned UserVF) { MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); unsigned SmallestType, WidestType; std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); @@ -5333,6 +5335,18 @@ // dependence distance). unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth(); + if (UserVF) { + // If legally unsafe, clamp the user vectorization factor to a safe value. + auto MaxSafeVF = PowerOf2Floor(MaxSafeRegisterWidth / WidestType); + if (UserVF <= MaxSafeVF) + return UserVF; + + LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF + << " is unsafe, using maximum safe VF=" << MaxSafeVF + << ".\n"); + return MaxSafeVF; + } + WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth); // Ensure MaxVF is a power of 2; the dependence distance bound may not be. @@ -7000,7 +7014,10 @@ CM.invalidateCostModelingDecisions(); } - if (!UserVF.isZero()) { + unsigned MaxVF = MaybeMaxVF.getValue(); + assert(MaxVF != 0 && "MaxVF is zero."); + + if (!UserVF.isZero() && UserVF.getKnownMinValue() <= MaxVF) { LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); assert(isPowerOf2_32(UserVF.getKnownMinValue()) && "VF needs to be a power of two"); @@ -7014,9 +7031,6 @@ return {{UserVF, 0}}; } - unsigned MaxVF = MaybeMaxVF.getValue(); - assert(MaxVF != 0 && "MaxVF is zero."); - for (unsigned VF = 1; VF <= MaxVF; VF *= 2) { // Collect Uniform and Scalar instructions after vectorization with VF. CM.collectUniformsAndScalars(ElementCount::getFixed(VF)); Index: llvm/test/Transforms/LoopVectorize/unsafe-vf-remark.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/unsafe-vf-remark.ll @@ -0,0 +1,46 @@ +; RUN: opt -loop-vectorize -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck %s + +; Make sure the unsafe user specified vectorization factor is clamped. + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" + +; void foo(int *a, int *b, int N) { +; #pragma clang loop vectorize(enable) vectorize_width(4) +; for (int i=0; i +define void @foo(i32* nocapture %a, i32* nocapture readonly %b, i32 %N) { +entry: + %cmp12 = icmp sgt i32 %N, 0 + br i1 %cmp12, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %N to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv + %1 = load i32, i32* %arrayidx2, align 4 + %add = add nsw i32 %1, %0 + %2 = add nuw nsw i64 %indvars.iv, 2 + %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2 + store i32 %add, i32* %arrayidx5, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !0 +} + +!0 = !{!0, !1, !2} +!1 = !{!"llvm.loop.vectorize.width", i32 4} +!2 = !{!"llvm.loop.vectorize.enable", i1 true}