Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1442,7 +1442,8 @@ /// \return An upper bound for the vectorization factor, a power-of-2 larger /// than zero. One is returned if vectorization should best be avoided due /// to cost. - ElementCount computeFeasibleMaxVF(unsigned ConstTripCount); + ElementCount computeFeasibleMaxVF(unsigned ConstTripCount, + ElementCount UserVF); /// The vectorization cost is a combination of the cost itself and a boolean /// indicating whether any of the contributing operations will actually @@ -5235,9 +5236,11 @@ return None; } + ElementCount MaxVF = computeFeasibleMaxVF(TC, UserVF); + switch (ScalarEpilogueStatus) { case CM_ScalarEpilogueAllowed: - return UserVF ? UserVF : computeFeasibleMaxVF(TC); + return MaxVF; case CM_ScalarEpilogueNotNeededUsePredicate: LLVM_DEBUG( dbgs() << "LV: vector predicate hint/switch found.\n" @@ -5273,7 +5276,6 @@ InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); } - ElementCount MaxVF = UserVF ? UserVF : computeFeasibleMaxVF(TC); assert(!MaxVF.isScalable() && "Scalable vectors do not yet support tail folding"); assert((UserVF.isNonZero() || isPowerOf2_32(MaxVF.getFixedValue())) && @@ -5326,7 +5328,9 @@ } ElementCount -LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) { +LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount, + ElementCount UserVF) { + assert(!UserVF.isScalable() && "scalable vectorization not yet handled"); MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); unsigned SmallestType, WidestType; std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); @@ -5338,6 +5342,27 @@ // dependence distance). unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth(); + if (UserVF.isNonZero()) { + // If legally unsafe, clamp the user vectorization factor to a safe value. + unsigned MaxSafeVF = PowerOf2Floor(MaxSafeRegisterWidth / WidestType); + if (UserVF.getFixedValue() <= MaxSafeVF) + return UserVF; + + LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF + << " is unsafe, clamping to max safe VF=" << MaxSafeVF + << ".\n"); + ORE->emit([&]() { + return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", + TheLoop->getStartLoc(), + TheLoop->getHeader()) + << "User-specified vectorization factor " + << ore::NV("UserVectorizationFactor", UserVF) + << " is unsafe, clamping to maximum safe vectorization factor " + << ore::NV("VectorizationFactor", MaxSafeVF); + }); + return ElementCount::getFixed(MaxSafeVF); + } + WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth); // Ensure MaxVF is a power of 2; the dependence distance bound may not be. @@ -6997,9 +7022,12 @@ CM.invalidateCostModelingDecisions(); } - if (!UserVF.isZero()) { + ElementCount MaxVF = MaybeMaxVF.getValue(); + assert(MaxVF.isNonZero() && "MaxVF is zero."); + + if (!UserVF.isZero() && UserVF.getFixedValue() <= MaxVF.getFixedValue()) { LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); - assert(isPowerOf2_32(UserVF.getKnownMinValue()) && + assert(isPowerOf2_32(UserVF.getFixedValue()) && "VF needs to be a power of two"); // Collect the instructions (and their associated costs) that will be more // profitable to scalarize. @@ -7010,9 +7038,6 @@ return {{UserVF, 0}}; } - ElementCount MaxVF = MaybeMaxVF.getValue(); - assert(MaxVF.isNonZero() && "MaxVF is zero."); - for (ElementCount VF = ElementCount::getFixed(1); ElementCount::isKnownLE(VF, MaxVF); VF *= 2) { // Collect Uniform and Scalar instructions after vectorization with VF. Index: llvm/test/Transforms/LoopVectorize/AArch64/unsafe-vf-hint-remark.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/AArch64/unsafe-vf-hint-remark.ll @@ -0,0 +1,42 @@ +; RUN: opt -loop-vectorize -mtriple=arm64-apple-iphoneos -pass-remarks-analysis=loop-vectorize -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck %s + +; Specify a large unsafe vectorization factor of 32 that gets clamped to 16, +; then test an even smaller VF of 2 is selected based on the cost-model. + +; CHECK: LV: User VF=32 is unsafe, clamping to max safe VF=16. +; CHECK: remark: :0:0: User-specified vectorization factor 32 is unsafe, clamping to maximum safe vectorization factor 16 +; CHECK: LV: Selecting VF: 2. +; CHECK-LABEL: @test +; CHECK: <2 x i64> +define void @test(i64* nocapture %a, i64* nocapture readonly %b) { +entry: + br label %loop.header + +loop.header: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %latch ] + %arrayidx = getelementptr inbounds i64, i64* %a, i64 %iv + %0 = load i64, i64* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i64, i64* %b, i64 %iv + %1 = load i64, i64* %arrayidx2, align 4 + %add = add nsw i64 %1, %0 + %2 = add nuw nsw i64 %iv, 16 + %arrayidx5 = getelementptr inbounds i64, i64* %a, i64 %2 + %c = icmp eq i64 %1, 120 + br i1 %c, label %then, label %latch + +then: + store i64 %add, i64* %arrayidx5, align 4 + br label %latch + +latch: + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %exit, label %loop.header, !llvm.loop !0 + +exit: + ret void +} + +!0 = !{!0, !1, !2} +!1 = !{!"llvm.loop.vectorize.width", i64 32} +!2 = !{!"llvm.loop.vectorize.enable", i1 true} Index: llvm/test/Transforms/LoopVectorize/unsafe-vf-hint-remark.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/unsafe-vf-hint-remark.ll @@ -0,0 +1,45 @@ +; RUN: opt -loop-vectorize -pass-remarks-analysis=loop-vectorize -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck %s + +; Make sure the unsafe user specified vectorization factor is clamped. + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" + +; void foo(int *a, int *b) { +; #pragma clang loop vectorize(enable) vectorize_width(4) +; for (int i=0; i < 1024; ++i) { +; a[i + 2] = a[i] + b[i]; +; } +; } + +; CHECK: LV: User VF=4 is unsafe, clamping to max safe VF=2. +; CHECK: remark: :0:0: User-specified vectorization factor 4 is unsafe, clamping to maximum safe vectorization factor 2 +; CHECK-LABEL: @foo +; CHECK: <2 x i32> +define void @foo(i32* %a, i32* %b) { +entry: + br label %loop.ph + +loop.ph: + br label %loop + +loop: + %iv = phi i64 [ 0, %loop.ph ], [ %iv.next, %loop ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %iv + %1 = load i32, i32* %arrayidx2, align 4 + %add = add nsw i32 %1, %0 + %2 = add nuw nsw i64 %iv, 2 + %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2 + store i32 %add, i32* %arrayidx5, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !0 + +exit: + ret void +} + +!0 = !{!0, !1, !2} +!1 = !{!"llvm.loop.vectorize.width", i32 4} +!2 = !{!"llvm.loop.vectorize.enable", i1 true}