Index: lib/Transforms/Vectorize/LoopVectorizationPlanner.h =================================================================== --- lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -230,7 +230,8 @@ /// Use the VPlan-native path to plan how to best vectorize, return the best /// VF and its cost. - VectorizationFactor planInVPlanNativePath(bool OptForSize, unsigned UserVF); + VectorizationFactor planInVPlanNativePath(const bool OptForSize, + const unsigned UserVF); /// Finalize the best decision and dispose of all other VPlans. void setBestPlan(unsigned VF, unsigned UF); Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1383,12 +1383,6 @@ return false; } - if (!Hints.getWidth()) { - LLVM_DEBUG(dbgs() << "LV: Not vectorizing: No user vector width.\n"); - Hints.emitRemarkWithHints(); - return false; - } - if (Hints.getInterleave() > 1) { // TODO: Interleave support is future work. LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " @@ -6081,31 +6075,49 @@ } } +// TODO: we could return a pair of values that specify the max VF and +// min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of +// `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment +// doesn't have a cost model that can choose which plan to execute if +// more than one is generated. +unsigned determineVPlanVF(const unsigned WidestVectorRegBits, + LoopVectorizationCostModel &CM) { + unsigned WidestType; + std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); + return WidestVectorRegBits / WidestType; +} + VectorizationFactor -LoopVectorizationPlanner::planInVPlanNativePath(bool OptForSize, - unsigned UserVF) { +LoopVectorizationPlanner::planInVPlanNativePath(const bool OptForSize, + const unsigned UserVF) { + unsigned VF = UserVF; // Outer loop handling: They may require CFG and instruction level // transformations before even evaluating whether vectorization is profitable. // Since we cannot modify the incoming IR, we need to build VPlan upfront in // the vectorization pipeline. if (!OrigLoop->empty()) { - // TODO: If UserVF is not provided, we set UserVF to 4 for stress testing. - // This won't be necessary when UserVF is not required in the VPlan-native - // path. - if (VPlanBuildStressTest && !UserVF) - UserVF = 4; + // If the user doesn't provide a vectorization factor, determine a + // reasonable one. + if (!UserVF) + VF = determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM); + + // We set VF to 4 for stress testing. + if (VPlanBuildStressTest) + VF = 4; assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); - assert(UserVF && "Expected UserVF for outer loop vectorization."); - assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two"); - LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); - buildVPlans(UserVF, UserVF); + assert(isPowerOf2_32(VF) && "VF needs to be a power of two"); + const auto Msg = [](const bool IsUserVF) -> const char * { + return IsUserVF ? "user VF " : "computed VF "; + }; + LLVM_DEBUG(dbgs() << "LV: Using " << Msg(UserVF) << VF << ".\n"); + buildVPlans(VF, VF); // For VPlan build stress testing, we bail out after VPlan construction. if (VPlanBuildStressTest) return VectorizationFactor::Disabled(); - return {UserVF, 0}; + return {VF, 0}; } LLVM_DEBUG( @@ -7128,7 +7140,7 @@ LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM); // Get user vectorization factor. - unsigned UserVF = Hints.getWidth(); + const unsigned UserVF = Hints.getWidth(); // Check the function attributes to find out if this function should be // optimized for size. @@ -7136,16 +7148,17 @@ Hints.getForce() != LoopVectorizeHints::FK_Enabled && F->optForSize(); // Plan how to best vectorize, return the best VF and its cost. - VectorizationFactor VF = LVP.planInVPlanNativePath(OptForSize, UserVF); + const VectorizationFactor VF = LVP.planInVPlanNativePath(OptForSize, UserVF); // If we are stress testing VPlan builds, do not attempt to generate vector // code. Masked vector code generation support will follow soon. - if (VPlanBuildStressTest || EnableVPlanPredication) + // Also, do not attempt to vectorize if no vector code will be produced. + if (VPlanBuildStressTest || EnableVPlanPredication || VF.Width == 1) return false; LVP.setBestPlan(VF.Width, 1); - InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, UserVF, 1, LVL, + InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, &CM); LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" << L->getHeader()->getParent()->getName() << "\"\n"); Index: test/Transforms/LoopVectorize/explicit_outer_detection.ll =================================================================== --- test/Transforms/LoopVectorize/explicit_outer_detection.ll +++ test/Transforms/LoopVectorize/explicit_outer_detection.ll @@ -72,10 +72,9 @@ ; be collected. ; CHECK-LABEL: case2 -; CHECK-NOT: LV: Loop hints: force=enabled -; CHECK-NOT: LV: We can vectorize this outer loop! -; CHECK: LV: Loop hints: force=? -; CHECK: LV: Found a loop: inner.body +; CHECK: LV: Loop hints: force=enabled width=0 unroll=0 +; CHECK: LV: We can vectorize this outer loop! +; CHECK: LV: Using computed VF 1. define void @case2(i32* nocapture %a, i32* nocapture readonly %b, i32 %N, i32 %M) local_unnamed_addr { entry: Index: test/Transforms/LoopVectorize/outer_loop_test1_no_explicit_vect_width.ll =================================================================== --- /dev/null +++ test/Transforms/LoopVectorize/outer_loop_test1_no_explicit_vect_width.ll @@ -0,0 +1,113 @@ +; extern int arr[8][8]; +; extern int arr2[8]; +; +; void foo(int n) +; { +; int i1, i2; +; +; #pragma clang loop vectorize(enable) +; for (i1 = 0; i1 < 8; i1++) { +; arr2[i1] = i1; +; for (i2 = 0; i2 < 8; i2++) +; arr[i2][i1] = i1 + n; +; } +; } +; +; RUN: opt -S -loop-vectorize -enable-vplan-native-path -mtriple aarch64-gnu-linux < %s | FileCheck %s +; RUN: opt -S -loop-vectorize -enable-vplan-native-path -mtriple x86_64 < %s | FileCheck %s +; CHECK-LABEL: vector.ph: +; CHECK: %[[SplatVal:.*]] = insertelement <4 x i32> undef, i32 %n, i32 0 +; CHECK: %[[Splat:.*]] = shufflevector <4 x i32> %[[SplatVal]], <4 x i32> undef, <4 x i32> zeroinitializer + +; CHECK-LABEL: vector.body: +; CHECK: %[[Ind:.*]] = phi i64 [ 0, %vector.ph ], [ %[[IndNext:.*]], %[[ForInc:.*]] ] +; CHECK: %[[VecInd:.*]] = phi <4 x i64> [ , %vector.ph ], [ %[[VecIndNext:.*]], %[[ForInc]] ] +; CHECK: %[[AAddr:.*]] = getelementptr inbounds [8 x i32], [8 x i32]* @arr2, i64 0, <4 x i64> %[[VecInd]] +; CHECK: %[[VecIndTr:.*]] = trunc <4 x i64> %[[VecInd]] to <4 x i32> +; CHECK: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %[[VecIndTr]], <4 x i32*> %[[AAddr]], i32 4, <4 x i1> ) +; CHECK: %[[VecIndTr2:.*]] = trunc <4 x i64> %[[VecInd]] to <4 x i32> +; CHECK: %[[StoreVal:.*]] = add nsw <4 x i32> %[[VecIndTr2]], %[[Splat]] +; CHECK: br label %[[InnerLoop:.+]] + +; CHECK: [[InnerLoop]]: +; CHECK: %[[InnerPhi:.*]] = phi <4 x i64> [ %[[InnerPhiNext:.*]], %[[InnerLoop]] ], [ zeroinitializer, %vector.body ] +; CHECK: %[[AAddr2:.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* @arr, i64 0, <4 x i64> %[[InnerPhi]], <4 x i64> %[[VecInd]] +; CHECK: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %[[StoreVal]], <4 x i32*> %[[AAddr2]], i32 4, <4 x i1> %[[InnerPhi]], +; CHECK: %[[VecCond:.*]] = icmp eq <4 x i64> %[[InnerPhiNext]], +; CHECK: %[[InnerCond:.*]] = extractelement <4 x i1> %[[VecCond]], i32 0 +; CHECK: br i1 %[[InnerCond]], label %[[ForInc]], label %[[InnerLoop]] + +; CHECK: [[ForInc]]: +; CHECK: %[[IndNext]] = add i64 %[[Ind]], 4 +; CHECK: %[[VecIndNext]] = add <4 x i64> %[[VecInd]], +; CHECK: %[[Cmp:.*]] = icmp eq i64 %[[IndNext]], 8 +; CHECK: br i1 %[[Cmp]], label %middle.block, label %vector.body + +; RUN: opt -S -loop-vectorize -enable-vplan-native-path -mtriple x86_64 -mattr=+avx < %s | FileCheck %s --check-prefix=AVX +; RUN: opt -S -loop-vectorize -enable-vplan-native-path -mtriple x86_64 -mattr=+avx2 < %s | FileCheck %s --check-prefix=AVX +; AVX-LABEL: vector.ph: +; AVX: %[[SplatVal:.*]] = insertelement <8 x i32> undef, i32 %n, i32 0 +; AVX: %[[Splat:.*]] = shufflevector <8 x i32> %[[SplatVal]], <8 x i32> undef, <8 x i32> zeroinitializer + +; AVX-LABEL: vector.body: +; AVX: %[[Ind:.*]] = phi i64 [ 0, %vector.ph ], [ %[[IndNext:.*]], %[[ForInc:.*]] ] +; AVX: %[[VecInd:.*]] = phi <8 x i64> [ , %vector.ph ], [ %[[VecIndNext:.*]], %[[ForInc]] ] +; AVX: %[[AAddr:.*]] = getelementptr inbounds [8 x i32], [8 x i32]* @arr2, i64 0, <8 x i64> %[[VecInd]] +; AVX: %[[VecIndTr:.*]] = trunc <8 x i64> %[[VecInd]] to <8 x i32> +; AVX: call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %[[VecIndTr]], <8 x i32*> %[[AAddr]], i32 4, <8 x i1> ) +; AVX: %[[VecIndTr2:.*]] = trunc <8 x i64> %[[VecInd]] to <8 x i32> +; AVX: %[[StoreVal:.*]] = add nsw <8 x i32> %[[VecIndTr2]], %[[Splat]] +; AVX: br label %[[InnerLoop:.+]] + +; AVX: [[InnerLoop]]: +; AVX: %[[InnerPhi:.*]] = phi <8 x i64> [ %[[InnerPhiNext:.*]], %[[InnerLoop]] ], [ zeroinitializer, %vector.body ] +; AVX: %[[AAddr2:.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* @arr, i64 0, <8 x i64> %[[InnerPhi]], <8 x i64> %[[VecInd]] +; AVX: call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %[[StoreVal]], <8 x i32*> %[[AAddr2]], i32 4, <8 x i1> %[[InnerPhi]], +; AVX: %[[VecCond:.*]] = icmp eq <8 x i64> %[[InnerPhiNext]], +; AVX: %[[InnerCond:.*]] = extractelement <8 x i1> %[[VecCond]], i32 0 +; AVX: br i1 %[[InnerCond]], label %[[ForInc]], label %[[InnerLoop]] + +; AVX: [[ForInc]]: +; AVX: %[[IndNext]] = add i64 %[[Ind]], 8 +; AVX: %[[VecIndNext]] = add <8 x i64> %[[VecInd]], +; AVX: %[[Cmp:.*]] = icmp eq i64 %[[IndNext]], 8 +; AVX: br i1 %[[Cmp]], label %middle.block, label %vector.body + +@arr2 = external global [8 x i32], align 16 +@arr = external global [8 x [8 x i32]], align 16 + +; Function Attrs: norecurse nounwind uwtable +define void @foo(i32 %n) { +entry: + br label %for.body + +for.body: ; preds = %for.inc8, %entry + %indvars.iv21 = phi i64 [ 0, %entry ], [ %indvars.iv.next22, %for.inc8 ] + %arrayidx = getelementptr inbounds [8 x i32], [8 x i32]* @arr2, i64 0, i64 %indvars.iv21 + %0 = trunc i64 %indvars.iv21 to i32 + store i32 %0, i32* %arrayidx, align 4 + %1 = trunc i64 %indvars.iv21 to i32 + %add = add nsw i32 %1, %n + br label %for.body3 + +for.body3: ; preds = %for.body3, %for.body + %indvars.iv = phi i64 [ 0, %for.body ], [ %indvars.iv.next, %for.body3 ] + %arrayidx7 = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* @arr, i64 0, i64 %indvars.iv, i64 %indvars.iv21 + store i32 %add, i32* %arrayidx7, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 8 + br i1 %exitcond, label %for.inc8, label %for.body3 + +for.inc8: ; preds = %for.body3 + %indvars.iv.next22 = add nuw nsw i64 %indvars.iv21, 1 + %exitcond23 = icmp eq i64 %indvars.iv.next22, 8 + br i1 %exitcond23, label %for.end10, label %for.body, !llvm.loop !1 + +for.end10: ; preds = %for.inc8 + ret void +} + +!1 = distinct !{!1, !2} +!2 = !{!"llvm.loop.vectorize.enable", i1 true}