Index: llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -6117,17 +6117,20 @@ // If the user doesn't provide a vectorization factor, determine a // reasonable one. if (!UserVF) { - // We set VF to 4 for stress testing. - if (VPlanBuildStressTest) + VF = determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM); + LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); + + // Make sure we have a VF > 1 for stress testing. + if (VPlanBuildStressTest && VF < 2) { + LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " + << "overriding computed VF.\n"); VF = 4; - else - VF = determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM); + } } - assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); assert(isPowerOf2_32(VF) && "VF needs to be a power of two"); - LLVM_DEBUG(dbgs() << "LV: Using " << (UserVF ? "user VF " : "computed VF ") - << VF << " to build VPlans.\n"); + LLVM_DEBUG(dbgs() << "LV: Using " << (UserVF ? "user " : "") << "VF " << VF + << " to build VPlans.\n"); buildVPlans(VF, VF); // For VPlan build stress testing, we bail out after VPlan construction. Index: llvm/trunk/test/Transforms/LoopVectorize/AArch64/outer_loop_test1_no_explicit_vect_width.ll =================================================================== --- llvm/trunk/test/Transforms/LoopVectorize/AArch64/outer_loop_test1_no_explicit_vect_width.ll +++ llvm/trunk/test/Transforms/LoopVectorize/AArch64/outer_loop_test1_no_explicit_vect_width.ll @@ -16,6 +16,7 @@ ; } ; +; CHECK-LABEL: @foo_i32( ; CHECK-LABEL: vector.ph: ; CHECK: %[[SplatVal:.*]] = insertelement <4 x i32> undef, i32 %n, i32 0 ; CHECK: %[[Splat:.*]] = shufflevector <4 x i32> %[[SplatVal]], <4 x i32> undef, <4 x i32> zeroinitializer @@ -48,8 +49,11 @@ @arr2 = external global [8 x i32], align 16 @arr = external global [8 x [8 x i32]], align 16 +@arrX = external global [8 x i64], align 16 +@arrY = external global [8 x [8 x i64]], align 16 + ; Function Attrs: norecurse nounwind uwtable -define void @foo(i32 %n) { +define void @foo_i32(i32 %n) { entry: br label %for.body @@ -79,5 +83,62 @@ ret void } +; CHECK-LABEL: @foo_i64( +; CHECK-LABEL: vector.ph: +; CHECK: %[[SplatVal:.*]] = insertelement <2 x i64> undef, i64 %n, i32 0 +; CHECK: %[[Splat:.*]] = shufflevector <2 x i64> %[[SplatVal]], <2 x i64> undef, <2 x i32> zeroinitializer + +; CHECK-LABEL: vector.body: +; CHECK: %[[Ind:.*]] = phi i64 [ 0, %vector.ph ], [ %[[IndNext:.*]], %[[ForInc:.*]] ] +; CHECK: %[[VecInd:.*]] = phi <2 x i64> [ , %vector.ph ], [ %[[VecIndNext:.*]], %[[ForInc]] ] +; CHECK: %[[AAddr:.*]] = getelementptr inbounds [8 x i64], [8 x i64]* @arrX, i64 0, <2 x i64> %[[VecInd]] +; CHECK: call void @llvm.masked.scatter.v2i64.v2p0i64(<2 x i64> %[[VecInd]], <2 x i64*> %[[AAddr]], i32 4, <2 x i1> ) +; CHECK: %[[StoreVal:.*]] = add nsw <2 x i64> %[[VecInd]], %[[Splat]] +; CHECK: br label %[[InnerLoop:.+]] + +; CHECK: [[InnerLoop]]: +; CHECK: %[[InnerPhi:.*]] = phi <2 x i64> [ %[[InnerPhiNext:.*]], %[[InnerLoop]] ], [ zeroinitializer, %vector.body ] +; CHECK: %[[AAddr2:.*]] = getelementptr inbounds [8 x [8 x i64]], [8 x [8 x i64]]* @arrY, i64 0, <2 x i64> %[[InnerPhi]], <2 x i64> %[[VecInd]] +; CHECK: call void @llvm.masked.scatter.v2i64.v2p0i64(<2 x i64> %[[StoreVal]], <2 x i64*> %[[AAddr2]], i32 4, <2 x i1> +; CHECK: %[[InnerPhiNext]] = add nuw nsw <2 x i64> %[[InnerPhi]], +; CHECK: %[[VecCond:.*]] = icmp eq <2 x i64> %[[InnerPhiNext]], +; CHECK: %[[InnerCond:.*]] = extractelement <2 x i1> %[[VecCond]], i32 0 +; CHECK: br i1 %[[InnerCond]], label %[[ForInc]], label %[[InnerLoop]] + +; CHECK: [[ForInc]]: +; CHECK: %[[IndNext]] = add i64 %[[Ind]], 2 +; CHECK: %[[VecIndNext]] = add <2 x i64> %[[VecInd]], +; CHECK: %[[Cmp:.*]] = icmp eq i64 %[[IndNext]], 8 +; CHECK: br i1 %[[Cmp]], label %middle.block, label %vector.body +; Function Attrs: norecurse nounwind uwtable +define void @foo_i64(i64 %n) { +entry: + br label %for.body + +for.body: ; preds = %for.inc8, %entry + %indvars.iv21 = phi i64 [ 0, %entry ], [ %indvars.iv.next22, %for.inc8 ] + %arrayidx = getelementptr inbounds [8 x i64], [8 x i64]* @arrX, i64 0, i64 %indvars.iv21 + store i64 %indvars.iv21, i64* %arrayidx, align 4 + %add = add nsw i64 %indvars.iv21, %n + br label %for.body3 + +for.body3: ; preds = %for.body3, %for.body + %indvars.iv = phi i64 [ 0, %for.body ], [ %indvars.iv.next, %for.body3 ] + %arrayidx7 = getelementptr inbounds [8 x [8 x i64]], [8 x [8 x i64]]* @arrY, i64 0, i64 %indvars.iv, i64 %indvars.iv21 + store i64 %add, i64* %arrayidx7, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 8 + br i1 %exitcond, label %for.inc8, label %for.body3 + +for.inc8: ; preds = %for.body3 + %indvars.iv.next22 = add nuw nsw i64 %indvars.iv21, 1 + %exitcond23 = icmp eq i64 %indvars.iv.next22, 8 + br i1 %exitcond23, label %for.end10, label %for.body, !llvm.loop !1 + +for.end10: ; preds = %for.inc8 + ret void +} + + !1 = distinct !{!1, !2} !2 = !{!"llvm.loop.vectorize.enable", i1 true} Index: llvm/trunk/test/Transforms/LoopVectorize/explicit_outer_detection.ll =================================================================== --- llvm/trunk/test/Transforms/LoopVectorize/explicit_outer_detection.ll +++ llvm/trunk/test/Transforms/LoopVectorize/explicit_outer_detection.ll @@ -73,7 +73,7 @@ ; CHECK-LABEL: case2 ; CHECK: LV: Loop hints: force=enabled width=0 unroll=0 ; CHECK: LV: We can vectorize this outer loop! -; CHECK: LV: Using computed VF 1 to build VPlans. +; CHECK: LV: Using VF 1 to build VPlans. define void @case2(i32* nocapture %a, i32* nocapture readonly %b, i32 %N, i32 %M) local_unnamed_addr { entry: Index: llvm/trunk/test/Transforms/LoopVectorize/vplan-stress-test-no-explict-vf.ll =================================================================== --- llvm/trunk/test/Transforms/LoopVectorize/vplan-stress-test-no-explict-vf.ll +++ llvm/trunk/test/Transforms/LoopVectorize/vplan-stress-test-no-explict-vf.ll @@ -0,0 +1,44 @@ +; RUN: opt < %s -S -loop-vectorize -enable-vplan-native-path -vplan-build-stress-test -debug-only=loop-vectorize -disable-output 2>&1 | FileCheck %s + +; This test checks that, when stress testing VPlan, if the computed VF +; is 1, we override it to VF = 4. + +; CHECK: LV: VPlan computed VF 1. +; CHECK: LV: VPlan stress testing: overriding computed VF. +; CHECK: LV: Using VF 4 to build VPlans. +@arr2 = external global [8 x i32], align 16 +@arr = external global [8 x [8 x i32]], align 16 + +; Function Attrs: norecurse nounwind uwtable +define void @foo(i32 %n) { +entry: + br label %for.body + +for.body: ; preds = %for.inc8, %entry + %indvars.iv21 = phi i64 [ 0, %entry ], [ %indvars.iv.next22, %for.inc8 ] + %arrayidx = getelementptr inbounds [8 x i32], [8 x i32]* @arr2, i64 0, i64 %indvars.iv21 + %0 = trunc i64 %indvars.iv21 to i32 + store i32 %0, i32* %arrayidx, align 4 + %1 = trunc i64 %indvars.iv21 to i32 + %add = add nsw i32 %1, %n + br label %for.body3 + +for.body3: ; preds = %for.body3, %for.body + %indvars.iv = phi i64 [ 0, %for.body ], [ %indvars.iv.next, %for.body3 ] + %arrayidx7 = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* @arr, i64 0, i64 %indvars.iv, i64 %indvars.iv21 + store i32 %add, i32* %arrayidx7, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 8 + br i1 %exitcond, label %for.inc8, label %for.body3 + +for.inc8: ; preds = %for.body3 + %indvars.iv.next22 = add nuw nsw i64 %indvars.iv21, 1 + %exitcond23 = icmp eq i64 %indvars.iv.next22, 8 + br i1 %exitcond23, label %for.end10, label %for.body, !llvm.loop !1 + +for.end10: ; preds = %for.inc8 + ret void +} + +!1 = distinct !{!1, !2} +!2 = !{!"llvm.loop.vectorize.enable", i1 true}