Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -998,8 +998,6 @@ /// width \p VF. Return CM_Unknown if this instruction did not pass /// through the cost modeling. InstWidening getWideningDecision(Instruction *I, unsigned VF) { - assert(VF >= 2 && "Expected VF >=2"); - // Cost model is not run in the VPlan-native path - return conservative // result until this changes. if (EnableVPlanNativePath) @@ -1379,12 +1377,6 @@ return false; } - if (!Hints.getWidth()) { - LLVM_DEBUG(dbgs() << "LV: Not vectorizing: No user vector width.\n"); - Hints.emitRemarkWithHints(); - return false; - } - if (Hints.getInterleave() > 1) { // TODO: Interleave support is future work. LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " @@ -3792,8 +3784,7 @@ // PHIs where all control flow is uniform. We simply widen these PHIs. // Create a vector phi with no operands - the vector phi operands will be // set at the end of vector code generation. - Type *VecTy = - (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF); + Type *VecTy = VectorType::get(PN->getType(), VF); Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); VectorLoopValueMap.setVectorValue(P, 0, VecPhi); OrigPHIsToFix.push_back(P); @@ -7104,6 +7095,23 @@ State.ILV->vectorizeMemoryInstruction(&Instr, &MaskValues); } +unsigned guessVPlanVF(Loop &L, unsigned WidestVectorRegBits) { + unsigned Max = 1; + + for (auto B : L.blocks()) { + for (auto I = B->begin(), E=B->end(); I!=E; ++I ) { + if (!isa(*I) && !isa(*I)) + continue; + + Type *ScalarDataTy = getMemInstValueType(&(*I)); + unsigned Size = ScalarDataTy->getPrimitiveSizeInBits(); + Max = std::max(Size, Max); + } + } + + return WidestVectorRegBits / Max; +} + // Process the loop in the VPlan-native vectorization path. This path builds // VPlan upfront in the vectorization pipeline, which allows to apply // VPlan-to-VPlan transformations from the very beginning without modifying the @@ -7127,6 +7135,11 @@ // Get user vectorization factor. unsigned UserVF = Hints.getWidth(); + // If the user doesn't provide a vectorization factor, determine a + // reasonable one. + if (!UserVF) + UserVF = guessVPlanVF(*L, TTI->getRegisterBitWidth(true /* Vector*/)); + // Check the function attributes to find out if this function should be // optimized for size. bool OptForSize = Index: test/Transforms/LoopVectorize/explicit_outer_detection.ll =================================================================== --- test/Transforms/LoopVectorize/explicit_outer_detection.ll +++ test/Transforms/LoopVectorize/explicit_outer_detection.ll @@ -72,10 +72,9 @@ ; be collected. ; CHECK-LABEL: case2 -; CHECK-NOT: LV: Loop hints: force=enabled -; CHECK-NOT: LV: We can vectorize this outer loop! -; CHECK: LV: Loop hints: force=? -; CHECK: LV: Found a loop: inner.body +; CHECK: LV: Loop hints: force=enabled width=0 unroll=0 +; CHECK: LV: We can vectorize this outer loop! +; CHECK: LV: Using user VF 1. define void @case2(i32* nocapture %a, i32* nocapture readonly %b, i32 %N, i32 %M) local_unnamed_addr { entry: Index: test/Transforms/LoopVectorize/outer_loop_test1_no_explicit_vect_width.ll =================================================================== --- /dev/null +++ test/Transforms/LoopVectorize/outer_loop_test1_no_explicit_vect_width.ll @@ -0,0 +1,113 @@ +; extern int arr[8][8]; +; extern int arr2[8]; +; +; void foo(int n) +; { +; int i1, i2; +; +; #pragma clang loop vectorize(enable) +; for (i1 = 0; i1 < 8; i1++) { +; arr2[i1] = i1; +; for (i2 = 0; i2 < 8; i2++) +; arr[i2][i1] = i1 + n; +; } +; } +; +; RUN: opt -S -loop-vectorize -enable-vplan-native-path -mtriple aarch64-gnu-linux < %s | FileCheck %s +; RUN: opt -S -loop-vectorize -enable-vplan-native-path -mtriple x86_64 < %s | FileCheck %s +; CHECK-LABEL: vector.ph: +; CHECK: %[[SplatVal:.*]] = insertelement <4 x i32> undef, i32 %n, i32 0 +; CHECK: %[[Splat:.*]] = shufflevector <4 x i32> %[[SplatVal]], <4 x i32> undef, <4 x i32> zeroinitializer + +; CHECK-LABEL: vector.body: +; CHECK: %[[Ind:.*]] = phi i64 [ 0, %vector.ph ], [ %[[IndNext:.*]], %[[ForInc:.*]] ] +; CHECK: %[[VecInd:.*]] = phi <4 x i64> [ , %vector.ph ], [ %[[VecIndNext:.*]], %[[ForInc]] ] +; CHECK: %[[AAddr:.*]] = getelementptr inbounds [8 x i32], [8 x i32]* @arr2, i64 0, <4 x i64> %[[VecInd]] +; CHECK: %[[VecIndTr:.*]] = trunc <4 x i64> %[[VecInd]] to <4 x i32> +; CHECK: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %[[VecIndTr]], <4 x i32*> %[[AAddr]], i32 4, <4 x i1> ) +; CHECK: %[[VecIndTr2:.*]] = trunc <4 x i64> %[[VecInd]] to <4 x i32> +; CHECK: %[[StoreVal:.*]] = add nsw <4 x i32> %[[VecIndTr2]], %[[Splat]] +; CHECK: br label %[[InnerLoop:.+]] + +; CHECK: [[InnerLoop]]: +; CHECK: %[[InnerPhi:.*]] = phi <4 x i64> [ %[[InnerPhiNext:.*]], %[[InnerLoop]] ], [ zeroinitializer, %vector.body ] +; CHECK: %[[AAddr2:.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* @arr, i64 0, <4 x i64> %[[InnerPhi]], <4 x i64> %[[VecInd]] +; CHECK: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %[[StoreVal]], <4 x i32*> %[[AAddr2]], i32 4, <4 x i1> %[[InnerPhi]], +; CHECK: %[[VecCond:.*]] = icmp eq <4 x i64> %[[InnerPhiNext]], +; CHECK: %[[InnerCond:.*]] = extractelement <4 x i1> %[[VecCond]], i32 0 +; CHECK: br i1 %[[InnerCond]], label %[[ForInc]], label %[[InnerLoop]] + +; CHECK: [[ForInc]]: +; CHECK: %[[IndNext]] = add i64 %[[Ind]], 4 +; CHECK: %[[VecIndNext]] = add <4 x i64> %[[VecInd]], +; CHECK: %[[Cmp:.*]] = icmp eq i64 %[[IndNext]], 8 +; CHECK: br i1 %[[Cmp]], label %middle.block, label %vector.body + +; RUN: opt -S -loop-vectorize -enable-vplan-native-path -mtriple x86_64 -mattr=+avx < %s | FileCheck %s --check-prefix=AVX +; RUN: opt -S -loop-vectorize -enable-vplan-native-path -mtriple x86_64 -mattr=+avx2 < %s | FileCheck %s --check-prefix=AVX +; AVX-LABEL: vector.ph: +; AVX: %[[SplatVal:.*]] = insertelement <8 x i32> undef, i32 %n, i32 0 +; AVX: %[[Splat:.*]] = shufflevector <8 x i32> %[[SplatVal]], <8 x i32> undef, <8 x i32> zeroinitializer + +; AVX-LABEL: vector.body: +; AVX: %[[Ind:.*]] = phi i64 [ 0, %vector.ph ], [ %[[IndNext:.*]], %[[ForInc:.*]] ] +; AVX: %[[VecInd:.*]] = phi <8 x i64> [ , %vector.ph ], [ %[[VecIndNext:.*]], %[[ForInc]] ] +; AVX: %[[AAddr:.*]] = getelementptr inbounds [8 x i32], [8 x i32]* @arr2, i64 0, <8 x i64> %[[VecInd]] +; AVX: %[[VecIndTr:.*]] = trunc <8 x i64> %[[VecInd]] to <8 x i32> +; AVX: call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %[[VecIndTr]], <8 x i32*> %[[AAddr]], i32 4, <8 x i1> ) +; AVX: %[[VecIndTr2:.*]] = trunc <8 x i64> %[[VecInd]] to <8 x i32> +; AVX: %[[StoreVal:.*]] = add nsw <8 x i32> %[[VecIndTr2]], %[[Splat]] +; AVX: br label %[[InnerLoop:.+]] + +; AVX: [[InnerLoop]]: +; AVX: %[[InnerPhi:.*]] = phi <8 x i64> [ %[[InnerPhiNext:.*]], %[[InnerLoop]] ], [ zeroinitializer, %vector.body ] +; AVX: %[[AAddr2:.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* @arr, i64 0, <8 x i64> %[[InnerPhi]], <8 x i64> %[[VecInd]] +; AVX: call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %[[StoreVal]], <8 x i32*> %[[AAddr2]], i32 4, <8 x i1> %[[InnerPhi]], +; AVX: %[[VecCond:.*]] = icmp eq <8 x i64> %[[InnerPhiNext]], +; AVX: %[[InnerCond:.*]] = extractelement <8 x i1> %[[VecCond]], i32 0 +; AVX: br i1 %[[InnerCond]], label %[[ForInc]], label %[[InnerLoop]] + +; AVX: [[ForInc]]: +; AVX: %[[IndNext]] = add i64 %[[Ind]], 8 +; AVX: %[[VecIndNext]] = add <8 x i64> %[[VecInd]], +; AVX: %[[Cmp:.*]] = icmp eq i64 %[[IndNext]], 8 +; AVX: br i1 %[[Cmp]], label %middle.block, label %vector.body + +@arr2 = external global [8 x i32], align 16 +@arr = external global [8 x [8 x i32]], align 16 + +; Function Attrs: norecurse nounwind uwtable +define void @foo(i32 %n) { +entry: + br label %for.body + +for.body: ; preds = %for.inc8, %entry + %indvars.iv21 = phi i64 [ 0, %entry ], [ %indvars.iv.next22, %for.inc8 ] + %arrayidx = getelementptr inbounds [8 x i32], [8 x i32]* @arr2, i64 0, i64 %indvars.iv21 + %0 = trunc i64 %indvars.iv21 to i32 + store i32 %0, i32* %arrayidx, align 4 + %1 = trunc i64 %indvars.iv21 to i32 + %add = add nsw i32 %1, %n + br label %for.body3 + +for.body3: ; preds = %for.body3, %for.body + %indvars.iv = phi i64 [ 0, %for.body ], [ %indvars.iv.next, %for.body3 ] + %arrayidx7 = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* @arr, i64 0, i64 %indvars.iv, i64 %indvars.iv21 + store i32 %add, i32* %arrayidx7, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 8 + br i1 %exitcond, label %for.inc8, label %for.body3 + +for.inc8: ; preds = %for.body3 + %indvars.iv.next22 = add nuw nsw i64 %indvars.iv21, 1 + %exitcond23 = icmp eq i64 %indvars.iv.next22, 8 + br i1 %exitcond23, label %for.end10, label %for.body, !llvm.loop !1 + +for.end10: ; preds = %for.inc8 + ret void +} + +!1 = distinct !{!1, !2} +!2 = !{!"llvm.loop.vectorize.enable", i1 true}