Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2516,7 +2516,8 @@ if (C->isZero()) return; - assert(!Cost->foldTailByMasking() && "Cannot check stride when folding tail"); + assert(!Cost->foldTailByMasking() && + "Cannot SCEV check stride or overflow when folding tail"); // Create a new block containing the stride check. BB->setName("vector.scevcheck"); auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph"); @@ -4588,6 +4589,29 @@ << "LV: Aborting. Runtime ptr check is required with -Os/-Oz.\n"); return None; } + + if (!PSE.getUnionPredicate().getPredicates().empty()) { + ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize") + << "runtime SCEV checks needed. Enable vectorization of this " + "loop with '#pragma clang loop vectorize(enable)' when " + "compiling with -Os/-Oz"); + LLVM_DEBUG( + dbgs() + << "LV: Aborting. Runtime SCEV check is required with -Os/-Oz.\n"); + return None; + } + + // FIXME: Avoid specializing for stride==1 instead of bailing out. + if (!Legal->getLAI()->getSymbolicStrides().empty()) { + ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize") + << "runtime stride == 1 checks needed. Enable vectorization of " + "this loop with '#pragma clang loop vectorize(enable)' when " + "compiling with -Os/-Oz"); + LLVM_DEBUG( + dbgs() + << "LV: Aborting. Runtime stride check is required with -Os/-Oz.\n"); + return None; + } // If we optimize the program for size, avoid creating the tail loop. LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); Index: test/Transforms/LoopVectorize/X86/optsize.ll =================================================================== --- test/Transforms/LoopVectorize/X86/optsize.ll +++ test/Transforms/LoopVectorize/X86/optsize.ll @@ -3,6 +3,7 @@ ; will produce a tail loop with the optimize for size or the minimize size ; attributes. This is a target-dependent version of the test. ; RUN: opt < %s -loop-vectorize -force-vector-width=64 -S -mtriple=x86_64-unknown-linux -mcpu=skx | FileCheck %s +; RUN: opt < %s -loop-vectorize -S -mtriple=x86_64-unknown-linux -mcpu=skx | FileCheck %s --check-prefix AUTOVF target datalayout = "E-m:e-p:32:32-i64:32-f64:32:64-a:0:32-n32-S128" @@ -136,3 +137,34 @@ attributes #1 = { minsize } + +; We can't vectorize this one because we version for stride==1; even having TC +; a multiple of VF. +; CHECK-LABEL: @scev4stride1 +; CHECK-NOT: vector.scevcheck +; CHECK-NOT: vector.body: +; CHECK-LABEL: for.body: +; AUTOVF-LABEL: @scev4stride1 +; AUTOVF-NOT: vector.scevcheck +; AUTOVF-NOT: vector.body: +; AUTOVF-LABEL: for.body: +define void @scev4stride1(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32 %k) #2 { +for.body.preheader: + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] + %mul = mul nsw i32 %i.07, %k + %arrayidx = getelementptr inbounds i32, i32* %b, i32 %mul + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds i32, i32* %a, i32 %i.07 + store i32 %0, i32* %arrayidx1, align 4 + %inc = add nuw nsw i32 %i.07, 1 + %exitcond = icmp eq i32 %inc, 256 + br i1 %exitcond, label %for.end.loopexit, label %for.body + +for.end.loopexit: ; preds = %for.body + ret void +} + +attributes #2 = { optsize } Index: test/Transforms/LoopVectorize/pr30654-phiscev-sext-trunc.ll =================================================================== --- test/Transforms/LoopVectorize/pr30654-phiscev-sext-trunc.ll +++ test/Transforms/LoopVectorize/pr30654-phiscev-sext-trunc.ll @@ -239,3 +239,54 @@ for.end: ret void } + + +; Check that the need for overflow check prevents vectorizing a loop with tiny +; trip count / under opt for size. +; CHECK-LABEL: @func_34 +; CHECK-NOT: vector.scevcheck +; CHECK-NOT: vector.body: +; CHECK-LABEL: bb67: +define void @func_34() { +bb1: + br label %bb67 + +bb67: ; preds = %bb67, %bb1 + %storemerge2 = phi i32 [ 0, %bb1 ], [ %_tmp2300, %bb67 ] + %sext = shl i32 %storemerge2, 16 + %_tmp2299 = ashr exact i32 %sext, 16 + %_tmp2300 = add nsw i32 %_tmp2299, 1 + %_tmp2310 = trunc i32 %_tmp2300 to i16 + %_tmp2312 = icmp slt i16 %_tmp2310, 3 + br i1 %_tmp2312, label %bb67, label %bb68 + +bb68: ; preds = %bb67 + ret void +} + +; Check that the need for stride==1 check prevents vectorizing a loop under opt +; for size. +; CHECK-LABEL: @scev4stride1 +; CHECK-NOT: vector.scevcheck +; CHECK-NOT: vector.body: +; CHECK-LABEL: for.body: +define void @scev4stride1(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32 %k) #0 { +for.body.preheader: + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] + %mul = mul nsw i32 %i.07, %k + %arrayidx = getelementptr inbounds i32, i32* %b, i32 %mul + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds i32, i32* %a, i32 %i.07 + store i32 %0, i32* %arrayidx1, align 4 + %inc = add nuw nsw i32 %i.07, 1 + %exitcond = icmp eq i32 %inc, 1024 + br i1 %exitcond, label %for.end.loopexit, label %for.body + +for.end.loopexit: ; preds = %for.body + ret void +} + +attributes #0 = { optsize }