Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2557,7 +2557,8 @@ if (C->isZero()) return; - assert(!Cost->foldTailByMasking() && "Cannot check stride when folding tail"); + assert(!Cost->foldTailByMasking() && + "Cannot SCEV check stride or overflow when folding tail"); // Create a new block containing the stride check. BB->setName("vector.scevcheck"); auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph"); @@ -4637,6 +4638,29 @@ return None; } + if (!PSE.getUnionPredicate().getPredicates().empty()) { + ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize") + << "runtime SCEV checks needed. Enable vectorization of this " + "loop with '#pragma clang loop vectorize(enable)' when " + "compiling with -Os/-Oz"); + LLVM_DEBUG( + dbgs() + << "LV: Aborting. Runtime SCEV check is required with -Os/-Oz.\n"); + return None; + } + + // FIXME: Avoid specializing for stride==1 instead of bailing out. + if (!Legal->getLAI()->getSymbolicStrides().empty()) { + ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize") + << "runtime stride == 1 checks needed. Enable vectorization of " + "this loop with '#pragma clang loop vectorize(enable)' when " + "compiling with -Os/-Oz"); + LLVM_DEBUG( + dbgs() + << "LV: Aborting. Runtime stride check is required with -Os/-Oz.\n"); + return None; + } + // If we optimize the program for size, avoid creating the tail loop. LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); Index: test/Transforms/LoopVectorize/X86/optsize.ll =================================================================== --- test/Transforms/LoopVectorize/X86/optsize.ll +++ test/Transforms/LoopVectorize/X86/optsize.ll @@ -3,6 +3,7 @@ ; will produce a tail loop with the optimize for size or the minimize size ; attributes. This is a target-dependent version of the test. ; RUN: opt < %s -loop-vectorize -force-vector-width=64 -S -mtriple=x86_64-unknown-linux -mcpu=skx | FileCheck %s +; RUN: opt < %s -loop-vectorize -S -mtriple=x86_64-unknown-linux -mcpu=skx | FileCheck %s --check-prefix AUTOVF target datalayout = "E-m:e-p:32:32-i64:32-f64:32:64-a:0:32-n32-S128" @@ -136,3 +137,62 @@ attributes #1 = { minsize } + +; We can't vectorize this one because we version for stride==1; even having TC +; a multiple of VF. +; CHECK-LABEL: @scev4stride1 +; CHECK-NOT: vector.scevcheck +; CHECK-NOT: vector.body: +; CHECK-LABEL: for.body: +; AUTOVF-LABEL: @scev4stride1 +; AUTOVF-NOT: vector.scevcheck +; AUTOVF-NOT: vector.body: +; AUTOVF-LABEL: for.body: +define void @scev4stride1(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32 %k) #2 { +for.body.preheader: + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] + %mul = mul nsw i32 %i.07, %k + %arrayidx = getelementptr inbounds i32, i32* %b, i32 %mul + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds i32, i32* %a, i32 %i.07 + store i32 %0, i32* %arrayidx1, align 4 + %inc = add nuw nsw i32 %i.07, 1 + %exitcond = icmp eq i32 %inc, 256 + br i1 %exitcond, label %for.end.loopexit, label %for.body + +for.end.loopexit: ; preds = %for.body + ret void +} + +attributes #2 = { optsize } + + +; PR39497 +; We can't vectorize this one because we version for overflow check and tiny +; trip count leads to opt-for-size (which otherwise could fold the tail by +; masking). +; CHECK-LABEL: @main +; CHECK-NOT: vector.scevcheck +; CHECK-NOT: vector.body: +; CHECK-LABEL: for.cond: +; AUTOVF-LABEL: @main +; AUTOVF-NOT: vector.scevcheck +; AUTOVF-NOT: vector.body: +; AUTOVF-LABEL: for.cond: +define i32 @main() local_unnamed_addr { +while.cond: + br label %for.cond + +for.cond: + %d.0 = phi i32 [ 0, %while.cond ], [ %add, %for.cond ] + %conv = and i32 %d.0, 65535 + %cmp = icmp ult i32 %conv, 4 + %add = add nuw nsw i32 %conv, 1 + br i1 %cmp, label %for.cond, label %while.cond.loopexit + +while.cond.loopexit: + ret i32 0 +} Index: test/Transforms/LoopVectorize/pr39417-optsize-scevchecks.ll =================================================================== --- test/Transforms/LoopVectorize/pr39417-optsize-scevchecks.ll +++ test/Transforms/LoopVectorize/pr39417-optsize-scevchecks.ll @@ -0,0 +1,54 @@ +; RUN: opt -S -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +; PR39417 +; Check that the need for overflow check prevents vectorizing a loop with tiny +; trip count (which implies opt for size). +; CHECK-LABEL: @func_34 +; CHECK-NOT: vector.scevcheck +; CHECK-NOT: vector.body: +; CHECK-LABEL: bb67: +define void @func_34() { +bb1: + br label %bb67 + +bb67: + %storemerge2 = phi i32 [ 0, %bb1 ], [ %_tmp2300, %bb67 ] + %sext = shl i32 %storemerge2, 16 + %_tmp2299 = ashr exact i32 %sext, 16 + %_tmp2300 = add nsw i32 %_tmp2299, 1 + %_tmp2310 = trunc i32 %_tmp2300 to i16 + %_tmp2312 = icmp slt i16 %_tmp2310, 3 + br i1 %_tmp2312, label %bb67, label %bb68 + +bb68: + ret void +} + +; Check that the need for stride==1 check prevents vectorizing a loop under opt +; for size. +; CHECK-LABEL: @scev4stride1 +; CHECK-NOT: vector.scevcheck +; CHECK-NOT: vector.body: +; CHECK-LABEL: for.body: +define void @scev4stride1(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32 %k) #0 { +for.body.preheader: + br label %for.body + +for.body: + %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] + %mul = mul nsw i32 %i.07, %k + %arrayidx = getelementptr inbounds i32, i32* %b, i32 %mul + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds i32, i32* %a, i32 %i.07 + store i32 %0, i32* %arrayidx1, align 4 + %inc = add nuw nsw i32 %i.07, 1 + %exitcond = icmp eq i32 %inc, 1024 + br i1 %exitcond, label %for.end.loopexit, label %for.body + +for.end.loopexit: + ret void +} + +attributes #0 = { optsize }