Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -5508,6 +5508,19 @@ return MaxVF; } + // Avoid tail folding if the trip count is known to be a multiple of any VF we + // chose. + ScalarEvolution *SE = PSE.getSE(); + const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); + const SCEV *ExitCount = SE->getAddExpr( + BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); + unsigned TCisMultipleOf = 1 << SE->GetMinTrailingZeros(ExitCount); + if (TCisMultipleOf % MaxVFtimesIC == 0) { + // Accept MaxVF if we do not have a tail. + LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); + return MaxVF; + } + // If we don't know the precise trip count, or if the trip count that we // found modulo the vectorization factor is not zero, try to fold the tail // by masking. Index: llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-const-TC.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-const-TC.ll @@ -0,0 +1,27 @@ +; RUN: opt < %s -loop-vectorize -force-vector-interleave=3 -force-vector-width=2 -S | FileCheck %s + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" + +; Make sure loop is unrolled under -Os without folding its tail based on the its trip-count +; being provably divisible by chosen VFxIC. + +; CHECK-LABEL: constTC +; CHECK: vector.body: +; CHECK-COUNT-3: store <2 x i32> +; CHECK: br i1 + +define dso_local void @constTC(i32* noalias nocapture %A) optsize { +entry: + br label %loop + +loop: + %riv = phi i32 [ 0, %entry ], [ %rivPlus1, %loop ] + %arrayidx = getelementptr inbounds i32, i32* %A, i32 %riv + store i32 13, i32* %arrayidx, align 1 + %rivPlus1 = add nuw nsw i32 %riv, 1 + %cond = icmp eq i32 %rivPlus1, 1800 + br i1 %cond, label %exit, label %loop + +exit: + ret void +} Index: llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-divisible-TC.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-divisible-TC.ll @@ -0,0 +1,27 @@ +; RUN: opt < %s -loop-vectorize -force-vector-width=4 -S | FileCheck %s + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" + +; Make sure loops are vectorized under -Os without folding its tail based on +; their trip-count's lower bits being zero. + +; CHECK-LABEL: alignTC +; CHECK: vector.body: +; CHECK: store <4 x i32> + +define dso_local void @alignTC(i32* noalias nocapture %A, i32 %n) optsize { +entry: + %alignedTC = and i32 %n, -8 + br label %loop + +loop: + %riv = phi i32 [ 0, %entry ], [ %rivPlus1, %loop ] + %arrayidx = getelementptr inbounds i32, i32* %A, i32 %riv + store i32 13, i32* %arrayidx, align 1 + %rivPlus1 = add nuw nsw i32 %riv, 1 + %cond = icmp eq i32 %rivPlus1, %alignedTC + br i1 %cond, label %exit, label %loop + +exit: + ret void +}