Index: include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h =================================================================== --- include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h +++ include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h @@ -228,8 +228,8 @@ bool canVectorize(bool UseVPlanNativePath); /// Return true if we can vectorize this loop while folding its tail by - /// masking. - bool canFoldTailByMasking(); + /// masking, and mark all respective loads/stores for masking. + bool prepareToFoldTailByMasking(); /// Returns the primary induction variable. PHINode *getPrimaryInduction() { return PrimaryInduction; } @@ -355,9 +355,16 @@ bool canVectorizeOuterLoop(); /// Return true if all of the instructions in the block can be speculatively - /// executed. \p SafePtrs is a list of addresses that are known to be legal - /// and we know that we can read from them without segfault. - bool blockCanBePredicated(BasicBlock *BB, SmallPtrSetImpl &SafePtrs); + /// executed, and record the loads/stores that require masking. If's that + /// guard loads can be ignored under "assume safety" unless \p PreserveGuards + /// is true. This can happen when we introduces guards for which the original + /// "unguarded-loads are safe" assumption does not hold. For example, the + /// vectorizer's fold-tail transformation changes the loop to execute beyond + /// its original trip-count, under a proper guard, which should be preserved. + /// \p SafePtrs is a list of addresses that are known to be legal and we know + /// that we can read from them without segfault. + bool blockCanBePredicated(BasicBlock *BB, SmallPtrSetImpl &SafePtrs, + bool PreserveGuards = false); /// Updates the vectorization state by adding \p Phi to the inductions list. /// This can set \p Phi as the main induction of the loop if \p Phi is a Index: lib/Transforms/Vectorize/LoopVectorizationLegality.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -869,7 +869,7 @@ } bool LoopVectorizationLegality::blockCanBePredicated( - BasicBlock *BB, SmallPtrSetImpl &SafePtrs) { + BasicBlock *BB, SmallPtrSetImpl &SafePtrs, bool PreserveGuards) { const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel(); for (Instruction &I : *BB) { @@ -888,7 +888,7 @@ // !llvm.mem.parallel_loop_access implies if-conversion safety. // Otherwise, record that the load needs (real or emulated) masking // and let the cost model decide. - if (!IsAnnotatedParallel) + if (!IsAnnotatedParallel || PreserveGuards) MaskedOp.insert(LI); continue; } @@ -1159,7 +1159,7 @@ return Result; } -bool LoopVectorizationLegality::canFoldTailByMasking() { +bool LoopVectorizationLegality::prepareToFoldTailByMasking() { LLVM_DEBUG(dbgs() << "LV: checking if tail can be folded by masking.\n"); @@ -1202,7 +1202,7 @@ // Check and mark all blocks for predication, including those that ordinarily // do not need predication such as the header block. for (BasicBlock *BB : TheLoop->blocks()) { - if (!blockCanBePredicated(BB, SafePointers)) { + if (!blockCanBePredicated(BB, SafePointers, /* MaskAllLoads= */ true)) { reportVectorizationFailure( "Cannot fold tail by masking as required", "control flow cannot be substituted for a select", Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4845,7 +4845,7 @@ // found modulo the vectorization factor is not zero, try to fold the tail // by masking. // FIXME: look for a smaller MaxVF that does divide TC rather than masking. - if (Legal->canFoldTailByMasking()) { + if (Legal->prepareToFoldTailByMasking()) { FoldTailByMasking = true; return MaxVF; } Index: test/Transforms/LoopVectorize/X86/tail_folding_and_assume_safety.ll =================================================================== --- test/Transforms/LoopVectorize/X86/tail_folding_and_assume_safety.ll +++ test/Transforms/LoopVectorize/X86/tail_folding_and_assume_safety.ll @@ -0,0 +1,166 @@ +; RUN: opt -mcpu=skx -S -loop-vectorize -instcombine -force-vector-width=8 -force-vector-interleave=1 < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-pc-linux-gnu" + +; Case1: With pragma predicate to force tail-folding. +; All memory opertions are masked. +;void fold_tail(int * restrict p, int * restrict q1, int * restrict q2, int guard) { +; #pragma clang loop vectorize_predicate(enable) +; for(int ix=0; ix < 1021; ++ix) { +; if (ix > guard) { +; p[ix] = q1[ix] + q2[ix]; +; } +; } +;} + +;CHECK-LABEL: @fold_tail +;CHECK: vector.body: +;CHECK: call <8 x i32> @llvm.masked.load +;CHECK: call <8 x i32> @llvm.masked.load +;CHECK: call void @llvm.masked.store + +; Function Attrs: nofree norecurse nounwind uwtable +define dso_local void @fold_tail(i32* noalias nocapture %p, i32* noalias nocapture readonly %q1, i32* noalias nocapture readonly %q2, +i32 %guard) local_unnamed_addr #0 { +entry: + %0 = sext i32 %guard to i64 + br label %for.body + +for.cond.cleanup: + ret void + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ] + %cmp1 = icmp sgt i64 %indvars.iv, %0 + br i1 %cmp1, label %if.then, label %for.inc + +if.then: + %arrayidx = getelementptr inbounds i32, i32* %q1, i64 %indvars.iv + %1 = load i32, i32* %arrayidx, align 4, !tbaa !2 + %arrayidx3 = getelementptr inbounds i32, i32* %q2, i64 %indvars.iv + %2 = load i32, i32* %arrayidx3, align 4, !tbaa !2 + %add = add nsw i32 %2, %1 + %arrayidx5 = getelementptr inbounds i32, i32* %p, i64 %indvars.iv + store i32 %add, i32* %arrayidx5, align 4, !tbaa !2 + br label %for.inc + +for.inc: + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1021 + br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !8 +} + +; Case2: With pragma assume_safety only the store is masked. +; void fold_tail(int * p, int * q1, int * q2, int guard) { +; #pragma clang loop vectorize(assume_safety) +; for(int ix=0; ix < 1021; ++ix) { +; if (ix > guard) { +; p[ix] = q1[ix] + q2[ix]; +; } +; } +;} + +;CHECK-LABEL: @assume_safety +;CHECK: vector.body: +;CHECK-NOT: @llvm.masked.load +;CHECK: call void @llvm.masked.store + +; Function Attrs: norecurse nounwind uwtable +define void @assume_safety(i32* nocapture, i32* nocapture readonly, i32* nocapture readonly, i32) local_unnamed_addr #0 { + %5 = sext i32 %3 to i64 + br label %7 + +;