Index: llvm/include/llvm/Analysis/LoopInfo.h =================================================================== --- llvm/include/llvm/Analysis/LoopInfo.h +++ llvm/include/llvm/Analysis/LoopInfo.h @@ -780,6 +780,10 @@ /// unrolling pass is run more than once (which it generally is). void setLoopAlreadyUnrolled(); + /// Return true if the loop is annotated with pragma + /// llvm.loop.vectorize.predicate.enable, and false otherwise. + bool isAnnotatedVectorPredicate() const; + void dump() const; void dumpVerbose() const; Index: llvm/lib/Analysis/LoopInfo.cpp =================================================================== --- llvm/lib/Analysis/LoopInfo.cpp +++ llvm/lib/Analysis/LoopInfo.cpp @@ -494,6 +494,34 @@ setLoopID(NewLoopID); } +bool Loop::isAnnotatedVectorPredicate() const { + MDNode *LoopID = getLoopID(); + if (!LoopID) + return false; + + StringRef Name = "llvm.loop.vectorize.predicate.enable"; + // First operand should refer to the loop id itself. + assert(LoopID->getNumOperands() > 0 && "requires at least one operand"); + assert(LoopID->getOperand(0) == LoopID && "invalid loop id"); + + for (unsigned i = 1, e = LoopID->getNumOperands(); i < e; ++i) { + MDNode *MD = dyn_cast(LoopID->getOperand(i)); + if (!MD) + continue; + + MDString *S = dyn_cast(MD->getOperand(0)); + if (!S) + continue; + + if (Name.equals(S->getString()) && + mdconst::extract(MD->getOperand(1))->getZExtValue()) + return true; + else + return false; + } + return false; +} + bool Loop::isAnnotatedParallel() const { MDNode *DesiredLoopIdMetadata = getLoopID(); Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -841,7 +841,8 @@ enum ScalarEpilogueLowering { CM_ScalarEpilogueAllowed, CM_ScalarEpilogueNotAllowedOptSize, - CM_ScalarEpilogueNotAllowedLowTripLoop + CM_ScalarEpilogueNotAllowedLowTripLoop, + CM_ScalarEpilogueNotAllowedPredicatePragma }; /// LoopVectorizationCostModel - estimates the expected speedups due to @@ -870,6 +871,10 @@ /// vectorization and interleaving should be avoided up front. Optional computeMaxVF(); + /// \return True if runtime checks are required for vectorization, and false + /// otherwise. + bool runtimeChecksRequired(); + /// \return The most profitable vectorization factor and the cost of that VF. /// This method checks every power of two up to MaxVF. If UserVF is not ZERO /// then this vectorization factor will be selected if vectorization is @@ -4686,26 +4691,8 @@ Uniforms[VF].insert(Worklist.begin(), Worklist.end()); } -Optional LoopVectorizationCostModel::computeMaxVF() { - if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { - // TODO: It may by useful to do since it's still likely to be dynamically - // uniform if the target can skip. - LLVM_DEBUG( - dbgs() << "LV: Not inserting runtime ptr check for divergent target"); - - ORE->emit( - createMissedAnalysis("CantVersionLoopWithDivergentTarget") - << "runtime pointer checks needed. Not enabled for divergent target"); - - return None; - } - - unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); - if (isScalarEpilogueAllowed()) - return computeFeasibleMaxVF(TC); - - LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue.\n" << - "LV: Performing code size checks.\n"); +bool LoopVectorizationCostModel::runtimeChecksRequired() { + LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); if (Legal->getRuntimePointerChecking()->Need) { ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize") @@ -4715,7 +4702,7 @@ LLVM_DEBUG( dbgs() << "LV: Aborting. Runtime ptr check is required with -Os/-Oz.\n"); - return None; + return true; } if (!PSE.getUnionPredicate().getPredicates().empty()) { @@ -4726,7 +4713,7 @@ LLVM_DEBUG( dbgs() << "LV: Aborting. Runtime SCEV check is required with -Os/-Oz.\n"); - return None; + return true; } // FIXME: Avoid specializing for stride==1 instead of bailing out. @@ -4738,12 +4725,28 @@ LLVM_DEBUG( dbgs() << "LV: Aborting. Runtime stride check is required with -Os/-Oz.\n"); + return true; + } + + return false; +} + +Optional LoopVectorizationCostModel::computeMaxVF() { + if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { + // TODO: It may by useful to do since it's still likely to be dynamically + // uniform if the target can skip. + LLVM_DEBUG( + dbgs() << "LV: Not inserting runtime ptr check for divergent target"); + + ORE->emit( + createMissedAnalysis("CantVersionLoopWithDivergentTarget") + << "runtime pointer checks needed. Not enabled for divergent target"); + return None; } - // If we optimize the program for size, avoid creating the tail loop. + unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); - if (TC == 1) { ORE->emit(createMissedAnalysis("SingleIterationLoop") << "loop trip count is one, irrelevant for vectorization"); @@ -4751,17 +4754,35 @@ return None; } - // Record that scalar epilogue is not allowed. - LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); + switch (IsScalarEpilogueAllowed) { + default: return None; + case CM_ScalarEpilogueAllowed: + return computeFeasibleMaxVF(TC); + case CM_ScalarEpilogueNotAllowedPredicatePragma: + LLVM_DEBUG(dbgs() << "LV: vector predicate pragma found.\n" + << "LV: creating predicated vector loop.\n"); + break; + case CM_ScalarEpilogueNotAllowedLowTripLoop: + LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " + << "count.\n"); + case CM_ScalarEpilogueNotAllowedOptSize: + LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); + // Bail if runtime checks are required, which are not good when optimising + // for size. + if (runtimeChecksRequired()) + return None; + break; + } + + // Now try the tail folding - // We don't create an epilogue when optimizing for size. // Invalidate interleave groups that require an epilogue if we can't mask // the interleave-group. if (!useMaskedInterleavedAccesses(TTI)) InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); + // Bail if we don't have a tail at all. unsigned MaxVF = computeFeasibleMaxVF(TC); - if (TC > 0 && TC % MaxVF == 0) { LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); return MaxVF; @@ -7226,6 +7247,8 @@ (F->hasOptSize() || llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI))) SEL = CM_ScalarEpilogueNotAllowedOptSize; + else if (L->isAnnotatedVectorPredicate()) + SEL = CM_ScalarEpilogueNotAllowedPredicatePragma; LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, &Hints, IAI); @@ -7318,10 +7341,13 @@ // Check the function attributes and profiles to find out if this function // should be optimized for size. ScalarEpilogueLowering SEL = CM_ScalarEpilogueAllowed; + if (Hints.getForce() != LoopVectorizeHints::FK_Enabled && (F->hasOptSize() || llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI))) SEL = CM_ScalarEpilogueNotAllowedOptSize; + else if (L->isAnnotatedVectorPredicate()) + SEL = CM_ScalarEpilogueNotAllowedPredicatePragma; // Entrance to the VPlan-native vectorization path. Outer loops are processed // here. They may require CFG and instruction level transformations before Index: llvm/test/Transforms/LoopVectorize/tail_loop_folding.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/tail_loop_folding.ll @@ -0,0 +1,79 @@ +; REQUIRES: asserts +; RUN: opt < %s -loop-vectorize -S | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define dso_local void @tail_folding_enabled(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) local_unnamed_addr #0 { +; CHECK-LABEL: tail_folding_enabled( +; CHECK: vector.body: +; CHECK: %wide.masked.load = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32( +; CHECK: %wide.masked.load1 = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32( +; CHECK: %8 = add nsw <8 x i32> %wide.masked.load1, %wide.masked.load +; CHECK: call void @llvm.masked.store.v8i32.p0v8i32( +; CHECK: %index.next = add i64 %index, 8 +; CHECK: %12 = icmp eq i64 %index.next, 432 +; CHECK: br i1 %12, label %middle.block, label %vector.body, !llvm.loop !0 + +entry: + br label %for.body + +for.cond.cleanup: + ret void + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %B, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, i32* %C, i64 %indvars.iv + %1 = load i32, i32* %arrayidx2, align 4 + %add = add nsw i32 %1, %0 + %arrayidx4 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + store i32 %add, i32* %arrayidx4, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 430 + br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !6 +} + +define dso_local void @tail_folding_disabled(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) local_unnamed_addr #0 { +; CHECK-LABEL: tail_folding_disabled( +; CHECK: vector.body: +; CHECK-NOT: @llvm.masked.load.v8i32.p0v8i32( +; CHECK-NOT: @llvm.masked.store.v8i32.p0v8i32( +; CHECK: br i1 %44, label {{.*}}, label %vector.body +entry: + br label %for.body + +for.cond.cleanup: + ret void + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %B, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, i32* %C, i64 %indvars.iv + %1 = load i32, i32* %arrayidx2, align 4 + %add = add nsw i32 %1, %0 + %arrayidx4 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + store i32 %add, i32* %arrayidx4, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 430 + br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !10 +} + +; CHECK: !0 = distinct !{!0, !1} +; CHECK-NEXT: !1 = !{!"llvm.loop.isvectorized", i32 1} +; CHECK-NEXT: !2 = distinct !{!2, !3, !1} +; CHECK-NEXT: !3 = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK-NEXT: !4 = distinct !{!4, !1} +; CHECK-NEXT: !5 = distinct !{!5, !3, !1} + +attributes #0 = { nounwind optsize uwtable "target-cpu"="core-avx2" "target-features"="+avx,+avx2" } + +!6 = distinct !{!6, !7, !8} +!7 = !{!"llvm.loop.vectorize.predicate.enable", i1 true} +!8 = !{!"llvm.loop.vectorize.enable", i1 true} + +!10 = distinct !{!10, !11, !12} +!11 = !{!"llvm.loop.vectorize.predicate.enable", i1 false} +!12 = !{!"llvm.loop.vectorize.enable", i1 true}