diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -235,6 +235,54 @@ "prefers tail-folding, don't attempt vectorization if " "tail-folding fails."))); +// Option prefer-predicate-with-vp-intrinsics is an experimental switch to +// indicate that the loop vectorizer should try to generate VP intrinsics if +// tail-folding is enabled (note that this option is dependent on the +// prefer-predicate-over-epilogue option being set to predicate-dont-vectorize). +// This can be particularly useful for targets like RISC-V and SX-Aurora that +// support vector length predication. +// Currently this switch takes four possible values: +// 0. no-predication: Do not generate VP intrinsics. +// 1. if-active-vector-length-supported: Only generate VP intrinsics if the +// target supports active vector length based predication. +// 2. without-avl-support: Generate VP intrinsics even if vector length based +// predication is not supported. This will behave a bit like existing +// tail-folding by using a mask for predication, except all instructions are +// widened to VP intrinsics and not just memory instructions. Use of this +// options is discouraged and is only meant for experimental/testing purpose. +// 3. force-active-vector-length-support: This is purely an experimental/testing +// option which will be removed in future. It forces the loop vectorizer to +// assume that the target supports vector length predication. +namespace PreferVPIntrinsicsTy { +enum Option { + NoPredication = 0, + IfAVLSupported, + WithoutAVLSupport, + ForceAVLSupport +}; +} // namespace PreferVPIntrinsicsTy + +static cl::opt PreferPredicateWithVPIntrinsics( + "prefer-predicate-with-vp-intrinsics", + cl::init(PreferVPIntrinsicsTy::NoPredication), cl::Hidden, + cl::desc("When vectorizing with tail-folding, generate vector predication " + "intrinsics."), + cl::values( + clEnumValN(PreferVPIntrinsicsTy::NoPredication, "no-predication", + "Do not generate VP intrinsics."), + clEnumValN(PreferVPIntrinsicsTy::IfAVLSupported, + "if-active-vector-length-support", + "Only generate VP intrinsics if the target supports vector " + "length predication."), + clEnumValN(PreferVPIntrinsicsTy::WithoutAVLSupport, + "without-active-vector-length-support", + "Generate VP intrinsics even if vector length predication " + "is not supported. This option is discouraged."), + clEnumValN(PreferVPIntrinsicsTy::ForceAVLSupport, + "force-active-vector-length-support", + "Assume that the target supports vector length predication " + "and generate VP intrinsics accordingly."))); + static cl::opt MaximizeBandwidth( "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, cl::desc("Maximize bandwidth when selecting vectorization factor which " @@ -1589,6 +1637,11 @@ return foldTailByMasking() || Legal->blockNeedsPredication(BB); } + /// Returns true if VP intrinsics should be generated in the tail folded loop. + bool preferVPIntrinsics() const { + return foldTailByMasking() && PreferVPIntrinsics; + } + /// A SmallMapVector to store the InLoop reduction op chains, mapping phi /// nodes to the chain of instructions representing the reductions. Uses a /// MapVector to ensure deterministic iteration order. @@ -1749,6 +1802,9 @@ /// All blocks of loop are to be masked to fold tail of scalar iterations. bool FoldTailByMasking = false; + /// Control whether to generate VP intrinsics in vectorized code. + bool PreferVPIntrinsics = false; + /// A map holding scalar costs for different vectorization factors. The /// presence of a cost for an instruction in the mapping indicates that the /// instruction will be scalarized when vectorizing with the associated @@ -5905,6 +5961,28 @@ // FIXME: look for a smaller MaxVF that does divide TC rather than masking. if (Legal->prepareToFoldTailByMasking()) { FoldTailByMasking = true; + if (!PreferPredicateWithVPIntrinsics) + return MaxFactors; + + if (UserIC > 1) { + LLVM_DEBUG(dbgs() << "LV: Preference for VP intrinsics indicated. Will " + "not generate VP intrinsics since interleave count " + "specified is greater than 1.\n"); + return MaxFactors; + } + + if (PreferPredicateWithVPIntrinsics == + PreferVPIntrinsicsTy::IfAVLSupported) { + PreferVPIntrinsics = TTI.hasActiveVectorLength(); + LLVM_DEBUG(dbgs() << "LV: Preference for VP intrinsics indicated. Will " + "try to generate VP Intrinsics if the target " + "support vector length predication.\n"); + } else { + PreferVPIntrinsics = true; + LLVM_DEBUG(dbgs() << "LV: Preference for VP intrinsics indicated. Will " + "try to generate VP Intrinsics.\n"); + } + return MaxFactors; } @@ -6333,6 +6411,11 @@ if (!isScalarEpilogueAllowed()) return 1; + // Do not interleave if VP intrinsics are preferred and no User IC is + // specified. + if (preferVPIntrinsics()) + return 1; + // We used the distance for the interleave count. if (Legal->getMaxSafeDepDistBytes() != -1U) return 1;