Index: llvm/docs/LangRef.rst =================================================================== --- llvm/docs/LangRef.rst +++ llvm/docs/LangRef.rst @@ -5930,18 +5930,23 @@ '``llvm.loop.vectorize.width``' Metadata ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -This metadata sets the target width of the vectorizer. The first -operand is the string ``llvm.loop.vectorize.width`` and the second -operand is an integer specifying the width. For example: +The vector width is a tuple, where the first value specifies the minimum vector +width and the second value indicates whether the vectorization factor is +scalable or not. One example of this is shown below: .. code-block:: llvm - !0 = !{!"llvm.loop.vectorize.width", i32 4} + !0 = !{!"llvm.loop.vectorize.width", !1} + !1 = !{i32 4, i1 true} -Note that setting ``llvm.loop.vectorize.width`` to 1 disables -vectorization of the loop. If ``llvm.loop.vectorize.width`` is set to -0 or if the loop does not have this metadata the width will be -determined automatically. +which indicates the loop-vectorizer should use vector-length agnostic +vectorization with a minimum vector width of 4. + +For fixed-width vectorization-factors, a short-hand `i32` operand for +llvm.loop.vectorize.width is also supported: + +.. code-block:: llvm + !0 = !{!"llvm.loop.vectorize.width", i32 4} '``llvm.loop.vectorize.followup_vectorized``' Metadata ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Index: llvm/include/llvm/Transforms/Utils/LoopUtils.h =================================================================== --- llvm/include/llvm/Transforms/Utils/LoopUtils.h +++ llvm/include/llvm/Transforms/Utils/LoopUtils.h @@ -194,6 +194,9 @@ /// Find named metadata for a loop with an integer value. llvm::Optional getOptionalIntLoopAttribute(Loop *TheLoop, StringRef Name); +llvm::Optional +getOptionalElementCountLoopAttribute(Loop *TheLoop, StringRef Name); + /// Create a new loop identifier for a loop created from a loop transformation. /// /// @param OrigLoopID The loop ID of the loop before the transformation. Index: llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h =================================================================== --- llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h +++ llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h @@ -29,6 +29,7 @@ #include "llvm/ADT/MapVector.h" #include "llvm/Analysis/LoopAccessAnalysis.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/Support/TypeSize.h" #include "llvm/Transforms/Utils/LoopUtils.h" namespace llvm { @@ -49,13 +50,21 @@ /// Hint - associates name and validation with the hint value. struct Hint { const char *Name; - unsigned Value; // This may have to change for non-numeric values. + union { + unsigned U32; // Used for boolean and integer hint values. + ElementCount EC; // Used for the vectorization width. + } Value; HintKind Kind; Hint(const char *Name, unsigned Value, HintKind Kind) - : Name(Name), Value(Value), Kind(Kind) {} + : Name(Name), Value({Value}), Kind(Kind) {} - bool validate(unsigned Val); + Hint(const char *Name, ElementCount EC) + : Name(Name), Value({0}), Kind(HK_WIDTH) { + Value.EC = EC; + } + + bool validateAndSet(const Metadata *Arg); }; /// Vectorization width. @@ -98,15 +107,15 @@ /// Dumps all the hint information. void emitRemarkWithHints() const; - unsigned getWidth() const { return Width.Value; } - unsigned getInterleave() const { return Interleave.Value; } - unsigned getIsVectorized() const { return IsVectorized.Value; } - unsigned getPredicate() const { return Predicate.Value; } + ElementCount getWidth() const { return Width.Value.EC; } + unsigned getInterleave() const { return Interleave.Value.U32; } + unsigned getIsVectorized() const { return IsVectorized.Value.U32; } + unsigned getPredicate() const { return Predicate.Value.U32; } enum ForceKind getForce() const { - if ((ForceKind)Force.Value == FK_Undefined && + if ((ForceKind)Force.Value.U32 == FK_Undefined && hasDisableAllTransformsHint(TheLoop)) return FK_Disabled; - return (ForceKind)Force.Value; + return (ForceKind)Force.Value.U32; } /// If hints are provided that force vectorization, use the AlwaysPrint @@ -119,7 +128,9 @@ // enabled by default because can be unsafe or inefficient. For example, // reordering floating-point operations will change the way round-off // error accumulates in the loop. - return getForce() == LoopVectorizeHints::FK_Enabled || getWidth() > 1; + ElementCount EC = getWidth(); + return getForce() == LoopVectorizeHints::FK_Enabled || + EC.getKnownMinValue() > 1; } bool isPotentiallyUnsafe() const { @@ -138,7 +149,7 @@ void getHintsFromMetadata(); /// Checks string hint with one operand and set value if valid. - void setHint(StringRef Name, Metadata *Arg); + void setHint(StringRef Name, const Metadata *Arg); /// The loop these hints belong to. const Loop *TheLoop; Index: llvm/lib/Transforms/Scalar/WarnMissedTransforms.cpp =================================================================== --- llvm/lib/Transforms/Scalar/WarnMissedTransforms.cpp +++ llvm/lib/Transforms/Scalar/WarnMissedTransforms.cpp @@ -48,12 +48,12 @@ if (hasVectorizeTransformation(L) == TM_ForcedByUser) { LLVM_DEBUG(dbgs() << "Leftover vectorization transformation\n"); - Optional VectorizeWidth = - getOptionalIntLoopAttribute(L, "llvm.loop.vectorize.width"); + Optional VectorizeWidth = + getOptionalElementCountLoopAttribute(L, "llvm.loop.vectorize.width"); Optional InterleaveCount = getOptionalIntLoopAttribute(L, "llvm.loop.interleave.count"); - if (VectorizeWidth.getValueOr(0) != 1) + if (!VectorizeWidth || VectorizeWidth->isVector()) ORE->emit( DiagnosticInfoOptimizationFailure(DEBUG_TYPE, "FailedRequestedVectorization", Index: llvm/lib/Transforms/Utils/LoopUtils.cpp =================================================================== --- llvm/lib/Transforms/Utils/LoopUtils.cpp +++ llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -301,6 +301,31 @@ return getOptionalBoolLoopAttribute(TheLoop, Name).getValueOr(false); } +llvm::Optional +llvm::getOptionalElementCountLoopAttribute(Loop *TheLoop, StringRef Name) { + const MDOperand *AttrMD = + findStringMetadataForLoop(TheLoop, Name).getValueOr(nullptr); + if (!AttrMD) + return None; + + if (ConstantInt *IntMD = mdconst::dyn_extract(AttrMD->get())) + return ElementCount::getFixed(IntMD->getZExtValue()); + else if (const MDNode *MD = dyn_cast(AttrMD->get())) { + // Here we are looking for a tuple of the form: + // { Width, IsScalable } + if (MD->getNumOperands() != 2) + return None; + const ConstantInt *C0 = + mdconst::dyn_extract(MD->getOperand(0)); + const ConstantInt *C1 = + mdconst::dyn_extract(MD->getOperand(1)); + if (C0 && C1) + return ElementCount::get(C0->getZExtValue(), C1->getZExtValue()); + } + + return None; +} + llvm::Optional llvm::getOptionalIntLoopAttribute(Loop *TheLoop, StringRef Name) { const MDOperand *AttrMD = @@ -450,14 +475,15 @@ if (Enable == false) return TM_SuppressedByUser; - Optional VectorizeWidth = - getOptionalIntLoopAttribute(L, "llvm.loop.vectorize.width"); + Optional VectorizeWidth = + getOptionalElementCountLoopAttribute(L, "llvm.loop.vectorize.width"); Optional InterleaveCount = getOptionalIntLoopAttribute(L, "llvm.loop.interleave.count"); // 'Forcing' vector width and interleave count to one effectively disables // this tranformation. - if (Enable == true && VectorizeWidth == 1 && InterleaveCount == 1) + if (Enable == true && (VectorizeWidth && VectorizeWidth->isScalar()) && + InterleaveCount == 1) return TM_SuppressedByUser; if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized")) @@ -466,10 +492,10 @@ if (Enable == true) return TM_ForcedByUser; - if (VectorizeWidth == 1 && InterleaveCount == 1) + if ((VectorizeWidth && VectorizeWidth->isScalar()) && InterleaveCount == 1) return TM_Disable; - if (VectorizeWidth > 1 || InterleaveCount > 1) + if ((VectorizeWidth && VectorizeWidth->isVector()) || InterleaveCount > 1) return TM_Enable; if (hasDisableAllTransformsHint(L)) Index: llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -56,17 +56,48 @@ namespace llvm { -bool LoopVectorizeHints::Hint::validate(unsigned Val) { +bool LoopVectorizeHints::Hint::validateAndSet(const Metadata *Arg) { + unsigned IntVal; + bool IsScalable = false; + + if (const ConstantInt *C = mdconst::dyn_extract(Arg)) + IntVal = C->getZExtValue(); + else if (const MDNode *MD = dyn_cast(Arg)) { + // Here we are looking for a tuple of the form: + // { Width, IsScalable } + if (Kind != HK_WIDTH || MD->getNumOperands() != 2) + return false; + const ConstantInt *C0 = + mdconst::dyn_extract(MD->getOperand(0)); + const ConstantInt *C1 = + mdconst::dyn_extract(MD->getOperand(1)); + if (!C0 || !C1) + return false; + IntVal = C0->getZExtValue(); + IsScalable = C1->getZExtValue(); + } else + return false; + + auto ConditionallySetIntValue = [this](unsigned Val, bool Cond) { + if (Cond) + this->Value.U32 = Val; + return Cond; + }; + switch (Kind) { case HK_WIDTH: - return isPowerOf2_32(Val) && Val <= VectorizerParams::MaxVectorWidth; + if (!isPowerOf2_32(IntVal) || IntVal > VectorizerParams::MaxVectorWidth) + return false; + Value.EC = ElementCount::get(IntVal, IsScalable); + return true; case HK_UNROLL: - return isPowerOf2_32(Val) && Val <= MaxInterleaveFactor; + return ConditionallySetIntValue(IntVal, isPowerOf2_32(IntVal) && + IntVal <= MaxInterleaveFactor); case HK_FORCE: - return (Val <= 1); + return ConditionallySetIntValue(IntVal, IntVal <= 1); case HK_ISVECTORIZED: case HK_PREDICATE: - return (Val == 0 || Val == 1); + return ConditionallySetIntValue(IntVal, IntVal == 0 || IntVal == 1); } return false; } @@ -74,25 +105,27 @@ LoopVectorizeHints::LoopVectorizeHints(const Loop *L, bool InterleaveOnlyWhenForced, OptimizationRemarkEmitter &ORE) - : Width("vectorize.width", VectorizerParams::VectorizationFactor, HK_WIDTH), + : Width("vectorize.width", + ElementCount::getFixed(VectorizerParams::VectorizationFactor)), Interleave("interleave.count", InterleaveOnlyWhenForced, HK_UNROLL), Force("vectorize.enable", FK_Undefined, HK_FORCE), IsVectorized("isvectorized", 0, HK_ISVECTORIZED), - Predicate("vectorize.predicate.enable", FK_Undefined, HK_PREDICATE), TheLoop(L), - ORE(ORE) { + Predicate("vectorize.predicate.enable", FK_Undefined, HK_PREDICATE), + TheLoop(L), ORE(ORE) { // Populate values with existing loop metadata. getHintsFromMetadata(); // force-vector-interleave overrides DisableInterleaving. if (VectorizerParams::isInterleaveForced()) - Interleave.Value = VectorizerParams::VectorizationInterleave; + Interleave.Value.U32 = VectorizerParams::VectorizationInterleave; - if (IsVectorized.Value != 1) + if (IsVectorized.Value.U32 != 1) // If the vectorization width and interleaving count are both 1 then // consider the loop to have been already vectorized because there's // nothing more that we can do. - IsVectorized.Value = Width.Value == 1 && Interleave.Value == 1; - LLVM_DEBUG(if (InterleaveOnlyWhenForced && Interleave.Value == 1) dbgs() + IsVectorized.Value.U32 = Width.Value.EC == ElementCount::getFixed(1) && + Interleave.Value.U32 == 1; + LLVM_DEBUG(if (InterleaveOnlyWhenForced && Interleave.Value.U32 == 1) dbgs() << "LV: Interleaving disabled by the pass manager\n"); } @@ -112,7 +145,7 @@ TheLoop->setLoopID(NewLoopID); // Update internal cache. - IsVectorized.Value = 1; + IsVectorized.Value.U32 = 1; } bool LoopVectorizeHints::allowVectorization( @@ -152,7 +185,7 @@ using namespace ore; ORE.emit([&]() { - if (Force.Value == LoopVectorizeHints::FK_Disabled) + if (Force.Value.U32 == LoopVectorizeHints::FK_Disabled) return OptimizationRemarkMissed(LV_NAME, "MissedExplicitlyDisabled", TheLoop->getStartLoc(), TheLoop->getHeader()) @@ -161,12 +194,13 @@ OptimizationRemarkMissed R(LV_NAME, "MissedDetails", TheLoop->getStartLoc(), TheLoop->getHeader()); R << "loop not vectorized"; - if (Force.Value == LoopVectorizeHints::FK_Enabled) { + if (Force.Value.U32 == LoopVectorizeHints::FK_Enabled) { R << " (Force=" << NV("Force", true); - if (Width.Value != 0) - R << ", Vector Width=" << NV("VectorWidth", Width.Value); - if (Interleave.Value != 0) - R << ", Interleave Count=" << NV("InterleaveCount", Interleave.Value); + if (Width.Value.EC.isNonZero()) + R << ", Vector Width=" << NV("VectorWidth", Width.Value.EC); + if (Interleave.Value.U32 != 0) + R << ", Interleave Count=" + << NV("InterleaveCount", Interleave.Value.U32); R << ")"; } return R; @@ -175,11 +209,11 @@ } const char *LoopVectorizeHints::vectorizeAnalysisPassName() const { - if (getWidth() == 1) + if (getWidth() == ElementCount::getFixed(1)) return LV_NAME; if (getForce() == LoopVectorizeHints::FK_Disabled) return LV_NAME; - if (getForce() == LoopVectorizeHints::FK_Undefined && getWidth() == 0) + if (getForce() == LoopVectorizeHints::FK_Undefined && getWidth().isZero()) return LV_NAME; return OptimizationRemarkAnalysis::AlwaysPrint; } @@ -220,22 +254,15 @@ } } -void LoopVectorizeHints::setHint(StringRef Name, Metadata *Arg) { +void LoopVectorizeHints::setHint(StringRef Name, const Metadata *Arg) { if (!Name.startswith(Prefix())) return; Name = Name.substr(Prefix().size(), StringRef::npos); - const ConstantInt *C = mdconst::dyn_extract(Arg); - if (!C) - return; - unsigned Val = C->getZExtValue(); - Hint *Hints[] = {&Width, &Interleave, &Force, &IsVectorized, &Predicate}; for (auto H : Hints) { if (Name == H->Name) { - if (H->validate(Val)) - H->Value = Val; - else + if (!H->validateAndSet(Arg)) LLVM_DEBUG(dbgs() << "LV: ignoring invalid hint '" << Name << "'\n"); break; } Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -8260,11 +8260,10 @@ LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE); // Get user vectorization factor. - const unsigned UserVF = Hints.getWidth(); + const ElementCount UserVF = Hints.getWidth(); // Plan how to best vectorize, return the best VF and its cost. - const VectorizationFactor VF = - LVP.planInVPlanNativePath(ElementCount::getFixed(UserVF)); + const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); // If we are stress testing VPlan builds, do not attempt to generate vector // code. Masked vector code generation support will follow soon. @@ -8426,12 +8425,11 @@ LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE); // Get user vectorization factor and interleave count. - unsigned UserVF = Hints.getWidth(); + ElementCount UserVF = Hints.getWidth(); unsigned UserIC = Hints.getInterleave(); // Plan how to best vectorize, return the best VF and its cost. - Optional MaybeVF = - LVP.plan(ElementCount::getFixed(UserVF), UserIC); + Optional MaybeVF = LVP.plan(UserVF, UserIC); VectorizationFactor VF = VectorizationFactor::Disabled(); unsigned IC = 1; Index: llvm/test/Transforms/LoopVectorize/metadata-width.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/metadata-width.ll +++ llvm/test/Transforms/LoopVectorize/metadata-width.ll @@ -24,7 +24,32 @@ ret void } +; CHECK-LABEL: @test2( +; CHECK: store <8 x i32> +; CHECK: ret void +define void @test2(i32* nocapture %a, i32 %n) #0 { +entry: + %cmp4 = icmp sgt i32 %n, 0 + br i1 %cmp4, label %for.body, label %for.end + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv + %0 = trunc i64 %indvars.iv to i32 + store i32 %0, i32* %arrayidx, align 4 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %n + br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !2 + +for.end: ; preds = %for.body, %entry + ret void +} + attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } !0 = !{!0, !1} !1 = !{!"llvm.loop.vectorize.width", i32 8} +!2 = !{!2, !3} +!3 = !{!"llvm.loop.vectorize.width", !4} +!4 = !{i32 8, i1 false} Index: llvm/test/Transforms/LoopVectorize/no_array_bounds2.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/no_array_bounds2.ll @@ -0,0 +1,75 @@ +; RUN: opt < %s -loop-vectorize -transform-warning -S 2>&1 | FileCheck %s + + +; #pragma clang loop vectorize(enable) +; for (int i = 0; i < number; i++) { +; A[B[i]]++; +; } + +; CHECK: warning: :0:0: loop not interleaved: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering +define dso_local void @foo(i32* nocapture %A, i32* nocapture readonly %B, i32 %N) local_unnamed_addr #0 { +entry: + %cmp7 = icmp sgt i32 %N, 0 + br i1 %cmp7, label %for.body.preheader, label %for.end + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %B, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4, !tbaa !0 + %idxprom1 = sext i32 %0 to i64 + %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %idxprom1 + %1 = load i32, i32* %arrayidx2, align 4, !tbaa !0 + %inc = add nsw i32 %1, 1 + store i32 %inc, i32* %arrayidx2, align 4, !tbaa !0 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !4 + +for.end: ; preds = %for.body, %entry + ret void +} + +; CHECK: warning: :0:0: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering +define dso_local void @foo2(i32* nocapture %A, i32* nocapture readonly %B, i32 %N) local_unnamed_addr #0 { +entry: + %cmp7 = icmp sgt i32 %N, 0 + br i1 %cmp7, label %for.body.preheader, label %for.end + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %B, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4, !tbaa !0 + %idxprom1 = sext i32 %0 to i64 + %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %idxprom1 + %1 = load i32, i32* %arrayidx2, align 4, !tbaa !0 + %inc = add nsw i32 %1, 1 + store i32 %inc, i32* %arrayidx2, align 4, !tbaa !0 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !8 + +for.end: ; preds = %for.body, %entry + ret void +} + +attributes #0 = { nofree norecurse nounwind "disable-tail-calls"="false" "frame-pointer"="non-leaf" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" } + +!0 = !{!1, !1, i64 0} +!1 = !{!"int", !2, i64 0} +!2 = !{!"omnipotent char", !3, i64 0} +!3 = !{!"Simple C/C++ TBAA"} +!4 = distinct !{!4, !5, !6} +!5 = !{!"llvm.loop.vectorize.enable", i1 true} +!6 = !{!"llvm.loop.vectorize.width", !7} +!7 = !{i32 1, i1 false} +!8 = distinct !{!8, !5, !9} +!9 = !{!"llvm.loop.vectorize.width", !10} +!10 = !{i32 1, i1 true}