Index: llvm/docs/LangRef.rst =================================================================== --- llvm/docs/LangRef.rst +++ llvm/docs/LangRef.rst @@ -5927,6 +5927,20 @@ !0 = !{!"llvm.loop.vectorize.predicate.enable", i1 0} !1 = !{!"llvm.loop.vectorize.predicate.enable", i1 1} +'``llvm.loop.vectorize.scalable.enable``' Metadata +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +This metadata selectively enables or disables scalable vectorization for the +loop. The first operand is the string ``llvm.loop.vectorize.scalable.enable`` +and the second operand is a bit. If the bit operand value is 1 scalable +vectorization is enabled, whereas a value of 0 reverts to the default fixed +width vectorization: + +.. code-block:: llvm + + !0 = !{!"llvm.loop.vectorize.scalable.enable", i1 0} + !1 = !{!"llvm.loop.vectorize.scalable.enable", i1 1} + '``llvm.loop.vectorize.width``' Metadata ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Index: llvm/include/llvm/Transforms/Utils/LoopUtils.h =================================================================== --- llvm/include/llvm/Transforms/Utils/LoopUtils.h +++ llvm/include/llvm/Transforms/Utils/LoopUtils.h @@ -194,6 +194,9 @@ /// Find named metadata for a loop with an integer value. llvm::Optional getOptionalIntLoopAttribute(Loop *TheLoop, StringRef Name); +llvm::Optional +getOptionalElementCountLoopAttribute(Loop *TheLoop); + /// Create a new loop identifier for a loop created from a loop transformation. /// /// @param OrigLoopID The loop ID of the loop before the transformation. Index: llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h =================================================================== --- llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h +++ llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h @@ -29,6 +29,7 @@ #include "llvm/ADT/MapVector.h" #include "llvm/Analysis/LoopAccessAnalysis.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/Support/TypeSize.h" #include "llvm/Transforms/Utils/LoopUtils.h" namespace llvm { @@ -43,8 +44,14 @@ /// for example 'force', means a decision has been made. So, we need to be /// careful NOT to add them if the user hasn't specifically asked so. class LoopVectorizeHints { - enum HintKind { HK_WIDTH, HK_UNROLL, HK_FORCE, HK_ISVECTORIZED, - HK_PREDICATE }; + enum HintKind { + HK_WIDTH, + HK_UNROLL, + HK_FORCE, + HK_ISVECTORIZED, + HK_PREDICATE, + HK_SCALABLE + }; /// Hint - associates name and validation with the hint value. struct Hint { @@ -73,6 +80,9 @@ /// Vector Predicate Hint Predicate; + /// Says whether we should use fixed width or scalable vectorization. + Hint Scalable; + /// Return the loop metadata prefix. static StringRef Prefix() { return "llvm.loop."; } @@ -98,7 +108,9 @@ /// Dumps all the hint information. void emitRemarkWithHints() const; - unsigned getWidth() const { return Width.Value; } + ElementCount getWidth() const { + return ElementCount::get(Width.Value, isScalable()); + } unsigned getInterleave() const { return Interleave.Value; } unsigned getIsVectorized() const { return IsVectorized.Value; } unsigned getPredicate() const { return Predicate.Value; } @@ -109,6 +121,8 @@ return (ForceKind)Force.Value; } + bool isScalable() const { return Scalable.Value; } + /// If hints are provided that force vectorization, use the AlwaysPrint /// pass name to force the frontend to print the diagnostic. const char *vectorizeAnalysisPassName() const; @@ -119,7 +133,9 @@ // enabled by default because can be unsafe or inefficient. For example, // reordering floating-point operations will change the way round-off // error accumulates in the loop. - return getForce() == LoopVectorizeHints::FK_Enabled || getWidth() > 1; + ElementCount EC = getWidth(); + return getForce() == LoopVectorizeHints::FK_Enabled || + EC.getKnownMinValue() > 1; } bool isPotentiallyUnsafe() const { Index: llvm/lib/Transforms/Scalar/WarnMissedTransforms.cpp =================================================================== --- llvm/lib/Transforms/Scalar/WarnMissedTransforms.cpp +++ llvm/lib/Transforms/Scalar/WarnMissedTransforms.cpp @@ -48,12 +48,12 @@ if (hasVectorizeTransformation(L) == TM_ForcedByUser) { LLVM_DEBUG(dbgs() << "Leftover vectorization transformation\n"); - Optional VectorizeWidth = - getOptionalIntLoopAttribute(L, "llvm.loop.vectorize.width"); + Optional VectorizeWidth = + getOptionalElementCountLoopAttribute(L); Optional InterleaveCount = getOptionalIntLoopAttribute(L, "llvm.loop.interleave.count"); - if (VectorizeWidth.getValueOr(0) != 1) + if (!VectorizeWidth || VectorizeWidth->isVector()) ORE->emit( DiagnosticInfoOptimizationFailure(DEBUG_TYPE, "FailedRequestedVectorization", Index: llvm/lib/Transforms/Utils/LoopUtils.cpp =================================================================== --- llvm/lib/Transforms/Utils/LoopUtils.cpp +++ llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -301,6 +301,30 @@ return getOptionalBoolLoopAttribute(TheLoop, Name).getValueOr(false); } +llvm::Optional +llvm::getOptionalElementCountLoopAttribute(Loop *TheLoop) { + const MDOperand *AttrMD = + findStringMetadataForLoop(TheLoop, "llvm.loop.vectorize.width") + .getValueOr(nullptr); + if (!AttrMD) + return None; + + if (ConstantInt *IntMD = mdconst::dyn_extract(AttrMD->get())) { + unsigned Width = IntMD->getZExtValue(); + bool IsScalable = false; + AttrMD = findStringMetadataForLoop(TheLoop, + "llvm.loop.vectorize.scalable.enable") + .getValueOr(nullptr); + if (AttrMD) { + IntMD = mdconst::dyn_extract(AttrMD->get()); + IsScalable = IntMD ? IntMD->getZExtValue() : false; + } + return ElementCount::get(Width, IsScalable); + } + + return None; +} + llvm::Optional llvm::getOptionalIntLoopAttribute(Loop *TheLoop, StringRef Name) { const MDOperand *AttrMD = @@ -450,14 +474,15 @@ if (Enable == false) return TM_SuppressedByUser; - Optional VectorizeWidth = - getOptionalIntLoopAttribute(L, "llvm.loop.vectorize.width"); + Optional VectorizeWidth = + getOptionalElementCountLoopAttribute(L); Optional InterleaveCount = getOptionalIntLoopAttribute(L, "llvm.loop.interleave.count"); // 'Forcing' vector width and interleave count to one effectively disables // this tranformation. - if (Enable == true && VectorizeWidth == 1 && InterleaveCount == 1) + if (Enable == true && (VectorizeWidth && VectorizeWidth->isScalar()) && + InterleaveCount == 1) return TM_SuppressedByUser; if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized")) @@ -466,10 +491,10 @@ if (Enable == true) return TM_ForcedByUser; - if (VectorizeWidth == 1 && InterleaveCount == 1) + if ((VectorizeWidth && VectorizeWidth->isScalar()) && InterleaveCount == 1) return TM_Disable; - if (VectorizeWidth > 1 || InterleaveCount > 1) + if ((VectorizeWidth && VectorizeWidth->isVector()) || InterleaveCount > 1) return TM_Enable; if (hasDisableAllTransformsHint(L)) Index: llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -66,6 +66,7 @@ return (Val <= 1); case HK_ISVECTORIZED: case HK_PREDICATE: + case HK_SCALABLE: return (Val == 0 || Val == 1); } return false; @@ -78,7 +79,8 @@ Interleave("interleave.count", InterleaveOnlyWhenForced, HK_UNROLL), Force("vectorize.enable", FK_Undefined, HK_FORCE), IsVectorized("isvectorized", 0, HK_ISVECTORIZED), - Predicate("vectorize.predicate.enable", FK_Undefined, HK_PREDICATE), TheLoop(L), + Predicate("vectorize.predicate.enable", FK_Undefined, HK_PREDICATE), + Scalable("vectorize.scalable.enable", false, HK_SCALABLE), TheLoop(L), ORE(ORE) { // Populate values with existing loop metadata. getHintsFromMetadata(); @@ -91,7 +93,8 @@ // If the vectorization width and interleaving count are both 1 then // consider the loop to have been already vectorized because there's // nothing more that we can do. - IsVectorized.Value = Width.Value == 1 && Interleave.Value == 1; + IsVectorized.Value = + getWidth() == ElementCount::getFixed(1) && Interleave.Value == 1; LLVM_DEBUG(if (InterleaveOnlyWhenForced && Interleave.Value == 1) dbgs() << "LV: Interleaving disabled by the pass manager\n"); } @@ -164,7 +167,7 @@ if (Force.Value == LoopVectorizeHints::FK_Enabled) { R << " (Force=" << NV("Force", true); if (Width.Value != 0) - R << ", Vector Width=" << NV("VectorWidth", Width.Value); + R << ", Vector Width=" << NV("VectorWidth", getWidth()); if (Interleave.Value != 0) R << ", Interleave Count=" << NV("InterleaveCount", Interleave.Value); R << ")"; @@ -175,11 +178,11 @@ } const char *LoopVectorizeHints::vectorizeAnalysisPassName() const { - if (getWidth() == 1) + if (getWidth() == ElementCount::getFixed(1)) return LV_NAME; if (getForce() == LoopVectorizeHints::FK_Disabled) return LV_NAME; - if (getForce() == LoopVectorizeHints::FK_Undefined && getWidth() == 0) + if (getForce() == LoopVectorizeHints::FK_Undefined && getWidth().isZero()) return LV_NAME; return OptimizationRemarkAnalysis::AlwaysPrint; } @@ -230,7 +233,8 @@ return; unsigned Val = C->getZExtValue(); - Hint *Hints[] = {&Width, &Interleave, &Force, &IsVectorized, &Predicate}; + Hint *Hints[] = {&Width, &Interleave, &Force, + &IsVectorized, &Predicate, &Scalable}; for (auto H : Hints) { if (Name == H->Name) { if (H->validate(Val)) Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -8272,11 +8272,16 @@ LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE); // Get user vectorization factor. - const unsigned UserVF = Hints.getWidth(); + ElementCount UserVF = Hints.getWidth(); + if (UserVF.isScalable()) { + // TODO: Use scalable UserVF once we've added initial support for scalable + // vectorization. For now we convert it to fixed width, but this will be + // removed in a later patch. + UserVF = ElementCount::getFixed(UserVF.getKnownMinValue()); + } // Plan how to best vectorize, return the best VF and its cost. - const VectorizationFactor VF = - LVP.planInVPlanNativePath(ElementCount::getFixed(UserVF)); + const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); // If we are stress testing VPlan builds, do not attempt to generate vector // code. Masked vector code generation support will follow soon. @@ -8438,12 +8443,18 @@ LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE); // Get user vectorization factor and interleave count. - unsigned UserVF = Hints.getWidth(); + ElementCount UserVF = Hints.getWidth(); + if (UserVF.isScalable()) { + // TODO: Use scalable UserVF once we've added initial support for scalable + // vectorization. For now we convert it to fixed width, but this will be + // removed in a later patch. + UserVF = ElementCount::getFixed(UserVF.getKnownMinValue()); + } + unsigned UserIC = Hints.getInterleave(); // Plan how to best vectorize, return the best VF and its cost. - Optional MaybeVF = - LVP.plan(ElementCount::getFixed(UserVF), UserIC); + Optional MaybeVF = LVP.plan(UserVF, UserIC); VectorizationFactor VF = VectorizationFactor::Disabled(); unsigned IC = 1; Index: llvm/test/Transforms/LoopVectorize/metadata-width.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/metadata-width.ll +++ llvm/test/Transforms/LoopVectorize/metadata-width.ll @@ -24,7 +24,55 @@ ret void } +; CHECK-LABEL: @test2( +; CHECK: store <8 x i32> +; CHECK: ret void +define void @test2(i32* nocapture %a, i32 %n) #0 { +entry: + %cmp4 = icmp sgt i32 %n, 0 + br i1 %cmp4, label %for.body, label %for.end + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv + %0 = trunc i64 %indvars.iv to i32 + store i32 %0, i32* %arrayidx, align 4 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %n + br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !2 + +for.end: ; preds = %for.body, %entry + ret void +} + +; CHECK-LABEL: @test3( +; CHECK: store <8 x i32> +; CHECK: ret void +define void @test3(i32* nocapture %a, i32 %n) #0 { +entry: + %cmp4 = icmp sgt i32 %n, 0 + br i1 %cmp4, label %for.body, label %for.end + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv + %0 = trunc i64 %indvars.iv to i32 + store i32 %0, i32* %arrayidx, align 4 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %n + br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !4 + +for.end: ; preds = %for.body, %entry + ret void +} + attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } !0 = !{!0, !1} !1 = !{!"llvm.loop.vectorize.width", i32 8} +!2 = !{!2, !1, !3} +!3 = !{!"llvm.loop.vectorize.scalable.enable", i32 1} +!4 = !{!4, !1, !5} +!5 = !{!"llvm.loop.vectorize.scalable.enable", i32 0} Index: llvm/test/Transforms/LoopVectorize/no_array_bounds_scalable.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/no_array_bounds_scalable.ll @@ -0,0 +1,75 @@ +; RUN: opt < %s -loop-vectorize -transform-warning -S 2>&1 | FileCheck %s + +; Like no_array_bounds.ll we verify warnings are generated when vectorization/interleaving is +; explicitly specified and fails to occur for both fixed and scalable vectorize.width loop hints. + +; #pragma clang loop vectorize(enable) +; for (int i = 0; i < number; i++) { +; A[B[i]]++; +; } + +; CHECK: warning: :0:0: loop not interleaved: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering +define dso_local void @foo(i32* nocapture %A, i32* nocapture readonly %B, i32 %N) local_unnamed_addr #0 { +entry: + %cmp7 = icmp sgt i32 %N, 0 + br i1 %cmp7, label %for.body.preheader, label %for.end + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %B, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4, !tbaa !0 + %idxprom1 = sext i32 %0 to i64 + %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %idxprom1 + %1 = load i32, i32* %arrayidx2, align 4, !tbaa !0 + %inc = add nsw i32 %1, 1 + store i32 %inc, i32* %arrayidx2, align 4, !tbaa !0 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !4 + +for.end: ; preds = %for.body, %entry + ret void +} + +; CHECK: warning: :0:0: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering +define dso_local void @foo2(i32* nocapture %A, i32* nocapture readonly %B, i32 %N) local_unnamed_addr #0 { +entry: + %cmp7 = icmp sgt i32 %N, 0 + br i1 %cmp7, label %for.body.preheader, label %for.end + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %B, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4, !tbaa !0 + %idxprom1 = sext i32 %0 to i64 + %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %idxprom1 + %1 = load i32, i32* %arrayidx2, align 4, !tbaa !0 + %inc = add nsw i32 %1, 1 + store i32 %inc, i32* %arrayidx2, align 4, !tbaa !0 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !7 + +for.end: ; preds = %for.body, %entry + ret void +} + +attributes #0 = { nofree norecurse nounwind "disable-tail-calls"="false" "frame-pointer"="non-leaf" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" } + +!0 = !{!1, !1, i64 0} +!1 = !{!"int", !2, i64 0} +!2 = !{!"omnipotent char", !3, i64 0} +!3 = !{!"Simple C/C++ TBAA"} +!4 = distinct !{!4, !5, !6} +!5 = !{!"llvm.loop.vectorize.enable", i1 true} +!6 = !{!"llvm.loop.vectorize.width", i32 1} +!7 = distinct !{!7, !5, !6, !8} +!8 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}