Index: llvm/include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfo.h +++ llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -928,6 +928,10 @@ /// \return The width of the smallest vector register type. unsigned getMinVectorRegisterBitWidth() const; + /// \return The maximum value for vscale in scalable vectors such as + /// . Default is None. + Optional getMaxVScale() const; + /// \return True if the vectorization factor should be chosen to /// make the vector of the smallest element type match the size of a /// vector register. For wider element types, this could result in @@ -1504,6 +1508,7 @@ virtual const char *getRegisterClassName(unsigned ClassID) const = 0; virtual unsigned getRegisterBitWidth(bool Vector) const = 0; virtual unsigned getMinVectorRegisterBitWidth() = 0; + virtual Optional getMaxVScale() const = 0; virtual bool shouldMaximizeVectorBandwidth(bool OptSize) const = 0; virtual unsigned getMinimumVF(unsigned ElemWidth) const = 0; virtual unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const = 0; @@ -1921,6 +1926,9 @@ unsigned getMinVectorRegisterBitWidth() override { return Impl.getMinVectorRegisterBitWidth(); } + Optional getMaxVScale() const override { + return Impl.getMaxVScale(); + } bool shouldMaximizeVectorBandwidth(bool OptSize) const override { return Impl.shouldMaximizeVectorBandwidth(OptSize); } Index: llvm/include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -352,6 +352,8 @@ unsigned getMinVectorRegisterBitWidth() { return 128; } + llvm::Optional getMaxVScale() const { return None; } + bool shouldMaximizeVectorBandwidth(bool OptSize) const { return false; } unsigned getMinimumVF(unsigned ElemWidth) const { return 0; } Index: llvm/include/llvm/CodeGen/BasicTTIImpl.h =================================================================== --- llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -571,6 +571,8 @@ unsigned getRegisterBitWidth(bool Vector) const { return 32; } + Optional getMaxVScale() const { return None; } + /// Estimate the overhead of scalarizing an instruction. Insert and Extract /// are set if the demanded result elements need to be inserted and/or /// extracted from vectors. Index: llvm/lib/Analysis/TargetTransformInfo.cpp =================================================================== --- llvm/lib/Analysis/TargetTransformInfo.cpp +++ llvm/lib/Analysis/TargetTransformInfo.cpp @@ -627,6 +627,10 @@ return TTIImpl->getMinVectorRegisterBitWidth(); } +llvm::Optional TargetTransformInfo::getMaxVScale() const { + return TTIImpl->getMaxVScale(); +} + bool TargetTransformInfo::shouldMaximizeVectorBandwidth(bool OptSize) const { return TTIImpl->shouldMaximizeVectorBandwidth(OptSize); } Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h =================================================================== --- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -115,6 +115,12 @@ return ST->getMinVectorRegisterBitWidth(); } + Optional getMaxVScale() const { + if (ST->hasSVE()) + return AArch64::SVEMaxBitsPerVector / AArch64::SVEBitsPerBlock; + return BaseT::getMaxVScale(); + } + unsigned getMaxInterleaveFactor(unsigned VF); int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -272,6 +272,12 @@ "an instruction to a single constant value. Mostly " "useful for getting consistent testing.")); +static cl::opt ForceTargetSupportsScalableVectors( + "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, + cl::desc( + "Allow scalable vectorization for targets that don't support scalable " + "vectors.")); + static cl::opt SmallLoopCost( "small-loop-cost", cl::init(20), cl::Hidden, cl::desc( @@ -5562,15 +5568,63 @@ // dependence distance). unsigned MaxSafeVectorWidthInBits = Legal->getMaxSafeVectorWidthInBits(); - if (UserVF.isNonZero()) { - // For now, don't verify legality of scalable vectors. - // This will be addressed properly in https://reviews.llvm.org/D91718. - if (UserVF.isScalable()) + bool IgnoreUserVF = UserVF.isScalable() && !TTI.supportsScalableVectors() && + !ForceTargetSupportsScalableVectors; + if (IgnoreUserVF) { + LLVM_DEBUG( + dbgs() << "LV: Ignoring VF=" << UserVF + << " because target does not support scalable vectors.\n"); + ORE->emit([&]() { + return OptimizationRemarkAnalysis(DEBUG_TYPE, "IgnoreUserVF", + TheLoop->getStartLoc(), + TheLoop->getHeader()) + << "Ignoring VF=" << ore::NV("UserVF", UserVF) + << " because target does not support scalable vectors."; + }); + } + + // If the user vectorization factor is legally unsafe, clamp it to a safe + // value. Otherwise, return as is. + if (UserVF.isNonZero() && !IgnoreUserVF) { + // Nothing to do if there are no dependencies. + if (MaxSafeVectorWidthInBits == UINT_MAX) return UserVF; - // If legally unsafe, clamp the user vectorization factor to a safe value. - unsigned MaxSafeVF = PowerOf2Floor(MaxSafeVectorWidthInBits / WidestType); - if (UserVF.getFixedValue() <= MaxSafeVF) + unsigned MaxSafeElements = + PowerOf2Floor(MaxSafeVectorWidthInBits / WidestType); + ElementCount MaxSafeVF = ElementCount::getFixed(MaxSafeElements); + + if (UserVF.isScalable()) { + Optional MaxVScale = TTI.getMaxVScale(); + assert(MaxVScale && + "max vscale undefined for target that supports scalable vectors!"); + + // Scale VF by vscale before checking if it's safe. + MaxSafeVF = + ElementCount::getScalable(MaxSafeElements / MaxVScale.getValue()); + + if (MaxSafeVF.isZero()) { + // The dependence distance is too small to use scalable vectors, + // fallback on fixed. + LLVM_DEBUG( + dbgs() + << "LV: Max legal vector width too small, scalable vectorization " + "unfeasible. Using fixed-width vectorization instead.\n"); + ORE->emit([&]() { + return OptimizationRemarkAnalysis(DEBUG_TYPE, "ScalableVFUnfeasible", + TheLoop->getStartLoc(), + TheLoop->getHeader()) + << "Max legal vector width too small, scalable vectorization " + << "unfeasible. Using fixed-width vectorization instead."; + }); + return computeFeasibleMaxVF( + ConstTripCount, ElementCount::getFixed(UserVF.getKnownMinValue())); + } + } + + LLVM_DEBUG(dbgs() << "LV: The max safe VF is: " << MaxSafeVF << ".\n"); + + if (ElementCount::isKnownLE(UserVF, MaxSafeVF)) return UserVF; LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF @@ -5585,7 +5639,7 @@ << " is unsafe, clamping to maximum safe vectorization factor " << ore::NV("VectorizationFactor", MaxSafeVF); }); - return ElementCount::getFixed(MaxSafeVF); + return MaxSafeVF; } WidestRegister = std::min(WidestRegister, MaxSafeVectorWidthInBits); @@ -7385,17 +7439,24 @@ ElementCount MaxVF = MaybeMaxVF.getValue(); assert(MaxVF.isNonZero() && "MaxVF is zero."); - if (!UserVF.isZero() && ElementCount::isKnownLE(UserVF, MaxVF)) { - LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); - assert(isPowerOf2_32(UserVF.getKnownMinValue()) && + bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxVF); + if (!UserVF.isZero() && + (UserVFIsLegal || (UserVF.isScalable() && MaxVF.isScalable()))) { + // FIXME: MaxVF is temporarily used inplace of UserVF for illegal scalable + // VFs here, this should be reverted to only use legal UserVFs once the + // loop below supports scalable VFs. + ElementCount VF = UserVFIsLegal ? UserVF : MaxVF; + LLVM_DEBUG(dbgs() << "LV: Using " << (UserVFIsLegal ? "user" : "max") + << " VF " << VF << ".\n"); + assert(isPowerOf2_32(VF.getKnownMinValue()) && "VF needs to be a power of two"); // Collect the instructions (and their associated costs) that will be more // profitable to scalarize. - CM.selectUserVectorizationFactor(UserVF); + CM.selectUserVectorizationFactor(VF); CM.collectInLoopReductions(); - buildVPlansWithVPRecipes(UserVF, UserVF); + buildVPlansWithVPRecipes(VF, VF); LLVM_DEBUG(printPlans(dbgs())); - return {{UserVF, 0}}; + return {{VF, 0}}; } assert(!MaxVF.isScalable() && Index: llvm/test/Transforms/LoopVectorize/AArch64/scalable-loop-unpredicated-body-scalar-tail.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/AArch64/scalable-loop-unpredicated-body-scalar-tail.ll @@ -0,0 +1,101 @@ +; RUN: opt -mtriple=aarch64-none-linux-gnu -mattr=+sve -S -loop-vectorize -instcombine -force-vector-interleave=1 < %s | FileCheck %s --check-prefix=CHECKUF1 +; RUN: opt -mtriple=aarch64-none-linux-gnu -mattr=+sve -S -loop-vectorize -instcombine -force-vector-interleave=2 < %s | FileCheck %s --check-prefix=CHECKUF2 + +; CHECKUF1: for.body.preheader: +; CHECKUF1-DAG: %wide.trip.count = zext i32 %N to i64 +; CHECKUF1-DAG: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64() +; CHECKUF1-DAG: %[[VSCALEX4:.*]] = shl i64 %[[VSCALE]], 2 +; CHECKUF1-DAG: %min.iters.check = icmp ugt i64 %[[VSCALEX4]], %wide.trip.count + +; CHECKUF1: vector.ph: +; CHECKUF1-DAG: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64() +; CHECKUF1-DAG: %[[VSCALEX4:.*]] = shl i64 %[[VSCALE]], 2 +; CHECKUF1-DAG: %n.mod.vf = urem i64 %wide.trip.count, %[[VSCALEX4]] +; CHECKUF1: %n.vec = sub nsw i64 %wide.trip.count, %n.mod.vf + +; CHECKUF1: vector.body: +; CHECKUF1: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] +; CHECKUF1: %[[IDXB:.*]] = getelementptr inbounds double, double* %b, i64 %index +; CHECKUF1: %[[IDXB_CAST:.*]] = bitcast double* %[[IDXB]] to * +; CHECKUF1: %wide.load = load , * %[[IDXB_CAST]], align 8, !alias.scope !0 +; CHECKUF1: %[[FADD:.*]] = fadd %wide.load, shufflevector ( insertelement ( undef, double 1.000000e+00, i32 0), undef, zeroinitializer) +; CHECKUF1: %[[IDXA:.*]] = getelementptr inbounds double, double* %a, i64 %index +; CHECKUF1: %[[IDXA_CAST:.*]] = bitcast double* %[[IDXA]] to * +; CHECKUF1: store %[[FADD]], * %[[IDXA_CAST]], align 8, !alias.scope !3, !noalias !0 +; CHECKUF1: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64() +; CHECKUF1: %[[VSCALEX4:.*]] = shl i64 %[[VSCALE]], 2 +; CHECKUF1: %index.next = add i64 %index, %[[VSCALEX4]] +; CHECKUF1: %[[CMP:.*]] = icmp eq i64 %index.next, %n.vec +; CHECKUF1: br i1 %[[CMP]], label %middle.block, label %vector.body, !llvm.loop !5 + + +; For an interleave factor of 2, vscale is scaled by 8 instead of 4 (and thus shifted left by 3 instead of 2). +; There is also the increment for the next iteration, e.g. instead of indexing IDXB, it indexes at IDXB + vscale * 4. + +; CHECKUF2: for.body.preheader: +; CHECKUF2-DAG: %wide.trip.count = zext i32 %N to i64 +; CHECKUF2-DAG: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64() +; CHECKUF2-DAG: %[[VSCALEX8:.*]] = shl i64 %[[VSCALE]], 3 +; CHECKUF2-DAG: %min.iters.check = icmp ugt i64 %[[VSCALEX8]], %wide.trip.count + +; CHECKUF2: vector.ph: +; CHECKUF2-DAG: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64() +; CHECKUF2-DAG: %[[VSCALEX8:.*]] = shl i64 %[[VSCALE]], 3 +; CHECKUF2-DAG: %n.mod.vf = urem i64 %wide.trip.count, %[[VSCALEX8]] +; CHECKUF2: %n.vec = sub nsw i64 %wide.trip.count, %n.mod.vf + +; CHECKUF2: vector.body: +; CHECKUF2: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] +; CHECKUF2: %[[IDXB:.*]] = getelementptr inbounds double, double* %b, i64 %index +; CHECKUF2: %[[IDXB_CAST:.*]] = bitcast double* %[[IDXB]] to * +; CHECKUF2: %wide.load = load , * %[[IDXB_CAST]], align 8, !alias.scope !0 +; CHECKUF2: %[[VSCALE:.*]] = call i32 @llvm.vscale.i32() +; CHECKUF2: %[[VSCALE2:.*]] = shl i32 %[[VSCALE]], 2 +; CHECKUF2: %[[VSCALE2_EXT:.*]] = sext i32 %[[VSCALE2]] to i64 +; CHECKUF2: %[[IDXB_NEXT:.*]] = getelementptr inbounds double, double* %[[IDXB]], i64 %[[VSCALE2_EXT]] +; CHECKUF2: %[[IDXB_NEXT_CAST:.*]] = bitcast double* %[[IDXB_NEXT]] to * +; CHECKUF2: %wide.load{{[0-9]+}} = load , * %[[IDXB_NEXT_CAST]], align 8, !alias.scope !0 +; CHECKUF2: %[[FADD:.*]] = fadd %wide.load, shufflevector ( insertelement ( undef, double 1.000000e+00, i32 0), undef, zeroinitializer) +; CHECKUF2: %[[FADD_NEXT:.*]] = fadd %wide.load{{[0-9]+}}, shufflevector ( insertelement ( undef, double 1.000000e+00, i32 0), undef, zeroinitializer) +; CHECKUF2: %[[IDXA:.*]] = getelementptr inbounds double, double* %a, i64 %index +; CHECKUF2: %[[IDXA_CAST:.*]] = bitcast double* %[[IDXA]] to * +; CHECKUF2: store %[[FADD]], * %[[IDXA_CAST]], align 8, !alias.scope !3, !noalias !0 +; CHECKUF2: %[[VSCALE:.*]] = call i32 @llvm.vscale.i32() +; CHECKUF2: %[[VSCALE2:.*]] = shl i32 %[[VSCALE]], 2 +; CHECKUF2: %[[VSCALE2_EXT:.*]] = sext i32 %[[VSCALE2]] to i64 +; CHECKUF2: %[[IDXA_NEXT:.*]] = getelementptr inbounds double, double* %[[IDXA]], i64 %[[VSCALE2_EXT]] +; CHECKUF2: %[[IDXA_NEXT_CAST:.*]] = bitcast double* %[[IDXA_NEXT]] to * +; CHECKUF2: store %[[FADD_NEXT]], * %[[IDXA_NEXT_CAST]], align 8, !alias.scope !3, !noalias !0 +; CHECKUF2: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64() +; CHECKUF2: %[[VSCALEX8:.*]] = shl i64 %[[VSCALE]], 3 +; CHECKUF2: %index.next = add i64 %index, %[[VSCALEX8]] +; CHECKUF2: %[[CMP:.*]] = icmp eq i64 %index.next, %n.vec +; CHECKUF2: br i1 %[[CMP]], label %middle.block, label %vector.body, !llvm.loop !5 + +define void @loop(i32 %N, double* nocapture %a, double* nocapture readonly %b) { +entry: + %cmp7 = icmp sgt i32 %N, 0 + br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %N to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds double, double* %b, i64 %indvars.iv + %0 = load double, double* %arrayidx, align 8 + %add = fadd double %0, 1.000000e+00 + %arrayidx2 = getelementptr inbounds double, double* %a, i64 %indvars.iv + store double %add, double* %arrayidx2, align 8 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !1 +} + +!1 = distinct !{!1, !2, !3} +!2 = !{!"llvm.loop.vectorize.width", i32 4} +!3 = !{!"llvm.loop.vectorize.scalable.enable", i1 true} Index: llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll @@ -0,0 +1,333 @@ +; REQUIRES: asserts +; RUN: opt -mtriple=aarch64-none-linux-gnu -mattr=+sve -loop-vectorize -S < %s 2>&1 | FileCheck %s +; RUN: opt -mtriple=aarch64-none-linux-gnu -mattr=+sve -loop-vectorize -pass-remarks-analysis=loop-vectorize -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck --check-prefix=CHECK-DBG %s +; RUN: opt -mtriple=aarch64-none-linux-gnu -loop-vectorize -pass-remarks-analysis=loop-vectorize -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck --check-prefix=CHECK-NO-SVE %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" + +; These tests validate the behaviour of scalable vectorization factor hints, +; where the following applies: +; +; * If the backend does not support scalable vectors, ignore the hint and let +; the vectorizer pick a VF. +; * If there are no dependencies and assuming the VF is a power of 2 the VF +; should be accepted. This applies to both fixed and scalable VFs. +; * If the dependency is too small to use scalable vectors, change the VF to +; fixed, where existing behavior applies (clamping). +; * If scalable vectorization is feasible given the dependency and the VF is +; valid, accept it. Otherwise, clamp to the max scalable VF. + +; test1 +; +; Scalable vectorization unfeasible, clamp VF from (4, scalable) -> (4, fixed). +; +; The pragma applied to this loop implies a scalable vector +; be used for vectorization. For fixed vectors the MaxVF=8, otherwise there +; would be a dependence between vector lanes for vectors greater than 256 bits. +; +; void test1(int *a, int *b, int N) { +; #pragma clang loop vectorize(enable) vectorize_width(4, scalable) +; for (int i=0; i:0:0: Max legal vector width too small, scalable vectorization unfeasible. Using fixed-width vectorization instead. +; CHECK-DBG: LV: The max safe VF is: 8. +; CHECK-DBG: LV: Selecting VF: 4. +; CHECK-LABEL: @test1 +; CHECK: <4 x i32> +define void @test1(i32* %a, i32* %b) { +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %iv + %1 = load i32, i32* %arrayidx2, align 4 + %add = add nsw i32 %1, %0 + %2 = add nuw nsw i64 %iv, 8 + %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2 + store i32 %add, i32* %arrayidx5, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !0 + +exit: + ret void +} + +!0 = !{!0, !1, !2} +!1 = !{!"llvm.loop.vectorize.width", i32 4} +!2 = !{!"llvm.loop.vectorize.scalable.enable", i1 true} + +; test2 +; +; Scalable vectorization unfeasible, clamp VF from (8, scalable) -> (4, fixed). +; +; void test2(int *a, int *b, int N) { +; #pragma clang loop vectorize(enable) vectorize_width(8, scalable) +; for (int i=0; i +define void @test2(i32* %a, i32* %b) { +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %iv + %1 = load i32, i32* %arrayidx2, align 4 + %add = add nsw i32 %1, %0 + %2 = add nuw nsw i64 %iv, 4 + %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2 + store i32 %add, i32* %arrayidx5, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !3 + +exit: + ret void +} + +!3 = !{!3, !4, !5} +!4 = !{!"llvm.loop.vectorize.width", i32 8} +!5 = !{!"llvm.loop.vectorize.scalable.enable", i1 true} + +; test3 +; +; Scalable vectorization feasible and the VF is valid. +; +; Specifies a vector of , i.e. maximum of 32 x i32 with 2 +; words per 128-bits (unpacked). +; +; void test3(int *a, int *b, int N) { +; #pragma clang loop vectorize(enable) vectorize_width(2, scalable) +; for (int i=0; i +define void @test3(i32* %a, i32* %b) { +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %iv + %1 = load i32, i32* %arrayidx2, align 4 + %add = add nsw i32 %1, %0 + %2 = add nuw nsw i64 %iv, 32 + %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2 + store i32 %add, i32* %arrayidx5, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !6 + +exit: + ret void +} + +!6 = !{!6, !7, !8} +!7 = !{!"llvm.loop.vectorize.width", i32 2} +!8 = !{!"llvm.loop.vectorize.scalable.enable", i1 true} + +; test4 +; +; Scalable vectorization feasible, but the VF is unsafe. Should clamp. +; +; Specifies a vector of , i.e. maximum of 64 x i32 with 4 +; words per 128-bits (packed). +; +; void test4(int *a, int *b, int N) { +; #pragma clang loop vectorize(enable) vectorize_width(4, scalable) +; for (int i=0; i:0:0: User-specified vectorization factor vscale x 4 is unsafe, clamping to maximum safe vectorization factor vscale x 2 +; CHECK-DBG: LV: Using max VF vscale x 2 +; CHECK-LABEL: @test4 +; CHECK: +define void @test4(i32* %a, i32* %b) { +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %iv + %1 = load i32, i32* %arrayidx2, align 4 + %add = add nsw i32 %1, %0 + %2 = add nuw nsw i64 %iv, 32 + %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2 + store i32 %add, i32* %arrayidx5, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !9 + +exit: + ret void +} + +!9 = !{!9, !10, !11} +!10 = !{!"llvm.loop.vectorize.width", i32 4} +!11 = !{!"llvm.loop.vectorize.scalable.enable", i1 true} + +; test5 +; +; Scalable vectorization feasible and the VF is valid. +; +; Specifies a vector of , i.e. maximum of 64 x i32 with 4 +; words per 128-bits (packed). +; +; void test5(int *a, int *b, int N) { +; #pragma clang loop vectorize(enable) vectorize_width(4, scalable) +; for (int i=0; i +define void @test5(i32* %a, i32* %b) { +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %iv + %1 = load i32, i32* %arrayidx2, align 4 + %add = add nsw i32 %1, %0 + %2 = add nuw nsw i64 %iv, 128 + %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2 + store i32 %add, i32* %arrayidx5, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !12 + +exit: + ret void +} + +!12 = !{!12, !13, !14} +!13 = !{!"llvm.loop.vectorize.width", i32 4} +!14 = !{!"llvm.loop.vectorize.scalable.enable", i1 true} + +; test6 +; +; Scalable vectorization feasible, but the VF is unsafe. Should clamp. +; +; Specifies a vector of , i.e. maximum of 256 x i32. +; +; void test6(int *a, int *b, int N) { +; #pragma clang loop vectorize(enable) vectorize_width(16, scalable) +; for (int i=0; i:0:0: User-specified vectorization factor vscale x 16 is unsafe, clamping to maximum safe vectorization factor vscale x 8 +; CHECK-DBG: LV: Using max VF vscale x 8 +; CHECK-LABEL: @test6 +; CHECK: +define void @test6(i32* %a, i32* %b) { +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %iv + %1 = load i32, i32* %arrayidx2, align 4 + %add = add nsw i32 %1, %0 + %2 = add nuw nsw i64 %iv, 128 + %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2 + store i32 %add, i32* %arrayidx5, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !15 + +exit: + ret void +} + +!15 = !{!15, !16, !17} +!16 = !{!"llvm.loop.vectorize.width", i32 16} +!17 = !{!"llvm.loop.vectorize.scalable.enable", i1 true} + +; CHECK-NO-SVE: LV: Ignoring VF=vscale x 4 because target does not support scalable vectors. +; CHECK-NO-SVE: remark: :0:0: Ignoring VF=vscale x 4 because target does not support scalable vectors. +; CHECK-NO-SVE: LV: Selecting VF: 4. +; CHECK-NO-SVE: <4 x i32> +define void @test_no_sve(i32* %a, i32* %b) { +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %iv + %1 = load i32, i32* %arrayidx2, align 4 + %add = add nsw i32 %1, %0 + %2 = add nuw nsw i64 %iv, 4 + %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2 + store i32 %add, i32* %arrayidx5, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !18 + +exit: + ret void +} + +!18 = !{!18, !19, !20} +!19 = !{!"llvm.loop.vectorize.width", i32 4} +!20 = !{!"llvm.loop.vectorize.scalable.enable", i1 true} Index: llvm/test/Transforms/LoopVectorize/metadata-width.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/metadata-width.ll +++ llvm/test/Transforms/LoopVectorize/metadata-width.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -dce -instcombine -S | FileCheck %s +; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-target-supports-scalable-vectors=true -dce -instcombine -S | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" Index: llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-limitations.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-limitations.ll +++ llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-limitations.ll @@ -1,5 +1,5 @@ ; REQUIRES: asserts -; RUN: opt < %s -passes='loop-vectorize' -force-vector-width=2 -enable-epilogue-vectorization -epilogue-vectorization-force-VF=2 --debug-only=loop-vectorize -S 2>&1 | FileCheck %s +; RUN: opt < %s -passes='loop-vectorize' -force-vector-width=2 -force-target-supports-scalable-vectors=true -enable-epilogue-vectorization -epilogue-vectorization-force-VF=2 --debug-only=loop-vectorize -S 2>&1 | FileCheck %s target datalayout = "e-m:e-i64:64-n32:64-v256:256:256-v512:512:512" Index: llvm/test/Transforms/LoopVectorize/scalable-loop-unpredicated-body-scalar-tail.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/scalable-loop-unpredicated-body-scalar-tail.ll +++ /dev/null @@ -1,101 +0,0 @@ -; RUN: opt -S -loop-vectorize -instcombine -force-vector-interleave=1 < %s | FileCheck %s --check-prefix=CHECKUF1 -; RUN: opt -S -loop-vectorize -instcombine -force-vector-interleave=2 < %s | FileCheck %s --check-prefix=CHECKUF2 - -; CHECKUF1: for.body.preheader: -; CHECKUF1-DAG: %wide.trip.count = zext i32 %N to i64 -; CHECKUF1-DAG: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64() -; CHECKUF1-DAG: %[[VSCALEX4:.*]] = shl i64 %[[VSCALE]], 2 -; CHECKUF1-DAG: %min.iters.check = icmp ugt i64 %[[VSCALEX4]], %wide.trip.count - -; CHECKUF1: vector.ph: -; CHECKUF1-DAG: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64() -; CHECKUF1-DAG: %[[VSCALEX4:.*]] = shl i64 %[[VSCALE]], 2 -; CHECKUF1-DAG: %n.mod.vf = urem i64 %wide.trip.count, %[[VSCALEX4]] -; CHECKUF1: %n.vec = sub nsw i64 %wide.trip.count, %n.mod.vf - -; CHECKUF1: vector.body: -; CHECKUF1: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] -; CHECKUF1: %[[IDXB:.*]] = getelementptr inbounds double, double* %b, i64 %index -; CHECKUF1: %[[IDXB_CAST:.*]] = bitcast double* %[[IDXB]] to * -; CHECKUF1: %wide.load = load , * %[[IDXB_CAST]], align 8, !alias.scope !0 -; CHECKUF1: %[[FADD:.*]] = fadd %wide.load, shufflevector ( insertelement ( undef, double 1.000000e+00, i32 0), undef, zeroinitializer) -; CHECKUF1: %[[IDXA:.*]] = getelementptr inbounds double, double* %a, i64 %index -; CHECKUF1: %[[IDXA_CAST:.*]] = bitcast double* %[[IDXA]] to * -; CHECKUF1: store %[[FADD]], * %[[IDXA_CAST]], align 8, !alias.scope !3, !noalias !0 -; CHECKUF1: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64() -; CHECKUF1: %[[VSCALEX4:.*]] = shl i64 %[[VSCALE]], 2 -; CHECKUF1: %index.next = add i64 %index, %[[VSCALEX4]] -; CHECKUF1: %[[CMP:.*]] = icmp eq i64 %index.next, %n.vec -; CHECKUF1: br i1 %[[CMP]], label %middle.block, label %vector.body, !llvm.loop !5 - - -; For an interleave factor of 2, vscale is scaled by 8 instead of 4 (and thus shifted left by 3 instead of 2). -; There is also the increment for the next iteration, e.g. instead of indexing IDXB, it indexes at IDXB + vscale * 4. - -; CHECKUF2: for.body.preheader: -; CHECKUF2-DAG: %wide.trip.count = zext i32 %N to i64 -; CHECKUF2-DAG: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64() -; CHECKUF2-DAG: %[[VSCALEX8:.*]] = shl i64 %[[VSCALE]], 3 -; CHECKUF2-DAG: %min.iters.check = icmp ugt i64 %[[VSCALEX8]], %wide.trip.count - -; CHECKUF2: vector.ph: -; CHECKUF2-DAG: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64() -; CHECKUF2-DAG: %[[VSCALEX8:.*]] = shl i64 %[[VSCALE]], 3 -; CHECKUF2-DAG: %n.mod.vf = urem i64 %wide.trip.count, %[[VSCALEX8]] -; CHECKUF2: %n.vec = sub nsw i64 %wide.trip.count, %n.mod.vf - -; CHECKUF2: vector.body: -; CHECKUF2: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] -; CHECKUF2: %[[IDXB:.*]] = getelementptr inbounds double, double* %b, i64 %index -; CHECKUF2: %[[IDXB_CAST:.*]] = bitcast double* %[[IDXB]] to * -; CHECKUF2: %wide.load = load , * %[[IDXB_CAST]], align 8, !alias.scope !0 -; CHECKUF2: %[[VSCALE:.*]] = call i32 @llvm.vscale.i32() -; CHECKUF2: %[[VSCALE2:.*]] = shl i32 %[[VSCALE]], 2 -; CHECKUF2: %[[VSCALE2_EXT:.*]] = sext i32 %[[VSCALE2]] to i64 -; CHECKUF2: %[[IDXB_NEXT:.*]] = getelementptr inbounds double, double* %[[IDXB]], i64 %[[VSCALE2_EXT]] -; CHECKUF2: %[[IDXB_NEXT_CAST:.*]] = bitcast double* %[[IDXB_NEXT]] to * -; CHECKUF2: %wide.load{{[0-9]+}} = load , * %[[IDXB_NEXT_CAST]], align 8, !alias.scope !0 -; CHECKUF2: %[[FADD:.*]] = fadd %wide.load, shufflevector ( insertelement ( undef, double 1.000000e+00, i32 0), undef, zeroinitializer) -; CHECKUF2: %[[FADD_NEXT:.*]] = fadd %wide.load{{[0-9]+}}, shufflevector ( insertelement ( undef, double 1.000000e+00, i32 0), undef, zeroinitializer) -; CHECKUF2: %[[IDXA:.*]] = getelementptr inbounds double, double* %a, i64 %index -; CHECKUF2: %[[IDXA_CAST:.*]] = bitcast double* %[[IDXA]] to * -; CHECKUF2: store %[[FADD]], * %[[IDXA_CAST]], align 8, !alias.scope !3, !noalias !0 -; CHECKUF2: %[[VSCALE:.*]] = call i32 @llvm.vscale.i32() -; CHECKUF2: %[[VSCALE2:.*]] = shl i32 %[[VSCALE]], 2 -; CHECKUF2: %[[VSCALE2_EXT:.*]] = sext i32 %[[VSCALE2]] to i64 -; CHECKUF2: %[[IDXA_NEXT:.*]] = getelementptr inbounds double, double* %[[IDXA]], i64 %[[VSCALE2_EXT]] -; CHECKUF2: %[[IDXA_NEXT_CAST:.*]] = bitcast double* %[[IDXA_NEXT]] to * -; CHECKUF2: store %[[FADD_NEXT]], * %[[IDXA_NEXT_CAST]], align 8, !alias.scope !3, !noalias !0 -; CHECKUF2: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64() -; CHECKUF2: %[[VSCALEX8:.*]] = shl i64 %[[VSCALE]], 3 -; CHECKUF2: %index.next = add i64 %index, %[[VSCALEX8]] -; CHECKUF2: %[[CMP:.*]] = icmp eq i64 %index.next, %n.vec -; CHECKUF2: br i1 %[[CMP]], label %middle.block, label %vector.body, !llvm.loop !5 - -define void @loop(i32 %N, double* nocapture %a, double* nocapture readonly %b) { -entry: - %cmp7 = icmp sgt i32 %N, 0 - br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup - -for.body.preheader: ; preds = %entry - %wide.trip.count = zext i32 %N to i64 - br label %for.body - -for.cond.cleanup: ; preds = %for.body, %entry - ret void - -for.body: ; preds = %for.body.preheader, %for.body - %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] - %arrayidx = getelementptr inbounds double, double* %b, i64 %indvars.iv - %0 = load double, double* %arrayidx, align 8 - %add = fadd double %0, 1.000000e+00 - %arrayidx2 = getelementptr inbounds double, double* %a, i64 %indvars.iv - store double %add, double* %arrayidx2, align 8 - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count - br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !1 -} - -!1 = distinct !{!1, !2, !3} -!2 = !{!"llvm.loop.vectorize.width", i32 4} -!3 = !{!"llvm.loop.vectorize.scalable.enable", i1 true} Index: llvm/test/Transforms/LoopVectorize/scalable-vf-hint.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/scalable-vf-hint.ll @@ -0,0 +1,33 @@ +; REQUIRES: asserts +; RUN: opt -loop-vectorize -pass-remarks-analysis=loop-vectorize -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" + +; CHECK: LV: Ignoring VF=vscale x 4 because target does not support scalable vectors. +; CHECK: remark: :0:0: Ignoring VF=vscale x 4 because target does not support scalable vectors. +; CHECK: LV: The Widest register safe to use is: 32 bits. +define void @test1(i32* %a, i32* %b) { +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %iv + %1 = load i32, i32* %arrayidx2, align 4 + %add = add nsw i32 %1, %0 + %2 = add nuw nsw i64 %iv, 4 + %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2 + store i32 %add, i32* %arrayidx5, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !0 + +exit: + ret void +} + +!0 = !{!0, !1, !2} +!1 = !{!"llvm.loop.vectorize.width", i32 4} +!2 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}