Index: llvm/include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfo.h +++ llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -21,6 +21,7 @@ #ifndef LLVM_ANALYSIS_TARGETTRANSFORMINFO_H #define LLVM_ANALYSIS_TARGETTRANSFORMINFO_H +#include "llvm/Analysis/IVDescriptors.h" #include "llvm/IR/InstrTypes.h" #include "llvm/IR/Operator.h" #include "llvm/IR/PassManager.h" @@ -1305,6 +1306,8 @@ bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const; + bool isLegalToVectorizeReduction(RecurKind RecKind, bool Scalable) const; + /// \returns The new vector factor value if the target doesn't support \p /// SizeInBytes loads or has a better vector factor. unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize, @@ -1644,6 +1647,8 @@ virtual bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const = 0; + virtual bool isLegalToVectorizeReduction(RecurKind RecKind, + bool Scalable) const = 0; virtual unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize, unsigned ChainSizeInBytes, VectorType *VecTy) const = 0; @@ -2170,6 +2175,10 @@ return Impl.isLegalToVectorizeStoreChain(ChainSizeInBytes, Alignment, AddrSpace); } + bool isLegalToVectorizeReduction(RecurKind RecKind, + bool Scalable) const override { + return Impl.isLegalToVectorizeReduction(RecKind, Scalable); + } unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize, unsigned ChainSizeInBytes, VectorType *VecTy) const override { Index: llvm/include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -686,6 +686,9 @@ return true; } + bool isLegalToVectorizeReduction(RecurKind RecKind, + bool Scalable) const { return true; } + unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize, unsigned ChainSizeInBytes, VectorType *VecTy) const { Index: llvm/lib/Analysis/TargetTransformInfo.cpp =================================================================== --- llvm/lib/Analysis/TargetTransformInfo.cpp +++ llvm/lib/Analysis/TargetTransformInfo.cpp @@ -1035,6 +1035,11 @@ AddrSpace); } +bool TargetTransformInfo::isLegalToVectorizeReduction(RecurKind RecKind, + bool Scalable) const { + return TTIImpl->isLegalToVectorizeReduction(RecKind, Scalable); +} + unsigned TargetTransformInfo::getLoadVectorFactor(unsigned VF, unsigned LoadSize, unsigned ChainSizeInBytes, Index: llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -3921,6 +3921,8 @@ SDValue InVec = Op.getOperand(0); SDValue EltNo = Op.getOperand(1); EVT VecVT = InVec.getValueType(); + if (VecVT.isScalableVector()) + break; const unsigned BitWidth = Op.getValueSizeInBits(); const unsigned EltBitWidth = Op.getOperand(0).getScalarValueSizeInBits(); const unsigned NumSrcElts = VecVT.getVectorNumElements(); Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h =================================================================== --- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -249,6 +249,8 @@ bool supportsScalableVectors() const { return ST->hasSVE(); } + bool isLegalToVectorizeReduction(RecurKind RecKind, bool Scalable) const; + bool useReductionIntrinsic(unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const; Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -1073,8 +1073,36 @@ return Considerable; } +bool AArch64TTIImpl::isLegalToVectorizeReduction(RecurKind RecKind, + bool Scalable) const { + if (Scalable) { + switch (RecKind) { + case RecurKind::Add: + case RecurKind::FAdd: + case RecurKind::And: + case RecurKind::Or: + case RecurKind::Xor: + case RecurKind::SMin: + case RecurKind::SMax: + case RecurKind::UMin: + case RecurKind::UMax: + case RecurKind::FMin: + case RecurKind::FMax: + return true; + default: + return false; + } + return false; + } + + return true; +} + bool AArch64TTIImpl::useReductionIntrinsic(unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const { + if (isa(Ty)) + return true; + auto *VTy = cast(Ty); unsigned ScalarBits = Ty->getScalarSizeInBits(); switch (Opcode) { Index: llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -28,6 +28,7 @@ #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Support/InstructionCost.h" namespace llvm { @@ -174,7 +175,7 @@ // Vector width with best cost ElementCount Width; // Cost of the loop with that width - unsigned Cost; + InstructionCost Cost; // Width 1 means no vectorization, cost 0 means uncomputed cost. static VectorizationFactor Disabled() { Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1274,7 +1274,7 @@ /// If interleave count has been specified by metadata it will be returned. /// Otherwise, the interleave count is computed and returned. VF and LoopCost /// are the selected vectorization factor and the cost of the selected VF. - unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost); + unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost); /// Memory access instruction may be vectorized in more than one way. /// Form of instruction after vectorization depends on cost. @@ -1516,6 +1516,19 @@ (SI && isLegalMaskedScatter(Ty, Align)); } + bool isLegalWideningOperation(ElementCount VF) { + for (auto &Reduction : Legal->getReductionVars()) { + RecurrenceDescriptor RdxDesc = Reduction.second; + if (!TTI.isLegalToVectorizeReduction(RdxDesc.getRecurrenceKind(), + VF.isScalable())) { + LLVM_DEBUG( + dbgs() << "LV: Not vectorizing. Found invalid reduction type.\n"); + return false; + } + } + return true; + } + /// Returns true if \p I is an instruction that will be scalarized with /// predication. Such instructions include conditional stores and /// instructions that may divide by zero. @@ -4587,7 +4600,6 @@ RecurrenceDescriptor *RdxDesc, Value *StartV, unsigned UF, ElementCount VF) { - assert(!VF.isScalable() && "scalable vectors not yet supported."); PHINode *P = cast(PN); if (EnableVPlanNativePath) { // Currently we enter here in the VPlan-native path for non-induction @@ -6031,7 +6043,7 @@ } unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, - unsigned LoopCost) { + InstructionCost LoopCost) { // -- The interleave heuristics -- // We interleave the loop in order to expose ILP and reduce the loop overhead. // There are many micro-architectural considerations that we can't predict @@ -6162,7 +6174,7 @@ LoopCost = *expectedCost(VF).first.getValue(); } - assert(LoopCost && "Non-zero loop cost expected"); + assert(LoopCost.getValue() && "Non-zero loop cost expected"); // Interleave if we vectorized this loop and there is a reduction that could // benefit from interleaving. @@ -6183,12 +6195,12 @@ << "LV: VF is " << VF << '\n'); const bool AggressivelyInterleaveReductions = TTI.enableAggressiveInterleaving(HasReductions); - if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { + if (!InterleavingRequiresRuntimePointerCheck && (unsigned)*LoopCost.getValue() < SmallLoopCost) { // We assume that the cost overhead is 1 and we use the cost model // to estimate the cost of the loop and interleave until the cost of the // loop overhead is about 5% of the cost of the loop. unsigned SmallIC = - std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); + std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / *(LoopCost.getValue()))); // Interleave until store/load ports (estimated by max interleave count) are // saturated. @@ -7650,6 +7662,9 @@ CM.invalidateCostModelingDecisions(); } + if (!CM.isLegalWideningOperation(UserVF)) + return {{UserVF, InstructionCost::getInvalid()}}; + ElementCount MaxVF = MaybeMaxVF.getValue(); assert(MaxVF.isNonZero() && "MaxVF is zero."); @@ -7659,6 +7674,7 @@ // FIXME: MaxVF is temporarily used inplace of UserVF for illegal scalable // VFs here, this should be reverted to only use legal UserVFs once the // loop below supports scalable VFs. + ElementCount VF = UserVFIsLegal ? UserVF : MaxVF; LLVM_DEBUG(dbgs() << "LV: Using " << (UserVFIsLegal ? "user" : "max") << " VF " << VF << ".\n"); @@ -9421,7 +9437,8 @@ if (MaybeVF) { VF = *MaybeVF; // Select the interleave count. - IC = CM.selectInterleaveCount(VF.Width, VF.Cost); + if (VF.Cost.isValid()) + IC = CM.selectInterleaveCount(VF.Width, VF.Cost); } // Identify the diagnostic messages that should be produced. @@ -9434,6 +9451,13 @@ return false; } + if (!VF.Cost.isValid()) { + LLVM_DEBUG(dbgs() << "LV: Not vectorizing: The cost-model indicates that " + "vectorization is not possible.\n"); + VectorizeLoop = false; + return false; + } + if (VF.Width.isScalar()) { LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); VecDiagMsg = std::make_pair( Index: llvm/test/Transforms/LoopVectorize/scalable_reductions.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/scalable_reductions.ll @@ -0,0 +1,465 @@ +; RUN: opt < %s -loop-vectorize -transform-warning -mtriple aarch64-unknown-linux-gnu -mattr=+sve -debug-only=loop-vectorize -S 2>&1 | FileCheck %s -check-prefix=CHECK +; RUN: opt < %s -loop-vectorize -transform-warning -mtriple aarch64-unknown-linux-gnu -mattr=+sve -S 2>&1 | FileCheck %s -check-prefix=CHECK-WARN + +; Reduction can be vectorized + +; ADD + +; int sum = 0; +; #pragma clang loop vectorize_width(8, scalable) interleave_count(2) +; for (int i = 0; i < n; ++i) +; sum += a[i]; +; return sum; + +; CHECK: LV: Found a vectorizable loop (vscale x 8) +; CHECK: LV: Interleave Count is 2 +; CHECK: Setting best plan to VF=vscale x 8, UF=2 +define dso_local i32 @add(i32* nocapture %a, i32* nocapture readonly %b, i32 %n) { +entry: + %cmp6 = icmp sgt i32 %n, 0 + br i1 %cmp6, label %for.body.preheader, label %for.end + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %n to i64 + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %sum.07 = phi i32 [ 2, %for.body.preheader ], [ %add, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %add = add nsw i32 %0, %sum.07 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 + +for.end: ; preds = %for.body, %entry + %sum.0.lcssa = phi i32 [ 2, %entry ], [ %add, %for.body ] + ret i32 %sum.0.lcssa +} + +; OR + +; int foo(int * __restrict__ a, int * __restrict__ inv, int *b, int *c, int n) { +; int sum = 0; +; #pragma clang loop vectorize_width(8, scalable) interleave_count(2) +; for (int i = 0; i < n; ++i) +; sum |= a[i]; +; return sum; +; } + +; CHECK: LV: Found a vectorizable loop (vscale x 8) +; CHECK: LV: Interleave Count is 2 +; CHECK: Setting best plan to VF=vscale x 8, UF=2 +define dso_local i32 @or(i32* nocapture %a, i32* nocapture readonly %b, i32 %n) { +entry: + %cmp6 = icmp sgt i32 %n, 0 + br i1 %cmp6, label %for.body.preheader, label %for.end + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %n to i64 + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %sum.07 = phi i32 [ 2, %for.body.preheader ], [ %or, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %or = or i32 %0, %sum.07 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 + +for.end: ; preds = %for.body, %entry + %sum.0.lcssa = phi i32 [ 2, %entry ], [ %or, %for.body ] + ret i32 %sum.0.lcssa +} + +; AND + +; int foo(int * __restrict__ a, int * __restrict__ inv, int *b, int *c, int n) { +; int sum = 0; +; #pragma clang loop vectorize_width(8, scalable) interleave_count(2) +; for (int i = 0; i < n; ++i) +; sum &= a[i]; +; return sum; +; } + +; CHECK: LV: Found a vectorizable loop (vscale x 8) +; CHECK: LV: Interleave Count is 2 +; CHECK: Setting best plan to VF=vscale x 8, UF=2 +define dso_local i32 @and(i32* nocapture %a, i32* nocapture readonly %b, i32 %n) { +entry: + %cmp6 = icmp sgt i32 %n, 0 + br i1 %cmp6, label %for.body.preheader, label %for.end + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %n to i64 + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %sum.07 = phi i32 [ 2, %for.body.preheader ], [ %and, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %and = and i32 %0, %sum.07 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 + +for.end: ; preds = %for.body, %entry + %sum.0.lcssa = phi i32 [ 2, %entry ], [ %and, %for.body ] + ret i32 %sum.0.lcssa +} + +; XOR + +; int sum = 0; +; #pragma clang loop vectorize_width(8, scalable) interleave_count(2) +; for (int i = 0; i < n; ++i) +; sum ^= a[i]; +; return sum; + +; CHECK: LV: Found a vectorizable loop (vscale x 8) +; CHECK: LV: Interleave Count is 2 +; CHECK: Setting best plan to VF=vscale x 8, UF=2 +define dso_local i32 @xor(i32* nocapture %a, i32* nocapture readonly %b, i32 %n) { +entry: + %cmp6 = icmp sgt i32 %n, 0 + br i1 %cmp6, label %for.body.preheader, label %for.end + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %n to i64 + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %sum.07 = phi i32 [ 2, %for.body.preheader ], [ %xor, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %xor = xor i32 %0, %sum.07 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 + +for.end: ; preds = %for.body, %entry + %sum.0.lcssa = phi i32 [ 2, %entry ], [ %xor, %for.body ] + ret i32 %sum.0.lcssa +} + +; SMIN + +; int foo(int * __restrict__ a, int * __restrict__ inv, int *b, int *c, int n) { +; int sum = 1; +; #pragma clang loop vectorize_width(8, scalable) interleave_count(2) +; for (int i = 0; i < n; ++i) +; sum = std::min(sum, a[i]); +; return sum; +; } + +; CHECK: LV: Found a vectorizable loop (vscale x 8) +; CHECK: LV: Interleave Count is 2 +; CHECK: Setting best plan to VF=vscale x 8, UF=2 +define dso_local i32 @smin(i32* nocapture %a, i32* nocapture readonly %b, i32 %n) { +entry: + %cmp6 = icmp sgt i32 %n, 0 + br i1 %cmp6, label %for.body.preheader, label %for.end + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %n to i64 + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %sum.010 = phi i32 [ 2, %for.body.preheader ], [ %.sroa.speculated, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %cmp.i = icmp slt i32 %0, %sum.010 + %.sroa.speculated = select i1 %cmp.i, i32 %0, i32 %sum.010 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 + +for.end: + %sum.0.lcssa = phi i32 [ 1, %entry ], [ %.sroa.speculated, %for.body ] + ret i32 %sum.0.lcssa +} + +; UMAX + +; unsigned foo(unsigned * __restrict__ a, int n) { +; unsigned sum = 1; +; #pragma clang loop vectorize_width(8, scalable) interleave_count(2) +; for (int i = 0; i < n; ++i) +; sum = std::min(sum, a[i]); +; return sum; +; } + +; CHECK: LV: Found a vectorizable loop (vscale x 8) +; CHECK: LV: Interleave Count is 2 +; CHECK: Setting best plan to VF=vscale x 8, UF=2 +define dso_local i32 @umax(i32* nocapture %a, i32* nocapture readonly %b, i32 %n) { +entry: + %cmp6 = icmp sgt i32 %n, 0 + br i1 %cmp6, label %for.body.preheader, label %for.end + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %n to i64 + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %sum.010 = phi i32 [ 2, %for.body.preheader ], [ %.sroa.speculated, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %cmp.i = icmp ugt i32 %0, %sum.010 + %.sroa.speculated = select i1 %cmp.i, i32 %0, i32 %sum.010 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 + +for.end: + %sum.0.lcssa = phi i32 [ 1, %entry ], [ %.sroa.speculated, %for.body ] + ret i32 %sum.0.lcssa +} + +; FADD + +; float foo(float * __restrict__ a, int n) { +; float sum = 0; +; #pragma clang loop vectorize_width(8, scalable) interleave_count(2) +; for (int i = 0; i < n; ++i) +; sum += a[i]; +; return sum; +; } + +; CHECK: LV: Found a vectorizable loop (vscale x 8) +; CHECK: LV: Interleave Count is 2 +; CHECK: Setting best plan to VF=vscale x 8, UF=2 +define dso_local float @fadd(float* noalias nocapture readonly %a, i32 %n) { +entry: + %cmp6 = icmp sgt i32 %n, 0 + br i1 %cmp6, label %for.body.preheader, label %for.end + +for.body.preheader: + %wide.trip.count = zext i32 %n to i64 + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %sum.07 = phi float [ 0.000000e+00, %for.body.preheader ], [ %add, %for.body ] + %arrayidx = getelementptr inbounds float, float* %a, i64 %indvars.iv + %0 = load float, float* %arrayidx, align 4 + %add = fadd float %0, %sum.07 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 + +for.end: + %sum.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ] + ret float %sum.0.lcssa +} + +; FADD (FAST) + +; CHECK: LV: Found a vectorizable loop (vscale x 8) +; CHECK: LV: Interleave Count is 2 +; CHECK: Setting best plan to VF=vscale x 8, UF=2 +define dso_local float @fadd_fast(float* noalias nocapture readonly %a, i32 %n) { +entry: + %cmp6 = icmp sgt i32 %n, 0 + br i1 %cmp6, label %for.body.preheader, label %for.end + +for.body.preheader: + %wide.trip.count = zext i32 %n to i64 + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %sum.07 = phi float [ 0.000000e+00, %for.body.preheader ], [ %add, %for.body ] + %arrayidx = getelementptr inbounds float, float* %a, i64 %indvars.iv + %0 = load float, float* %arrayidx, align 4 + %add = fadd fast float %0, %sum.07 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 + +for.end: + %sum.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ] + ret float %sum.0.lcssa +} + +; FMIN (FAST) + +; float foo(float * __restrict__ a, int n) { +; float sum = 2; +; #pragma clang loop vectorize_width(8, scalable) interleave_count(2) +; for (int i = 0; i < n; ++i) +; sum = std::min(sum, a[i]); +; return sum; +; } + +; CHECK: LV: Found a vectorizable loop (vscale x 8) +; CHECK: LV: Interleave Count is 2 +; CHECK: Setting best plan to VF=vscale x 8, UF=2 +define dso_local float @fmin_fast(float* noalias nocapture readonly %a, i32 %n) #0 { +entry: + %cmp6 = icmp sgt i32 %n, 0 + br i1 %cmp6, label %for.body.preheader, label %for.end + +for.body.preheader: + %wide.trip.count = zext i32 %n to i64 + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %sum.07 = phi float [ 0.000000e+00, %for.body.preheader ], [ %.sroa.speculated, %for.body ] + %arrayidx = getelementptr inbounds float, float* %a, i64 %indvars.iv + %0 = load float, float* %arrayidx, align 4 + %cmp.i = fcmp fast olt float %0, %sum.07 + %.sroa.speculated = select i1 %cmp.i, float %0, float %sum.07 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 + +for.end: + %sum.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %.sroa.speculated, %for.body ] + ret float %sum.0.lcssa +} + +; FMAX (FAST) + +; float foo(float * __restrict__ a, int n) { +; float sum = 2; +; #pragma clang loop vectorize_width(8, scalable) interleave_count(2) +; for (int i = 0; i < n; ++i) +; sum = std::max(sum, a[i]); +; return sum; +; } + +; CHECK: LV: Found a vectorizable loop (vscale x 8) +; CHECK: LV: Interleave Count is 2 +; CHECK: Setting best plan to VF=vscale x 8, UF=2 +define dso_local float @fmax_fast(float* noalias nocapture readonly %a, i32 %n) #0 { +entry: + %cmp6 = icmp sgt i32 %n, 0 + br i1 %cmp6, label %for.body.preheader, label %for.end + +for.body.preheader: + %wide.trip.count = zext i32 %n to i64 + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %sum.07 = phi float [ 0.000000e+00, %for.body.preheader ], [ %.sroa.speculated, %for.body ] + %arrayidx = getelementptr inbounds float, float* %a, i64 %indvars.iv + %0 = load float, float* %arrayidx, align 4 + %cmp.i = fcmp fast ogt float %0, %sum.07 + %.sroa.speculated = select i1 %cmp.i, float %0, float %sum.07 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 + +for.end: + %sum.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %.sroa.speculated, %for.body ] + ret float %sum.0.lcssa +} + +; Reduction cannot be vectorized + +; MUL + +; int sum = 2; +; #pragma clang loop vectorize_width(8, scalable) interleave_count(2) +; for (int i = 0; i < n; ++i) +; sum *= a[i]; +; return sum; + +; CHECK-WARN: warning: :0:0: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering +define dso_local i32 @mul(i32* nocapture %a, i32* nocapture readonly %b, i32 %n) { +entry: + %cmp6 = icmp sgt i32 %n, 0 + br i1 %cmp6, label %for.body.preheader, label %for.end + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %n to i64 + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %sum.07 = phi i32 [ 2, %for.body.preheader ], [ %mul, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %mul = mul nsw i32 %0, %sum.07 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 + +for.end: ; preds = %for.body, %entry + %sum.0.lcssa = phi i32 [ 2, %entry ], [ %mul, %for.body ] + ret i32 %sum.0.lcssa +} + +; FMIN + +; CHECK-WARN: warning: :0:0: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering +define dso_local float @fmin(float* noalias nocapture readonly %a, i32 %n) { +entry: + %cmp6 = icmp sgt i32 %n, 0 + br i1 %cmp6, label %for.body.preheader, label %for.end + +for.body.preheader: + %wide.trip.count = zext i32 %n to i64 + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %sum.07 = phi float [ 0.000000e+00, %for.body.preheader ], [ %.sroa.speculated, %for.body ] + %arrayidx = getelementptr inbounds float, float* %a, i64 %indvars.iv + %0 = load float, float* %arrayidx, align 4 + %cmp.i = fcmp olt float %0, %sum.07 + %.sroa.speculated = select i1 %cmp.i, float %0, float %sum.07 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 + +for.end: + %sum.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %.sroa.speculated, %for.body ] + ret float %sum.0.lcssa +} + +; FMAX + +; CHECK-WARN: warning: :0:0: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering +define dso_local float @fmax(float* noalias nocapture readonly %a, i32 %n) { +entry: + %cmp6 = icmp sgt i32 %n, 0 + br i1 %cmp6, label %for.body.preheader, label %for.end + +for.body.preheader: + %wide.trip.count = zext i32 %n to i64 + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %sum.07 = phi float [ 0.000000e+00, %for.body.preheader ], [ %.sroa.speculated, %for.body ] + %arrayidx = getelementptr inbounds float, float* %a, i64 %indvars.iv + %0 = load float, float* %arrayidx, align 4 + %cmp.i = fcmp ogt float %0, %sum.07 + %.sroa.speculated = select i1 %cmp.i, float %0, float %sum.07 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 + +for.end: + %sum.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %.sroa.speculated, %for.body ] + ret float %sum.0.lcssa +} + +attributes #0 = { "no-nans-fp-math"="true" } + +!0 = distinct !{!0, !1, !2, !3, !4} +!1 = !{!"llvm.loop.vectorize.width", i32 8} +!2 = !{!"llvm.loop.vectorize.scalable.enable", i1 true} +!3 = !{!"llvm.loop.interleave.count", i32 2} +!4 = !{!"llvm.loop.vectorize.enable", i1 true}