Index: llvm/include/llvm/Analysis/TargetTransformInfo.h
===================================================================
--- llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -21,6 +21,7 @@
 #ifndef LLVM_ANALYSIS_TARGETTRANSFORMINFO_H
 #define LLVM_ANALYSIS_TARGETTRANSFORMINFO_H
 
+#include "llvm/Analysis/IVDescriptors.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/PassManager.h"
@@ -1305,6 +1306,8 @@
   bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment,
                                     unsigned AddrSpace) const;
 
+  bool isLegalToVectorizeReduction(RecurKind RecKind, bool Scalable) const;
+
   /// \returns The new vector factor value if the target doesn't support \p
   /// SizeInBytes loads or has a better vector factor.
   unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize,
@@ -1644,6 +1647,8 @@
   virtual bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
                                             Align Alignment,
                                             unsigned AddrSpace) const = 0;
+  virtual bool isLegalToVectorizeReduction(RecurKind RecKind,
+                                           bool Scalable) const = 0;
   virtual unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize,
                                        unsigned ChainSizeInBytes,
                                        VectorType *VecTy) const = 0;
@@ -2170,6 +2175,10 @@
     return Impl.isLegalToVectorizeStoreChain(ChainSizeInBytes, Alignment,
                                              AddrSpace);
   }
+  bool isLegalToVectorizeReduction(RecurKind RecKind,
+                                   bool Scalable) const override {
+    return Impl.isLegalToVectorizeReduction(RecKind, Scalable);
+  }
   unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize,
                                unsigned ChainSizeInBytes,
                                VectorType *VecTy) const override {
Index: llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
===================================================================
--- llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -686,6 +686,9 @@
     return true;
   }
 
+  bool isLegalToVectorizeReduction(RecurKind RecKind,
+                                   bool Scalable) const { return true; }
+
   unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize,
                                unsigned ChainSizeInBytes,
                                VectorType *VecTy) const {
Index: llvm/lib/Analysis/TargetTransformInfo.cpp
===================================================================
--- llvm/lib/Analysis/TargetTransformInfo.cpp
+++ llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1035,6 +1035,11 @@
                                                AddrSpace);
 }
 
+bool TargetTransformInfo::isLegalToVectorizeReduction(RecurKind RecKind,
+                                                      bool Scalable) const {
+  return TTIImpl->isLegalToVectorizeReduction(RecKind, Scalable);
+}
+
 unsigned TargetTransformInfo::getLoadVectorFactor(unsigned VF,
                                                   unsigned LoadSize,
                                                   unsigned ChainSizeInBytes,
Index: llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
===================================================================
--- llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -3921,6 +3921,8 @@
     SDValue InVec = Op.getOperand(0);
     SDValue EltNo = Op.getOperand(1);
     EVT VecVT = InVec.getValueType();
+    if (VecVT.isScalableVector())
+      break;
     const unsigned BitWidth = Op.getValueSizeInBits();
     const unsigned EltBitWidth = Op.getOperand(0).getScalarValueSizeInBits();
     const unsigned NumSrcElts = VecVT.getVectorNumElements();
Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
===================================================================
--- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -249,6 +249,8 @@
 
   bool supportsScalableVectors() const { return ST->hasSVE(); }
 
+  bool isLegalToVectorizeReduction(RecurKind RecKind, bool Scalable) const;
+
   bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
                              TTI::ReductionFlags Flags) const;
 
Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
===================================================================
--- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -1073,8 +1073,36 @@
   return Considerable;
 }
 
+bool AArch64TTIImpl::isLegalToVectorizeReduction(RecurKind RecKind,
+                                                 bool Scalable) const {
+  if (Scalable) {
+    switch (RecKind) {
+      case RecurKind::Add:
+      case RecurKind::FAdd:
+      case RecurKind::And:
+      case RecurKind::Or:
+      case RecurKind::Xor:
+      case RecurKind::SMin:
+      case RecurKind::SMax:
+      case RecurKind::UMin:
+      case RecurKind::UMax:
+      case RecurKind::FMin:
+      case RecurKind::FMax:
+        return true;
+      default:
+        return false;
+    }
+    return false;
+  }
+
+  return true;
+}
+
 bool AArch64TTIImpl::useReductionIntrinsic(unsigned Opcode, Type *Ty,
                                            TTI::ReductionFlags Flags) const {
+  if (isa<ScalableVectorType>(Ty))
+    return true;
+
   auto *VTy = cast<VectorType>(Ty);
   unsigned ScalarBits = Ty->getScalarSizeInBits();
   switch (Opcode) {
Index: llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
===================================================================
--- llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -28,6 +28,7 @@
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Support/InstructionCost.h"
 
 namespace llvm {
 
@@ -174,7 +175,7 @@
   // Vector width with best cost
   ElementCount Width;
   // Cost of the loop with that width
-  unsigned Cost;
+  InstructionCost Cost;
 
   // Width 1 means no vectorization, cost 0 means uncomputed cost.
   static VectorizationFactor Disabled() {
Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1274,7 +1274,7 @@
   /// If interleave count has been specified by metadata it will be returned.
   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
   /// are the selected vectorization factor and the cost of the selected VF.
-  unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost);
+  unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost);
 
   /// Memory access instruction may be vectorized in more than one way.
   /// Form of instruction after vectorization depends on cost.
@@ -1516,6 +1516,19 @@
            (SI && isLegalMaskedScatter(Ty, Align));
   }
 
+  bool isLegalWideningOperation(ElementCount VF) {
+    for (auto &Reduction : Legal->getReductionVars()) {
+      RecurrenceDescriptor RdxDesc = Reduction.second;
+      if (!TTI.isLegalToVectorizeReduction(RdxDesc.getRecurrenceKind(),
+                                            VF.isScalable())) {
+        LLVM_DEBUG(
+            dbgs() << "LV: Not vectorizing. Found invalid reduction type.\n");
+        return false;
+      }
+    }
+    return true;
+  }
+
   /// Returns true if \p I is an instruction that will be scalarized with
   /// predication. Such instructions include conditional stores and
   /// instructions that may divide by zero.
@@ -4587,7 +4600,6 @@
                                               RecurrenceDescriptor *RdxDesc,
                                               Value *StartV, unsigned UF,
                                               ElementCount VF) {
-  assert(!VF.isScalable() && "scalable vectors not yet supported.");
   PHINode *P = cast<PHINode>(PN);
   if (EnableVPlanNativePath) {
     // Currently we enter here in the VPlan-native path for non-induction
@@ -6031,7 +6043,7 @@
 }
 
 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
-                                                           unsigned LoopCost) {
+                                                           InstructionCost LoopCost) {
   // -- The interleave heuristics --
   // We interleave the loop in order to expose ILP and reduce the loop overhead.
   // There are many micro-architectural considerations that we can't predict
@@ -6162,7 +6174,7 @@
     LoopCost = *expectedCost(VF).first.getValue();
   }
 
-  assert(LoopCost && "Non-zero loop cost expected");
+  assert(LoopCost.getValue() && "Non-zero loop cost expected");
 
   // Interleave if we vectorized this loop and there is a reduction that could
   // benefit from interleaving.
@@ -6183,12 +6195,12 @@
                     << "LV: VF is " << VF << '\n');
   const bool AggressivelyInterleaveReductions =
       TTI.enableAggressiveInterleaving(HasReductions);
-  if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
+  if (!InterleavingRequiresRuntimePointerCheck && (unsigned)*LoopCost.getValue() < SmallLoopCost) {
     // We assume that the cost overhead is 1 and we use the cost model
     // to estimate the cost of the loop and interleave until the cost of the
     // loop overhead is about 5% of the cost of the loop.
     unsigned SmallIC =
-        std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
+        std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / *(LoopCost.getValue())));
 
     // Interleave until store/load ports (estimated by max interleave count) are
     // saturated.
@@ -7650,6 +7662,9 @@
       CM.invalidateCostModelingDecisions();
   }
 
+  if (!CM.isLegalWideningOperation(UserVF))
+    return {{UserVF, InstructionCost::getInvalid()}};
+
   ElementCount MaxVF = MaybeMaxVF.getValue();
   assert(MaxVF.isNonZero() && "MaxVF is zero.");
 
@@ -7659,6 +7674,7 @@
     // FIXME: MaxVF is temporarily used inplace of UserVF for illegal scalable
     // VFs here, this should be reverted to only use legal UserVFs once the
     // loop below supports scalable VFs.
+
     ElementCount VF = UserVFIsLegal ? UserVF : MaxVF;
     LLVM_DEBUG(dbgs() << "LV: Using " << (UserVFIsLegal ? "user" : "max")
                       << " VF " << VF << ".\n");
@@ -9421,7 +9437,8 @@
   if (MaybeVF) {
     VF = *MaybeVF;
     // Select the interleave count.
-    IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
+    if (VF.Cost.isValid())
+      IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
   }
 
   // Identify the diagnostic messages that should be produced.
@@ -9434,6 +9451,13 @@
     return false;
   }
 
+  if (!VF.Cost.isValid()) {
+    LLVM_DEBUG(dbgs() << "LV: Not vectorizing: The cost-model indicates that "
+                         "vectorization is not possible.\n");
+    VectorizeLoop = false;
+    return false;
+  }
+
   if (VF.Width.isScalar()) {
     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
     VecDiagMsg = std::make_pair(
Index: llvm/test/Transforms/LoopVectorize/scalable_reductions.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopVectorize/scalable_reductions.ll
@@ -0,0 +1,465 @@
+; RUN: opt < %s -loop-vectorize -transform-warning -mtriple aarch64-unknown-linux-gnu -mattr=+sve -debug-only=loop-vectorize -S 2>&1 | FileCheck %s -check-prefix=CHECK
+; RUN: opt < %s -loop-vectorize -transform-warning -mtriple aarch64-unknown-linux-gnu -mattr=+sve -S 2>&1 | FileCheck %s -check-prefix=CHECK-WARN
+
+; Reduction can be vectorized
+
+; ADD
+
+; int sum = 0;
+; #pragma clang loop vectorize_width(8, scalable) interleave_count(2)
+;  for (int i = 0; i < n; ++i)
+;    sum += a[i];
+;  return sum;
+
+; CHECK: LV: Found a vectorizable loop (vscale x 8)
+; CHECK: LV: Interleave Count is 2
+; CHECK: Setting best plan to VF=vscale x 8, UF=2
+define dso_local i32 @add(i32* nocapture %a, i32* nocapture readonly %b, i32 %n) {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %sum.07 = phi i32 [ 2, %for.body.preheader ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %add = add nsw i32 %0, %sum.07
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:                                 ; preds = %for.body, %entry
+  %sum.0.lcssa = phi i32 [ 2, %entry ], [ %add, %for.body ]
+  ret i32 %sum.0.lcssa
+}
+
+; OR
+
+; int foo(int * __restrict__ a, int * __restrict__ inv, int *b, int *c, int n) {
+;   int sum = 0;
+;   #pragma clang loop vectorize_width(8, scalable) interleave_count(2)
+;   for (int i = 0; i < n; ++i)
+;     sum |= a[i];
+;   return sum;
+; }
+
+; CHECK: LV: Found a vectorizable loop (vscale x 8)
+; CHECK: LV: Interleave Count is 2
+; CHECK: Setting best plan to VF=vscale x 8, UF=2
+define dso_local i32 @or(i32* nocapture %a, i32* nocapture readonly %b, i32 %n) {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %sum.07 = phi i32 [ 2, %for.body.preheader ], [ %or, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %or = or i32 %0, %sum.07
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:                                 ; preds = %for.body, %entry
+  %sum.0.lcssa = phi i32 [ 2, %entry ], [ %or, %for.body ]
+  ret i32 %sum.0.lcssa
+}
+
+; AND
+
+; int foo(int * __restrict__ a, int * __restrict__ inv, int *b, int *c, int n) {
+;   int sum = 0;
+;   #pragma clang loop vectorize_width(8, scalable) interleave_count(2)
+;   for (int i = 0; i < n; ++i)
+;     sum &= a[i];
+;   return sum;
+; }
+
+; CHECK: LV: Found a vectorizable loop (vscale x 8)
+; CHECK: LV: Interleave Count is 2
+; CHECK: Setting best plan to VF=vscale x 8, UF=2
+define dso_local i32 @and(i32* nocapture %a, i32* nocapture readonly %b, i32 %n) {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %sum.07 = phi i32 [ 2, %for.body.preheader ], [ %and, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %and = and i32 %0, %sum.07
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:                                 ; preds = %for.body, %entry
+  %sum.0.lcssa = phi i32 [ 2, %entry ], [ %and, %for.body ]
+  ret i32 %sum.0.lcssa
+}
+
+; XOR
+
+; int sum = 0;
+; #pragma clang loop vectorize_width(8, scalable) interleave_count(2)
+;  for (int i = 0; i < n; ++i)
+;    sum ^= a[i];
+;  return sum;
+
+; CHECK: LV: Found a vectorizable loop (vscale x 8)
+; CHECK: LV: Interleave Count is 2
+; CHECK: Setting best plan to VF=vscale x 8, UF=2
+define dso_local i32 @xor(i32* nocapture %a, i32* nocapture readonly %b, i32 %n) {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %sum.07 = phi i32 [ 2, %for.body.preheader ], [ %xor, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %xor = xor i32 %0, %sum.07
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:                                 ; preds = %for.body, %entry
+  %sum.0.lcssa = phi i32 [ 2, %entry ], [ %xor, %for.body ]
+  ret i32 %sum.0.lcssa
+}
+
+; SMIN
+
+; int foo(int * __restrict__ a, int * __restrict__ inv, int *b, int *c, int n) {
+;   int sum = 1;
+;   #pragma clang loop vectorize_width(8, scalable) interleave_count(2)
+;   for (int i = 0; i < n; ++i)
+;     sum = std::min(sum, a[i]);
+;   return sum;
+; }
+
+; CHECK: LV: Found a vectorizable loop (vscale x 8)
+; CHECK: LV: Interleave Count is 2
+; CHECK: Setting best plan to VF=vscale x 8, UF=2
+define dso_local i32 @smin(i32* nocapture %a, i32* nocapture readonly %b, i32 %n) {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %sum.010 = phi i32 [ 2, %for.body.preheader ], [ %.sroa.speculated, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %cmp.i = icmp slt i32 %0, %sum.010
+  %.sroa.speculated = select i1 %cmp.i, i32 %0, i32 %sum.010
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+  %sum.0.lcssa = phi i32 [ 1, %entry ], [ %.sroa.speculated, %for.body ]
+  ret i32 %sum.0.lcssa
+}
+
+; UMAX
+
+; unsigned foo(unsigned * __restrict__ a, int n) {
+;   unsigned sum = 1;
+;   #pragma clang loop vectorize_width(8, scalable) interleave_count(2)
+;   for (int i = 0; i < n; ++i)
+;     sum = std::min(sum, a[i]);
+;   return sum;
+; }
+
+; CHECK: LV: Found a vectorizable loop (vscale x 8)
+; CHECK: LV: Interleave Count is 2
+; CHECK: Setting best plan to VF=vscale x 8, UF=2
+define dso_local i32 @umax(i32* nocapture %a, i32* nocapture readonly %b, i32 %n) {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %sum.010 = phi i32 [ 2, %for.body.preheader ], [ %.sroa.speculated, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %cmp.i = icmp ugt i32 %0, %sum.010
+  %.sroa.speculated = select i1 %cmp.i, i32 %0, i32 %sum.010
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+  %sum.0.lcssa = phi i32 [ 1, %entry ], [ %.sroa.speculated, %for.body ]
+  ret i32 %sum.0.lcssa
+}
+
+; FADD
+
+; float foo(float * __restrict__ a, int n) {
+;   float sum = 0;
+;   #pragma clang loop vectorize_width(8, scalable) interleave_count(2)
+;   for (int i = 0; i < n; ++i)
+;     sum += a[i];
+;  return sum;
+; }
+
+; CHECK: LV: Found a vectorizable loop (vscale x 8)
+; CHECK: LV: Interleave Count is 2
+; CHECK: Setting best plan to VF=vscale x 8, UF=2
+define dso_local float @fadd(float* noalias nocapture readonly %a, i32 %n) {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body.preheader, label %for.end
+
+for.body.preheader:
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %sum.07 = phi float [ 0.000000e+00, %for.body.preheader ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %a, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %add = fadd float %0, %sum.07
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+  %sum.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ]
+  ret float %sum.0.lcssa
+}
+
+; FADD (FAST)
+
+; CHECK: LV: Found a vectorizable loop (vscale x 8)
+; CHECK: LV: Interleave Count is 2
+; CHECK: Setting best plan to VF=vscale x 8, UF=2
+define dso_local float @fadd_fast(float* noalias nocapture readonly %a, i32 %n) {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body.preheader, label %for.end
+
+for.body.preheader:
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %sum.07 = phi float [ 0.000000e+00, %for.body.preheader ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %a, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %add = fadd fast float %0, %sum.07
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+  %sum.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ]
+  ret float %sum.0.lcssa
+}
+
+; FMIN (FAST)
+
+; float foo(float * __restrict__ a, int n) {
+;   float sum = 2;
+;   #pragma clang loop vectorize_width(8, scalable) interleave_count(2)
+;   for (int i = 0; i < n; ++i)
+;     sum = std::min(sum, a[i]);
+;   return sum;
+; }
+
+; CHECK: LV: Found a vectorizable loop (vscale x 8)
+; CHECK: LV: Interleave Count is 2
+; CHECK: Setting best plan to VF=vscale x 8, UF=2
+define dso_local float @fmin_fast(float* noalias nocapture readonly %a, i32 %n) #0 {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body.preheader, label %for.end
+
+for.body.preheader:
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %sum.07 = phi float [ 0.000000e+00, %for.body.preheader ], [ %.sroa.speculated, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %a, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %cmp.i = fcmp fast olt float %0, %sum.07
+  %.sroa.speculated = select i1 %cmp.i, float %0, float %sum.07
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+  %sum.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %.sroa.speculated, %for.body ]
+  ret float %sum.0.lcssa
+}
+
+; FMAX (FAST)
+
+; float foo(float * __restrict__ a, int n) {
+;   float sum = 2;
+;   #pragma clang loop vectorize_width(8, scalable) interleave_count(2)
+;   for (int i = 0; i < n; ++i)
+;     sum = std::max(sum, a[i]);
+;   return sum;
+; }
+
+; CHECK: LV: Found a vectorizable loop (vscale x 8)
+; CHECK: LV: Interleave Count is 2
+; CHECK: Setting best plan to VF=vscale x 8, UF=2
+define dso_local float @fmax_fast(float* noalias nocapture readonly %a, i32 %n) #0 {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body.preheader, label %for.end
+
+for.body.preheader:
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %sum.07 = phi float [ 0.000000e+00, %for.body.preheader ], [ %.sroa.speculated, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %a, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %cmp.i = fcmp fast ogt float %0, %sum.07
+  %.sroa.speculated = select i1 %cmp.i, float %0, float %sum.07
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+  %sum.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %.sroa.speculated, %for.body ]
+  ret float %sum.0.lcssa
+}
+
+; Reduction cannot be vectorized
+
+; MUL
+
+; int sum = 2;
+; #pragma clang loop vectorize_width(8, scalable) interleave_count(2)
+;  for (int i = 0; i < n; ++i)
+;    sum *= a[i];
+;  return sum;
+
+; CHECK-WARN: warning: <unknown>:0:0: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering
+define dso_local i32 @mul(i32* nocapture %a, i32* nocapture readonly %b, i32 %n) {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %sum.07 = phi i32 [ 2, %for.body.preheader ], [ %mul, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %mul = mul nsw i32 %0, %sum.07
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:                                 ; preds = %for.body, %entry
+  %sum.0.lcssa = phi i32 [ 2, %entry ], [ %mul, %for.body ]
+  ret i32 %sum.0.lcssa
+}
+
+; FMIN
+
+; CHECK-WARN: warning: <unknown>:0:0: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering
+define dso_local float @fmin(float* noalias nocapture readonly %a, i32 %n) {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body.preheader, label %for.end
+
+for.body.preheader:
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %sum.07 = phi float [ 0.000000e+00, %for.body.preheader ], [ %.sroa.speculated, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %a, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %cmp.i = fcmp olt float %0, %sum.07
+  %.sroa.speculated = select i1 %cmp.i, float %0, float %sum.07
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+  %sum.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %.sroa.speculated, %for.body ]
+  ret float %sum.0.lcssa
+}
+
+; FMAX
+
+; CHECK-WARN: warning: <unknown>:0:0: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering
+define dso_local float @fmax(float* noalias nocapture readonly %a, i32 %n) {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body.preheader, label %for.end
+
+for.body.preheader:
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %sum.07 = phi float [ 0.000000e+00, %for.body.preheader ], [ %.sroa.speculated, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %a, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %cmp.i = fcmp ogt float %0, %sum.07
+  %.sroa.speculated = select i1 %cmp.i, float %0, float %sum.07
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+  %sum.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %.sroa.speculated, %for.body ]
+  ret float %sum.0.lcssa
+}
+
+attributes #0 = { "no-nans-fp-math"="true" }
+
+!0 = distinct !{!0, !1, !2, !3, !4}
+!1 = !{!"llvm.loop.vectorize.width", i32 8}
+!2 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
+!3 = !{!"llvm.loop.interleave.count", i32 2}
+!4 = !{!"llvm.loop.vectorize.enable", i1 true}