Index: llvm/include/llvm/Analysis/TargetTransformInfo.h
===================================================================
--- llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -927,6 +927,10 @@
   /// \return The width of the smallest vector register type.
   unsigned getMinVectorRegisterBitWidth() const;
 
+  /// \return The maximum value for vscale in scalable vectors such as
+  /// <vscale x 4 x i32>.
+  unsigned getMaxVScale() const;
+
   /// \return True if the vectorization factor should be chosen to
   /// make the vector of the smallest element type match the size of a
   /// vector register. For wider element types, this could result in
@@ -1495,6 +1499,7 @@
   virtual const char *getRegisterClassName(unsigned ClassID) const = 0;
   virtual unsigned getRegisterBitWidth(bool Vector) const = 0;
   virtual unsigned getMinVectorRegisterBitWidth() = 0;
+  virtual unsigned getMaxVScale() const = 0;
   virtual bool shouldMaximizeVectorBandwidth(bool OptSize) const = 0;
   virtual unsigned getMinimumVF(unsigned ElemWidth) const = 0;
   virtual bool shouldConsiderAddressTypePromotion(
@@ -1910,6 +1915,7 @@
   unsigned getMinVectorRegisterBitWidth() override {
     return Impl.getMinVectorRegisterBitWidth();
   }
+  unsigned getMaxVScale() const override { return Impl.getMaxVScale(); }
   bool shouldMaximizeVectorBandwidth(bool OptSize) const override {
     return Impl.shouldMaximizeVectorBandwidth(OptSize);
   }
Index: llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
===================================================================
--- llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -352,6 +352,8 @@
 
   unsigned getMinVectorRegisterBitWidth() { return 128; }
 
+  unsigned getMaxVScale() const { return 1; }
+
   bool shouldMaximizeVectorBandwidth(bool OptSize) const { return false; }
 
   unsigned getMinimumVF(unsigned ElemWidth) const { return 0; }
Index: llvm/include/llvm/CodeGen/BasicTTIImpl.h
===================================================================
--- llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -567,6 +567,8 @@
 
   unsigned getRegisterBitWidth(bool Vector) const { return 32; }
 
+  unsigned getMaxVScale() const { return 1; }
+
   /// Estimate the overhead of scalarizing an instruction. Insert and Extract
   /// are set if the demanded result elements need to be inserted and/or
   /// extracted from vectors.
Index: llvm/lib/Analysis/TargetTransformInfo.cpp
===================================================================
--- llvm/lib/Analysis/TargetTransformInfo.cpp
+++ llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -622,6 +622,10 @@
   return TTIImpl->getMinVectorRegisterBitWidth();
 }
 
+unsigned TargetTransformInfo::getMaxVScale() const {
+  return TTIImpl->getMaxVScale();
+}
+
 bool TargetTransformInfo::shouldMaximizeVectorBandwidth(bool OptSize) const {
   return TTIImpl->shouldMaximizeVectorBandwidth(OptSize);
 }
Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
===================================================================
--- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -115,6 +115,12 @@
     return ST->getMinVectorRegisterBitWidth();
   }
 
+  unsigned getMaxVScale() const {
+    if (ST->hasSVE())
+      return AArch64::SVEMaxBitsPerVector / AArch64::SVEBitsPerBlock;
+    return BaseT::getMaxVScale();
+  }
+
   unsigned getMaxInterleaveFactor(unsigned VF);
 
   int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1448,8 +1448,8 @@
   /// \return An upper bound for the vectorization factor, a power-of-2 larger
   /// than zero. One is returned if vectorization should best be avoided due
   /// to cost.
-  ElementCount computeFeasibleMaxVF(unsigned ConstTripCount,
-                                    ElementCount UserVF);
+  Optional<ElementCount> computeFeasibleMaxVF(unsigned ConstTripCount,
+                                              ElementCount UserVF);
 
   /// The vectorization cost is a combination of the cost itself and a boolean
   /// indicating whether any of the contributing operations will actually
@@ -5256,7 +5256,11 @@
     return None;
   }
 
-  ElementCount MaxVF = computeFeasibleMaxVF(TC, UserVF);
+  Optional<ElementCount> MaybeMaxVF = computeFeasibleMaxVF(TC, UserVF);
+  if (!MaybeMaxVF)
+    return None;
+
+  ElementCount MaxVF = MaybeMaxVF.getValue();
 
   switch (ScalarEpilogueStatus) {
   case CM_ScalarEpilogueAllowed:
@@ -5347,7 +5351,7 @@
   return None;
 }
 
-ElementCount
+Optional<ElementCount>
 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
                                                  ElementCount UserVF) {
   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
@@ -5361,10 +5365,34 @@
   // dependence distance).
   unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth();
 
+  // If the user vectorization factor is legally unsafe, clamp it to a safe
+  // value if possible. Otherwise, return as is.
   if (UserVF.isNonZero()) {
-    // If legally unsafe, clamp the user vectorization factor to a safe value.
-    unsigned MaxSafeVF = PowerOf2Floor(MaxSafeRegisterWidth / WidestType);
-    if (UserVF.getKnownMinValue() <= MaxSafeVF)
+    // Nothing to do if there are no dependencies.
+    if (MaxSafeRegisterWidth >= UINT_MAX)
+      return UserVF;
+
+    unsigned FixedVF = PowerOf2Floor(MaxSafeRegisterWidth / WidestType);
+    // If scalable, scale VF by vscale before checking if it's safe.
+    ElementCount MaxSafeVF =
+        UserVF.isScalable()
+            ? ElementCount::getScalable(FixedVF / TTI.getMaxVScale())
+            : ElementCount::getFixed(FixedVF);
+
+    if (UserVF.isScalable() && MaxSafeVF.isZero()) {
+      // Dependence distance too small to use scalable vectors. Bail out.
+      // TODO: Fixed-width vectorization may still be beneficial.
+      reportVectorizationFailure("Max legal vector width too small, scalable "
+                                 "vectorization unfeasible.",
+                                 "max legal vector width too small, scalable "
+                                 "vectorization unfeasible.",
+                                 "UnfeasibleScalableVF", ORE, TheLoop);
+      return None;
+    }
+
+    LLVM_DEBUG(dbgs() << "LV: The max safe VF is: " << MaxSafeVF << ".\n");
+
+    if (UserVF.getKnownMinValue() <= MaxSafeVF.getKnownMinValue())
       return UserVF;
 
     LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
@@ -5379,7 +5407,7 @@
              << " is unsafe, clamping to maximum safe vectorization factor "
              << ore::NV("VectorizationFactor", MaxSafeVF);
     });
-    return ElementCount::getFixed(MaxSafeVF);
+    return MaxSafeVF;
   }
 
   WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth);
Index: llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll
@@ -0,0 +1,267 @@
+; REQUIRES: asserts
+; RUN: opt -mtriple=aarch64-none-linux-gnu -mattr=+sve -loop-vectorize -pass-remarks-analysis=loop-vectorize -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+
+; test1
+;
+; The pragma applied to this loop implies a scalable vector <vscale x 4 x i32>
+; be used for vectorization. For fixed vectors the MaxVF=4, otherwise there
+; would be a dependence between vector lanes for vectors greater than 128 bits.
+;
+; void test1(int *a, int *b, int N) {
+;   #pragma clang loop vectorize(enable) vectorize_width(4, scalable)
+;   for (int i=0; i<N; ++i) {
+;     a[i + 4] = a[i] + b[i];
+;   }
+; }
+;
+; For scalable vectorization 'vscale' has to be considered, with vscale=1 this
+; scalable VF would be valid, although the maximum for SVE is 16 so this loop
+; cannot be vectorized in a scalable manner.
+
+; CHECK-LABEL: LV: Checking a loop in "test1"
+; CHECK: remark: <unknown>:0:0: loop not vectorized: max legal vector width too small, scalable vectorization unfeasible.
+define void @test1(i32* %a, i32* %b, i32 %N) {
+entry:
+  %cmp12 = icmp sgt i32 %N, 0
+  br i1 %cmp12, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %2 = add nuw nsw i64 %indvars.iv, 4
+  %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2
+  store i32 %add, i32* %arrayidx5, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body, !llvm.loop !0
+}
+
+!0 = !{!0, !1, !2}
+!1 = !{!"llvm.loop.vectorize.enable", i1 true}
+!2 = !{!"llvm.loop.vectorize.width", !3}
+!3 = !{i32 4, i1 true}
+
+; test2
+;
+; Specifies a vector of <vscale x 2 x i32>, i.e. maximum of 32 x i32 with 2
+; words per 128-bits (unpacked).
+;
+; void test2(int *a, int *b, int N) {
+;   #pragma clang loop vectorize(enable) vectorize_width(2, scalable)
+;   for (int i=0; i<N; ++i) {
+;     a[i + 32] = a[i] + b[i];
+;   }
+; }
+;
+; Max fixed VF=32, Max scalable VF=2, safe to vectorize.
+
+; CHECK-LABEL: LV: Checking a loop in "test2"
+; CHECK: LV: The max safe VF is: vscale x 2.
+; CHECK: LV: Using user VF 2.
+define void @test2(i32* %a, i32* %b, i32 %N) {
+entry:
+  %cmp12 = icmp sgt i32 %N, 0
+  br i1 %cmp12, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %2 = add nuw nsw i64 %indvars.iv, 32
+  %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2
+  store i32 %add, i32* %arrayidx5, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body, !llvm.loop !4
+}
+
+!4 = !{!4, !5, !6}
+!5 = !{!"llvm.loop.vectorize.enable", i1 true}
+!6 = !{!"llvm.loop.vectorize.width", !7}
+!7 = !{i32 2, i1 true}
+
+; test3
+;
+; Specifies a vector of <vscale x 4 x i32>, i.e. maximum of 64 x i32 with 4
+; words per 128-bits (packed).
+;
+; void test3(int *a, int *b, int N) {
+;   #pragma clang loop vectorize(enable) vectorize_width(4, scalable)
+;   for (int i=0; i<N; ++i) {
+;     a[i + 32] = a[i] + b[i];
+;   }
+; }
+;
+; Max fixed VF=32, Max scalable VF=2, unsafe to vectorize. Should clamp to 2.
+
+; CHECK-LABEL: LV: Checking a loop in "test3"
+; CHECK: LV: The max safe VF is: vscale x 2.
+; CHECK: LV: User VF=vscale x 4 is unsafe, clamping to max safe VF=vscale x 2.
+; CHECK: remark: <unknown>:0:0: User-specified vectorization factor vscale x 4 is unsafe, clamping to maximum safe vectorization factor vscale x 2
+define void @test3(i32* %a, i32* %b, i32 %N) {
+entry:
+  %cmp12 = icmp sgt i32 %N, 0
+  br i1 %cmp12, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %2 = add nuw nsw i64 %indvars.iv, 32
+  %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2
+  store i32 %add, i32* %arrayidx5, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body, !llvm.loop !8
+}
+
+!8 = !{!8, !9, !10}
+!9 = !{!"llvm.loop.vectorize.enable", i1 true}
+!10 = !{!"llvm.loop.vectorize.width", !11}
+!11 = !{i32 4, i1 true}
+
+; test4
+;
+; Specifies a vector of <vscale x 4 x i32>, i.e. maximum of 64 x i32 with 4
+; words per 128-bits (packed).
+;
+; void test4(int *a, int *b, int N) {
+;   #pragma clang loop vectorize(enable) vectorize_width(4, scalable)
+;   for (int i=0; i<N; ++i) {
+;     a[i + 128] = a[i] + b[i];
+;   }
+; }
+;
+; Max fixed VF=128, Max scalable VF=8, safe to vectorize.
+
+; CHECK-LABEL: LV: Checking a loop in "test4"
+; CHECK: LV: The max safe VF is: vscale x 8.
+; CHECK: LV: Using user VF 4
+define void @test4(i32* %a, i32* %b, i32 %N) {
+entry:
+  %cmp12 = icmp sgt i32 %N, 0
+  br i1 %cmp12, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %2 = add nuw nsw i64 %indvars.iv, 128
+  %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2
+  store i32 %add, i32* %arrayidx5, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body, !llvm.loop !12
+}
+
+!12 = !{!12, !13, !14}
+!13 = !{!"llvm.loop.vectorize.enable", i1 true}
+!14 = !{!"llvm.loop.vectorize.width", !15}
+!15 = !{i32 4, i1 true}
+
+; test5
+;
+; Specifies a vector of <vscale x 16 x i32>, i.e. maximum of 256 x i32.
+;
+; void test5(int *a, int *b, int N) {
+;   #pragma clang loop vectorize(enable) vectorize_width(8, scalable)
+;   for (int i=0; i<N; ++i) {
+;     a[i + 128] = a[i] + b[i];
+;   }
+; }
+;
+; Max fixed VF=128, Max scalable VF=8, unsafe to vectorize. Should clamp to 8.
+
+; CHECK-LABEL: LV: Checking a loop in "test5"
+; CHECK: LV: The max safe VF is: vscale x 8.
+; CHECK: LV: User VF=vscale x 16 is unsafe, clamping to max safe VF=vscale x 8.
+; CHECK: remark: <unknown>:0:0: User-specified vectorization factor vscale x 16 is unsafe, clamping to maximum safe vectorization factor vscale x 8
+define void @test5(i32* %a, i32* %b, i32 %N) {
+entry:
+  %cmp12 = icmp sgt i32 %N, 0
+  br i1 %cmp12, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %2 = add nuw nsw i64 %indvars.iv, 128
+  %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2
+  store i32 %add, i32* %arrayidx5, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body, !llvm.loop !16
+}
+
+!16 = !{!16, !17, !18}
+!17 = !{!"llvm.loop.vectorize.enable", i1 true}
+!18 = !{!"llvm.loop.vectorize.width", !19}
+!19 = !{i32 16, i1 true}
Index: llvm/test/Transforms/LoopVectorize/scalable-vf-hint.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopVectorize/scalable-vf-hint.ll
@@ -0,0 +1,60 @@
+; REQUIRES: asserts
+; RUN: opt -loop-vectorize -pass-remarks-analysis=loop-vectorize -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; test1
+;
+; The pragma applied to this loop implies a scalable vector <vscale x 4 x i32>
+; be used for vectorization. For fixed vectors the MaxVF=4, otherwise there
+; would be a dependence between vector lanes for vectors greater than 128 bits.
+;
+; void test1(int *a, int *b, int N) {
+;   #pragma clang loop vectorize(enable) vectorize_width(4, scalable)
+;   for (int i=0; i<N; ++i) {
+;     a[i + 4] = a[i] + b[i];
+;   }
+; }
+;
+; For scalable vectorization 'vscale' has to be considered, in this example
+; vectorization is only safe when vscale=1. The default max vscale is 1 unless
+; a target specifies otherwise, therefore it is safe to vectorize this loop.
+
+; CHECK-LABEL: LV: Checking a loop in "test1"
+; CHECK: The max safe VF is: vscale x 4.
+; CHECK: Using user VF 4.
+; CHECK: <4 x i32>
+define void @test1(i32* %a, i32* %b, i32 %N) {
+entry:
+  %cmp12 = icmp sgt i32 %N, 0
+  br i1 %cmp12, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %2 = add nuw nsw i64 %indvars.iv, 4
+  %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2
+  store i32 %add, i32* %arrayidx5, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body, !llvm.loop !0
+}
+
+!0 = !{!0, !1, !2}
+!1 = !{!"llvm.loop.vectorize.enable", i1 true}
+!2 = !{!"llvm.loop.vectorize.width", !3}
+!3 = !{i32 4, i1 true}