Index: llvm/include/llvm/Analysis/LoopAccessAnalysis.h
===================================================================
--- llvm/include/llvm/Analysis/LoopAccessAnalysis.h
+++ llvm/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -215,6 +215,13 @@
   /// the accesses safely with.
   uint64_t getMaxSafeDepDistBytes() { return MaxSafeDepDistBytes; }
 
+  /// True if there is an unsafe backward dependence whose
+  /// minmum safe distance is greater than another backward dependence's
+  /// distance which is already known.
+  bool isUnsafeBackwardMinDistGTOtherBackwardDist() const {
+    return UnsafeBackwardMinDistGTOtherBackwardDist;
+  }
+
   /// Return the number of elements that are safe to operate on
   /// simultaneously, multiplied by the size of the element in bits.
   uint64_t getMaxSafeVectorWidthInBits() const {
@@ -280,6 +287,11 @@
   // We can access this many bytes in parallel safely.
   uint64_t MaxSafeDepDistBytes;
 
+  /// True if there is an unsafe backward dependence whose
+  /// minmum safe distance is greater than another backward dependence's
+  /// distance which is already known.
+  bool UnsafeBackwardMinDistGTOtherBackwardDist;
+
   /// Number of elements (from consecutive iterations) that are safe to
   /// operate on simultaneously, multiplied by the size of the element in bits.
   /// The size of the element is taken from the memory access that is most
Index: llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
===================================================================
--- llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -235,6 +235,10 @@
   /// inductions and reductions.
   using RecurrenceSet = SmallPtrSet<const PHINode *, 8>;
 
+  /// ReorderList contains the load/store instruction pairs which
+  /// should be reordered before vectorization.
+  using ReorderList = SmallVector<std::pair<Instruction *, Instruction *>, 2>;
+
   /// Returns true if it is legal to vectorize this loop.
   /// This does not mean that it is profitable to vectorize this
   /// loop, only that it is legal to do so.
@@ -353,6 +357,18 @@
     return ConditionalAssumes;
   }
 
+  /// Returns the ReorderNeededInstructionPairs list.
+  ///
+  /// The ReorderNeededInstructionPairs list is filled if
+  /// canVectorizeMemory found unsafe memory accesses while the
+  /// accesses can be changed to safe by reordering the involved
+  /// instructions.
+  ///
+  /// Also see canUnsafeDepsVectorize.
+  ReorderList &getReorderNeededInstructionPairs() {
+    return ReorderNeededInstructionPairs;
+  }
+
 private:
   /// Return true if the pre-header, exiting and latch blocks of \p Lp and all
   /// its nested loops are considered legal for vectorization. These legal
@@ -387,6 +403,23 @@
   /// Returns true if the loop is vectorizable
   bool canVectorizeMemory();
 
+  /// Returns true if the unsafe memory accesses reported by
+  /// MemoryDepChecker can be changed to safe by reordering the
+  /// involved instructions.
+  ///
+  /// This method may set the involved instruction pairs to the
+  /// ReorderNeededInstructionPairs list, and the client should
+  /// reorder each instruction pair before vectorization.  Now only
+  /// the following pattern will be set.  And IR reorder maybe move
+  /// the second instruction and it's dependences to the front of the
+  /// first.
+  ///
+  ///   <store a[i], load a[i + n]>
+  ///
+  /// If there are other unsafe patterns, it will return false and
+  /// ReorderNeededInstructionPairs list will be empty.
+  bool canUnsafeDepsVectorize();
+
   /// Return true if we can vectorize this loop using the IF-conversion
   /// transformation.
   bool canVectorizeWithIfConvert();
@@ -520,6 +553,10 @@
   /// flattened.
   SmallPtrSet<Instruction *, 8> ConditionalAssumes;
 
+  /// Holds instruction pairs that need to reorder.
+  /// Also see canUnsafeDepsVectorize.
+  ReorderList ReorderNeededInstructionPairs;
+
   /// BFI and PSI are used to check for profile guided size optimizations.
   BlockFrequencyInfo *BFI;
   ProfileSummaryInfo *PSI;
Index: llvm/lib/Analysis/LoopAccessAnalysis.cpp
===================================================================
--- llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -1623,6 +1623,7 @@
   if (MinDistanceNeeded > MaxSafeDepDistBytes) {
     LLVM_DEBUG(dbgs() << "LAA: Failure because it needs at least "
                       << MinDistanceNeeded << " size in bytes");
+    UnsafeBackwardMinDistGTOtherBackwardDist = true;
     return Dependence::Backward;
   }
 
@@ -1663,6 +1664,7 @@
                                    const ValueToValueMap &Strides) {
 
   MaxSafeDepDistBytes = -1;
+  UnsafeBackwardMinDistGTOtherBackwardDist = false;
   SmallPtrSet<MemAccessInfo, 8> Visited;
   for (MemAccessInfo CurAccess : CheckDeps) {
     if (Visited.count(CurAccess))
Index: llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -874,15 +874,21 @@
 
 bool LoopVectorizationLegality::canVectorizeMemory() {
   LAI = &(*GetLAA)(*TheLoop);
-  const OptimizationRemarkAnalysis *LAR = LAI->getReport();
-  if (LAR) {
-    ORE->emit([&]() {
-      return OptimizationRemarkAnalysis(Hints->vectorizeAnalysisPassName(),
-                                        "loop not vectorized: ", *LAR);
-    });
-  }
-  if (!LAI->canVectorizeMemory())
+
+  // LoopAccessInfo checks the dependences base on source order,
+  // while canUnsafeDepsVectorize checks the dependences which are
+  // reported unsafe base on possible IR reordering.
+  if (!LAI->canVectorizeMemory() && !canUnsafeDepsVectorize()) {
+    const OptimizationRemarkAnalysis *LAR = LAI->getReport();
+    if (LAR) {
+      ORE->emit([&]() {
+        return OptimizationRemarkAnalysis(Hints->vectorizeAnalysisPassName(),
+                                          "loop not vectorized: ", *LAR);
+      });
+    }
+
     return false;
+  }
 
   if (LAI->hasDependenceInvolvingLoopInvariantAddress()) {
     reportVectorizationFailure("Stores to a uniform address",
@@ -896,6 +902,60 @@
   return true;
 }
 
+bool LoopVectorizationLegality::canUnsafeDepsVectorize() {
+  bool BackwardCanVectorize = true;
+  Instruction *SourceInst, *DestInst;
+
+  if (LAI->hasConvergentOp())
+    return false;
+
+  MemoryDepChecker depChecker = LAI->getDepChecker();
+  if (depChecker.isUnsafeBackwardMinDistGTOtherBackwardDist()) {
+    return false;
+  }
+
+  const auto &Dependences = depChecker.getDependences();
+  if (Dependences == nullptr || Dependences->empty())
+    return false;
+
+  for (const auto &Dep : *Dependences) {
+    if (!BackwardCanVectorize)
+      break;
+
+    // If the Dep.Type is not Backward, go to check the next.
+    if (Dep.Type != MemoryDepChecker::Dependence::Backward) {
+      BackwardCanVectorize &=
+        MemoryDepChecker::Dependence::isSafeForVectorization(Dep.Type) ==
+            MemoryDepChecker::VectorizationSafetyStatus::Safe;
+      continue;
+    }
+
+    SourceInst = Dep.getSource(*LAI);
+    DestInst = Dep.getDestination(*LAI);
+
+    if (SourceInst->getParent() != DestInst->getParent()) {
+      // Move instruction between basic blocks is not supported.
+      BackwardCanVectorize = false;
+    } else {
+      // Here checks the source and destination instruction for
+      // load after store to let it can be vectorized.
+      // FIXME: There are other patterns that can be vectorized,
+      // but will need data dependency check, and will make
+      // reordering complex.
+      if (isa<StoreInst>(SourceInst) && isa<LoadInst>(DestInst))
+        ReorderNeededInstructionPairs.push_back(
+            std::make_pair(SourceInst, DestInst));
+      else
+        BackwardCanVectorize = false;
+    }
+  }
+
+  if (!BackwardCanVectorize)
+    ReorderNeededInstructionPairs.clear();
+
+  return BackwardCanVectorize;
+}
+
 bool LoopVectorizationLegality::isInductionPhi(const Value *V) {
   Value *In0 = const_cast<Value *>(V);
   PHINode *PN = dyn_cast_or_null<PHINode>(In0);
Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1921,6 +1921,43 @@
     collectSupportedLoops(*InnerL, LI, ORE, V);
 }
 
+/// Move DestInst instruction and it's dependences to the head of
+/// SourceInst instruction.
+static void moveDestInstBeforeSourceInst(Instruction *SourceInst,
+    Instruction *DestInst) {
+
+  if (DestInst->getParent() != SourceInst->getParent())
+    return;
+
+  if (DestInst->comesBefore(SourceInst))
+    return;
+
+  for(auto &U : DestInst->operands()){
+    if(isa<Instruction>(U)){
+        moveDestInstBeforeSourceInst(SourceInst, cast<Instruction>(U));
+    }
+  }
+
+  DestInst->moveBefore(SourceInst);
+}
+
+/// Revert the instructions order according to OriginBlocks.
+///
+/// OriginBlocks stored the original instructions list,
+/// the instructions have been reorder in their basic blocks.
+static void revertReorderedBBs(
+    SmallVector<SmallVector<Instruction*, 16>, 2> OriginBlocks) {
+  for (auto OB : OriginBlocks) {
+    BasicBlock *BB = OB.front()->getParent();
+
+    // revert instructions orders
+    for (auto I : OB) {
+      I->removeFromParent();
+      BB->getInstList().push_back(I);
+    }
+  }
+}
+
 namespace {
 
 /// The LoopVectorize Pass.
@@ -9238,6 +9275,33 @@
                                 F, &Hints, IAI);
   CM.collectValuesToIgnore();
 
+  // Reorder the memory access instructions which are reported unsafe
+  // by MemoryDepChecker but LoopVectorizationLegality say that they
+  // can be changed to safe.
+  SmallPtrSet<BasicBlock*, 2> BlockPtrsSet;
+  SmallVector<SmallVector<Instruction*, 16>, 2> OriginBlocks;
+  for (auto InstPair : LVL.getReorderNeededInstructionPairs()) {
+    Instruction *SourceInst = (Instruction *)InstPair.first;
+    Instruction *DestInst = (Instruction *)InstPair.second;
+
+    LLVM_DEBUG(dbgs() << "Reordering src: " << *SourceInst
+        << " dest: " << *DestInst << "\n");
+
+    // Save the origin order for reverting
+    if (!BlockPtrsSet.contains(SourceInst->getParent())) {
+      BlockPtrsSet.insert(SourceInst->getParent());
+      SmallVector<Instruction*, 16> OriginInstrs;
+      for (Instruction &I : *(SourceInst->getParent())) {
+        OriginInstrs.push_back(&I);
+      }
+      OriginBlocks.push_back(OriginInstrs);
+    }
+
+    // Now the instructions are only the following 1 case.
+    // 1. SourceInst = StoreInst, DestInst = LoadInst
+    moveDestInstBeforeSourceInst(SourceInst, DestInst);
+  }
+
   // Use the planner for vectorization.
   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE);
 
@@ -9264,6 +9328,10 @@
     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
                          "requirements.\n");
     Hints.emitRemarkWithHints();
+
+    if (!OriginBlocks.empty())
+      revertReorderedBBs(OriginBlocks);
+
     return false;
   }
 
@@ -9310,6 +9378,9 @@
   // Override IC if user provided an interleave count.
   IC = UserIC > 0 ? UserIC : IC;
 
+  if (!VectorizeLoop && !OriginBlocks.empty())
+      revertReorderedBBs(OriginBlocks);
+
   // Emit diagnostic messages, if any.
   const char *VAPassName = Hints.vectorizeAnalysisPassName();
   if (!VectorizeLoop && !InterleaveLoop) {
Index: llvm/test/Transforms/LoopVectorize/memdep.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/memdep.ll
+++ llvm/test/Transforms/LoopVectorize/memdep.ll
@@ -91,38 +91,6 @@
   ret void
 }
 
-; Plausible dependence of distance 1 - cannot be vectorized (without reordering
-; accesses).
-;   for (i = 0; i < 1024; ++i) {
-;     B[i] = A[i];
-;     A[i] = B[i + 1];
-;   }
-
-; CHECK-LABEL: @f5(
-; CHECK-NOT: <2 x i32>
-
-define void @f5(i32*  %A, i32* %B) {
-entry:
-  br label %for.body
-
-for.body:
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
-  %0 = load i32, i32* %arrayidx, align 4
-  %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
-  store i32 %0, i32* %arrayidx2, align 4
-  %indvars.iv.next = add nsw i64 %indvars.iv, 1
-  %arrayidx4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv.next
-  %1 = load i32, i32* %arrayidx4, align 4
-  store i32 %1, i32* %arrayidx, align 4
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp ne i32 %lftr.wideiv, 1024
-  br i1 %exitcond, label %for.body, label %for.end
-
-for.end:
-  ret void
-}
-
 ; Dependence through a phi node - must not vectorize.
 ;   for (i = 0; i < 1024; ++i) {
 ;     a[i+1] = tmp;
Index: llvm/test/Transforms/LoopVectorize/pr47929-vectorize-backward-load-after-store.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopVectorize/pr47929-vectorize-backward-load-after-store.ll
@@ -0,0 +1,550 @@
+; RUN: opt < %s -O2 -force-vector-width=4 -force-vector-interleave=1 -S | FileCheck %s
+; RUN: opt < %s -O2 -force-vector-interleave=1 -S | FileCheck %s -check-prefix=REVERT
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+
+; The C code corresponding to the following cases are as follows
+;
+;   #define N 1024
+;
+;   float a[N], b[N], c[N], d[N], e[N], f[N];
+;
+;   // Loops that can be vectorized
+;   void foo(int LEN) {
+;     for (int i = 0; i < LEN - 1; i++) {
+;       a[i] = c[i] * 10;
+;       b[i] = a[i + 1] + 1;
+;     }
+;   }
+;
+;   void foo1(int LEN) {
+;     for (int i = LEN - 2; i > -1; i--) {
+;       a[i + 1] = c[i] * 10;
+;       b[i] = a[i] + 1;
+;     }
+;   }
+;
+;   void foo2(int LEN) {
+;     for (int i = 0; i < LEN - 2; i++) {
+;       a[i] = c[i] * 10;
+;       b[i] = a[i + 2] + 1;
+;     }
+;   }
+;
+;  void foo3(int LEN) {
+;     for (int i = 1; i < LEN - 1; i++) {
+;       a[i] = c[i] * 10;
+;       a[i - 1] = d[i] * 11;
+;       b[i] = a[i + 1] + 1;
+;     }
+;  }
+;
+;  void foo4(int LEN) {
+;     for (int i = 0; i < LEN - 2; i++) {
+;       if (i % 2) {
+;         a[i] = c[i] * 10;
+;         b[i] = a[i + 2] + 1;
+;       } else {
+;         d[i] = e[i] + 11;
+;         f[i] = d[i + 2] * 2;
+;       }
+;     }
+;  }
+;
+;  void foo5(int LEN) {
+;    for (int i = 0; i < LEN - 1; i++) {
+;      a[i] = c[i] * 10;
+;      b[i] = a[i + 1] + 1;
+;      d[i] = b[i + 1] - 2;
+;    }
+;  }
+;
+;  // Loops that cann't be vectorized
+;  void bar(int LEN) {
+;    for (int i = 0; i < LEN - 1; i++) {
+;      if (i % 2)
+;        a[i] = c[i] * 10;
+;      b[i] = a[i + 1] + 1;
+;    }
+;  }
+;
+@a = dso_local global [1024 x float] zeroinitializer, align 4
+@b = dso_local global [1024 x float] zeroinitializer, align 4
+@c = dso_local global [1024 x float] zeroinitializer, align 4
+@d = dso_local global [1024 x float] zeroinitializer, align 4
+@e = dso_local global [1024 x float] zeroinitializer, align 4
+@f = dso_local global [1024 x float] zeroinitializer, align 4
+
+define dso_local void @foo(i32 %LEN) #0 {
+; CHECK-LABEL: @foo(
+; CHECK: vector.body
+; CHECK: load <4 x float>
+; CHECK: fmul <4 x float>
+; CHECK: load <4 x float>
+; CHECK: store <4 x float>
+; CHECK: fadd <4 x float>
+; CHECK: store <4 x float>
+entry:
+  %LEN.addr = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 %LEN, i32* %LEN.addr, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %1 = load i32, i32* %LEN.addr, align 4
+  %sub = sub nsw i32 %1, 1
+  %cmp = icmp slt i32 %0, %sub
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %2 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %2 to i64
+  %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @c, i64 0, i64 %idxprom
+  %3 = load float, float* %arrayidx, align 4
+  %mul = fmul float %3, 1.000000e+01
+  %4 = load i32, i32* %i, align 4
+  %idxprom1 = sext i32 %4 to i64
+  %arrayidx2 = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %idxprom1
+  store float %mul, float* %arrayidx2, align 4
+  %5 = load i32, i32* %i, align 4
+  %add = add nsw i32 %5, 1
+  %idxprom3 = sext i32 %add to i64
+  %arrayidx4 = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %idxprom3
+  %6 = load float, float* %arrayidx4, align 4
+  %add5 = fadd float %6, 1.000000e+00
+  %7 = load i32, i32* %i, align 4
+  %idxprom6 = sext i32 %7 to i64
+  %arrayidx7 = getelementptr inbounds [1024 x float], [1024 x float]* @b, i64 0, i64 %idxprom6
+  store float %add5, float* %arrayidx7, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %8 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %8, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+define dso_local void @foo1(i32 %LEN) #0 {
+; CHECK-LABEL: @foo1(
+; CHECK: vector.body
+; CHECK: load <4 x float>
+; CHECK: fmul <4 x float>
+; CHECK: load <4 x float>
+; CHECK: store <4 x float>
+; CHECK: fadd <4 x float>
+; CHECK: store <4 x float>
+; REVERT-LABEL: @foo1(
+; REVERT-NOT: vector.body
+; REVERT: for.body
+; REVERT: load float
+; REVERT: fmul float
+; REVERT: store float
+; REVERT: load float
+; REVERT: fadd float
+; REVERT: store float
+entry:
+  %LEN.addr = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 %LEN, i32* %LEN.addr, align 4
+  %0 = load i32, i32* %LEN.addr, align 4
+  %sub = sub nsw i32 %0, 2
+  store i32 %sub, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %1 = load i32, i32* %i, align 4
+  %cmp = icmp sgt i32 %1, -1
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %2 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %2 to i64
+  %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @c, i64 0, i64 %idxprom
+  %3 = load float, float* %arrayidx, align 4
+  %mul = fmul float %3, 1.000000e+01
+  %4 = load i32, i32* %i, align 4
+  %add = add nsw i32 %4, 1
+  %idxprom1 = sext i32 %add to i64
+  %arrayidx2 = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %idxprom1
+  store float %mul, float* %arrayidx2, align 4
+  %5 = load i32, i32* %i, align 4
+  %idxprom3 = sext i32 %5 to i64
+  %arrayidx4 = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %idxprom3
+  %6 = load float, float* %arrayidx4, align 4
+  %add5 = fadd float %6, 1.000000e+00
+  %7 = load i32, i32* %i, align 4
+  %idxprom6 = sext i32 %7 to i64
+  %arrayidx7 = getelementptr inbounds [1024 x float], [1024 x float]* @b, i64 0, i64 %idxprom6
+  store float %add5, float* %arrayidx7, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %8 = load i32, i32* %i, align 4
+  %dec = add nsw i32 %8, -1
+  store i32 %dec, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+define dso_local void @foo2(i32 %LEN) #0 {
+; CHECK-LABEL: @foo2(
+; CHECK: vector.body
+; CHECK: load <4 x float>
+; CHECK: fmul <4 x float>
+; CHECK: load <4 x float>
+; CHECK: store <4 x float>
+; CHECK: fadd <4 x float>
+; CHECK: store <4 x float>
+; REVERT-LABEL: @foo2(
+entry:
+  %LEN.addr = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 %LEN, i32* %LEN.addr, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %1 = load i32, i32* %LEN.addr, align 4
+  %sub = sub nsw i32 %1, 2
+  %cmp = icmp slt i32 %0, %sub
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %2 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %2 to i64
+  %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @c, i64 0, i64 %idxprom
+  %3 = load float, float* %arrayidx, align 4
+  %mul = fmul float %3, 1.000000e+01
+  %4 = load i32, i32* %i, align 4
+  %idxprom1 = sext i32 %4 to i64
+  %arrayidx2 = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %idxprom1
+  store float %mul, float* %arrayidx2, align 4
+  %5 = load i32, i32* %i, align 4
+  %add = add nsw i32 %5, 2
+  %idxprom3 = sext i32 %add to i64
+  %arrayidx4 = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %idxprom3
+  %6 = load float, float* %arrayidx4, align 4
+  %add5 = fadd float %6, 1.000000e+00
+  %7 = load i32, i32* %i, align 4
+  %idxprom6 = sext i32 %7 to i64
+  %arrayidx7 = getelementptr inbounds [1024 x float], [1024 x float]* @b, i64 0, i64 %idxprom6
+  store float %add5, float* %arrayidx7, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %8 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %8, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+define dso_local void @foo3(i32 %LEN) #0 {
+; CHECK-LABEL: @foo3(
+; CHECK: vector.body
+; CHECK: load <4 x float>
+; CHECK: fmul <4 x float>
+; CHECK: load <4 x float>
+; CHECK: store <4 x float>
+; CHECK: load <4 x float>
+; CHECK: fmul <4 x float>
+; CHECK: store <4 x float>
+; CHECK: fadd <4 x float>
+; CHECK: store <4 x float>
+entry:
+  %LEN.addr = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 %LEN, i32* %LEN.addr, align 4
+  store i32 1, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %1 = load i32, i32* %LEN.addr, align 4
+  %sub = sub nsw i32 %1, 1
+  %cmp = icmp slt i32 %0, %sub
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %2 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %2 to i64
+  %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @c, i64 0, i64 %idxprom
+  %3 = load float, float* %arrayidx, align 4
+  %mul = fmul float %3, 1.000000e+01
+  %4 = load i32, i32* %i, align 4
+  %idxprom1 = sext i32 %4 to i64
+  %arrayidx2 = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %idxprom1
+  store float %mul, float* %arrayidx2, align 4
+  %5 = load i32, i32* %i, align 4
+  %idxprom3 = sext i32 %5 to i64
+  %arrayidx4 = getelementptr inbounds [1024 x float], [1024 x float]* @d, i64 0, i64 %idxprom3
+  %6 = load float, float* %arrayidx4, align 4
+  %mul5 = fmul float %6, 1.100000e+01
+  %7 = load i32, i32* %i, align 4
+  %sub6 = sub nsw i32 %7, 1
+  %idxprom7 = sext i32 %sub6 to i64
+  %arrayidx8 = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %idxprom7
+  store float %mul5, float* %arrayidx8, align 4
+  %8 = load i32, i32* %i, align 4
+  %add = add nsw i32 %8, 1
+  %idxprom9 = sext i32 %add to i64
+  %arrayidx10 = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %idxprom9
+  %9 = load float, float* %arrayidx10, align 4
+  %add11 = fadd float %9, 1.000000e+00
+  %10 = load i32, i32* %i, align 4
+  %idxprom12 = sext i32 %10 to i64
+  %arrayidx13 = getelementptr inbounds [1024 x float], [1024 x float]* @b, i64 0, i64 %idxprom12
+  store float %add11, float* %arrayidx13, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %11 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %11, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+define dso_local void @foo4(i32 %LEN) #0 {
+; CHECK-LABEL: @foo4(
+; CHECK: vector.body
+; REVERT-LABEL: @foo4(
+; REVERT-NOT: vector.body
+; REVERT: for.body
+; REVERT: if.then
+; REVERT: load
+; REVERT: fmul
+; REVERT: store
+; REVERT: load
+; REVERT: fadd
+; REVERT: if.else
+; REVERT: load
+; REVERT: fadd
+; REVERT: store
+; REVERT: load
+; REVERT: fmul
+entry:
+  %LEN.addr = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 %LEN, i32* %LEN.addr, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %1 = load i32, i32* %LEN.addr, align 4
+  %sub = sub nsw i32 %1, 2
+  %cmp = icmp slt i32 %0, %sub
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %2 = load i32, i32* %i, align 4
+  %rem = srem i32 %2, 2
+  %tobool = icmp ne i32 %rem, 0
+  br i1 %tobool, label %if.then, label %if.else
+
+if.then:                                          ; preds = %for.body
+  %3 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %3 to i64
+  %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @c, i64 0, i64 %idxprom
+  %4 = load float, float* %arrayidx, align 4
+  %mul = fmul float %4, 1.000000e+01
+  %5 = load i32, i32* %i, align 4
+  %idxprom1 = sext i32 %5 to i64
+  %arrayidx2 = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %idxprom1
+  store float %mul, float* %arrayidx2, align 4
+  %6 = load i32, i32* %i, align 4
+  %add = add nsw i32 %6, 2
+  %idxprom3 = sext i32 %add to i64
+  %arrayidx4 = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %idxprom3
+  %7 = load float, float* %arrayidx4, align 4
+  %add5 = fadd float %7, 1.000000e+00
+  %8 = load i32, i32* %i, align 4
+  %idxprom6 = sext i32 %8 to i64
+  %arrayidx7 = getelementptr inbounds [1024 x float], [1024 x float]* @b, i64 0, i64 %idxprom6
+  store float %add5, float* %arrayidx7, align 4
+  br label %if.end
+
+if.else:                                          ; preds = %for.body
+  %9 = load i32, i32* %i, align 4
+  %idxprom8 = sext i32 %9 to i64
+  %arrayidx9 = getelementptr inbounds [1024 x float], [1024 x float]* @e, i64 0, i64 %idxprom8
+  %10 = load float, float* %arrayidx9, align 4
+  %add10 = fadd float %10, 1.100000e+01
+  %11 = load i32, i32* %i, align 4
+  %idxprom11 = sext i32 %11 to i64
+  %arrayidx12 = getelementptr inbounds [1024 x float], [1024 x float]* @d, i64 0, i64 %idxprom11
+  store float %add10, float* %arrayidx12, align 4
+  %12 = load i32, i32* %i, align 4
+  %add13 = add nsw i32 %12, 2
+  %idxprom14 = sext i32 %add13 to i64
+  %arrayidx15 = getelementptr inbounds [1024 x float], [1024 x float]* @d, i64 0, i64 %idxprom14
+  %13 = load float, float* %arrayidx15, align 4
+  %mul16 = fmul float %13, 2.000000e+00
+  %14 = load i32, i32* %i, align 4
+  %idxprom17 = sext i32 %14 to i64
+  %arrayidx18 = getelementptr inbounds [1024 x float], [1024 x float]* @f, i64 0, i64 %idxprom17
+  store float %mul16, float* %arrayidx18, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end
+  %15 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %15, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+define dso_local void @foo5(i32 %LEN) #0 {
+; CHECK-LABEL: @foo5(
+; CHECK: vector.body
+; CHECK: load <4 x float>
+; CHECK: fmul <4 x float>
+; CHECK: load <4 x float>
+; CHECK: store <4 x float>
+; CHECK: fadd <4 x float>
+; CHECK: load <4 x float>
+; CHECK: store <4 x float>
+; CHECK: fadd <4 x float>
+; CHECK: store <4 x float>
+; REVERT-LABEL: @foo5(
+entry:
+  %LEN.addr = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 %LEN, i32* %LEN.addr, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %1 = load i32, i32* %LEN.addr, align 4
+  %sub = sub nsw i32 %1, 1
+  %cmp = icmp slt i32 %0, %sub
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %2 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %2 to i64
+  %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @c, i64 0, i64 %idxprom
+  %3 = load float, float* %arrayidx, align 4
+  %mul = fmul float %3, 1.000000e+01
+  %4 = load i32, i32* %i, align 4
+  %idxprom1 = sext i32 %4 to i64
+  %arrayidx2 = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %idxprom1
+  store float %mul, float* %arrayidx2, align 4
+  %5 = load i32, i32* %i, align 4
+  %add = add nsw i32 %5, 1
+  %idxprom3 = sext i32 %add to i64
+  %arrayidx4 = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %idxprom3
+  %6 = load float, float* %arrayidx4, align 4
+  %add5 = fadd float %6, 1.000000e+00
+  %7 = load i32, i32* %i, align 4
+  %idxprom6 = sext i32 %7 to i64
+  %arrayidx7 = getelementptr inbounds [1024 x float], [1024 x float]* @b, i64 0, i64 %idxprom6
+  store float %add5, float* %arrayidx7, align 4
+  %8 = load i32, i32* %i, align 4
+  %add8 = add nsw i32 %8, 1
+  %idxprom9 = sext i32 %add8 to i64
+  %arrayidx10 = getelementptr inbounds [1024 x float], [1024 x float]* @b, i64 0, i64 %idxprom9
+  %9 = load float, float* %arrayidx10, align 4
+  %sub11 = fsub float %9, 2.000000e+00
+  %10 = load i32, i32* %i, align 4
+  %idxprom12 = sext i32 %10 to i64
+  %arrayidx13 = getelementptr inbounds [1024 x float], [1024 x float]* @d, i64 0, i64 %idxprom12
+  store float %sub11, float* %arrayidx13, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %11 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %11, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+define dso_local void @bar(i32 %LEN) #0 {
+; CHECK-LABEL: @bar(
+; CHECK-NOT: vector.body
+; CHECK: for.body
+; CHECK: if.then
+; CHECK: load float
+; CHECK: fmul float
+; CHECK: store float
+; CHECK: if.end
+; CHECK: load float
+; CHECK: fadd float
+; CHECK: store float
+entry:
+  %LEN.addr = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 %LEN, i32* %LEN.addr, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %1 = load i32, i32* %LEN.addr, align 4
+  %sub = sub nsw i32 %1, 1
+  %cmp = icmp slt i32 %0, %sub
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %2 = load i32, i32* %i, align 4
+  %rem = srem i32 %2, 2
+  %tobool = icmp ne i32 %rem, 0
+  br i1 %tobool, label %if.then, label %if.end
+
+if.then:                                          ; preds = %for.body
+  %3 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %3 to i64
+  %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @c, i64 0, i64 %idxprom
+  %4 = load float, float* %arrayidx, align 4
+  %mul = fmul float %4, 1.000000e+01
+  %5 = load i32, i32* %i, align 4
+  %idxprom1 = sext i32 %5 to i64
+  %arrayidx2 = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %idxprom1
+  store float %mul, float* %arrayidx2, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  %6 = load i32, i32* %i, align 4
+  %add = add nsw i32 %6, 1
+  %idxprom3 = sext i32 %add to i64
+  %arrayidx4 = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %idxprom3
+  %7 = load float, float* %arrayidx4, align 4
+  %add5 = fadd float %7, 1.000000e+00
+  %8 = load i32, i32* %i, align 4
+  %idxprom6 = sext i32 %8 to i64
+  %arrayidx7 = getelementptr inbounds [1024 x float], [1024 x float]* @b, i64 0, i64 %idxprom6
+  store float %add5, float* %arrayidx7, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end
+  %9 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %9, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}