diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
--- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -215,6 +215,13 @@
   /// the accesses safely with.
   uint64_t getMaxSafeDepDistBytes() { return MaxSafeDepDistBytes; }
 
+  /// True if there is an unsafe backward dependence whose
+  /// minmum safe distance is greater than another backward dependence's
+  /// distance which is already known.
+  bool isUnsafeBackwardMinDistGTOtherBackwardDist() const {
+    return UnsafeBackwardMinDistGTOtherBackwardDist;
+  }
+
   /// Return the number of elements that are safe to operate on
   /// simultaneously, multiplied by the size of the element in bits.
   uint64_t getMaxSafeVectorWidthInBits() const {
@@ -280,6 +287,11 @@
   // We can access this many bytes in parallel safely.
   uint64_t MaxSafeDepDistBytes;
 
+  /// True if there is an unsafe backward dependence whose
+  /// minmum safe distance is greater than another backward dependence's
+  /// distance which is already known.
+  bool UnsafeBackwardMinDistGTOtherBackwardDist;
+
   /// Number of elements (from consecutive iterations) that are safe to
   /// operate on simultaneously, multiplied by the size of the element in bits.
   /// The size of the element is taken from the memory access that is most
diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -235,6 +235,17 @@
   /// inductions and reductions.
   using RecurrenceSet = SmallPtrSet<const PHINode *, 8>;
 
+  /// ReorderList contains the instructions which should be reordered
+  /// before vectorization.
+  ///
+  /// The items are in the following format.
+  ///
+  ///   std::pair
+  ///     .first  --> a load/store instruction
+  ///     .second --> a list of instructions to be move before the first
+  using ReorderList =
+      SmallVector<std::pair<Instruction *, SmallVector<Instruction *, 2>>, 2>;
+
   /// Returns true if it is legal to vectorize this loop.
   /// This does not mean that it is profitable to vectorize this
   /// loop, only that it is legal to do so.
@@ -350,6 +361,19 @@
     return ConditionalAssumes;
   }
 
+  /// Returns the ReorderNeededInstructionPairs list.
+  ///
+  /// The ReorderNeededInstructionPairs list is filled if
+  /// canVectorizeMemory found unsafe memory accesses while the
+  /// accesses can be changed to safe by reordering the involved
+  /// instructions.  Also see canUnsafeDepsVectorize.
+  ///
+  /// The pairs are sorted by the first field(llvm::Instruction).
+  /// And each second field(llvm::SmallVector) is also sorted.
+  ReorderList &getReorderNeededInstructionPairs() {
+    return ReorderNeededInstructionPairs;
+  }
+
 private:
   /// Return true if the pre-header, exiting and latch blocks of \p Lp and all
   /// its nested loops are considered legal for vectorization. These legal
@@ -384,6 +408,39 @@
   /// Returns true if the loop is vectorizable
   bool canVectorizeMemory();
 
+  /// Returns true if the unsafe memory accesses reported by
+  /// MemoryDepChecker can be changed to safe by reordering the
+  /// involved instructions.
+  ///
+  /// This method may set the involved instruction pairs to the
+  /// ReorderNeededInstructionPairs list, and the client should
+  /// reorder each instruction pair before vectorization.  Also
+  /// see getReorderNeededInstructionPairs.
+  bool canUnsafeDepsVectorize();
+
+  /// Returns true if the DESTINST instruction can be moved before
+  /// the SRCINST instruction, and add all the involved instructions
+  /// to the INSTS list.
+  ///
+  /// This method do the following checks recursively.
+  ///   step1. call canMoveInstrs to check if all the operands of
+  ///          DESTINST can be moved
+  ///   step2. call canMoveBetweenInstrs to check if the instructions
+  ///          between SRCINST and DESTINST which read/write the same
+  ///          underly object as DESTINST does can be moved.
+  bool canMoveInstrs(llvm::Instruction *SrcInst, llvm::Instruction *DestInst,
+                     SmallVector<Instruction *, 2> &Insts);
+
+  /// Returns true if the instructions between SRCINST and DESTINST
+  /// which read/write the same underly object as DESTINST does can
+  /// be moved before the SRCINST instruction.
+  ///
+  /// This method is called by canMoveInstrs and calls canMoveInstrs
+  /// too.  Also see canMoveInstrs.
+  bool canMoveBetweenInstrs(llvm::Instruction *SrcInst,
+                            llvm::Instruction *DestInst,
+                            SmallVector<Instruction *, 2> &Insts);
+
   /// Return true if we can vectorize this loop using the IF-conversion
   /// transformation.
   bool canVectorizeWithIfConvert();
@@ -514,6 +571,10 @@
   /// flattened.
   SmallPtrSet<Instruction *, 8> ConditionalAssumes;
 
+  /// Holds instruction pairs that need to reorder.
+  /// Also see canUnsafeDepsVectorize.
+  ReorderList ReorderNeededInstructionPairs;
+
   /// BFI and PSI are used to check for profile guided size optimizations.
   BlockFrequencyInfo *BFI;
   ProfileSummaryInfo *PSI;
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -124,7 +124,7 @@
 
 /// Enable store-to-load forwarding conflict detection. This option can
 /// be disabled for correctness testing.
-static cl::opt<bool> EnableForwardingConflictDetection(
+cl::opt<bool> EnableForwardingConflictDetection(
     "store-to-load-forwarding-conflict-detection", cl::Hidden,
     cl::desc("Enable conflict detection in loop-access analysis"),
     cl::init(true));
@@ -1623,6 +1623,7 @@
   if (MinDistanceNeeded > MaxSafeDepDistBytes) {
     LLVM_DEBUG(dbgs() << "LAA: Failure because it needs at least "
                       << MinDistanceNeeded << " size in bytes");
+    UnsafeBackwardMinDistGTOtherBackwardDist = true;
     return Dependence::Backward;
   }
 
@@ -1663,6 +1664,7 @@
                                    const ValueToValueMap &Strides) {
 
   MaxSafeDepDistBytes = -1;
+  UnsafeBackwardMinDistGTOtherBackwardDist = false;
   SmallPtrSet<MemAccessInfo, 8> Visited;
   for (MemAccessInfo CurAccess : CheckDeps) {
     if (Visited.count(CurAccess))
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -32,6 +32,7 @@
 #define DEBUG_TYPE LV_NAME
 
 extern cl::opt<bool> EnableVPlanPredication;
+extern cl::opt<bool> EnableForwardingConflictDetection;
 
 static cl::opt<bool>
     EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden,
@@ -876,15 +877,21 @@
 
 bool LoopVectorizationLegality::canVectorizeMemory() {
   LAI = &(*GetLAA)(*TheLoop);
-  const OptimizationRemarkAnalysis *LAR = LAI->getReport();
-  if (LAR) {
-    ORE->emit([&]() {
-      return OptimizationRemarkAnalysis(Hints->vectorizeAnalysisPassName(),
-                                        "loop not vectorized: ", *LAR);
-    });
-  }
-  if (!LAI->canVectorizeMemory())
+
+  // LoopAccessInfo checks the dependences base on source order,
+  // while canUnsafeDepsVectorize checks the dependences which are
+  // reported unsafe base on possible IR reordering.
+  if (!LAI->canVectorizeMemory() && !canUnsafeDepsVectorize()) {
+    const OptimizationRemarkAnalysis *LAR = LAI->getReport();
+    if (LAR) {
+      ORE->emit([&]() {
+        return OptimizationRemarkAnalysis(Hints->vectorizeAnalysisPassName(),
+                                          "loop not vectorized: ", *LAR);
+      });
+    }
+
     return false;
+  }
 
   if (LAI->hasDependenceInvolvingLoopInvariantAddress()) {
     reportVectorizationFailure("Stores to a uniform address",
@@ -898,6 +905,343 @@
   return true;
 }
 
+bool LoopVectorizationLegality::canMoveInstrs(
+    Instruction *SrcInst, Instruction *DestInst,
+    SmallVector<Instruction*, 2> &Insts) {
+  LLVM_DEBUG(dbgs() << "LV: canMoveInstrs: " << *SrcInst << "\n"
+                    << "            <--->  " << *DestInst << "\n");
+
+  bool CanMove = true;
+  unsigned OpNums;
+
+  // Recursive exit 1: already checked
+  if (std::find(Insts.begin(), Insts.end(), DestInst) != Insts.end())
+    return true;
+  else
+    Insts.push_back(DestInst);
+
+  // Recursive exit 2: the DestInst is same as SrcInst
+  if (DestInst == SrcInst) {
+    LLVM_DEBUG(dbgs() << "LV: canMoveInstrs: The source instruction and "
+                      << "destination instruction are same!!!\n");
+    return false;
+  }
+
+  // Recursive exit 3: Both access the same address
+  // In the process of recursion, the DestInst and StrInst may operate
+  // on the same address.
+  if (isa<LoadInst>(DestInst) || isa<StoreInst>(DestInst)) {
+    Value *SrcPtr, *DestPtr;
+
+    if (isa<LoadInst>(SrcInst))
+      SrcPtr = cast<LoadInst>(SrcInst)->getPointerOperand();
+    else
+      SrcPtr = cast<StoreInst>(SrcInst)->getPointerOperand();
+
+    if (isa<LoadInst>(DestInst))
+      DestPtr = cast<LoadInst>(DestInst)->getPointerOperand();
+    else
+      DestPtr = cast<StoreInst>(DestInst)->getPointerOperand();
+
+    if (SrcPtr == DestPtr) {
+      LLVM_DEBUG(dbgs() << "LV: canMoveInstrs: Store/Load instructions access "
+                        << "same address!!!\n");
+      return false;
+    }
+  }
+
+  // STEP1: Check the operands
+  OpNums = DestInst->getNumOperands();
+  for (unsigned Idx = 0; Idx < OpNums; Idx++) {
+    Instruction *OpInst = dyn_cast<Instruction>(DestInst->getOperand(Idx));
+
+    // Maybe a llvm::Value, E.g.:
+    //     a constant
+    //     a parameter variant
+    //     a global variant
+    if (!OpInst)
+      continue;
+
+    BasicBlock *OpBB = OpInst->getParent();
+    BasicBlock *DestBB = DestInst->getParent();
+    // This basic block may use instructions of other basic block.
+    // And the other basic block may also use instruction of this
+    // basic block, so go on checking.
+    if (OpBB != DestBB) {
+      // Use instruction out of current Function
+      // FIXME: Don't know if this is necessary.
+      if (OpBB->getParent() != DestBB->getParent())
+        continue;
+
+      // FIXME: Don't know if there is any situation that can stop
+      // the checking here.
+    }
+
+    LLVM_DEBUG(dbgs() << "LV: canMoveInstrs: Operand Instructions' Check\n");
+    CanMove = canMoveInstrs(SrcInst, OpInst, Insts);
+    if (!CanMove)
+      return false;
+  }
+
+  // STEP2: Check the instructions between the SrcInst and DestInst.
+  if (SrcInst->getParent() == DestInst->getParent() &&
+      SrcInst->comesBefore(DestInst))
+    return canMoveBetweenInstrs(SrcInst, DestInst, Insts);
+  else
+    return true;
+}
+
+bool LoopVectorizationLegality::canMoveBetweenInstrs(
+    Instruction *SrcInst, Instruction *DestInst,
+    SmallVector<Instruction*, 2> &Insts) {
+  LLVM_DEBUG(dbgs() << "LV: canMoveBetweenInstrs: " << *SrcInst << "\n"
+                    << "                   <--->  " << *DestInst << "\n");
+
+  bool CanMove = true;
+
+  Value *Ptr = nullptr;
+  if (isa<StoreInst>(DestInst))
+    Ptr = cast<StoreInst>(DestInst)->getPointerOperand();
+  else if (isa<LoadInst>(DestInst))
+    Ptr = cast<LoadInst>(DestInst)->getPointerOperand();
+
+  // DestInst is not load/store instruction.
+  if (!Ptr)
+    return true;
+
+  const ValueToValueMap &Strides =
+      getSymbolicStrides() ? *getSymbolicStrides() : ValueToValueMap();
+  int64_t Stride = getPtrStride(PSE, Ptr, TheLoop, Strides, true, true);
+  Value *UnderlyingObject = getUnderlyingObject(Ptr, 6);
+
+  for (BasicBlock::iterator it = --(DestInst->getIterator()),
+                            begin = SrcInst->getIterator();
+      it != begin; --it) {
+    // The current instruction maybe already checked as operand.
+    if (std::find(Insts.begin(), Insts.end(), (&*it)) != Insts.end())
+      continue;
+
+    // FIXME: Don't know how to check call instruction that
+    // read/wirte memory.
+    if (isa<CallInst>(&*it) &&
+        ((&*it)->mayReadFromMemory() || (&*it)->mayWriteToMemory())) {
+      LLVM_DEBUG(dbgs() << "LV: canMoveBetweenInstrs: Found memory access "
+                        << "call: " << (*it) << "\n");
+      return false;
+    }
+
+    if (isa<StoreInst>(&*it) || isa<LoadInst>(&*it)) {
+      Value *PtrTemp = nullptr;
+      if (isa<StoreInst>(&*it))
+        PtrTemp = cast<StoreInst>(&*it)->getPointerOperand();
+      else if (isa<LoadInst>(&*it))
+        PtrTemp = cast<LoadInst>(&*it)->getPointerOperand();
+
+      Value *UnderlyingObjectTemp = getUnderlyingObject(PtrTemp, 6);
+
+      LLVM_DEBUG(dbgs() << "LV: canMoveBetweenInstrs:\n"
+                        << "    UnderlyingObject: " << *UnderlyingObject << "\n"
+                        << "    UnderlyingObjectTemp: " << *UnderlyingObjectTemp
+                        << "\n");
+
+      // For some reason the return value of getUnderlyingObject
+      // function is still an Instruction.  So the following if
+      // statement maybe false, and the current instruction does not
+      // need move before.  This is an assumption.  If the current
+      // instruction do need move before, that says there is a
+      // memory conflict between the current instruction and the
+      // DestInst.  And the runtime memory check can detect the
+      // conflict, then go to run the origin scalar loop.
+      if (UnderlyingObjectTemp == UnderlyingObject) {
+        const SCEV *First = PSE.getSCEV(PtrTemp);
+        const SCEV *Second = PSE.getSCEV(Ptr);
+        const SCEV *Dist = PSE.getSE()->getMinusSCEV(Second, First);
+        const SCEVConstant *C = dyn_cast<SCEVConstant>(Dist);
+
+        if (!C) {
+          LLVM_DEBUG(dbgs() << "LV: canMoveBetweenInstrs: Found non constant "
+                            << "distance!!!\n");
+          return false;
+        }
+
+        const APInt &Val = C->getAPInt();
+        int64_t Distance = Val.getSExtValue();
+        LLVM_DEBUG(dbgs() << "LV: canMoveBetweenInstrs: Distance between:\n"
+                          << "    " << *PtrTemp << "\n"
+                          << "    " << *Ptr << "\n"
+                          << "    is " << Distance << "\n");
+
+        // Related instruction needs to be moved before together, so
+        // check if it can be moved. E.g.
+        //
+        // Stride > 0
+        //   SrcInst:     c[i] = a[i];
+        //   RelatedInst: b[i + 1] = c[i];
+        //   DestInst:    a[i + 1] = b[i];
+        //
+        // Stride < 0
+        //   SrcInst:     c[i] = a[i];
+        //   RelatedInst: b[i - 1] = c[i];
+        //   DestInst:    a[i - 1] = b[i];
+        if ((Stride > 0 && Distance <= 0) || (Stride < 0 && Distance >= 0)) {
+          LLVM_DEBUG(dbgs() << "LV: canMoveBetweenInstrs: Related "
+                            << "Instrctions' Check\n");
+          CanMove = canMoveInstrs(SrcInst, &*it, Insts);
+          if (!CanMove)
+            return false;
+        }
+      }
+    }
+  }
+
+  return CanMove;
+}
+
+bool LoopVectorizationLegality::canUnsafeDepsVectorize() {
+  bool BackwardCanVectorize = true;
+  SmallVector<MemoryDepChecker::Dependence, 2> BackwardDeps;
+
+  if (LAI->hasConvergentOp())
+    return false;
+
+  MemoryDepChecker Checker = LAI->getDepChecker();
+  if (Checker.isUnsafeBackwardMinDistGTOtherBackwardDist()) {
+    return false;
+  }
+
+  const auto &Dependences = Checker.getDependences();
+  if (Dependences == nullptr || Dependences->empty())
+    return false;
+
+  for (const auto &Dep : *Dependences) {
+    LLVM_DEBUG(dbgs() << "LV: canUnsafeDepsVectorize: "
+                      << "Unsafe dependence type is: " << Dep.Type << "\n");
+
+    // If the Dep.Type is not Backward, go to check the next.
+    if (Dep.Type != MemoryDepChecker::Dependence::Backward &&
+        MemoryDepChecker::Dependence::isSafeForVectorization(Dep.Type) !=
+            MemoryDepChecker::VectorizationSafetyStatus::Safe)
+      return false;
+    else if (Dep.Type == MemoryDepChecker::Dependence::Backward)
+      BackwardDeps.push_back(Dep);
+  }
+
+  // Check backward dependences if involved instructions can be move.
+  for (const auto &Dep : BackwardDeps) {
+    Instruction *SourceInst = Dep.getSource(*LAI);
+    Instruction *DestInst = Dep.getDestination(*LAI);
+
+    if (SourceInst->getParent() != DestInst->getParent()) {
+      LLVM_DEBUG(dbgs() << "LV: canUnsafeDepsVectorize: "
+                        << "Src: " << *SourceInst << " Dest: " << *DestInst
+                        << " are in different basic blocks\n");
+
+      // Move instruction between basic blocks is not supported.
+      BackwardCanVectorize = false;
+      break;
+    }
+
+    // For the following case, if the DestInst is moved before the
+    // SourceInst, it should not prevent store-load forwarding.
+    //   SourceInst: load a[i]
+    //   DestInst:   store a[i + n]
+    if (isa<LoadInst>(SourceInst) && isa<StoreInst>(DestInst) &&
+        EnableForwardingConflictDetection) {
+      // Get the distance between the SourceInst and the DestInst.
+      Value *SourcePtr = cast<LoadInst>(SourceInst)->getPointerOperand();
+      Value *DestPtr = cast<StoreInst>(DestInst)->getPointerOperand();
+      Type *SourceTy = SourcePtr->getType()->getPointerElementType();
+      auto &DL = TheLoop->getHeader()->getModule()->getDataLayout();
+      uint64_t TypeByteSize = DL.getTypeAllocSize(SourceTy);
+      const SCEV *Source = PSE.getSCEV(SourcePtr);
+      const SCEV *Dest = PSE.getSCEV(DestPtr);
+      const SCEV *Dist = PSE.getSE()->getMinusSCEV(Dest, Source);
+      const SCEVConstant *C = dyn_cast<SCEVConstant>(Dist);
+
+      if (!C) {
+        LLVM_DEBUG(dbgs() << "LV: canUnsafeDepsVectorize: Found not "
+                          << "vectorizable store after load backward "
+                          << "dependence (non consistant distance):\n"
+                          << "    " << *SourceInst << "\n"
+                          << "    " << *DestInst << "\n");
+        BackwardCanVectorize = false;
+        break;
+      }
+
+      const APInt &Val = C->getAPInt();
+      int64_t Distance = Val.getSExtValue();
+
+      // Do a check as MemoryDepChecker::couldPreventStoreLoadForward.
+      //
+      // The difference is that, here the distance is little than the
+      // minimum distance that vectorization need, so just need the
+      // following check.
+      if (Distance % (2 * TypeByteSize)) {
+        LLVM_DEBUG(dbgs() << "LV: canUnsafeDepsVectorize: Found not "
+                          << "vectorizable store after load backward "
+                          << "dependence (may prevent forwarding):\n"
+                          << "    " << *SourceInst << "\n"
+                          << "    " << *DestInst << "\n");
+        BackwardCanVectorize = false;
+        break;
+      }
+    }
+
+    LLVM_DEBUG(dbgs() << "LV: canUnsafeDepsVectorize: The origin basic block:\n"
+                      << *(SourceInst->getParent()) << "\n");
+
+    SmallVector<Instruction *, 2> CheckedInsts;
+    BackwardCanVectorize = canMoveInstrs(SourceInst, DestInst, CheckedInsts);
+    LLVM_DEBUG(dbgs() << "LV: canUnsafeDepsVectorize: canMoveInstrs return: "
+                      << BackwardCanVectorize << "\n");
+
+    if (!BackwardCanVectorize)
+      break;
+
+    // The CheckedInsts may contain instructions that do not need
+    // move.  E.g.
+    //   - instruction in the for.preheader block used by DestInst
+    //   - instruction already before SourceInst used by DestInst
+    SmallVector<Instruction *, 2> ReorderNeededInstructionList;
+    for (auto *I : CheckedInsts) {
+      if (I->getParent() == SourceInst->getParent() &&
+          SourceInst->comesBefore(I))
+        ReorderNeededInstructionList.push_back(I);
+    }
+    ReorderNeededInstructionPairs.push_back(
+        std::make_pair(SourceInst, ReorderNeededInstructionList));
+  }
+
+  if (!BackwardCanVectorize) {
+    ReorderNeededInstructionPairs.clear();
+  } else {
+    // The following sorts are important, they ensure the correctness
+    // of the move and the order of instructions after the move is
+    // more like that before the move.
+    for (auto &InstPair : ReorderNeededInstructionPairs) {
+      SmallVector<Instruction *, 2> *Insts =
+          (SmallVector<Instruction *, 2> *)(&(InstPair.second));
+
+      llvm::sort(
+          Insts->begin(), Insts->end(),
+          [](Instruction *L, Instruction *R) { return L->comesBefore(R); });
+    }
+
+    llvm::sort(ReorderNeededInstructionPairs.begin(),
+               ReorderNeededInstructionPairs.end(),
+               [](std::pair<Instruction *, SmallVector<Instruction *,2>> L,
+                  std::pair<Instruction *, SmallVector<Instruction *,2>> R) {
+                 Instruction *LInst =
+                     ((SmallVector<Instruction *, 2>)L.second).front();
+                 Instruction *RInst =
+                     ((SmallVector<Instruction *, 2>)R.second).front();
+                 return LInst->comesBefore(RInst);
+               });
+  }
+
+  return BackwardCanVectorize;
+}
+
 bool LoopVectorizationLegality::isInductionPhi(const Value *V) {
   Value *In0 = const_cast<Value *>(V);
   PHINode *PN = dyn_cast_or_null<PHINode>(In0);
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -9433,6 +9433,43 @@
                                 F, &Hints, IAI);
   CM.collectValuesToIgnore();
 
+  // Reorder the memory access instructions which are reported unsafe
+  // by MemoryDepChecker but LoopVectorizationLegality say that they
+  // can be changed to safe.
+  //
+  // After the planner plans, these reorder will be reverted.
+  SmallPtrSet<BasicBlock *, 2> BlockPtrsSet;
+  SmallVector<SmallVector<Instruction *, 16>, 2> OriginBlocks;
+  for (auto InstPair : LVL.getReorderNeededInstructionPairs()) {
+    Instruction *SourceInst = (Instruction *)InstPair.first;
+    SmallVector<Instruction *, 2> Insts =
+        (SmallVector<Instruction *, 2>)InstPair.second;
+
+    // Save the origin order for reverting
+    if (!BlockPtrsSet.contains(SourceInst->getParent())) {
+      BlockPtrsSet.insert(SourceInst->getParent());
+      SmallVector<Instruction *, 16> OriginInstrs;
+      for (Instruction &I : *(SourceInst->getParent())) {
+        OriginInstrs.push_back(&I);
+      }
+      OriginBlocks.push_back(OriginInstrs);
+    }
+
+    LLVM_DEBUG(dbgs() << "LV: Move instructions before: " << *SourceInst
+                      << "\n");
+
+    for (Instruction *I : Insts) {
+      // SrcInst maybe already in the after of DestInst because of
+      // last moving, if so go to next
+      if (I->comesBefore(SourceInst))
+        continue;
+
+      LLVM_DEBUG(dbgs() << "    " << *I << "\n");
+
+      I->moveBefore(SourceInst);
+    }
+  }
+
   // Use the planner for vectorization.
   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE);
 
@@ -9443,6 +9480,19 @@
   // Plan how to best vectorize, return the best VF and its cost.
   Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
 
+  // Revert the origin loop
+  for (auto OB : OriginBlocks) {
+    BasicBlock *BB = OB.front()->getParent();
+
+    LLVM_DEBUG(dbgs() << "LV: The reordered basic block:\n" << *BB << "\n");
+
+    // revert instructions orders
+    for (auto *I : OB) {
+      I->removeFromParent();
+      BB->getInstList().push_back(I);
+    }
+  }
+
   VectorizationFactor VF = VectorizationFactor::Disabled();
   unsigned IC = 1;
 
diff --git a/llvm/test/Transforms/LoopVectorize/memdep.ll b/llvm/test/Transforms/LoopVectorize/memdep.ll
--- a/llvm/test/Transforms/LoopVectorize/memdep.ll
+++ b/llvm/test/Transforms/LoopVectorize/memdep.ll
@@ -60,6 +60,7 @@
 }
 
 ; Plausible dependence of distance 2 - can be vectorized with a width of 2.
+; And can be vectorized with a width of 4 with reordering accesses.
 ;  for (i = 0; i < 1024; ++i)
 ;    A[i+2] = A[i] + 1;
 
@@ -67,7 +68,7 @@
 ; CHECK: <2 x i32>
 
 ; WIDTH: f3_vec_len
-; WIDTH-NOT: <4 x i32>
+; WIDTH: <4 x i32>
 
 define void @f3_vec_len(i32* %A) {
 entry:
@@ -91,7 +92,7 @@
   ret void
 }
 
-; Plausible dependence of distance 1 - cannot be vectorized (without reordering
+; Plausible dependence of distance 1 - can be vectorized (with reordering
 ; accesses).
 ;   for (i = 0; i < 1024; ++i) {
 ;     B[i] = A[i];
@@ -99,7 +100,7 @@
 ;   }
 
 ; CHECK-LABEL: @f5(
-; CHECK-NOT: <2 x i32>
+; CHECK: <2 x i32>
 
 define void @f5(i32*  %A, i32* %B) {
 entry:
@@ -244,8 +245,10 @@
 ; RIGHTVF-LABEL: @pr34283
 ; RIGHTVF: <4 x i64>
 
+;Before backward dependence can be vectorized, the following check
+;is 'WRONGVF-NOT: <8 x i64>'.  Now it can be vectorized.
 ; WRONGVF-LABLE: @pr34283
-; WRONGVF-NOT: <8 x i64>
+; WRONGVF: <8 x i64>
 
 @a = common local_unnamed_addr global [64 x i32] zeroinitializer, align 16
 
diff --git a/llvm/test/Transforms/LoopVectorize/pr47929-vectorize-backward-dependence.ll b/llvm/test/Transforms/LoopVectorize/pr47929-vectorize-backward-dependence.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/pr47929-vectorize-backward-dependence.ll
@@ -0,0 +1,495 @@
+; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+
+; The C code corresponding to the following cases are as follows
+;
+;   #define N 1024
+;
+;   float a[N], b[N], c[N], d[N], e[N], f[N];
+;
+;   // Loops that can be vectorized
+;   void foo(int LEN) {
+;     for (int i = 0; i < LEN - 1; i++) {
+;       a[i] = c[i] * 10;
+;       b[i] = a[i + 1] + 1;
+;     }
+;   }
+;
+;   void foo1(int LEN) {
+;     for (int i = LEN - 2; i > -1; i--) {
+;       a[i + 1] = c[i] * 10;
+;       b[i] = a[i] + 1;
+;     }
+;   }
+;
+;  void foo2(int LEN) {
+;    for (int i = 0; i < LEN - 1; i++) {
+;      a[i] = c[i] * 10;
+;      b[i] = a[i + 1] + 1;
+;      d[i] = b[i + 1] - 2;
+;    }
+;  }
+;
+;  void foo3(int LEN) {
+;    for (int i = 0; i < LEN - 1; i++) {
+;      a[i] = b[i] * 2;
+;      a[i + 1] = c[i] + 1;
+;    }
+;  }
+;
+;  void foo4(int LEN) {
+;    for (int i = 0; i < LEN - 2; i++) {
+;      b[i] = a[i] * 2;
+;      a[i + 2] = c[i] + 1;
+;    }
+;  }
+;
+;  void foo5(int LEN) {
+;    for (int i = 0; i < LEN - 2; i++) {
+;      b[i] = a[i] * 2;
+;      c[i + 2] = d[i];
+;      a[i + 2] = c[i] + 1;
+;    }
+;  }
+;
+;  // Loops that cann't be vectorized
+;  // Backward dependence belongs to different blocks.
+;  void bar(int LEN) {
+;    for (int i = 0; i < LEN - 1; i++) {
+;      if (i % 2)
+;        a[i] = c[i] * 10;
+;      b[i] = a[i + 1] + 1;
+;    }
+;  }
+;
+;  // Backward dependence is true data dependence (operands check).
+;  void bar1(int LEN) {
+;    for (int i = 0; i < LEN - 2; i++) {
+;      b[i] = a[i];
+;      a[i + 2] = b[i];
+;    }
+;  }
+;
+;  // Backward dependence is true data dependence (related check).
+;  void bar2(int LEN) {
+;    for (int i = 0; i < LEN - 2; i++) {
+;      b[i] = a[i];
+;      c[i + 2] = b[i];
+;      a[i + 2] = c[i];
+;    }
+;  }
+;
+;  // Backward dependence turns to forwardbutpreventsforwarding.
+;  void bar3(int LEN) {
+;    for (int i = 0; i < LEN - 1; i++) {
+;      b[i] = a[i] * 2;
+;      a[i + 1] = c[i] + 1;
+;    }
+;  }
+;
+@a = dso_local global [1024 x float] zeroinitializer, align 4
+@b = dso_local global [1024 x float] zeroinitializer, align 4
+@c = dso_local global [1024 x float] zeroinitializer, align 4
+@d = dso_local global [1024 x float] zeroinitializer, align 4
+@e = dso_local global [1024 x float] zeroinitializer, align 4
+@f = dso_local global [1024 x float] zeroinitializer, align 4
+
+define dso_local void @foo(i32 %LEN) #0 {
+; CHECK-LABEL: @foo(
+; CHECK: vector.body
+; CHECK: load <4 x float>
+; CHECK: fmul <4 x float>
+; CHECK: load <4 x float>
+; CHECK: store <4 x float>
+; CHECK: fadd <4 x float>
+; CHECK: store <4 x float>
+entry:
+  %cmp6 = icmp sgt i32 %LEN, 1
+  br i1 %cmp6, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  %0 = add i32 %LEN, -1
+  %wide.trip.count = zext i32 %0 to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @c, i64 0, i64 %indvars.iv
+  %1 = load float, float* %arrayidx, align 4
+  %mul = fmul float %1, 1.000000e+01
+  %arrayidx2 = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %indvars.iv
+  store float %mul, float* %arrayidx2, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %arrayidx4 = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %indvars.iv.next
+  %2 = load float, float* %arrayidx4, align 4
+  %add5 = fadd float %2, 1.000000e+00
+  %arrayidx7 = getelementptr inbounds [1024 x float], [1024 x float]* @b, i64 0, i64 %indvars.iv
+  store float %add5, float* %arrayidx7, align 4
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+define dso_local void @foo1(i32 %LEN) #0 {
+; CHECK-LABEL: @foo1(
+; CHECK: vector.body
+; CHECK: load <4 x float>
+; CHECK: fmul <4 x float>
+; CHECK: load <4 x float>
+; CHECK: store <4 x float>
+; CHECK: fadd <4 x float>
+; CHECK: store <4 x float>
+; CHECK: for.body
+; CHECK: load float
+; CHECK: fmul float
+; CHECK: store float
+; CHECK: load float
+; CHECK: fadd float
+; CHECK: store float
+entry:
+  %cmp6 = icmp sgt i32 %LEN, 1
+  br i1 %cmp6, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  %0 = add i32 %LEN, -2
+  %1 = sext i32 %0 to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %1, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @c, i64 0, i64 %indvars.iv
+  %2 = load float, float* %arrayidx, align 4
+  %mul = fmul float %2, 1.000000e+01
+  %3 = add nsw i64 %indvars.iv, 1
+  %arrayidx2 = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %3
+  store float %mul, float* %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %indvars.iv
+  %4 = load float, float* %arrayidx4, align 4
+  %add5 = fadd float %4, 1.000000e+00
+  %arrayidx7 = getelementptr inbounds [1024 x float], [1024 x float]* @b, i64 0, i64 %indvars.iv
+  store float %add5, float* %arrayidx7, align 4
+  %indvars.iv.next = add nsw i64 %indvars.iv, -1
+  %cmp = icmp sgt i64 %indvars.iv, 0
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+define dso_local void @foo2(i32 %LEN) #0 {
+; CHECK-LABEL: @foo2(
+; CHECK: vector.body
+; CHECK: load <4 x float>
+; CHECK: fmul <4 x float>
+; CHECK: load <4 x float>
+; CHECK: store <4 x float>
+; CHECK: fadd <4 x float>
+; CHECK: load <4 x float>
+; CHECK: store <4 x float>
+; CHECK: fadd <4 x float>
+; CHECK: store <4 x float>
+; REVERT-LABEL: @foo2(
+entry:
+  %cmp8 = icmp sgt i32 %LEN, 1
+  br i1 %cmp8, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  %0 = add i32 %LEN, -1
+  %wide.trip.count = zext i32 %0 to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @c, i64 0, i64 %indvars.iv
+  %1 = load float, float* %arrayidx, align 4
+  %mul = fmul float %1, 1.000000e+01
+  %arrayidx2 = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %indvars.iv
+  store float %mul, float* %arrayidx2, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %arrayidx4 = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %indvars.iv.next
+  %2 = load float, float* %arrayidx4, align 4
+  %add5 = fadd float %2, 1.000000e+00
+  %arrayidx7 = getelementptr inbounds [1024 x float], [1024 x float]* @b, i64 0, i64 %indvars.iv
+  store float %add5, float* %arrayidx7, align 4
+  %arrayidx10 = getelementptr inbounds [1024 x float], [1024 x float]* @b, i64 0, i64 %indvars.iv.next
+  %3 = load float, float* %arrayidx10, align 4
+  %sub11 = fadd float %3, -2.000000e+00
+  %arrayidx13 = getelementptr inbounds [1024 x float], [1024 x float]* @d, i64 0, i64 %indvars.iv
+  store float %sub11, float* %arrayidx13, align 4
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+define dso_local void @foo3(i32 %LEN) local_unnamed_addr #0 {
+; CHECK-LABEL: @foo3(
+; CHECK: vector.body
+; CHECK: load <4 x float>
+; CHECK: fmul <4 x float>
+; CHECK: load <4 x float>
+; CHECK: fadd <4 x float>
+; CHECK: store <4 x float>
+; CHECK: store <4 x float>
+entry:
+  %cmp14 = icmp sgt i32 %LEN, 1
+  br i1 %cmp14, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %0 = add i32 %LEN, -1
+  %wide.trip.count = zext i32 %0 to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @b, i64 0, i64 %indvars.iv
+  %1 = load float, float* %arrayidx, align 4
+  %mul = fmul float %1, 2.000000e+00
+  %arrayidx2 = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %indvars.iv
+  store float %mul, float* %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds [1024 x float], [1024 x float]* @c, i64 0, i64 %indvars.iv
+  %2 = load float, float* %arrayidx4, align 4
+  %add = fadd float %2, 1.000000e+00
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %arrayidx7 = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %indvars.iv.next
+  store float %add, float* %arrayidx7, align 4
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define dso_local void @foo4(i32 %LEN) local_unnamed_addr #0 {
+; CHECK-LABEL: @foo4(
+; CHECK: vector.body
+; CHECK: load <4 x float>
+; CHECK: fadd <4 x float>
+; CHECK: store <4 x float>
+; CHECK: load <4 x float>
+; CHECK: fmul <4 x float>
+; CHECK: store <4 x float>
+entry:
+  %cmp14 = icmp sgt i32 %LEN, 2
+  br i1 %cmp14, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %0 = add i32 %LEN, -2
+  %wide.trip.count = zext i32 %0 to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %indvars.iv
+  %1 = load float, float* %arrayidx, align 4
+  %mul = fmul float %1, 2.000000e+00
+  %arrayidx2 = getelementptr inbounds [1024 x float], [1024 x float]* @b, i64 0, i64 %indvars.iv
+  store float %mul, float* %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds [1024 x float], [1024 x float]* @c, i64 0, i64 %indvars.iv
+  %2 = load float, float* %arrayidx4, align 4
+  %add = fadd float %2, 1.000000e+00
+  %3 = add nuw nsw i64 %indvars.iv, 2
+  %arrayidx7 = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %3
+  store float %add, float* %arrayidx7, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define dso_local void @foo5(i32 %LEN) local_unnamed_addr #0 {
+; CHECK-LABEL: @foo5(
+; CHECK: vector.body
+; CHECK: load <4 x float>
+; CHECK: store <4 x float>
+; CHECK: load <4 x float>
+; CHECK: fadd <4 x float>
+; CHECK: store <4 x float>
+; CHECK: load <4 x float>
+; CHECK: fmul <4 x float>
+; CHECK: store <4 x float>
+entry:
+  %cmp21 = icmp sgt i32 %LEN, 2
+  br i1 %cmp21, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %0 = add i32 %LEN, -2
+  %wide.trip.count = zext i32 %0 to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %indvars.iv
+  %1 = load float, float* %arrayidx, align 4
+  %mul = fmul float %1, 2.000000e+00
+  %arrayidx2 = getelementptr inbounds [1024 x float], [1024 x float]* @b, i64 0, i64 %indvars.iv
+  store float %mul, float* %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds [1024 x float], [1024 x float]* @d, i64 0, i64 %indvars.iv
+  %2 = load float, float* %arrayidx4, align 4
+  %3 = add nuw nsw i64 %indvars.iv, 2
+  %arrayidx6 = getelementptr inbounds [1024 x float], [1024 x float]* @c, i64 0, i64 %3
+  store float %2, float* %arrayidx6, align 4
+  %arrayidx8 = getelementptr inbounds [1024 x float], [1024 x float]* @c, i64 0, i64 %indvars.iv
+  %4 = load float, float* %arrayidx8, align 4
+  %add9 = fadd float %4, 1.000000e+00
+  %arrayidx12 = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %3
+  store float %add9, float* %arrayidx12, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define dso_local void @bar(i32 %LEN) #0 {
+; CHECK-LABEL: @bar(
+; CHECK-NOT: vector.body
+; CHECK: for.body
+; CHECK: if.then
+; CHECK: load float
+; CHECK: fmul float
+; CHECK: store float
+; CHECK: if.end
+; CHECK: load float
+; CHECK: fadd float
+; CHECK: store float
+entry:
+  %cmp7 = icmp sgt i32 %LEN, 1
+  br i1 %cmp7, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  %0 = add i32 %LEN, -1
+  %wide.trip.count = zext i32 %0 to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %if.end
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %if.end ]
+  %rem10 = and i64 %indvars.iv, 1
+  %tobool.not = icmp eq i64 %rem10, 0
+  br i1 %tobool.not, label %if.end, label %if.then
+
+if.then:                                          ; preds = %for.body
+  %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @c, i64 0, i64 %indvars.iv
+  %1 = load float, float* %arrayidx, align 4
+  %mul = fmul float %1, 1.000000e+01
+  %arrayidx2 = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %indvars.iv
+  store float %mul, float* %arrayidx2, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %arrayidx4 = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %indvars.iv.next
+  %2 = load float, float* %arrayidx4, align 4
+  %add5 = fadd float %2, 1.000000e+00
+  %arrayidx7 = getelementptr inbounds [1024 x float], [1024 x float]* @b, i64 0, i64 %indvars.iv
+  store float %add5, float* %arrayidx7, align 4
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:                                          ; preds = %if.end, %entry
+  ret void
+}
+
+define dso_local void @bar1(i32 %LEN) local_unnamed_addr #0 {
+; CHECK-LABEL: @bar1(
+; CHECK-NOT: vector.body
+entry:
+  %cmp13 = icmp sgt i32 %LEN, 2
+  br i1 %cmp13, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %0 = add i32 %LEN, -2
+  %wide.trip.count = zext i32 %0 to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %indvars.iv
+  %1 = load float, float* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds [1024 x float], [1024 x float]* @b, i64 0, i64 %indvars.iv
+  store float %1, float* %arrayidx2, align 4
+  %2 = add nuw nsw i64 %indvars.iv, 2
+  %arrayidx6 = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %2
+  store float %1, float* %arrayidx6, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define dso_local void @bar2(i32 %LEN) local_unnamed_addr #0 {
+; CHECK-LABEL: @bar2(
+; CHECK-NOT: vector.body
+entry:
+  %cmp20 = icmp sgt i32 %LEN, 2
+  br i1 %cmp20, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %0 = add i32 %LEN, -2
+  %wide.trip.count = zext i32 %0 to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %indvars.iv
+  %1 = load float, float* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds [1024 x float], [1024 x float]* @b, i64 0, i64 %indvars.iv
+  store float %1, float* %arrayidx2, align 4
+  %2 = add nuw nsw i64 %indvars.iv, 2
+  %arrayidx6 = getelementptr inbounds [1024 x float], [1024 x float]* @c, i64 0, i64 %2
+  store float %1, float* %arrayidx6, align 4
+  %arrayidx8 = getelementptr inbounds [1024 x float], [1024 x float]* @c, i64 0, i64 %indvars.iv
+  %3 = load float, float* %arrayidx8, align 4
+  %arrayidx11 = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %2
+  store float %3, float* %arrayidx11, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define dso_local void @bar3(i32 %LEN) local_unnamed_addr #0 {
+; CHECK-LABEL: @bar3(
+; CHECK-NOT: vector.body
+entry:
+  %cmp14 = icmp sgt i32 %LEN, 1
+  br i1 %cmp14, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %0 = add i32 %LEN, -1
+  %wide.trip.count = zext i32 %0 to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %indvars.iv
+  %1 = load float, float* %arrayidx, align 4
+  %mul = fmul float %1, 2.000000e+00
+  %arrayidx2 = getelementptr inbounds [1024 x float], [1024 x float]* @b, i64 0, i64 %indvars.iv
+  store float %mul, float* %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds [1024 x float], [1024 x float]* @c, i64 0, i64 %indvars.iv
+  %2 = load float, float* %arrayidx4, align 4
+  %add = fadd float %2, 1.000000e+00
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %arrayidx7 = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %indvars.iv.next
+  store float %add, float* %arrayidx7, align 4
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}