diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
--- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -143,9 +143,13 @@
     unsigned Destination;
     /// The type of the dependence.
     DepType Type;
+    /// The reorder informations.  Only Backward may have this.
+    SmallVector<Instruction *, 2> ReorderInfo;
 
-    Dependence(unsigned Source, unsigned Destination, DepType Type)
-        : Source(Source), Destination(Destination), Type(Type) {}
+    Dependence(unsigned Source, unsigned Destination, DepType Type,
+        SmallVector<Instruction *, 2> ReorderInfo)
+        : Source(Source), Destination(Destination), Type(Type),
+        ReorderInfo(ReorderInfo){}
 
     /// Return the source instruction of the dependence.
     Instruction *getSource(const LoopAccessInfo &LAI) const;
@@ -205,6 +209,10 @@
     return Status == VectorizationSafetyStatus::Safe;
   }
 
+  /// Returns true if all the unsafe dependences are backward and have
+  /// nonempty reorder informations.
+  bool canUnsafeDepsVectorize() const;
+
   /// Return true if the number of elements that are safe to operate on
   /// simultaneously is not bounded.
   bool isSafeForAnyVectorWidth() const {
@@ -280,6 +288,11 @@
   // We can access this many bytes in parallel safely.
   uint64_t MaxSafeDepDistBytes;
 
+  /// True if there is an unsafe backward dependence whose
+  /// minmum safe distance is greater than another backward dependence's
+  /// distance which is already known.
+  bool UnsafeBackwardMinDistGTOtherBackwardDist;
+
   /// Number of elements (from consecutive iterations) that are safe to
   /// operate on simultaneously, multiplied by the size of the element in bits.
   /// The size of the element is taken from the memory access that is most
@@ -320,6 +333,37 @@
                                   const MemAccessInfo &B, unsigned BIdx,
                                   const ValueToValueMap &Strides);
 
+  /// Returns true if the DESTINST instruction can be moved before
+  /// the SRCINST instruction, and add all the involved instructions
+  /// to the INSTS list.
+  ///
+  /// This method do the following checks recursively.
+  ///   step1. call canMoveInstrs to check if all the operands of
+  ///          DESTINST can be moved
+  ///   step2. call canMoveBetweenInstrs to check if the instructions
+  ///          between SRCINST and DESTINST which read/write the same
+  ///          underly object as DESTINST does can be moved.
+  bool canMoveInstrs(llvm::Instruction *SrcInst, llvm::Instruction *DestInst,
+                     SmallVector<Instruction *, 2> &Insts,
+                     const ValueToValueMap &Strides);
+
+  /// Returns true if the instructions between SRCINST and DESTINST
+  /// which read/write the same underly object as DESTINST does can
+  /// be moved before the SRCINST instruction.
+  ///
+  /// This method is called by canMoveInstrs and calls canMoveInstrs
+  /// too.  Also see canMoveInstrs.
+  bool canMoveBetweenInstrs(llvm::Instruction *SrcInst,
+                            llvm::Instruction *DestInst,
+                            SmallVector<Instruction *, 2> &Insts,
+                            const ValueToValueMap &Strides);
+
+  /// Returns true if the accesses can be turned to forward
+  /// dependence with instructions reordering.
+  bool isReorderingToForwardPossible(
+          unsigned AIdx, unsigned BIdx, const ValueToValueMap &Strides,
+          SmallVector<Instruction *, 2> &ReorderNeededInstructionList);
+
   /// Check whether the data dependence could prevent store-load
   /// forwarding.
   ///
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -1623,6 +1623,7 @@
   if (MinDistanceNeeded > MaxSafeDepDistBytes) {
     LLVM_DEBUG(dbgs() << "LAA: Failure because it needs at least "
                       << MinDistanceNeeded << " size in bytes");
+    UnsafeBackwardMinDistGTOtherBackwardDist = true;
     return Dependence::Backward;
   }
 
@@ -1658,11 +1659,283 @@
   return Dependence::BackwardVectorizable;
 }
 
+bool MemoryDepChecker::canMoveInstrs(
+    Instruction *SrcInst, Instruction *DestInst,
+    SmallVector<Instruction *, 2> &Insts, const ValueToValueMap &Strides) {
+  LLVM_DEBUG(dbgs() << "LAA(ReorderInfo): " << *SrcInst << "\n"
+                    << "            <---> " << *DestInst << "\n");
+
+  bool CanMove = true;
+  unsigned OpNums;
+
+  // Recursive exit 1: already checked
+  if (std::find(Insts.begin(), Insts.end(), DestInst) != Insts.end())
+    return true;
+  else
+    Insts.push_back(DestInst);
+
+  // Recursive exit 2: the DestInst is same as SrcInst
+  if (DestInst == SrcInst) {
+    LLVM_DEBUG(dbgs() << "LAA(ReorderInfo): The source instruction and "
+                      << "destination instruction are same!!!\n");
+    return false;
+  }
+
+  // Recursive exit 3: Both access the same address
+  // In the process of recursion, the DestInst and StrInst may operate
+  // on the same address.
+  if (isa<LoadInst>(DestInst) || isa<StoreInst>(DestInst)) {
+    Value *SrcPtr, *DestPtr;
+
+    if (isa<LoadInst>(SrcInst))
+      SrcPtr = cast<LoadInst>(SrcInst)->getPointerOperand();
+    else
+      SrcPtr = cast<StoreInst>(SrcInst)->getPointerOperand();
+
+    if (isa<LoadInst>(DestInst))
+      DestPtr = cast<LoadInst>(DestInst)->getPointerOperand();
+    else
+      DestPtr = cast<StoreInst>(DestInst)->getPointerOperand();
+
+    if (SrcPtr == DestPtr) {
+      LLVM_DEBUG(dbgs() << "LAA(ReorderInfo): Store/Load instructions access "
+                        << "same address!!!\n");
+      return false;
+    }
+  }
+
+  // STEP1: Check the operands
+  OpNums = DestInst->getNumOperands();
+  for (unsigned Idx = 0; Idx < OpNums; Idx++) {
+    Instruction *OpInst = dyn_cast<Instruction>(DestInst->getOperand(Idx));
+
+    // Maybe a llvm::Value, E.g.:
+    //     a constant
+    //     a parameter variant
+    //     a global variant
+    if (!OpInst)
+      continue;
+
+    BasicBlock *OpBB = OpInst->getParent();
+    BasicBlock *DestBB = DestInst->getParent();
+    // This basic block may use instructions of other basic block.
+    // And the other basic block may also use instruction of this
+    // basic block, so go on checking.
+    if (OpBB != DestBB) {
+      // Use instruction out of current Function
+      // FIXME: Don't know if this is necessary.
+      if (OpBB->getParent() != DestBB->getParent())
+        continue;
+
+      // FIXME: Don't know if there is any situation that can stop
+      // the checking here.
+    }
+
+    LLVM_DEBUG(dbgs() << "LAA(ReorderInfo): Operand Instructions' Check\n");
+    CanMove = canMoveInstrs(SrcInst, OpInst, Insts, Strides);
+    if (!CanMove)
+      return false;
+  }
+
+  LLVM_DEBUG(dbgs() << "LAA(ReorderInfo): Between Instructions' Check\n");
+
+  // STEP2: Check the instructions between the SrcInst and DestInst.
+  if (SrcInst->getParent() == DestInst->getParent() &&
+      SrcInst->comesBefore(DestInst))
+    return canMoveBetweenInstrs(SrcInst, DestInst, Insts, Strides);
+  else
+    return true;
+}
+
+bool MemoryDepChecker::canMoveBetweenInstrs(
+    Instruction *SrcInst, Instruction *DestInst,
+    SmallVector<Instruction *, 2> &Insts, const ValueToValueMap &Strides) {
+  LLVM_DEBUG(dbgs() << "LAA(ReorderInfo): " << *SrcInst << "\n"
+                    << "           <--->  " << *DestInst << "\n");
+
+  bool CanMove = true;
+
+  Value *Ptr = nullptr;
+  if (isa<StoreInst>(DestInst))
+    Ptr = cast<StoreInst>(DestInst)->getPointerOperand();
+  else if (isa<LoadInst>(DestInst))
+    Ptr = cast<LoadInst>(DestInst)->getPointerOperand();
+
+  // DestInst is not load/store instruction.
+  if (!Ptr)
+    return true;
+
+  int64_t Stride = getPtrStride(PSE, Ptr, InnermostLoop, Strides, true, true);
+  Value *UnderlyingObject = getUnderlyingObject(Ptr, 6);
+
+  for (BasicBlock::iterator I = --(DestInst->getIterator()),
+                            Begin = SrcInst->getIterator();
+       I != Begin; --I) {
+    // The current instruction maybe already checked as operand.
+    if (std::find(Insts.begin(), Insts.end(), (&*I)) != Insts.end())
+      continue;
+
+    // FIXME: Don't know how to check call instruction that
+    // read/wirte memory.
+    if (isa<CallInst>(&*I) &&
+        ((&*I)->mayReadFromMemory() || (&*I)->mayWriteToMemory())) {
+      LLVM_DEBUG(dbgs() << "LAA(ReorderInfo): Found memory access call: "
+                        << (*I) << "\n");
+      return false;
+    }
+
+    if (isa<StoreInst>(&*I) || isa<LoadInst>(&*I)) {
+      Value *PtrTemp = nullptr;
+      if (isa<StoreInst>(&*I))
+        PtrTemp = cast<StoreInst>(&*I)->getPointerOperand();
+      else if (isa<LoadInst>(&*I))
+        PtrTemp = cast<LoadInst>(&*I)->getPointerOperand();
+
+      Value *UnderlyingObjectTemp = getUnderlyingObject(PtrTemp, 6);
+
+      LLVM_DEBUG(dbgs() << "LAA(ReorderInfo):\n"
+                        << "    UnderlyingObject: " << *UnderlyingObject << "\n"
+                        << "    UnderlyingObjectTemp: " << *UnderlyingObjectTemp
+                        << "\n");
+
+      // For some reason the return value of getUnderlyingObject
+      // function is still an Instruction.  So the following if
+      // statement maybe false, and the current instruction does not
+      // need move before.  This is an assumption.  If the current
+      // instruction do need move before, that says there is a
+      // memory conflict between the current instruction and the
+      // DestInst.  And the runtime memory check can detect the
+      // conflict, then go to run the origin scalar loop.
+      if (UnderlyingObjectTemp == UnderlyingObject) {
+        const SCEV *First = PSE.getSCEV(PtrTemp);
+        const SCEV *Second = PSE.getSCEV(Ptr);
+        const SCEV *Dist = PSE.getSE()->getMinusSCEV(Second, First);
+        const SCEVConstant *C = dyn_cast<SCEVConstant>(Dist);
+
+        if (!C) {
+          LLVM_DEBUG(dbgs() << "LAA(ReorderInfo): Found non constant distance\n");
+          return false;
+        }
+
+        const APInt &Val = C->getAPInt();
+        int64_t Distance = Val.getSExtValue();
+        LLVM_DEBUG(dbgs() << "LAA(ReorderInfo): Distance between:\n"
+                          << "    " << *PtrTemp << "\n"
+                          << "    " << *Ptr << "\n"
+                          << "    is " << Distance << "\n");
+
+        // Related instruction needs to be moved before together, so
+        // check if it can be moved. E.g.
+        //
+        // Stride > 0
+        //   SrcInst:     c[i] = a[i];
+        //   RelatedInst: b[i + 1] = c[i];
+        //   DestInst:    a[i + 1] = b[i];
+        //
+        // Stride < 0
+        //   SrcInst:     c[i] = a[i];
+        //   RelatedInst: b[i - 1] = c[i];
+        //   DestInst:    a[i - 1] = b[i];
+        if ((Stride > 0 && Distance <= 0) || (Stride < 0 && Distance >= 0)) {
+          LLVM_DEBUG(dbgs() << "LAA(ReorderInfo): Found an related instrctions\n");
+          CanMove = canMoveInstrs(SrcInst, &*I, Insts, Strides);
+          if (!CanMove)
+            return false;
+        }
+      }
+    }
+  }
+
+  return CanMove;
+}
+
+bool MemoryDepChecker::isReorderingToForwardPossible(
+    unsigned AIdx, unsigned BIdx, const ValueToValueMap &Strides,
+    SmallVector<Instruction *, 2> &ReorderNeededInstructionList) {
+  Instruction *SourceInst = InstMap[AIdx];
+  Instruction *DestInst = InstMap[BIdx];
+
+  if (SourceInst->getParent() != DestInst->getParent()) {
+    LLVM_DEBUG(dbgs() << "LAA(ReorderInfo): Accesses in different basic "
+                      << "blocks are not support to reordering.\n"
+                      << "    Src:  " << *SourceInst << "\n"
+                      << "    Dest: " << *DestInst << "\n");
+    return false;
+  }
+
+  // For the following case, if the DestInst is moved before the
+  // SourceInst, it should not prevent store-load forwarding.
+  //   SourceInst: load a[i]
+  //   DestInst:   store a[i + n]
+  if (isa<LoadInst>(SourceInst) && isa<StoreInst>(DestInst) &&
+      EnableForwardingConflictDetection) {
+    // Get the distance between the SourceInst and the DestInst.
+    Value *SourcePtr = cast<LoadInst>(SourceInst)->getPointerOperand();
+    Value *DestPtr = cast<StoreInst>(DestInst)->getPointerOperand();
+    llvm::Type *SourceTy = SourcePtr->getType()->getPointerElementType();
+    const auto &DL = InnermostLoop->getHeader()->getModule()->getDataLayout();
+    uint64_t TypeByteSize = DL.getTypeAllocSize(SourceTy);
+    const SCEV *Source = PSE.getSCEV(SourcePtr);
+    const SCEV *Dest = PSE.getSCEV(DestPtr);
+    const SCEV *Dist = PSE.getSE()->getMinusSCEV(Dest, Source);
+    const SCEVConstant *C = dyn_cast<SCEVConstant>(Dist);
+
+    if (!C) {
+      LLVM_DEBUG(dbgs() << "LAA(ReorderInfo): Found not vectorizable store "
+                        << "after load dependence (non consistant distance):\n"
+                        << "    Src:  " << *SourceInst << "\n"
+                        << "    Dest: " << *DestInst << "\n");
+      return false;
+    }
+
+    const APInt &Val = C->getAPInt();
+    int64_t Distance = Val.getSExtValue();
+
+    // Do a check as MemoryDepChecker::couldPreventStoreLoadForward.
+    //
+    // The difference is that, here the distance is little than the
+    // minimum distance that vectorization need, so just need the
+    // following check.
+    if (Distance % (2 * TypeByteSize)) {
+      LLVM_DEBUG(dbgs() << "LAA(ReorderInfo): Found not vectorizable store "
+                        << "after load dependence (may prevent forwarding):\n"
+                        << "    Src:  " << *SourceInst << "\n"
+                        << "    Dest: " << *DestInst << "\n");
+      return false;
+    }
+  }
+
+  LLVM_DEBUG(dbgs() << "LAA(ReorderInfo): The origin basic block:\n"
+                    << *(SourceInst->getParent()) << "\n");
+
+  SmallVector<Instruction *, 2> CheckedInsts;
+  bool BackwardCanVectorize =
+      canMoveInstrs(SourceInst, DestInst, CheckedInsts, Strides);
+
+  LLVM_DEBUG(dbgs() << "LAA(ReorderInfo): canMoveInstrs returned: "
+                    << BackwardCanVectorize << "\n");
+
+  if (BackwardCanVectorize) {
+    // The CheckedInsts may contain instructions that do not need
+    // move.  E.g.
+    //   - instruction in the for.preheader block used by DestInst
+    //   - instruction already before SourceInst used by DestInst
+    for (auto *I : CheckedInsts) {
+      if (I->getParent() == SourceInst->getParent() &&
+          SourceInst->comesBefore(I))
+        ReorderNeededInstructionList.push_back(I);
+    }
+  }
+
+  return BackwardCanVectorize;
+}
+
 bool MemoryDepChecker::areDepsSafe(DepCandidates &AccessSets,
                                    MemAccessInfoList &CheckDeps,
                                    const ValueToValueMap &Strides) {
 
   MaxSafeDepDistBytes = -1;
+  UnsafeBackwardMinDistGTOtherBackwardDist = false;
   SmallPtrSet<MemAccessInfo, 8> Visited;
   for (MemAccessInfo CurAccess : CheckDeps) {
     if (Visited.count(CurAccess))
@@ -1712,8 +1985,19 @@
             // unsafe dependence.  This puts a limit on this quadratic
             // algorithm.
             if (RecordDependences) {
-              if (Type != Dependence::NoDep)
-                Dependences.push_back(Dependence(A.second, B.second, Type));
+              SmallVector<Instruction *, 2> ReorderNeededInstructionList =
+                    SmallVector<Instruction *>();
+              if (Type == Dependence::Backward &&
+                  !UnsafeBackwardMinDistGTOtherBackwardDist) {
+                isReorderingToForwardPossible(A.second, B.second, Strides,
+                                              ReorderNeededInstructionList);
+              }
+
+              if (Type != Dependence::NoDep) {
+                Dependences.push_back(
+                    Dependence(A.second, B.second, Type,
+                               ReorderNeededInstructionList));
+              }
 
               if (Dependences.size() >= MaxDependences) {
                 RecordDependences = false;
@@ -1735,6 +2019,25 @@
   return isSafeForVectorization();
 }
 
+bool MemoryDepChecker::canUnsafeDepsVectorize() const {
+  if (UnsafeBackwardMinDistGTOtherBackwardDist)
+    return false;
+
+  if (!getDependences())
+    return false;
+
+  for (const auto &Dep : Dependences) {
+    if (Dep.Type == Dependence::Backward && Dep.ReorderInfo.empty())
+      return false;
+    else if (Dep.Type != Dependence::Backward &&
+        Dependence::isSafeForVectorization(Dep.Type) !=
+        VectorizationSafetyStatus::Safe)
+      return false;
+  }
+
+  return true;
+}
+
 SmallVector<Instruction *, 4>
 MemoryDepChecker::getInstructionsForAccess(Value *Ptr, bool isWrite) const {
   MemAccessInfo Access(Ptr, isWrite);
@@ -2070,6 +2373,16 @@
     return;
   }
 
+  if(!CanVecMem){
+    //Check that if all the dependences can be turned to safe by
+    //reordering instructions.
+    CanVecMem = DepChecker->canUnsafeDepsVectorize();
+    LLVM_DEBUG(
+        dbgs() << "LAA: We can "
+               << (CanVecMem ? "" : "not ")
+               << "vectorize the loop by reordering instructions.\n");
+  }
+
   if (CanVecMem)
     LLVM_DEBUG(
         dbgs() << "LAA: No unsafe dependent memory operations in loop.  We"
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -9433,6 +9433,73 @@
                                 F, &Hints, IAI);
   CM.collectValuesToIgnore();
 
+  // Get the memory access reorder informations.
+  auto *LAI = LVL.getLAI();
+  MemoryDepChecker Checker = LAI->getDepChecker();
+  const auto &Dependences = Checker.getDependences();
+  SmallVector<std::pair<Instruction *, SmallVector<Instruction *, 2>>, 2> InstPairs;
+  for (const auto &Dep : *Dependences) {
+    if (Dep.Type != MemoryDepChecker::Dependence::Backward)
+      continue;
+
+    InstPairs.push_back(std::make_pair(Dep.getSource(*LAI), Dep.ReorderInfo));
+  }
+
+  // The following sorts are important, they ensure the correctness
+  // of the move and the order of instructions after the move is
+  // more like that before the move.
+  for (auto &InstPair : InstPairs) {
+    SmallVector<Instruction *, 2> *Insts =
+        (SmallVector<Instruction *, 2> *)(&(InstPair.second));
+
+    llvm::sort(
+        Insts->begin(), Insts->end(),
+        [](Instruction *L, Instruction *R) { return L->comesBefore(R); });
+  }
+
+  llvm::sort(InstPairs.begin(), InstPairs.end(),
+             [](std::pair<Instruction *, SmallVector<Instruction *, 2>> L,
+                std::pair<Instruction *, SmallVector<Instruction *, 2>> R) {
+               Instruction *LInst =
+                   ((SmallVector<Instruction *, 2>)L.second).front();
+               Instruction *RInst =
+                   ((SmallVector<Instruction *, 2>)R.second).front();
+               return LInst->comesBefore(RInst);
+             });
+
+  // After the planner plans, these reorder will be reverted.
+  SmallPtrSet<BasicBlock *, 2> BlockPtrsSet;
+  SmallVector<SmallVector<Instruction *, 16>, 2> OriginBlocks;
+  for (auto InstPair : InstPairs) {
+    Instruction *SourceInst = (Instruction *)InstPair.first;
+    SmallVector<Instruction *, 2> Insts =
+        (SmallVector<Instruction *, 2>)InstPair.second;
+
+    // Save the origin order for reverting
+    if (!BlockPtrsSet.contains(SourceInst->getParent())) {
+      BlockPtrsSet.insert(SourceInst->getParent());
+      SmallVector<Instruction *, 16> OriginInstrs;
+      for (Instruction &I : *(SourceInst->getParent())) {
+        OriginInstrs.push_back(&I);
+      }
+      OriginBlocks.push_back(OriginInstrs);
+    }
+
+    LLVM_DEBUG(dbgs() << "LV: Move instructions before: " << *SourceInst
+                      << "\n");
+
+    for (Instruction *I : Insts) {
+      // SrcInst maybe already in the after of DestInst because of
+      // last moving, if so go to next
+      if (I->comesBefore(SourceInst))
+        continue;
+
+      LLVM_DEBUG(dbgs() << "    " << *I << "\n");
+
+      I->moveBefore(SourceInst);
+    }
+  }
+
   // Use the planner for vectorization.
   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE);
 
@@ -9443,6 +9510,19 @@
   // Plan how to best vectorize, return the best VF and its cost.
   Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
 
+  // Revert the origin loop
+  for (auto OB : OriginBlocks) {
+    BasicBlock *BB = OB.front()->getParent();
+
+    LLVM_DEBUG(dbgs() << "LV: The reordered basic block:\n" << *BB << "\n");
+
+    // revert instructions orders
+    for (auto *I : OB) {
+      I->removeFromParent();
+      BB->getInstList().push_back(I);
+    }
+  }
+
   VectorizationFactor VF = VectorizationFactor::Disabled();
   unsigned IC = 1;
 
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/stride-access-dependence.ll b/llvm/test/Analysis/LoopAccessAnalysis/stride-access-dependence.ll
--- a/llvm/test/Analysis/LoopAccessAnalysis/stride-access-dependence.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/stride-access-dependence.ll
@@ -145,7 +145,9 @@
   br i1 %cmp, label %for.body, label %for.cond.cleanup
 }
 
-; int unsafe_Write_Read(int *A) {
+; Following cases can be vectorized by reordering instructions.
+
+; int vectorizable_by_reorder_Write_Read(int *A) {
 ;   int sum = 0;
 ;   for (unsigned i = 0; i < 1024; i+=4) {
 ;     A[i] = i;
@@ -155,15 +157,15 @@
 ;   return sum;
 ; }
 
-; CHECK: function 'unsafe_Write_Read':
+; CHECK: function 'vectorizable_by_reorder_Write_Read':
 ; CHECK-NEXT:   for.body:
-; CHECK-NEXT:     Report: unsafe dependent memory operations in loop
+; CHECK-NEXT:     Memory dependences are safe
 ; CHECK-NEXT:     Dependences:
 ; CHECK-NEXT:      Backward:
 ; CHECK-NEXT:           store i32 %0, i32* %arrayidx, align 4 ->
 ; CHECK-NEXT:           %1 = load i32, i32* %arrayidx2, align 4
 
-define i32 @unsafe_Write_Read(i32* nocapture %A) {
+define i32 @vectorizable_by_reorder_Write_Read(i32* nocapture %A) {
 entry:
   br label %for.body
 
@@ -184,22 +186,22 @@
   br i1 %cmp, label %for.body, label %for.cond.cleanup
 }
 
-; void unsafe_Write_Write(int *A) {
+; void vectorizable_by_reorder_Write_Write(int *A) {
 ;   for (unsigned i = 0; i < 1024; i+=2) {
 ;     A[i] = i;
 ;     A[i+2] = i+1;
 ;   }
 ; }
 
-; CHECK: function 'unsafe_Write_Write':
+; CHECK: function 'vectorizable_by_reorder_Write_Write':
 ; CHECK-NEXT:   for.body:
-; CHECK-NEXT:     Report: unsafe dependent memory operations in loop
+; CHECK-NEXT:     Memory dependences are safe
 ; CHECK-NEXT:     Dependences:
 ; CHECK-NEXT:      Backward:
 ; CHECK-NEXT:           store i32 %0, i32* %arrayidx, align 4 ->
 ; CHECK-NEXT:           store i32 %2, i32* %arrayidx3, align 4
 
-define void @unsafe_Write_Write(i32* nocapture %A) {
+define void @vectorizable_by_reorder_Write_Write(i32* nocapture %A) {
 entry:
   br label %for.body
 
@@ -490,7 +492,7 @@
 }
 
 ; Following case checks that interleaved stores have dependences with another
-; store and can not pass dependence check.
+; store and can pass dependence check with reordering.
 
 ; void interleaved_stores(int *A) {
 ;   int *B = (int *) ((char *)A + 1);
@@ -505,7 +507,7 @@
 
 ; CHECK: function 'interleaved_stores':
 ; CHECK-NEXT:   for.body:
-; CHECK-NEXT:     Report: unsafe dependent memory operations in loop
+; CHECK-NEXT:     Memory dependences are safe
 ; CHECK-NEXT:     Dependences:
 ; CHECK-NEXT:       Backward:
 ; CHECK-NEXT:           store i32 %4, i32* %arrayidx5, align 4 -> 
diff --git a/llvm/test/Transforms/LoopVectorize/memdep.ll b/llvm/test/Transforms/LoopVectorize/memdep.ll
--- a/llvm/test/Transforms/LoopVectorize/memdep.ll
+++ b/llvm/test/Transforms/LoopVectorize/memdep.ll
@@ -60,6 +60,7 @@
 }
 
 ; Plausible dependence of distance 2 - can be vectorized with a width of 2.
+; And can be vectorized with a width of 4 with reordering accesses.
 ;  for (i = 0; i < 1024; ++i)
 ;    A[i+2] = A[i] + 1;
 
@@ -67,7 +68,7 @@
 ; CHECK: <2 x i32>
 
 ; WIDTH: f3_vec_len
-; WIDTH-NOT: <4 x i32>
+; WIDTH: <4 x i32>
 
 define void @f3_vec_len(i32* %A) {
 entry:
@@ -91,7 +92,7 @@
   ret void
 }
 
-; Plausible dependence of distance 1 - cannot be vectorized (without reordering
+; Plausible dependence of distance 1 - can be vectorized (with reordering
 ; accesses).
 ;   for (i = 0; i < 1024; ++i) {
 ;     B[i] = A[i];
@@ -99,7 +100,7 @@
 ;   }
 
 ; CHECK-LABEL: @f5(
-; CHECK-NOT: <2 x i32>
+; CHECK: <2 x i32>
 
 define void @f5(i32*  %A, i32* %B) {
 entry:
@@ -244,8 +245,10 @@
 ; RIGHTVF-LABEL: @pr34283
 ; RIGHTVF: <4 x i64>
 
+;Before backward dependence can be vectorized, the following check
+;is 'WRONGVF-NOT: <8 x i64>'.  Now it can be vectorized.
 ; WRONGVF-LABLE: @pr34283
-; WRONGVF-NOT: <8 x i64>
+; WRONGVF: <8 x i64>
 
 @a = common local_unnamed_addr global [64 x i32] zeroinitializer, align 16
 
diff --git a/llvm/test/Transforms/LoopVectorize/pr47929-vectorize-backward-dependence.ll b/llvm/test/Transforms/LoopVectorize/pr47929-vectorize-backward-dependence.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/pr47929-vectorize-backward-dependence.ll
@@ -0,0 +1,495 @@
+; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+
+; The C code corresponding to the following cases are as follows
+;
+;   #define N 1024
+;
+;   float a[N], b[N], c[N], d[N], e[N], f[N];
+;
+;   // Loops that can be vectorized
+;   void foo(int LEN) {
+;     for (int i = 0; i < LEN - 1; i++) {
+;       a[i] = c[i] * 10;
+;       b[i] = a[i + 1] + 1;
+;     }
+;   }
+;
+;   void foo1(int LEN) {
+;     for (int i = LEN - 2; i > -1; i--) {
+;       a[i + 1] = c[i] * 10;
+;       b[i] = a[i] + 1;
+;     }
+;   }
+;
+;  void foo2(int LEN) {
+;    for (int i = 0; i < LEN - 1; i++) {
+;      a[i] = c[i] * 10;
+;      b[i] = a[i + 1] + 1;
+;      d[i] = b[i + 1] - 2;
+;    }
+;  }
+;
+;  void foo3(int LEN) {
+;    for (int i = 0; i < LEN - 1; i++) {
+;      a[i] = b[i] * 2;
+;      a[i + 1] = c[i] + 1;
+;    }
+;  }
+;
+;  void foo4(int LEN) {
+;    for (int i = 0; i < LEN - 2; i++) {
+;      b[i] = a[i] * 2;
+;      a[i + 2] = c[i] + 1;
+;    }
+;  }
+;
+;  void foo5(int LEN) {
+;    for (int i = 0; i < LEN - 2; i++) {
+;      b[i] = a[i] * 2;
+;      c[i + 2] = d[i];
+;      a[i + 2] = c[i] + 1;
+;    }
+;  }
+;
+;  // Loops that cann't be vectorized
+;  // Backward dependence belongs to different blocks.
+;  void bar(int LEN) {
+;    for (int i = 0; i < LEN - 1; i++) {
+;      if (i % 2)
+;        a[i] = c[i] * 10;
+;      b[i] = a[i + 1] + 1;
+;    }
+;  }
+;
+;  // Backward dependence is true data dependence (operands check).
+;  void bar1(int LEN) {
+;    for (int i = 0; i < LEN - 2; i++) {
+;      b[i] = a[i];
+;      a[i + 2] = b[i];
+;    }
+;  }
+;
+;  // Backward dependence is true data dependence (related check).
+;  void bar2(int LEN) {
+;    for (int i = 0; i < LEN - 2; i++) {
+;      b[i] = a[i];
+;      c[i + 2] = b[i];
+;      a[i + 2] = c[i];
+;    }
+;  }
+;
+;  // Backward dependence turns to forwardbutpreventsforwarding.
+;  void bar3(int LEN) {
+;    for (int i = 0; i < LEN - 1; i++) {
+;      b[i] = a[i] * 2;
+;      a[i + 1] = c[i] + 1;
+;    }
+;  }
+;
+@a = dso_local global [1024 x float] zeroinitializer, align 4
+@b = dso_local global [1024 x float] zeroinitializer, align 4
+@c = dso_local global [1024 x float] zeroinitializer, align 4
+@d = dso_local global [1024 x float] zeroinitializer, align 4
+@e = dso_local global [1024 x float] zeroinitializer, align 4
+@f = dso_local global [1024 x float] zeroinitializer, align 4
+
+define dso_local void @foo(i32 %LEN) #0 {
+; CHECK-LABEL: @foo(
+; CHECK: vector.body
+; CHECK: load <4 x float>
+; CHECK: fmul <4 x float>
+; CHECK: load <4 x float>
+; CHECK: store <4 x float>
+; CHECK: fadd <4 x float>
+; CHECK: store <4 x float>
+entry:
+  %cmp6 = icmp sgt i32 %LEN, 1
+  br i1 %cmp6, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  %0 = add i32 %LEN, -1
+  %wide.trip.count = zext i32 %0 to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @c, i64 0, i64 %indvars.iv
+  %1 = load float, float* %arrayidx, align 4
+  %mul = fmul float %1, 1.000000e+01
+  %arrayidx2 = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %indvars.iv
+  store float %mul, float* %arrayidx2, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %arrayidx4 = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %indvars.iv.next
+  %2 = load float, float* %arrayidx4, align 4
+  %add5 = fadd float %2, 1.000000e+00
+  %arrayidx7 = getelementptr inbounds [1024 x float], [1024 x float]* @b, i64 0, i64 %indvars.iv
+  store float %add5, float* %arrayidx7, align 4
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+define dso_local void @foo1(i32 %LEN) #0 {
+; CHECK-LABEL: @foo1(
+; CHECK: vector.body
+; CHECK: load <4 x float>
+; CHECK: fmul <4 x float>
+; CHECK: load <4 x float>
+; CHECK: store <4 x float>
+; CHECK: fadd <4 x float>
+; CHECK: store <4 x float>
+; CHECK: for.body
+; CHECK: load float
+; CHECK: fmul float
+; CHECK: store float
+; CHECK: load float
+; CHECK: fadd float
+; CHECK: store float
+entry:
+  %cmp6 = icmp sgt i32 %LEN, 1
+  br i1 %cmp6, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  %0 = add i32 %LEN, -2
+  %1 = sext i32 %0 to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %1, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @c, i64 0, i64 %indvars.iv
+  %2 = load float, float* %arrayidx, align 4
+  %mul = fmul float %2, 1.000000e+01
+  %3 = add nsw i64 %indvars.iv, 1
+  %arrayidx2 = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %3
+  store float %mul, float* %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %indvars.iv
+  %4 = load float, float* %arrayidx4, align 4
+  %add5 = fadd float %4, 1.000000e+00
+  %arrayidx7 = getelementptr inbounds [1024 x float], [1024 x float]* @b, i64 0, i64 %indvars.iv
+  store float %add5, float* %arrayidx7, align 4
+  %indvars.iv.next = add nsw i64 %indvars.iv, -1
+  %cmp = icmp sgt i64 %indvars.iv, 0
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+define dso_local void @foo2(i32 %LEN) #0 {
+; CHECK-LABEL: @foo2(
+; CHECK: vector.body
+; CHECK: load <4 x float>
+; CHECK: fmul <4 x float>
+; CHECK: load <4 x float>
+; CHECK: store <4 x float>
+; CHECK: fadd <4 x float>
+; CHECK: load <4 x float>
+; CHECK: store <4 x float>
+; CHECK: fadd <4 x float>
+; CHECK: store <4 x float>
+; REVERT-LABEL: @foo2(
+entry:
+  %cmp8 = icmp sgt i32 %LEN, 1
+  br i1 %cmp8, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  %0 = add i32 %LEN, -1
+  %wide.trip.count = zext i32 %0 to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @c, i64 0, i64 %indvars.iv
+  %1 = load float, float* %arrayidx, align 4
+  %mul = fmul float %1, 1.000000e+01
+  %arrayidx2 = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %indvars.iv
+  store float %mul, float* %arrayidx2, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %arrayidx4 = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %indvars.iv.next
+  %2 = load float, float* %arrayidx4, align 4
+  %add5 = fadd float %2, 1.000000e+00
+  %arrayidx7 = getelementptr inbounds [1024 x float], [1024 x float]* @b, i64 0, i64 %indvars.iv
+  store float %add5, float* %arrayidx7, align 4
+  %arrayidx10 = getelementptr inbounds [1024 x float], [1024 x float]* @b, i64 0, i64 %indvars.iv.next
+  %3 = load float, float* %arrayidx10, align 4
+  %sub11 = fadd float %3, -2.000000e+00
+  %arrayidx13 = getelementptr inbounds [1024 x float], [1024 x float]* @d, i64 0, i64 %indvars.iv
+  store float %sub11, float* %arrayidx13, align 4
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+define dso_local void @foo3(i32 %LEN) local_unnamed_addr #0 {
+; CHECK-LABEL: @foo3(
+; CHECK: vector.body
+; CHECK: load <4 x float>
+; CHECK: fmul <4 x float>
+; CHECK: load <4 x float>
+; CHECK: fadd <4 x float>
+; CHECK: store <4 x float>
+; CHECK: store <4 x float>
+entry:
+  %cmp14 = icmp sgt i32 %LEN, 1
+  br i1 %cmp14, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %0 = add i32 %LEN, -1
+  %wide.trip.count = zext i32 %0 to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @b, i64 0, i64 %indvars.iv
+  %1 = load float, float* %arrayidx, align 4
+  %mul = fmul float %1, 2.000000e+00
+  %arrayidx2 = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %indvars.iv
+  store float %mul, float* %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds [1024 x float], [1024 x float]* @c, i64 0, i64 %indvars.iv
+  %2 = load float, float* %arrayidx4, align 4
+  %add = fadd float %2, 1.000000e+00
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %arrayidx7 = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %indvars.iv.next
+  store float %add, float* %arrayidx7, align 4
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define dso_local void @foo4(i32 %LEN) local_unnamed_addr #0 {
+; CHECK-LABEL: @foo4(
+; CHECK: vector.body
+; CHECK: load <4 x float>
+; CHECK: fadd <4 x float>
+; CHECK: store <4 x float>
+; CHECK: load <4 x float>
+; CHECK: fmul <4 x float>
+; CHECK: store <4 x float>
+entry:
+  %cmp14 = icmp sgt i32 %LEN, 2
+  br i1 %cmp14, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %0 = add i32 %LEN, -2
+  %wide.trip.count = zext i32 %0 to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %indvars.iv
+  %1 = load float, float* %arrayidx, align 4
+  %mul = fmul float %1, 2.000000e+00
+  %arrayidx2 = getelementptr inbounds [1024 x float], [1024 x float]* @b, i64 0, i64 %indvars.iv
+  store float %mul, float* %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds [1024 x float], [1024 x float]* @c, i64 0, i64 %indvars.iv
+  %2 = load float, float* %arrayidx4, align 4
+  %add = fadd float %2, 1.000000e+00
+  %3 = add nuw nsw i64 %indvars.iv, 2
+  %arrayidx7 = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %3
+  store float %add, float* %arrayidx7, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define dso_local void @foo5(i32 %LEN) local_unnamed_addr #0 {
+; CHECK-LABEL: @foo5(
+; CHECK: vector.body
+; CHECK: load <4 x float>
+; CHECK: store <4 x float>
+; CHECK: load <4 x float>
+; CHECK: fadd <4 x float>
+; CHECK: store <4 x float>
+; CHECK: load <4 x float>
+; CHECK: fmul <4 x float>
+; CHECK: store <4 x float>
+entry:
+  %cmp21 = icmp sgt i32 %LEN, 2
+  br i1 %cmp21, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %0 = add i32 %LEN, -2
+  %wide.trip.count = zext i32 %0 to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %indvars.iv
+  %1 = load float, float* %arrayidx, align 4
+  %mul = fmul float %1, 2.000000e+00
+  %arrayidx2 = getelementptr inbounds [1024 x float], [1024 x float]* @b, i64 0, i64 %indvars.iv
+  store float %mul, float* %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds [1024 x float], [1024 x float]* @d, i64 0, i64 %indvars.iv
+  %2 = load float, float* %arrayidx4, align 4
+  %3 = add nuw nsw i64 %indvars.iv, 2
+  %arrayidx6 = getelementptr inbounds [1024 x float], [1024 x float]* @c, i64 0, i64 %3
+  store float %2, float* %arrayidx6, align 4
+  %arrayidx8 = getelementptr inbounds [1024 x float], [1024 x float]* @c, i64 0, i64 %indvars.iv
+  %4 = load float, float* %arrayidx8, align 4
+  %add9 = fadd float %4, 1.000000e+00
+  %arrayidx12 = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %3
+  store float %add9, float* %arrayidx12, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define dso_local void @bar(i32 %LEN) #0 {
+; CHECK-LABEL: @bar(
+; CHECK-NOT: vector.body
+; CHECK: for.body
+; CHECK: if.then
+; CHECK: load float
+; CHECK: fmul float
+; CHECK: store float
+; CHECK: if.end
+; CHECK: load float
+; CHECK: fadd float
+; CHECK: store float
+entry:
+  %cmp7 = icmp sgt i32 %LEN, 1
+  br i1 %cmp7, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  %0 = add i32 %LEN, -1
+  %wide.trip.count = zext i32 %0 to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %if.end
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %if.end ]
+  %rem10 = and i64 %indvars.iv, 1
+  %tobool.not = icmp eq i64 %rem10, 0
+  br i1 %tobool.not, label %if.end, label %if.then
+
+if.then:                                          ; preds = %for.body
+  %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @c, i64 0, i64 %indvars.iv
+  %1 = load float, float* %arrayidx, align 4
+  %mul = fmul float %1, 1.000000e+01
+  %arrayidx2 = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %indvars.iv
+  store float %mul, float* %arrayidx2, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %arrayidx4 = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %indvars.iv.next
+  %2 = load float, float* %arrayidx4, align 4
+  %add5 = fadd float %2, 1.000000e+00
+  %arrayidx7 = getelementptr inbounds [1024 x float], [1024 x float]* @b, i64 0, i64 %indvars.iv
+  store float %add5, float* %arrayidx7, align 4
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:                                          ; preds = %if.end, %entry
+  ret void
+}
+
+define dso_local void @bar1(i32 %LEN) local_unnamed_addr #0 {
+; CHECK-LABEL: @bar1(
+; CHECK-NOT: vector.body
+entry:
+  %cmp13 = icmp sgt i32 %LEN, 2
+  br i1 %cmp13, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %0 = add i32 %LEN, -2
+  %wide.trip.count = zext i32 %0 to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %indvars.iv
+  %1 = load float, float* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds [1024 x float], [1024 x float]* @b, i64 0, i64 %indvars.iv
+  store float %1, float* %arrayidx2, align 4
+  %2 = add nuw nsw i64 %indvars.iv, 2
+  %arrayidx6 = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %2
+  store float %1, float* %arrayidx6, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define dso_local void @bar2(i32 %LEN) local_unnamed_addr #0 {
+; CHECK-LABEL: @bar2(
+; CHECK-NOT: vector.body
+entry:
+  %cmp20 = icmp sgt i32 %LEN, 2
+  br i1 %cmp20, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %0 = add i32 %LEN, -2
+  %wide.trip.count = zext i32 %0 to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %indvars.iv
+  %1 = load float, float* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds [1024 x float], [1024 x float]* @b, i64 0, i64 %indvars.iv
+  store float %1, float* %arrayidx2, align 4
+  %2 = add nuw nsw i64 %indvars.iv, 2
+  %arrayidx6 = getelementptr inbounds [1024 x float], [1024 x float]* @c, i64 0, i64 %2
+  store float %1, float* %arrayidx6, align 4
+  %arrayidx8 = getelementptr inbounds [1024 x float], [1024 x float]* @c, i64 0, i64 %indvars.iv
+  %3 = load float, float* %arrayidx8, align 4
+  %arrayidx11 = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %2
+  store float %3, float* %arrayidx11, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define dso_local void @bar3(i32 %LEN) local_unnamed_addr #0 {
+; CHECK-LABEL: @bar3(
+; CHECK-NOT: vector.body
+entry:
+  %cmp14 = icmp sgt i32 %LEN, 1
+  br i1 %cmp14, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %0 = add i32 %LEN, -1
+  %wide.trip.count = zext i32 %0 to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %indvars.iv
+  %1 = load float, float* %arrayidx, align 4
+  %mul = fmul float %1, 2.000000e+00
+  %arrayidx2 = getelementptr inbounds [1024 x float], [1024 x float]* @b, i64 0, i64 %indvars.iv
+  store float %mul, float* %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds [1024 x float], [1024 x float]* @c, i64 0, i64 %indvars.iv
+  %2 = load float, float* %arrayidx4, align 4
+  %add = fadd float %2, 1.000000e+00
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %arrayidx7 = getelementptr inbounds [1024 x float], [1024 x float]* @a, i64 0, i64 %indvars.iv.next
+  store float %add, float* %arrayidx7, align 4
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}