Index: lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- lib/Transforms/Vectorize/LoopVectorize.cpp
+++ lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -569,7 +569,7 @@
   Value *getOrCreateTripCount(Loop *NewLoop);
 
   /// Returns (and creates if needed) the trip count of the widened loop.
-  Value *getOrCreateVectorTripCount(Loop *NewLoop);
+  Value *getOrCreateVectorTripCount(Loop *NewLoop, Loop *OrigLoop  = nullptr);
 
   /// Emit a bypass check to see if the trip count would overflow, or we
   /// wouldn't have enough iterations to execute one vector loop.
@@ -3217,7 +3217,8 @@
   return TripCount;
 }
 
-Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
+Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L,
+                                                       Loop *OrigLoop) {
   if (VectorTripCount)
     return VectorTripCount;
 
@@ -3232,6 +3233,73 @@
   Constant *Step = ConstantInt::get(TC->getType(), VF * UF);
   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
 
+  assert(OrigLoop && "during first calculation of VectorTripCount, the orig "
+                     "loop should be passed in!");
+
+  // If the loop terminating condition uses the phi value at loop entry rather
+  // than the
+  // induction variable at the current iteration, and 
+  // extract the result at the last element of the vectorized phi (as done in
+  // fixFirstOrderRecurrence), we may get an incorrect result.
+  // Note that this
+  // problem only arises when both these conditions occur:
+  // 1. TC evenly divides Step, and the scalar loop is not run.
+  // 2. Some Phi value which is a relation based on the IV phi, is
+  // extracted out of the loop (this is the firstOrderRecurrenceRelation).
+  // For correctness, the vectorized part of the loop should be TC - Step times
+  // and the remaining iterations should be run in the scalar remainder loop.
+  // Example IR:
+  // loop:
+  // %var = phi i32 [ %init, %ph ], [ %sum, %loop ]
+  // %indvar = phi i32 [ 0, %ph ], [ %indvar.next, %loop ]
+  // %sum = add i32 %indvar, %a
+  // %indvar.next = add i32 %indvar, 1
+  // %c11 = icmp slt i32 %indvar, 95
+  // br i1 %c11, label %exit, label %loop
+  // exit:
+  // %var.lcssa = phi i32 [ %var, %loop ]
+  // ret i32 %var.lcssa
+  // Here although the trip count is 96, if we run the vectorized loop
+  // with Step = 4, %var will be extracted as 95 + a since we extract the
+  // last element of the vectorized %var, whereas it should be 94 +
+  // a.
+  // This is because the exit block uses the `var` value (calculated at previous
+  // iteration) and not the `sum` value
+  // calculated at current iteration of loop.
+  auto IsSecondLastIterationExtracted = [&]() {
+    auto *ExitBlock = OrigLoop->getExitBlock();
+    auto *HeaderBlock = OrigLoop->getHeader();
+    auto *Cmp = dyn_cast<CmpInst>(HeaderBlock->getTerminator()->getOperand(0));
+    if (!Cmp)
+      return false;
+    // Confirmed that we are effectively running TC - 1 number of iterations.
+    bool LoopTermConditionUsesPhi = any_of(
+        Cmp->operands(), [&](Use &U) { return Legal->isInductionVariable(U); });
+    if (!LoopTermConditionUsesPhi)
+      return false;
+    for (auto &I : *HeaderBlock) {
+      if (!isa<PHINode>(I))
+        break;
+      // We only care about phi nodes that depend on the IV (i.e. secondary
+      // induction variables).
+      if ((Legal->getPrimaryInduction()) == &(cast<PHINode>(I)))
+        continue;
+      for (auto *U : I.users()) {
+        if (auto *PhiUse = dyn_cast<Instruction>(U))
+          // At this stage, we know that the value for the secondary IV 
+          // should be the second last element in the vectorized version (since
+          // we are using the phi).
+          // However, we don't have the capability in vector.extractelement to
+          // choose between secondlast and last element. So, we conservatively
+          // assume that the secondlastiteration will be used and generate the
+          // loop accordingly.
+          if (PhiUse->getParent() == ExitBlock)
+            return true;
+      }
+    }
+    return false;
+  };
+
   // If there is a non-reversed interleaved group that may speculatively access
   // memory out-of-bounds, we need to ensure that there will be at least one
   // iteration of the scalar epilogue loop. Thus, if the step evenly divides
@@ -3239,7 +3307,11 @@
   // does not evenly divide the trip count, no adjustment is necessary since
   // there will already be scalar iterations. Note that the minimum iterations
   // check ensures that N >= Step.
-  if (VF > 1 && Legal->requiresScalarEpilogue()) {
+  // Also, if the second last iteration val is extracted, we should follow this logic of
+  // (N - Step) iterations in vectorized loop and remaining (atmost Step)
+  // iterations in scalar loop.
+  if (VF > 1 &&
+      (Legal->requiresScalarEpilogue() || IsSecondLastIterationExtracted())) {
     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
     R = Builder.CreateSelect(IsZero, Step, R);
   }
@@ -3448,8 +3520,9 @@
   // body. In case of overflow we want to directly jump to the scalar remainder
   // loop.
   emitMinimumIterationCountCheck(Lp, ScalarPH);
-  // Now, compare the new count to zero. If it is zero skip the vector loop and
-  // jump to the scalar loop.
+  // The loop step is equal to the vectorization factor (num of SIMD elements)
+  // times the unroll factor (num of SIMD instructions).
+  Value *CountRoundDown = getOrCreateVectorTripCount(Lp, OrigLoop);
   emitVectorLoopEnteredCheck(Lp, ScalarPH);
   // Generate the code to check any assumptions that we've made for SCEV
   // expressions.
@@ -3461,9 +3534,6 @@
   emitMemRuntimeChecks(Lp, ScalarPH);
 
   // Generate the induction variable.
-  // The loop step is equal to the vectorization factor (num of SIMD elements)
-  // times the unroll factor (num of SIMD instructions).
-  Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
   Constant *Step = ConstantInt::get(IdxTy, VF * UF);
   Induction =
       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
@@ -4070,7 +4140,13 @@
   VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
 
   // Extract the last vector element in the middle block. This will be the
-  // initial value for the recurrence when jumping to the scalar loop.
+  // initial value for the recurrence when jumping to the scalar loop.  Note
+  // that the last element need not always be the correct one, see
+  // IsSecondLastIterationExtracted. We fix the vector trip count so that the assumption
+  // of last element holds true. It would have been nice to fix this here (i.e.
+  // extract from the correct index), but extractElement does not allow anything
+  // other than a constant as the index.  We will not know if it is the last
+  // element or the element before the last, until runtime.
   auto *Extract = Incoming;
   if (VF > 1) {
     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
Index: test/Transforms/LoopVectorize/first-order-recurrence.ll
===================================================================
--- /dev/null
+++ test/Transforms/LoopVectorize/first-order-recurrence.ll
@@ -0,0 +1,100 @@
+; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -dce -S | FileCheck %s
+
+; This loop has 96 iterations, which divides evenly with the Step = 4. However, the vectorized part should only be upto 192.
+; The remaining scalar iterations will
+; be done at the remainder loop, so that we extract the correct value for
+; %val.phi = 94 + x.
+; If we vectorize the loop completely, we extract the last element of the vector val.phi (which would be incorrect: 95 + x) 
+; CHECK-LABEL: extract_second_last_iteration
+; CHECK: vector.body:
+; CHECK: %vec.ind = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %vector.ph ], [ %vec.ind.next, %vector.body ] 
+; CHECK: %[[L1:.+]] = add <4 x i32> %vec.ind, %broadcast.splat
+; CHECK: index.next = add i32 %index, 4
+; CHECK: icmp eq i32 %index.next, 92
+; CHECK: middle.block:
+; CHECK: %vector.recur.extract = extractelement <4 x i32> %[[L1]], i32 3
+; CHECK: scalar.ph:
+; CHECK: %scalar.recur.init = phi i32 [ %vector.recur.extract, %middle.block ], [ 0, %min.iters.checked ], [ 0, %entry ]
+; CHECK: %bc.resume.val = phi i32 [ 92, %middle.block ], [ 0, %entry ], [ 0, %min.iters.checked ] 
+; CHECK: for.body:
+; CHECK: %inc.phi = phi i32 [ %bc.resume.val, %scalar.ph ], [ %inc, %for.body ]
+; CHECK: %scalar.recur = phi i32 [ %scalar.recur.init, %scalar.ph ], [ %addx, %for.body ]
+; CHECK: %addx = add i32 %inc.phi, %x
+; CHECK: %cmp = icmp eq i32 %inc.phi, 95
+; CHECK: for.end:
+; CHECK: %val.phi.lcssa = phi i32 [ %scalar.recur, %for.body ], [ %vector.recur.extract, %middle.block ]
+; CHECK: ret i32 %val.phi.lcssa
+define i32 @extract_second_last_iteration(i32* %cval, i32 %x)  {
+entry:
+  br label %for.body
+
+for.body:
+  %inc.phi = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %val.phi = phi i32 [ 0, %entry ], [ %addx, %for.body ]
+  %inc = add i32 %inc.phi, 1
+  %bc = zext i32 %inc.phi to i64
+  %addx = add i32 %inc.phi, %x
+  %cmp = icmp eq i32 %inc.phi, 95
+  br i1 %cmp, label %for.end, label %for.body
+
+for.end:
+  ret i32 %val.phi
+}
+
+; same as test above but we also store into index = val.phi.
+; FIXME: We could vectorize in this case following the same logic above (92
+; iterations vectorized and remaining is scalarized),
+; but currently the legality phase of the vectorizer prevents this.
+; CHECK-LABEL: storing_upto_second_last
+; CHECK-NOT: vector.body
+define i32 @storing_upto_second_last(i32* %cval, i32 %x)  {
+entry:
+  br label %for.body
+
+for.body:
+; val.phi at end of loop is 94 + x.
+  %inc.phi = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %val.phi = phi i32 [ 0, %entry ], [ %addx, %for.body ]
+  %inc = add nsw i32 %inc.phi, 1
+  %bc = zext i32 %val.phi to i64
+  %c40 = getelementptr inbounds i32, i32 * %cval, i64 %bc
+  %addx = add nuw nsw i32 %inc.phi, %x
+  store i32 %val.phi, i32* %c40
+  %cmp = icmp eq i32 %inc.phi, 95
+  br i1 %cmp, label %for.end, label %for.body
+
+for.end:
+  ret i32 0 
+}
+
+; We are extracting the last element from the loop. Make sure that we don't
+; unnecessarily create TC - Step iterations (i.e. 92 iterations) for the loop.
+; We can completely vectorize the loop at 96 iterations and extract the correct
+; value inc.phi = 95.
+; CHECK-LABEL: extract_last_iteration
+; CHECK: vector.body:
+; CHECK: %vec.ind = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %vector.ph ], [ %vec.ind.next, %vector.body ] 
+; CHECK: index.next = add i32 %index, 4
+; CHECK: icmp eq i32 %index.next, 96
+; CHECK: middle.block:
+; CHECK: %cmp.n = icmp eq i32 96, 96
+; CHECK: br i1 %cmp.n, label %for.end, label %scalar.ph
+; CHECK: for.end:
+; CHECK: %inc.phi.lcssa = phi i32 [ %inc.phi, %for.body ], [ 95, %middle.block ]
+; CHECK: ret i32 %inc.phi.lcssa
+define i32 @extract_last_iteration(i32* %cval, i32 %x)  {
+entry:
+  br label %for.body
+
+for.body:
+  %inc.phi = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %val.phi = phi i32 [ 0, %entry ], [ %addx, %for.body ]
+  %inc = add i32 %inc.phi, 1
+  %bc = zext i32 %inc.phi to i64
+  %addx = add i32 %inc.phi, %x
+  %cmp = icmp eq i32 %inc.phi, 95
+  br i1 %cmp, label %for.end, label %for.body
+
+for.end:
+  ret i32 %inc.phi
+}