diff --git a/llvm/include/llvm/Analysis/IVDescriptors.h b/llvm/include/llvm/Analysis/IVDescriptors.h
--- a/llvm/include/llvm/Analysis/IVDescriptors.h
+++ b/llvm/include/llvm/Analysis/IVDescriptors.h
@@ -177,7 +177,10 @@
   /// previous iteration. \p SinkAfter includes pairs of instructions where the
   /// first will be rescheduled to appear after the second if/when the loop is
   /// vectorized. It may be augmented with additional pairs if needed in order
-  /// to handle Phi as a first-order recurrence.
+  /// to handle Phi as a first-order recurrence. \p SinkAfter can contain
+  /// multiple instructions to sink after the same instruction. Clients have to
+  /// ensure the relative ordering of those instructions is preserved when
+  /// moving those instructions.
   static bool
   isFirstOrderRecurrence(PHINode *Phi, Loop *TheLoop,
                          DenseMap<Instruction *, Instruction *> &SinkAfter,
diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp
--- a/llvm/lib/Analysis/IVDescriptors.cpp
+++ b/llvm/lib/Analysis/IVDescriptors.cpp
@@ -701,28 +701,42 @@
   // Ensure every user of the phi node is dominated by the previous value.
   // The dominance requirement ensures the loop vectorizer will not need to
   // vectorize the initial value prior to the first iteration of the loop.
-  // TODO: Consider extending this sinking to handle memory instructions and
-  // phis with multiple users.
+  // TODO: Consider extending this sinking to handle memory instructions.
 
   // Returns true, if all users of I are dominated by DominatedBy.
-  auto allUsesDominatedBy = [DT](Instruction *I, Instruction *DominatedBy) {
-    return all_of(I->uses(), [DT, DominatedBy](Use &U) {
-      return DT->dominates(DominatedBy, U);
-    });
+  SetVector<Instruction *> UserWorkList;
+  auto PushUsers = [&UserWorkList](Instruction *I) {
+    for (User *User : I->users())
+      UserWorkList.insert(cast<Instruction>(User));
   };
 
-  if (Phi->hasOneUse()) {
-    Instruction *I = Phi->user_back();
-
+  // We optimistically assume we can sink all users after Previous. Keep list of
+  // instructions to sink after Previous. It will be applied to SinkAfter if all
+  // users can be sunk.
+  SmallVector<Instruction *, 4> InstrsToSink;
+  // Try to sink instruction I after Previous. Queue users of I for further
+  // processin in UserWorkList.
+  auto TryToSinkUser = [&](Instruction *I) {
     // If the user of the PHI is also the incoming value, we potentially have a
     // reduction and which cannot be handled by sinking.
     if (Previous == I)
       return false;
 
+    if (DT->dominates(Previous, I)) // We already are good w/o sinking.
+      return true;
+
+    if (I->getParent() != Phi->getParent() || I->mayHaveSideEffects())
+      return false;
+
     // We cannot sink terminator instructions.
     if (I->getParent()->getTerminator() == I)
       return false;
 
+    // If we reach a PHI node that is not dominated by Previous, we reached a
+    // header PHI. No need for sinking.
+    if (I == Phi)
+      return true;
+
     // Do not try to sink an instruction multiple times (if multiple operands
     // are first order recurrences).
     // TODO: We can support this case, by sinking the instruction after the
@@ -730,19 +744,25 @@
     if (SinkAfter.find(I) != SinkAfter.end())
       return false;
 
-    if (DT->dominates(Previous, I)) // We already are good w/o sinking.
-      return true;
+    // We can sink I after Previous, if all users can also be sunk or are
+    // dominated by Previous.
+    InstrsToSink.push_back(I);
+    PushUsers(I);
+    return true;
+  };
+  PushUsers(Phi);
 
-    // We can sink any instruction without side effects, as long as all users
-    // are dominated by the instruction we are sinking after.
-    if (I->getParent() == Phi->getParent() && !I->mayHaveSideEffects() &&
-        allUsesDominatedBy(I, Previous)) {
-      SinkAfter[I] = Previous;
-      return true;
-    }
+  // Try to recursively sink instructions and their users after Previous.
+  for (unsigned I = 0; I < UserWorkList.size(); ++I) {
+    Instruction *Current = UserWorkList[I];
+    if (!TryToSinkUser(Current))
+      return false;
   }
 
-  return allUsesDominatedBy(Phi, Previous);
+  // We can sink all users of Previous. Update the mapping.
+  for (Instruction *I : InstrsToSink)
+    SinkAfter[I] = Previous;
+  return true;
 }
 
 /// This function returns the identity element (or neutral element) for
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7777,7 +7777,19 @@
   // ---------------------------------------------------------------------------
 
   // Apply Sink-After legal constraints.
-  for (auto &Entry : SinkAfter) {
+  // Multiple instructions can be marked for sinking after the same instruction.
+  // Convert map to a vector of pairs and sort them by the reverse order of the
+  // original IR. That way the instructions that dominate others are inserted
+  // after them and they will come before instructions dominated by them, due to
+  // using moveAfter.
+  SmallVector<std::pair<Instruction *, Instruction *>, 4> SinkAfterV;
+  for (auto &Entry : SinkAfter)
+    SinkAfterV.emplace_back(Entry.first, Entry.second);
+  sort(SinkAfterV, [](const std::pair<Instruction *, Instruction *> &A,
+                      const std::pair<Instruction *, Instruction *> &B) {
+    return B.first->comesBefore(A.first);
+  });
+  for (auto &Entry : SinkAfterV) {
     VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
     VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
     Sink->moveAfter(Target);
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll
@@ -112,21 +112,37 @@
   ret void
 }
 
-; FIXME: Currently we can only sink a single instruction. For the example below,
-;        we also have to sink users.
-define void @cannot_sink_with_additional_user(i32 %x, i32* %ptr, i64 %tc) {
-; CHECK-LABEL: define void @cannot_sink_with_additional_user(
-; CHECK-NEXT: entry:
-; CHECK-NEXT:   br label %preheader
-
-; CHECK-LABEL: preheader:                                        ; preds = %entry
-; CHECK:  br label %for
+; Sink users of %pre.phi recursively.
+define void @can_sink_with_additional_user(i32 %x, i32* %ptr, i64 %tc) {
+; CHECK-LABEL: define void @can_sink_with_additional_user(
 
-; CHECK-LABEL: for:                                              ; preds = %for, %preheader
-; CHECK:  br i1 %exitcond, label %exit, label %for
+; CHECK-LABEL: vector.ph:
+; CHECK-NEXT:    %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %x, i32 0
+; CHECK-NEXT:    %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    %vector.recur.init = insertelement <4 x i32> undef, i32 %.pre, i32 3
+; CHECK-NEXT:    br label %vector.body
 
-; CHECK-LABEL: exit:
-; CHECK-NEXT:    ret void
+; CHECK-LABEL: vector.body:
+; CHECK-NEXT:    %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECK-NEXT:    %vector.recur = phi <4 x i32> [ %vector.recur.init, %vector.ph ], [ %wide.load, %vector.body ]
+; CHECK-NEXT:    %offset.idx = add i64 1, %index
+; CHECK-NEXT:    %0 = add i64 %offset.idx, 0
+; CHECK-NEXT:    %1 = getelementptr inbounds [257 x i32], [257 x i32]* @p, i64 0, i64 %0
+; CHECK-NEXT:    %2 = getelementptr inbounds i32, i32* %1, i32 0
+; CHECK-NEXT:    %3 = bitcast i32* %2 to <4 x i32>*
+; CHECK-NEXT:    %wide.load = load <4 x i32>, <4 x i32>* %3, align 4
+; CHECK-NEXT:    %4 = shufflevector <4 x i32> %vector.recur, <4 x i32> %wide.load, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT:    %5 = add <4 x i32> %4, %broadcast.splat
+; CHECK-NEXT:    %6 = add <4 x i32> %5, %broadcast.splat
+; CHECK-NEXT:    %7 = add <4 x i32> %5, %wide.load
+; CHECK-NEXT:    %8 = add <4 x i32> %6, %7
+; CHECK-NEXT:    %9 = getelementptr inbounds [257 x i32], [257 x i32]* @q, i64 0, i64 %0
+; CHECK-NEXT:    %10 = getelementptr inbounds i32, i32* %9, i32 0
+; CHECK-NEXT:    %11 = bitcast i32* %10 to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> %8, <4 x i32>* %11, align 4
+; CHECK-NEXT:    %index.next = add i64 %index, 4
+; CHECK-NEXT:    %12 = icmp eq i64 %index.next, 1996
+; CHECK-NEXT:    br i1 %12, label %middle.block, label %vector.body, !llvm.loop !6
 
 entry:
   br label %preheader
@@ -267,3 +283,36 @@
 bb74:                                             ; preds = %bb13
   ret void
 }
+
+; Users that are phi nodes cannot be sunk.
+define void @cannot_sink_phi(i32* %ptr) {
+; CHECK-LABEL: define void @cannot_sink_phi(
+; CHECK-NOT:   vector.body
+entry:
+  br label %loop.header
+
+loop.header:                                      ; preds = %if.end128, %for.cond108.preheader
+  %iv = phi i64 [ 1, %entry ], [ %iv.next, %loop.latch ]
+  %for = phi i32 [ 0, %entry ], [ %for.next, %loop.latch ]
+  %c.1 = icmp ult i64 %iv, 500
+  br i1 %c.1, label %if.truebb, label %if.falsebb
+
+if.truebb:                  ; preds = %for.body114
+  br label %loop.latch
+
+if.falsebb:                                       ; preds = %for.body114
+  br label %loop.latch
+
+loop.latch:                                        ; preds = %if.then122, %for.body114.if.end128_crit_edge
+  %first_time.1 = phi i32 [ 20, %if.truebb ], [ %for, %if.falsebb ]
+  %c.2 = icmp ult i64 %iv, 800
+  %for.next = select i1 %c.2, i32 30, i32 %first_time.1
+  %ptr.idx = getelementptr i32, i32* %ptr, i64 %iv
+  store i32 %for.next, i32* %ptr.idx
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond.not, label %exit, label %loop.header
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
@@ -545,11 +545,36 @@
 ;     b[i] = ((a[i] + 2) * a[i + 1]);
 ; }
 ;
-; NO-SINK-AFTER-LABEL: no_sink_after
-; NO-SINK-AFTER-NOT:   vector.ph:
-; NO-SINK-AFTER:       }
+; SINK-AFTER-LABEL:  @sink_after_with_multiple_users
+; SINK-AFTER-LABEL:  vector.ph:
+; SINK-AFTER-NEXT:    %n.mod.vf = urem i64 %n, 4
+; SINK-AFTER-NEXT:    %n.vec = sub i64 %n, %n.mod.vf
+; SINK-AFTER-NEXT:    %vector.recur.init = insertelement <4 x i16> undef, i16 %.pre, i32 3
+; SINK-AFTER-NEXT:    br label %vector.body
+
+; SINK-AFTER-LABEL:  vector.body:
+; SINK-AFTER-NEXT:    %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; SINK-AFTER-NEXT:    %vector.recur = phi <4 x i16> [ %vector.recur.init, %vector.ph ], [ %wide.load, %vector.body ]
+; SINK-AFTER-NEXT:    %1 = add i64 %index, 0
+; SINK-AFTER-NEXT:    %2 = add nuw nsw i64 %1, 1
+; SINK-AFTER-NEXT:    %3 = getelementptr inbounds i16, i16* %a, i64 %2
+; SINK-AFTER-NEXT:    %4 = getelementptr inbounds i16, i16* %3, i32 0
+; SINK-AFTER-NEXT:    %5 = bitcast i16* %4 to <4 x i16>*
+; SINK-AFTER-NEXT:    %wide.load = load <4 x i16>, <4 x i16>* %5, align 2, !alias.scope !43
+; SINK-AFTER-NEXT:    %6 = shufflevector <4 x i16> %vector.recur, <4 x i16> %wide.load, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; SINK-AFTER-NEXT:    %7 = sext <4 x i16> %6 to <4 x i32>
+; SINK-AFTER-NEXT:    %8 = add nsw <4 x i32> %7, <i32 2, i32 2, i32 2, i32 2>
+; SINK-AFTER-NEXT:    %9 = sext <4 x i16> %wide.load to <4 x i32>
+; SINK-AFTER-NEXT:    %10 = mul nsw <4 x i32> %8, %9
+; SINK-AFTER-NEXT:    %11 = getelementptr inbounds i32, i32* %b, i64 %1
+; SINK-AFTER-NEXT:    %12 = getelementptr inbounds i32, i32* %11, i32 0
+; SINK-AFTER-NEXT:    %13 = bitcast i32* %12 to <4 x i32>*
+; SINK-AFTER-NEXT:    store <4 x i32> %10, <4 x i32>* %13, align 4, !alias.scope !46, !noalias !43
+; SINK-AFTER-NEXT:    %index.next = add i64 %index, 4
+; SINK-AFTER-NEXT:    %14 = icmp eq i64 %index.next, %n.vec
+; SINK-AFTER-NEXT:    br i1 %14, label %middle.block, label %vector.body, !llvm.loop !48
 ;
-define void @no_sink_after(i16* %a, i32* %b, i64 %n) {
+define void @sink_after_with_multiple_users(i16* %a, i32* %b, i64 %n) {
 entry:
   %.pre = load i16, i16* %a
   br label %for.body
@@ -626,7 +651,7 @@
 ; SINK-AFTER-NEXT:    %index.next = add i32 %index, 4
 ; SINK-AFTER-NEXT:    %vec.ind.next = add <4 x i16> %vec.ind, <i16 4, i16 4, i16 4, i16 4>
 ; SINK-AFTER-NEXT:    %6 = icmp eq i32 %index.next, 40
-; SINK-AFTER-NEXT:    br i1 %6, label %middle.block, label %vector.body, !llvm.loop !43
+; SINK-AFTER-NEXT:    br i1 %6, label %middle.block, label %vector.body, !llvm.loop !50
 ;
 entry:
   br label %for.cond