diff --git a/llvm/include/llvm/Analysis/IVDescriptors.h b/llvm/include/llvm/Analysis/IVDescriptors.h
--- a/llvm/include/llvm/Analysis/IVDescriptors.h
+++ b/llvm/include/llvm/Analysis/IVDescriptors.h
@@ -171,7 +171,10 @@
   /// previous iteration. \p SinkAfter includes pairs of instructions where the
   /// first will be rescheduled to appear after the second if/when the loop is
   /// vectorized. It may be augmented with additional pairs if needed in order
-  /// to handle Phi as a first-order recurrence.
+  /// to handle Phi as a first-order recurrence. \p SinkAfter can contain
+  /// multiple instructions to sink after the same instruction. Clients have to
+  /// ensure the relative ordering of those instructions is preserved when
+  /// moving those instructions.
   static bool
   isFirstOrderRecurrence(PHINode *Phi, Loop *TheLoop,
                          DenseMap<Instruction *, Instruction *> &SinkAfter,
diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp
--- a/llvm/lib/Analysis/IVDescriptors.cpp
+++ b/llvm/lib/Analysis/IVDescriptors.cpp
@@ -744,28 +744,41 @@
   // Ensure every user of the phi node is dominated by the previous value.
   // The dominance requirement ensures the loop vectorizer will not need to
   // vectorize the initial value prior to the first iteration of the loop.
-  // TODO: Consider extending this sinking to handle memory instructions and
-  // phis with multiple users.
-
-  // Returns true, if all users of I are dominated by DominatedBy.
-  auto allUsesDominatedBy = [DT](Instruction *I, Instruction *DominatedBy) {
-    return all_of(I->uses(), [DT, DominatedBy](Use &U) {
-      return DT->dominates(DominatedBy, U);
-    });
-  };
+  // TODO: Consider extending this sinking to handle memory instructions.
 
-  if (Phi->hasOneUse()) {
-    Instruction *I = Phi->user_back();
+  SetVector<Instruction *> UserWorkList;
+  auto PushUsers = [&UserWorkList](Instruction *I) {
+    for (User *User : I->users())
+      UserWorkList.insert(cast<Instruction>(User));
+  };
 
+  // We optimistically assume we can sink all users after Previous. Keep list of
+  // instructions to sink after Previous. It will be applied to SinkAfter if all
+  // users can be sunk.
+  SmallVector<Instruction *, 4> InstrsToSink;
+  // Try to sink instruction I after Previous. Queue users of I for further
+  // processin in UserWorkList.
+  auto TryToSinkUser = [&](Instruction *I) {
     // If the user of the PHI is also the incoming value, we potentially have a
     // reduction and which cannot be handled by sinking.
     if (Previous == I)
       return false;
 
+    if (DT->dominates(Previous, I)) // We already are good w/o sinking.
+      return true;
+
+    if (I->getParent() != Phi->getParent() || I->mayHaveSideEffects())
+      return false;
+
     // We cannot sink terminator instructions.
     if (I->getParent()->getTerminator() == I)
       return false;
 
+    // If we reach a PHI node that is not dominated by Previous, we reached a
+    // header PHI. No need for sinking.
+    if (isa<PHINode>(I))
+      return true;
+
     // Do not try to sink an instruction multiple times (if multiple operands
     // are first order recurrences).
     // TODO: We can support this case, by sinking the instruction after the
@@ -773,19 +786,26 @@
     if (SinkAfter.find(I) != SinkAfter.end())
       return false;
 
-    if (DT->dominates(Previous, I)) // We already are good w/o sinking.
-      return true;
+    // We can sink I after Previous, if all users can also be sunk or are
+    // dominated by Previous.
+    InstrsToSink.push_back(I);
+    PushUsers(I);
+    return true;
+  };
+  PushUsers(Phi);
 
-    // We can sink any instruction without side effects, as long as all users
-    // are dominated by the instruction we are sinking after.
-    if (I->getParent() == Phi->getParent() && !I->mayHaveSideEffects() &&
-        allUsesDominatedBy(I, Previous)) {
-      SinkAfter[I] = Previous;
-      return true;
-    }
+  // Try to recursively sink instructions and their users after Previous. We try
+  // to sink each instruction exactly once.
+  for (unsigned I = 0; I < UserWorkList.size(); ++I) {
+    Instruction *Current = UserWorkList[I];
+    if (!TryToSinkUser(Current))
+      return false;
   }
 
-  return allUsesDominatedBy(Phi, Previous);
+  // We can sink all users of Phi. Update the mapping.
+  for (Instruction *I : InstrsToSink)
+    SinkAfter[I] = Previous;
+  return true;
 }
 
 /// This function returns the identity element (or neutral element) for
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -9116,7 +9116,19 @@
   // ---------------------------------------------------------------------------
 
   // Apply Sink-After legal constraints.
-  for (auto &Entry : SinkAfter) {
+  // Multiple instructions can be marked for sinking after the same instruction.
+  // Convert map to a vector of pairs and sort them by the reverse order of the
+  // original IR. That way the instructions that dominate others are inserted
+  // after them and they will come before instructions dominated by them, due to
+  // using moveAfter.
+  SmallVector<std::pair<Instruction *, Instruction *>, 4> SinkAfterV;
+  for (const auto &Entry : SinkAfter)
+    SinkAfterV.emplace_back(Entry.first, Entry.second);
+  sort(SinkAfterV, [](const std::pair<Instruction *, Instruction *> &A,
+                      const std::pair<Instruction *, Instruction *> &B) {
+    return B.first->comesBefore(A.first);
+  });
+  for (auto &Entry : SinkAfterV) {
     VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
     VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
 
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll
@@ -173,36 +173,37 @@
   ret void
 }
 
-; FIXME: Currently we can only sink a single instruction. For the example below,
-;        we also have to sink users.
-define void @cannot_sink_with_additional_user(i32 %x, i32* %ptr, i64 %tc) {
-; CHECK-LABEL: @cannot_sink_with_additional_user(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[PREHEADER:%.*]]
-; CHECK:       preheader:
-; CHECK-NEXT:    [[IDX_PHI_TRANS:%.*]] = getelementptr inbounds [257 x i32], [257 x i32]* @p, i64 0, i64 1
-; CHECK-NEXT:    [[DOTPRE:%.*]] = load i32, i32* [[IDX_PHI_TRANS]], align 4
-; CHECK-NEXT:    br label [[FOR:%.*]]
-; CHECK:       for:
-; CHECK-NEXT:    [[PRE_PHI:%.*]] = phi i32 [ [[DOTPRE]], [[PREHEADER]] ], [ [[PRE_NEXT:%.*]], [[FOR]] ]
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 1, [[PREHEADER]] ], [ [[IV_NEXT:%.*]], [[FOR]] ]
-; CHECK-NEXT:    [[ADD_1:%.*]] = add i32 [[PRE_PHI]], [[X:%.*]]
-; CHECK-NEXT:    [[ADD_2:%.*]] = add i32 [[ADD_1]], [[X]]
-; CHECK-NEXT:    [[IDX_1:%.*]] = getelementptr inbounds [257 x i32], [257 x i32]* @p, i64 0, i64 [[IV]]
-; CHECK-NEXT:    [[PRE_NEXT]] = load i32, i32* [[IDX_1]], align 4
-; CHECK-NEXT:    [[ADD_3:%.*]] = add i32 [[ADD_1]], [[PRE_NEXT]]
-; CHECK-NEXT:    [[ADD_4:%.*]] = add i32 [[ADD_2]], [[ADD_3]]
-; CHECK-NEXT:    [[IDX_2:%.*]] = getelementptr inbounds [257 x i32], [257 x i32]* @q, i64 0, i64 [[IV]]
-; CHECK-NEXT:    store i32 [[ADD_4]], i32* [[IDX_2]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 2000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[FOR]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
-;
-
-
-
+; Sink users of %pre.phi recursively.
+define void @can_sink_with_additional_user(i32 %x, i32* %ptr, i64 %tc) {
+; CHECK-LABEL: define void @can_sink_with_additional_user(
+
+; CHECK-LABEL: vector.ph:
+; CHECK-NEXT:    %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %x, i32 0
+; CHECK-NEXT:    %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    %vector.recur.init = insertelement <4 x i32> poison, i32 %.pre, i32 3
+; CHECK-NEXT:    br label %vector.body
+
+; CHECK-LABEL: vector.body:
+; CHECK-NEXT:    %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECK-NEXT:    %vector.recur = phi <4 x i32> [ %vector.recur.init, %vector.ph ], [ %wide.load, %vector.body ]
+; CHECK-NEXT:    %offset.idx = add i64 1, %index
+; CHECK-NEXT:    %0 = add i64 %offset.idx, 0
+; CHECK-NEXT:    %1 = getelementptr inbounds [257 x i32], [257 x i32]* @p, i64 0, i64 %0
+; CHECK-NEXT:    %2 = getelementptr inbounds i32, i32* %1, i32 0
+; CHECK-NEXT:    %3 = bitcast i32* %2 to <4 x i32>*
+; CHECK-NEXT:    %wide.load = load <4 x i32>, <4 x i32>* %3, align 4
+; CHECK-NEXT:    %4 = shufflevector <4 x i32> %vector.recur, <4 x i32> %wide.load, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT:    %5 = add <4 x i32> %4, %broadcast.splat
+; CHECK-NEXT:    %6 = add <4 x i32> %5, %broadcast.splat
+; CHECK-NEXT:    %7 = add <4 x i32> %5, %wide.load
+; CHECK-NEXT:    %8 = add <4 x i32> %6, %7
+; CHECK-NEXT:    %9 = getelementptr inbounds [257 x i32], [257 x i32]* @q, i64 0, i64 %0
+; CHECK-NEXT:    %10 = getelementptr inbounds i32, i32* %9, i32 0
+; CHECK-NEXT:    %11 = bitcast i32* %10 to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> %8, <4 x i32>* %11, align 4
+; CHECK-NEXT:    %index.next = add i64 %index, 4
+; CHECK-NEXT:    %12 = icmp eq i64 %index.next, 1996
+; CHECK-NEXT:    br i1 %12, label %middle.block, label %vector.body, !llvm.loop !6
 
 entry:
   br label %preheader
@@ -378,48 +379,26 @@
   ret void
 }
 
-; Users that are phi nodes cannot be sunk.
+; Some users that are phi nodes cannot be sunk.
 define void @cannot_sink_phi(i32* %ptr) {
-; CHECK-LABEL: @cannot_sink_phi(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
-; CHECK:       loop.header:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 1, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
-; CHECK-NEXT:    [[FOR:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[FOR_NEXT:%.*]], [[LOOP_LATCH]] ]
-; CHECK-NEXT:    [[C_1:%.*]] = icmp ult i64 [[IV]], 500
-; CHECK-NEXT:    br i1 [[C_1]], label [[IF_TRUEBB:%.*]], label [[IF_FALSEBB:%.*]]
-; CHECK:       if.truebb:
-; CHECK-NEXT:    br label [[LOOP_LATCH]]
-; CHECK:       if.falsebb:
-; CHECK-NEXT:    br label [[LOOP_LATCH]]
-; CHECK:       loop.latch:
-; CHECK-NEXT:    [[FIRST_TIME_1:%.*]] = phi i32 [ 20, [[IF_TRUEBB]] ], [ [[FOR]], [[IF_FALSEBB]] ]
-; CHECK-NEXT:    [[C_2:%.*]] = icmp ult i64 [[IV]], 800
-; CHECK-NEXT:    [[FOR_NEXT]] = select i1 [[C_2]], i32 30, i32 [[FIRST_TIME_1]]
-; CHECK-NEXT:    [[PTR_IDX:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i64 [[IV]]
-; CHECK-NEXT:    store i32 [[FOR_NEXT]], i32* [[PTR_IDX]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT:%.*]], label [[LOOP_HEADER]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
-;
+; CHECK-LABEL: define void @cannot_sink_phi(
+; CHECK-NOT:   vector.body
 entry:
   br label %loop.header
 
-loop.header:                                      ; preds = %if.end128, %for.cond108.preheader
+loop.header:
   %iv = phi i64 [ 1, %entry ], [ %iv.next, %loop.latch ]
   %for = phi i32 [ 0, %entry ], [ %for.next, %loop.latch ]
   %c.1 = icmp ult i64 %iv, 500
   br i1 %c.1, label %if.truebb, label %if.falsebb
 
-if.truebb:                  ; preds = %for.body114
+if.truebb:
   br label %loop.latch
 
-if.falsebb:                                       ; preds = %for.body114
+if.falsebb:
   br label %loop.latch
 
-loop.latch:                                        ; preds = %if.then122, %for.body114.if.end128_crit_edge
+loop.latch:
   %first_time.1 = phi i32 [ 20, %if.truebb ], [ %for, %if.falsebb ]
   %c.2 = icmp ult i64 %iv, 800
   %for.next = select i1 %c.2, i32 30, i32 %first_time.1
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
@@ -545,11 +545,36 @@
 ;     b[i] = ((a[i] + 2) * a[i + 1]);
 ; }
 ;
-; NO-SINK-AFTER-LABEL: no_sink_after
-; NO-SINK-AFTER-NOT:   vector.ph:
-; NO-SINK-AFTER:       }
+; SINK-AFTER-LABEL:  @sink_after_with_multiple_users
+; SINK-AFTER-LABEL:  vector.ph:
+; SINK-AFTER-NEXT:    %n.mod.vf = urem i64 %n, 4
+; SINK-AFTER-NEXT:    %n.vec = sub i64 %n, %n.mod.vf
+; SINK-AFTER-NEXT:    %vector.recur.init = insertelement <4 x i16> poison, i16 %.pre, i32 3
+; SINK-AFTER-NEXT:    br label %vector.body
+
+; SINK-AFTER-LABEL:  vector.body:
+; SINK-AFTER-NEXT:    %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; SINK-AFTER-NEXT:    %vector.recur = phi <4 x i16> [ %vector.recur.init, %vector.ph ], [ %wide.load, %vector.body ]
+; SINK-AFTER-NEXT:    %1 = add i64 %index, 0
+; SINK-AFTER-NEXT:    %2 = add nuw nsw i64 %1, 1
+; SINK-AFTER-NEXT:    %3 = getelementptr inbounds i16, i16* %a, i64 %2
+; SINK-AFTER-NEXT:    %4 = getelementptr inbounds i16, i16* %3, i32 0
+; SINK-AFTER-NEXT:    %5 = bitcast i16* %4 to <4 x i16>*
+; SINK-AFTER-NEXT:    %wide.load = load <4 x i16>, <4 x i16>* %5, align 2, !alias.scope !43
+; SINK-AFTER-NEXT:    %6 = shufflevector <4 x i16> %vector.recur, <4 x i16> %wide.load, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; SINK-AFTER-NEXT:    %7 = sext <4 x i16> %6 to <4 x i32>
+; SINK-AFTER-NEXT:    %8 = add nsw <4 x i32> %7, <i32 2, i32 2, i32 2, i32 2>
+; SINK-AFTER-NEXT:    %9 = sext <4 x i16> %wide.load to <4 x i32>
+; SINK-AFTER-NEXT:    %10 = mul nsw <4 x i32> %8, %9
+; SINK-AFTER-NEXT:    %11 = getelementptr inbounds i32, i32* %b, i64 %1
+; SINK-AFTER-NEXT:    %12 = getelementptr inbounds i32, i32* %11, i32 0
+; SINK-AFTER-NEXT:    %13 = bitcast i32* %12 to <4 x i32>*
+; SINK-AFTER-NEXT:    store <4 x i32> %10, <4 x i32>* %13, align 4, !alias.scope !46, !noalias !43
+; SINK-AFTER-NEXT:    %index.next = add i64 %index, 4
+; SINK-AFTER-NEXT:    %14 = icmp eq i64 %index.next, %n.vec
+; SINK-AFTER-NEXT:    br i1 %14, label %middle.block, label %vector.body, !llvm.loop !48
 ;
-define void @no_sink_after(i16* %a, i32* %b, i64 %n) {
+define void @sink_after_with_multiple_users(i16* %a, i32* %b, i64 %n) {
 entry:
   %.pre = load i16, i16* %a
   br label %for.body
@@ -626,7 +651,7 @@
 ; SINK-AFTER-NEXT:    %index.next = add i32 %index, 4
 ; SINK-AFTER-NEXT:    %vec.ind.next = add <4 x i16> %vec.ind, <i16 4, i16 4, i16 4, i16 4>
 ; SINK-AFTER-NEXT:    %6 = icmp eq i32 %index.next, 40
-; SINK-AFTER-NEXT:    br i1 %6, label %middle.block, label %vector.body, !llvm.loop !43
+; SINK-AFTER-NEXT:    br i1 %6, label %middle.block, label %vector.body, !llvm.loop !50
 ;
 entry:
   br label %for.cond
@@ -708,7 +733,7 @@
 ; CHECK-NEXT:    [[TMP23]] = add <4 x i32> [[VEC_PHI1]], [[TMP22]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP24:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof !45, [[LOOP46:!llvm.loop !.*]]
+; CHECK-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof !52, [[LOOP46:!llvm.loop !.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP25:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[TMP23]], <4 x i32> [[VEC_PHI1]]
 ; CHECK-NEXT:    [[TMP26:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP25]])
@@ -719,7 +744,7 @@
 ; CHECK-NEXT:    [[TMP:%.*]] = phi i32 [ undef, [[BB2]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[TMP]]
 ; CHECK:       bb2:
-; CHECK-NEXT:    br i1 undef, label [[BB1]], label [[BB2]], !prof !47, [[LOOP48:!llvm.loop !.*]]
+; CHECK-NEXT:    br i1 undef, label [[BB1]], label [[BB2]], !prof !54, [[LOOP48:!llvm.loop !.*]]
 ;
 bb:
   br label %bb2
@@ -835,7 +860,7 @@
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT3]] = add <4 x i32> [[VEC_IND2]], <i32 4, i32 4, i32 4, i32 4>
 ; CHECK-NEXT:    [[TMP39:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP39]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof !45, [[LOOP49:!llvm.loop !.*]]
+; CHECK-NEXT:    br i1 [[TMP39]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof !52, [[LOOP49:!llvm.loop !.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP40:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> [[TMP23]], <4 x i32> [[VEC_PHI4]]
 ; CHECK-NEXT:    [[TMP41:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP40]])
@@ -846,7 +871,7 @@
 ; CHECK-NEXT:    [[TMP:%.*]] = phi i32 [ undef, [[BB2]] ], [ [[TMP41]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[TMP]]
 ; CHECK:       bb2:
-; CHECK-NEXT:    br i1 undef, label [[BB1]], label [[BB2]], !prof !47, [[LOOP50:!llvm.loop !.*]]
+; CHECK-NEXT:    br i1 undef, label [[BB1]], label [[BB2]], !prof !54, [[LOOP50:!llvm.loop !.*]]
 ;
 bb:
   br label %bb2