diff --git a/llvm/include/llvm/Analysis/IVDescriptors.h b/llvm/include/llvm/Analysis/IVDescriptors.h --- a/llvm/include/llvm/Analysis/IVDescriptors.h +++ b/llvm/include/llvm/Analysis/IVDescriptors.h @@ -177,7 +177,10 @@ /// previous iteration. \p SinkAfter includes pairs of instructions where the /// first will be rescheduled to appear after the second if/when the loop is /// vectorized. It may be augmented with additional pairs if needed in order - /// to handle Phi as a first-order recurrence. + /// to handle Phi as a first-order recurrence. \p SinkAfter can contain + /// multiple instructions to sink after the same instruction. Clients have to + /// ensure the relative ordering of those instructions is preserved when + /// moving those instructions. static bool isFirstOrderRecurrence(PHINode *Phi, Loop *TheLoop, DenseMap &SinkAfter, diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp --- a/llvm/lib/Analysis/IVDescriptors.cpp +++ b/llvm/lib/Analysis/IVDescriptors.cpp @@ -703,24 +703,39 @@ // phis with multiple users. // Returns true, if all users of I are dominated by DominatedBy. - auto allUsesDominatedBy = [DT](Instruction *I, Instruction *DominatedBy) { - return all_of(I->uses(), [DT, DominatedBy](Use &U) { - return DT->dominates(DominatedBy, U); - }); + SetVector UserWorkList; + auto PushUsers = [&UserWorkList](Instruction *I) { + for (User *User : I->users()) + UserWorkList.insert(cast(User)); }; - if (Phi->hasOneUse()) { - Instruction *I = Phi->user_back(); - + // We optimistically assume we can sink all users after Previous. Keep list of + // instructions to sink after Previous. It will be applied to SinkAfter if all + // users can be sunk. + SmallVector InstrsToSink; + // Try to sink instruction I after Previous. Queue users of I for further + // processin in UserWorkList. + auto TryToSinkUser = [&](Instruction *I) { // If the user of the PHI is also the incoming value, we potentially have a // reduction and which cannot be handled by sinking. if (Previous == I) return false; + if (DT->dominates(Previous, I)) // We already are good w/o sinking. + return true; + // We cannot sink terminator instructions. if (I->getParent()->getTerminator() == I) return false; + // If we reach a PHI node that is not dominated by Previous, we reached a + // header PHI. No need for sinking. + if (isa(I)) + return true; + + if (I->getParent() != Phi->getParent() || I->mayHaveSideEffects()) + return false; + // Do not try to sink an instruction multiple times (if multiple operands // are first order recurrences). // TODO: We can support this case, by sinking the instruction after the @@ -728,19 +743,25 @@ if (SinkAfter.find(I) != SinkAfter.end()) return false; - if (DT->dominates(Previous, I)) // We already are good w/o sinking. - return true; + // We can sink I after Previous, if all users can also be sunk or are + // dominated by Previous. + InstrsToSink.push_back(I); + PushUsers(I); + return true; + }; + PushUsers(Phi); - // We can sink any instruction without side effects, as long as all users - // are dominated by the instruction we are sinking after. - if (I->getParent() == Phi->getParent() && !I->mayHaveSideEffects() && - allUsesDominatedBy(I, Previous)) { - SinkAfter[I] = Previous; - return true; - } + // Try to recursively sink instructions and their users after Previous. + for (unsigned I = 0; I < UserWorkList.size(); ++I) { + Instruction *Current = UserWorkList[I]; + if (!TryToSinkUser(Current)) + return false; } - return allUsesDominatedBy(Phi, Previous); + // We can sink all users of Previous. Update the mapping. + for (Instruction *I : InstrsToSink) + SinkAfter[I] = Previous; + return true; } /// This function returns the identity element (or neutral element) for diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7436,7 +7436,19 @@ // --------------------------------------------------------------------------- // Apply Sink-After legal constraints. - for (auto &Entry : SinkAfter) { + // Multiple instructions can be marked for sinking after the same instruction. + // Convert map to a vector of pairs and sort them by the reverse order of the + // original IR. That way the instructions that dominate others are inserted + // after them and they will come before instructions dominated by them, due to + // using moveAfter. + SmallVector, 4> SinkAfterV; + for (auto &Entry : SinkAfter) + SinkAfterV.emplace_back(Entry.first, Entry.second); + sort(SinkAfterV, [](const std::pair &A, + const std::pair &B) { + return B.first->comesBefore(A.first); + }); + for (auto &Entry : SinkAfterV) { VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); Sink->moveAfter(Target); diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll @@ -112,21 +112,37 @@ ret void } -; FIXME: Currently we can only sink a single instruction. For the example below, -; we also have to sink users. -define void @cannot_sink_with_additional_user(i32 %x, i32* %ptr, i64 %tc) { -; CHECK-LABEL: define void @cannot_sink_with_additional_user( -; CHECK-NEXT: entry: -; CHECK-NEXT: br label %preheader - -; CHECK-LABEL: preheader: ; preds = %entry -; CHECK: br label %for +; Sink users of %pre.phi recursively. +define void @can_sink_with_additional_user(i32 %x, i32* %ptr, i64 %tc) { +; CHECK-LABEL: define void @can_sink_with_additional_user( -; CHECK-LABEL: for: ; preds = %for, %preheader -; CHECK: br i1 %exitcond, label %exit, label %for +; CHECK-LABEL: vector.ph: +; CHECK-NEXT: %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %x, i32 0 +; CHECK-NEXT: %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: %vector.recur.init = insertelement <4 x i32> undef, i32 %.pre, i32 3 +; CHECK-NEXT: br label %vector.body -; CHECK-LABEL: exit: -; CHECK-NEXT: ret void +; CHECK-LABEL: vector.body: +; CHECK-NEXT: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] +; CHECK-NEXT: %vector.recur = phi <4 x i32> [ %vector.recur.init, %vector.ph ], [ %wide.load, %vector.body ] +; CHECK-NEXT: %offset.idx = add i64 1, %index +; CHECK-NEXT: %0 = add i64 %offset.idx, 0 +; CHECK-NEXT: %1 = getelementptr inbounds [257 x i32], [257 x i32]* @p, i64 0, i64 %0 +; CHECK-NEXT: %2 = getelementptr inbounds i32, i32* %1, i32 0 +; CHECK-NEXT: %3 = bitcast i32* %2 to <4 x i32>* +; CHECK-NEXT: %wide.load = load <4 x i32>, <4 x i32>* %3, align 4 +; CHECK-NEXT: %4 = shufflevector <4 x i32> %vector.recur, <4 x i32> %wide.load, <4 x i32> +; CHECK-NEXT: %5 = add <4 x i32> %4, %broadcast.splat +; CHECK-NEXT: %6 = add <4 x i32> %5, %broadcast.splat +; CHECK-NEXT: %7 = add <4 x i32> %5, %wide.load +; CHECK-NEXT: %8 = add <4 x i32> %6, %7 +; CHECK-NEXT: %9 = getelementptr inbounds [257 x i32], [257 x i32]* @q, i64 0, i64 %0 +; CHECK-NEXT: %10 = getelementptr inbounds i32, i32* %9, i32 0 +; CHECK-NEXT: %11 = bitcast i32* %10 to <4 x i32>* +; CHECK-NEXT: store <4 x i32> %8, <4 x i32>* %11, align 4 +; CHECK-NEXT: %index.next = add i64 %index, 4 +; CHECK-NEXT: %12 = icmp eq i64 %index.next, 1996 +; CHECK-NEXT: br i1 %12, label %middle.block, label %vector.body, !llvm.loop !6 entry: br label %preheader diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll @@ -545,11 +545,36 @@ ; b[i] = ((a[i] + 2) * a[i + 1]); ; } ; -; NO-SINK-AFTER-LABEL: no_sink_after -; NO-SINK-AFTER-NOT: vector.ph: -; NO-SINK-AFTER: } +; SINK-AFTER-LABEL: @sink_after_with_multiple_users +; SINK-AFTER-LABEL: vector.ph: +; SINK-AFTER-NEXT: %n.mod.vf = urem i64 %n, 4 +; SINK-AFTER-NEXT: %n.vec = sub i64 %n, %n.mod.vf +; SINK-AFTER-NEXT: %vector.recur.init = insertelement <4 x i16> undef, i16 %.pre, i32 3 +; SINK-AFTER-NEXT: br label %vector.body + +; SINK-AFTER-LABEL: vector.body: +; SINK-AFTER-NEXT: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] +; SINK-AFTER-NEXT: %vector.recur = phi <4 x i16> [ %vector.recur.init, %vector.ph ], [ %wide.load, %vector.body ] +; SINK-AFTER-NEXT: %1 = add i64 %index, 0 +; SINK-AFTER-NEXT: %2 = add nuw nsw i64 %1, 1 +; SINK-AFTER-NEXT: %3 = getelementptr inbounds i16, i16* %a, i64 %2 +; SINK-AFTER-NEXT: %4 = getelementptr inbounds i16, i16* %3, i32 0 +; SINK-AFTER-NEXT: %5 = bitcast i16* %4 to <4 x i16>* +; SINK-AFTER-NEXT: %wide.load = load <4 x i16>, <4 x i16>* %5, align 2, !alias.scope !43 +; SINK-AFTER-NEXT: %6 = shufflevector <4 x i16> %vector.recur, <4 x i16> %wide.load, <4 x i32> +; SINK-AFTER-NEXT: %7 = sext <4 x i16> %6 to <4 x i32> +; SINK-AFTER-NEXT: %8 = add nsw <4 x i32> %7, +; SINK-AFTER-NEXT: %9 = sext <4 x i16> %wide.load to <4 x i32> +; SINK-AFTER-NEXT: %10 = mul nsw <4 x i32> %8, %9 +; SINK-AFTER-NEXT: %11 = getelementptr inbounds i32, i32* %b, i64 %1 +; SINK-AFTER-NEXT: %12 = getelementptr inbounds i32, i32* %11, i32 0 +; SINK-AFTER-NEXT: %13 = bitcast i32* %12 to <4 x i32>* +; SINK-AFTER-NEXT: store <4 x i32> %10, <4 x i32>* %13, align 4, !alias.scope !46, !noalias !43 +; SINK-AFTER-NEXT: %index.next = add i64 %index, 4 +; SINK-AFTER-NEXT: %14 = icmp eq i64 %index.next, %n.vec +; SINK-AFTER-NEXT: br i1 %14, label %middle.block, label %vector.body, !llvm.loop !48 ; -define void @no_sink_after(i16* %a, i32* %b, i64 %n) { +define void @sink_after_with_multiple_users(i16* %a, i32* %b, i64 %n) { entry: %.pre = load i16, i16* %a br label %for.body @@ -626,7 +651,7 @@ ; SINK-AFTER-NEXT: %index.next = add i32 %index, 4 ; SINK-AFTER-NEXT: %vec.ind.next = add <4 x i16> %vec.ind, ; SINK-AFTER-NEXT: %6 = icmp eq i32 %index.next, 40 -; SINK-AFTER-NEXT: br i1 %6, label %middle.block, label %vector.body, !llvm.loop !43 +; SINK-AFTER-NEXT: br i1 %6, label %middle.block, label %vector.body, !llvm.loop !50 ; entry: br label %for.cond