diff --git a/llvm/include/llvm/Analysis/IVDescriptors.h b/llvm/include/llvm/Analysis/IVDescriptors.h --- a/llvm/include/llvm/Analysis/IVDescriptors.h +++ b/llvm/include/llvm/Analysis/IVDescriptors.h @@ -177,7 +177,10 @@ /// previous iteration. \p SinkAfter includes pairs of instructions where the /// first will be rescheduled to appear after the second if/when the loop is /// vectorized. It may be augmented with additional pairs if needed in order - /// to handle Phi as a first-order recurrence. + /// to handle Phi as a first-order recurrence. \p SinkAfter can contain + /// multiple instructions to sink after the same instruction. Clients have to + /// ensure the relative ordering of those instructions is preserved when + /// moving those instructions. static bool isFirstOrderRecurrence(PHINode *Phi, Loop *TheLoop, DenseMap &SinkAfter, diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp --- a/llvm/lib/Analysis/IVDescriptors.cpp +++ b/llvm/lib/Analysis/IVDescriptors.cpp @@ -701,28 +701,42 @@ // Ensure every user of the phi node is dominated by the previous value. // The dominance requirement ensures the loop vectorizer will not need to // vectorize the initial value prior to the first iteration of the loop. - // TODO: Consider extending this sinking to handle memory instructions and - // phis with multiple users. + // TODO: Consider extending this sinking to handle memory instructions. // Returns true, if all users of I are dominated by DominatedBy. - auto allUsesDominatedBy = [DT](Instruction *I, Instruction *DominatedBy) { - return all_of(I->uses(), [DT, DominatedBy](Use &U) { - return DT->dominates(DominatedBy, U); - }); + SetVector UserWorkList; + auto PushUsers = [&UserWorkList](Instruction *I) { + for (User *User : I->users()) + UserWorkList.insert(cast(User)); }; - if (Phi->hasOneUse()) { - Instruction *I = Phi->user_back(); - + // We optimistically assume we can sink all users after Previous. Keep list of + // instructions to sink after Previous. It will be applied to SinkAfter if all + // users can be sunk. + SmallVector InstrsToSink; + // Try to sink instruction I after Previous. Queue users of I for further + // processin in UserWorkList. + auto TryToSinkUser = [&](Instruction *I) { // If the user of the PHI is also the incoming value, we potentially have a // reduction and which cannot be handled by sinking. if (Previous == I) return false; + if (DT->dominates(Previous, I)) // We already are good w/o sinking. + return true; + + if (I->getParent() != Phi->getParent() || I->mayHaveSideEffects()) + return false; + // We cannot sink terminator instructions. if (I->getParent()->getTerminator() == I) return false; + // If we reach a PHI node that is not dominated by Previous, we reached a + // header PHI. No need for sinking. + if (I == Phi) + return true; + // Do not try to sink an instruction multiple times (if multiple operands // are first order recurrences). // TODO: We can support this case, by sinking the instruction after the @@ -730,19 +744,25 @@ if (SinkAfter.find(I) != SinkAfter.end()) return false; - if (DT->dominates(Previous, I)) // We already are good w/o sinking. - return true; + // We can sink I after Previous, if all users can also be sunk or are + // dominated by Previous. + InstrsToSink.push_back(I); + PushUsers(I); + return true; + }; + PushUsers(Phi); - // We can sink any instruction without side effects, as long as all users - // are dominated by the instruction we are sinking after. - if (I->getParent() == Phi->getParent() && !I->mayHaveSideEffects() && - allUsesDominatedBy(I, Previous)) { - SinkAfter[I] = Previous; - return true; - } + // Try to recursively sink instructions and their users after Previous. + for (unsigned I = 0; I < UserWorkList.size(); ++I) { + Instruction *Current = UserWorkList[I]; + if (!TryToSinkUser(Current)) + return false; } - return allUsesDominatedBy(Phi, Previous); + // We can sink all users of Previous. Update the mapping. + for (Instruction *I : InstrsToSink) + SinkAfter[I] = Previous; + return true; } /// This function returns the identity element (or neutral element) for diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7777,7 +7777,19 @@ // --------------------------------------------------------------------------- // Apply Sink-After legal constraints. - for (auto &Entry : SinkAfter) { + // Multiple instructions can be marked for sinking after the same instruction. + // Convert map to a vector of pairs and sort them by the reverse order of the + // original IR. That way the instructions that dominate others are inserted + // after them and they will come before instructions dominated by them, due to + // using moveAfter. + SmallVector, 4> SinkAfterV; + for (auto &Entry : SinkAfter) + SinkAfterV.emplace_back(Entry.first, Entry.second); + sort(SinkAfterV, [](const std::pair &A, + const std::pair &B) { + return B.first->comesBefore(A.first); + }); + for (auto &Entry : SinkAfterV) { VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); Sink->moveAfter(Target); diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll @@ -112,21 +112,37 @@ ret void } -; FIXME: Currently we can only sink a single instruction. For the example below, -; we also have to sink users. -define void @cannot_sink_with_additional_user(i32 %x, i32* %ptr, i64 %tc) { -; CHECK-LABEL: define void @cannot_sink_with_additional_user( -; CHECK-NEXT: entry: -; CHECK-NEXT: br label %preheader - -; CHECK-LABEL: preheader: ; preds = %entry -; CHECK: br label %for +; Sink users of %pre.phi recursively. +define void @can_sink_with_additional_user(i32 %x, i32* %ptr, i64 %tc) { +; CHECK-LABEL: define void @can_sink_with_additional_user( -; CHECK-LABEL: for: ; preds = %for, %preheader -; CHECK: br i1 %exitcond, label %exit, label %for +; CHECK-LABEL: vector.ph: +; CHECK-NEXT: %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %x, i32 0 +; CHECK-NEXT: %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: %vector.recur.init = insertelement <4 x i32> undef, i32 %.pre, i32 3 +; CHECK-NEXT: br label %vector.body -; CHECK-LABEL: exit: -; CHECK-NEXT: ret void +; CHECK-LABEL: vector.body: +; CHECK-NEXT: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] +; CHECK-NEXT: %vector.recur = phi <4 x i32> [ %vector.recur.init, %vector.ph ], [ %wide.load, %vector.body ] +; CHECK-NEXT: %offset.idx = add i64 1, %index +; CHECK-NEXT: %0 = add i64 %offset.idx, 0 +; CHECK-NEXT: %1 = getelementptr inbounds [257 x i32], [257 x i32]* @p, i64 0, i64 %0 +; CHECK-NEXT: %2 = getelementptr inbounds i32, i32* %1, i32 0 +; CHECK-NEXT: %3 = bitcast i32* %2 to <4 x i32>* +; CHECK-NEXT: %wide.load = load <4 x i32>, <4 x i32>* %3, align 4 +; CHECK-NEXT: %4 = shufflevector <4 x i32> %vector.recur, <4 x i32> %wide.load, <4 x i32> +; CHECK-NEXT: %5 = add <4 x i32> %4, %broadcast.splat +; CHECK-NEXT: %6 = add <4 x i32> %5, %broadcast.splat +; CHECK-NEXT: %7 = add <4 x i32> %5, %wide.load +; CHECK-NEXT: %8 = add <4 x i32> %6, %7 +; CHECK-NEXT: %9 = getelementptr inbounds [257 x i32], [257 x i32]* @q, i64 0, i64 %0 +; CHECK-NEXT: %10 = getelementptr inbounds i32, i32* %9, i32 0 +; CHECK-NEXT: %11 = bitcast i32* %10 to <4 x i32>* +; CHECK-NEXT: store <4 x i32> %8, <4 x i32>* %11, align 4 +; CHECK-NEXT: %index.next = add i64 %index, 4 +; CHECK-NEXT: %12 = icmp eq i64 %index.next, 1996 +; CHECK-NEXT: br i1 %12, label %middle.block, label %vector.body, !llvm.loop !6 entry: br label %preheader @@ -267,3 +283,36 @@ bb74: ; preds = %bb13 ret void } + +; Users that are phi nodes cannot be sunk. +define void @cannot_sink_phi(i32* %ptr) { +; CHECK-LABEL: define void @cannot_sink_phi( +; CHECK-NOT: vector.body +entry: + br label %loop.header + +loop.header: ; preds = %if.end128, %for.cond108.preheader + %iv = phi i64 [ 1, %entry ], [ %iv.next, %loop.latch ] + %for = phi i32 [ 0, %entry ], [ %for.next, %loop.latch ] + %c.1 = icmp ult i64 %iv, 500 + br i1 %c.1, label %if.truebb, label %if.falsebb + +if.truebb: ; preds = %for.body114 + br label %loop.latch + +if.falsebb: ; preds = %for.body114 + br label %loop.latch + +loop.latch: ; preds = %if.then122, %for.body114.if.end128_crit_edge + %first_time.1 = phi i32 [ 20, %if.truebb ], [ %for, %if.falsebb ] + %c.2 = icmp ult i64 %iv, 800 + %for.next = select i1 %c.2, i32 30, i32 %first_time.1 + %ptr.idx = getelementptr i32, i32* %ptr, i64 %iv + store i32 %for.next, i32* %ptr.idx + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1000 + br i1 %exitcond.not, label %exit, label %loop.header + +exit: + ret void +} diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll @@ -545,11 +545,36 @@ ; b[i] = ((a[i] + 2) * a[i + 1]); ; } ; -; NO-SINK-AFTER-LABEL: no_sink_after -; NO-SINK-AFTER-NOT: vector.ph: -; NO-SINK-AFTER: } +; SINK-AFTER-LABEL: @sink_after_with_multiple_users +; SINK-AFTER-LABEL: vector.ph: +; SINK-AFTER-NEXT: %n.mod.vf = urem i64 %n, 4 +; SINK-AFTER-NEXT: %n.vec = sub i64 %n, %n.mod.vf +; SINK-AFTER-NEXT: %vector.recur.init = insertelement <4 x i16> undef, i16 %.pre, i32 3 +; SINK-AFTER-NEXT: br label %vector.body + +; SINK-AFTER-LABEL: vector.body: +; SINK-AFTER-NEXT: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] +; SINK-AFTER-NEXT: %vector.recur = phi <4 x i16> [ %vector.recur.init, %vector.ph ], [ %wide.load, %vector.body ] +; SINK-AFTER-NEXT: %1 = add i64 %index, 0 +; SINK-AFTER-NEXT: %2 = add nuw nsw i64 %1, 1 +; SINK-AFTER-NEXT: %3 = getelementptr inbounds i16, i16* %a, i64 %2 +; SINK-AFTER-NEXT: %4 = getelementptr inbounds i16, i16* %3, i32 0 +; SINK-AFTER-NEXT: %5 = bitcast i16* %4 to <4 x i16>* +; SINK-AFTER-NEXT: %wide.load = load <4 x i16>, <4 x i16>* %5, align 2, !alias.scope !43 +; SINK-AFTER-NEXT: %6 = shufflevector <4 x i16> %vector.recur, <4 x i16> %wide.load, <4 x i32> +; SINK-AFTER-NEXT: %7 = sext <4 x i16> %6 to <4 x i32> +; SINK-AFTER-NEXT: %8 = add nsw <4 x i32> %7, +; SINK-AFTER-NEXT: %9 = sext <4 x i16> %wide.load to <4 x i32> +; SINK-AFTER-NEXT: %10 = mul nsw <4 x i32> %8, %9 +; SINK-AFTER-NEXT: %11 = getelementptr inbounds i32, i32* %b, i64 %1 +; SINK-AFTER-NEXT: %12 = getelementptr inbounds i32, i32* %11, i32 0 +; SINK-AFTER-NEXT: %13 = bitcast i32* %12 to <4 x i32>* +; SINK-AFTER-NEXT: store <4 x i32> %10, <4 x i32>* %13, align 4, !alias.scope !46, !noalias !43 +; SINK-AFTER-NEXT: %index.next = add i64 %index, 4 +; SINK-AFTER-NEXT: %14 = icmp eq i64 %index.next, %n.vec +; SINK-AFTER-NEXT: br i1 %14, label %middle.block, label %vector.body, !llvm.loop !48 ; -define void @no_sink_after(i16* %a, i32* %b, i64 %n) { +define void @sink_after_with_multiple_users(i16* %a, i32* %b, i64 %n) { entry: %.pre = load i16, i16* %a br label %for.body @@ -626,7 +651,7 @@ ; SINK-AFTER-NEXT: %index.next = add i32 %index, 4 ; SINK-AFTER-NEXT: %vec.ind.next = add <4 x i16> %vec.ind, ; SINK-AFTER-NEXT: %6 = icmp eq i32 %index.next, 40 -; SINK-AFTER-NEXT: br i1 %6, label %middle.block, label %vector.body, !llvm.loop !43 +; SINK-AFTER-NEXT: br i1 %6, label %middle.block, label %vector.body, !llvm.loop !50 ; entry: br label %for.cond