diff --git a/llvm/include/llvm/Analysis/IVDescriptors.h b/llvm/include/llvm/Analysis/IVDescriptors.h --- a/llvm/include/llvm/Analysis/IVDescriptors.h +++ b/llvm/include/llvm/Analysis/IVDescriptors.h @@ -171,7 +171,10 @@ /// previous iteration. \p SinkAfter includes pairs of instructions where the /// first will be rescheduled to appear after the second if/when the loop is /// vectorized. It may be augmented with additional pairs if needed in order - /// to handle Phi as a first-order recurrence. + /// to handle Phi as a first-order recurrence. \p SinkAfter can contain + /// multiple instructions to sink after the same instruction. Clients have to + /// ensure the relative ordering of those instructions is preserved when + /// moving those instructions. static bool isFirstOrderRecurrence(PHINode *Phi, Loop *TheLoop, DenseMap &SinkAfter, diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp --- a/llvm/lib/Analysis/IVDescriptors.cpp +++ b/llvm/lib/Analysis/IVDescriptors.cpp @@ -744,28 +744,41 @@ // Ensure every user of the phi node is dominated by the previous value. // The dominance requirement ensures the loop vectorizer will not need to // vectorize the initial value prior to the first iteration of the loop. - // TODO: Consider extending this sinking to handle memory instructions and - // phis with multiple users. - - // Returns true, if all users of I are dominated by DominatedBy. - auto allUsesDominatedBy = [DT](Instruction *I, Instruction *DominatedBy) { - return all_of(I->uses(), [DT, DominatedBy](Use &U) { - return DT->dominates(DominatedBy, U); - }); - }; + // TODO: Consider extending this sinking to handle memory instructions. - if (Phi->hasOneUse()) { - Instruction *I = Phi->user_back(); + SetVector UserWorkList; + auto PushUsers = [&UserWorkList](Instruction *I) { + for (User *User : I->users()) + UserWorkList.insert(cast(User)); + }; + // We optimistically assume we can sink all users after Previous. Keep list of + // instructions to sink after Previous. It will be applied to SinkAfter if all + // users can be sunk. + SmallVector InstrsToSink; + // Try to sink instruction I after Previous. Queue users of I for further + // processin in UserWorkList. + auto TryToSinkUser = [&](Instruction *I) { // If the user of the PHI is also the incoming value, we potentially have a // reduction and which cannot be handled by sinking. if (Previous == I) return false; + if (DT->dominates(Previous, I)) // We already are good w/o sinking. + return true; + + if (I->getParent() != Phi->getParent() || I->mayHaveSideEffects()) + return false; + // We cannot sink terminator instructions. if (I->getParent()->getTerminator() == I) return false; + // If we reach a PHI node that is not dominated by Previous, we reached a + // header PHI. No need for sinking. + if (isa(I)) + return true; + // Do not try to sink an instruction multiple times (if multiple operands // are first order recurrences). // TODO: We can support this case, by sinking the instruction after the @@ -773,19 +786,26 @@ if (SinkAfter.find(I) != SinkAfter.end()) return false; - if (DT->dominates(Previous, I)) // We already are good w/o sinking. - return true; + // We can sink I after Previous, if all users can also be sunk or are + // dominated by Previous. + InstrsToSink.push_back(I); + PushUsers(I); + return true; + }; + PushUsers(Phi); - // We can sink any instruction without side effects, as long as all users - // are dominated by the instruction we are sinking after. - if (I->getParent() == Phi->getParent() && !I->mayHaveSideEffects() && - allUsesDominatedBy(I, Previous)) { - SinkAfter[I] = Previous; - return true; - } + // Try to recursively sink instructions and their users after Previous. We try + // to sink each instruction exactly once. + for (unsigned I = 0; I < UserWorkList.size(); ++I) { + Instruction *Current = UserWorkList[I]; + if (!TryToSinkUser(Current)) + return false; } - return allUsesDominatedBy(Phi, Previous); + // We can sink all users of Phi. Update the mapping. + for (Instruction *I : InstrsToSink) + SinkAfter[I] = Previous; + return true; } /// This function returns the identity element (or neutral element) for diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -9116,7 +9116,19 @@ // --------------------------------------------------------------------------- // Apply Sink-After legal constraints. - for (auto &Entry : SinkAfter) { + // Multiple instructions can be marked for sinking after the same instruction. + // Convert map to a vector of pairs and sort them by the reverse order of the + // original IR. That way the instructions that dominate others are inserted + // after them and they will come before instructions dominated by them, due to + // using moveAfter. + SmallVector, 4> SinkAfterV; + for (const auto &Entry : SinkAfter) + SinkAfterV.emplace_back(Entry.first, Entry.second); + sort(SinkAfterV, [](const std::pair &A, + const std::pair &B) { + return B.first->comesBefore(A.first); + }); + for (auto &Entry : SinkAfterV) { VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll @@ -173,36 +173,37 @@ ret void } -; FIXME: Currently we can only sink a single instruction. For the example below, -; we also have to sink users. -define void @cannot_sink_with_additional_user(i32 %x, i32* %ptr, i64 %tc) { -; CHECK-LABEL: @cannot_sink_with_additional_user( -; CHECK-NEXT: entry: -; CHECK-NEXT: br label [[PREHEADER:%.*]] -; CHECK: preheader: -; CHECK-NEXT: [[IDX_PHI_TRANS:%.*]] = getelementptr inbounds [257 x i32], [257 x i32]* @p, i64 0, i64 1 -; CHECK-NEXT: [[DOTPRE:%.*]] = load i32, i32* [[IDX_PHI_TRANS]], align 4 -; CHECK-NEXT: br label [[FOR:%.*]] -; CHECK: for: -; CHECK-NEXT: [[PRE_PHI:%.*]] = phi i32 [ [[DOTPRE]], [[PREHEADER]] ], [ [[PRE_NEXT:%.*]], [[FOR]] ] -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 1, [[PREHEADER]] ], [ [[IV_NEXT:%.*]], [[FOR]] ] -; CHECK-NEXT: [[ADD_1:%.*]] = add i32 [[PRE_PHI]], [[X:%.*]] -; CHECK-NEXT: [[ADD_2:%.*]] = add i32 [[ADD_1]], [[X]] -; CHECK-NEXT: [[IDX_1:%.*]] = getelementptr inbounds [257 x i32], [257 x i32]* @p, i64 0, i64 [[IV]] -; CHECK-NEXT: [[PRE_NEXT]] = load i32, i32* [[IDX_1]], align 4 -; CHECK-NEXT: [[ADD_3:%.*]] = add i32 [[ADD_1]], [[PRE_NEXT]] -; CHECK-NEXT: [[ADD_4:%.*]] = add i32 [[ADD_2]], [[ADD_3]] -; CHECK-NEXT: [[IDX_2:%.*]] = getelementptr inbounds [257 x i32], [257 x i32]* @q, i64 0, i64 [[IV]] -; CHECK-NEXT: store i32 [[ADD_4]], i32* [[IDX_2]], align 4 -; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 2000 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[FOR]] -; CHECK: exit: -; CHECK-NEXT: ret void -; - - - +; Sink users of %pre.phi recursively. +define void @can_sink_with_additional_user(i32 %x, i32* %ptr, i64 %tc) { +; CHECK-LABEL: define void @can_sink_with_additional_user( + +; CHECK-LABEL: vector.ph: +; CHECK-NEXT: %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %x, i32 0 +; CHECK-NEXT: %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: %vector.recur.init = insertelement <4 x i32> poison, i32 %.pre, i32 3 +; CHECK-NEXT: br label %vector.body + +; CHECK-LABEL: vector.body: +; CHECK-NEXT: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] +; CHECK-NEXT: %vector.recur = phi <4 x i32> [ %vector.recur.init, %vector.ph ], [ %wide.load, %vector.body ] +; CHECK-NEXT: %offset.idx = add i64 1, %index +; CHECK-NEXT: %0 = add i64 %offset.idx, 0 +; CHECK-NEXT: %1 = getelementptr inbounds [257 x i32], [257 x i32]* @p, i64 0, i64 %0 +; CHECK-NEXT: %2 = getelementptr inbounds i32, i32* %1, i32 0 +; CHECK-NEXT: %3 = bitcast i32* %2 to <4 x i32>* +; CHECK-NEXT: %wide.load = load <4 x i32>, <4 x i32>* %3, align 4 +; CHECK-NEXT: %4 = shufflevector <4 x i32> %vector.recur, <4 x i32> %wide.load, <4 x i32> +; CHECK-NEXT: %5 = add <4 x i32> %4, %broadcast.splat +; CHECK-NEXT: %6 = add <4 x i32> %5, %broadcast.splat +; CHECK-NEXT: %7 = add <4 x i32> %5, %wide.load +; CHECK-NEXT: %8 = add <4 x i32> %6, %7 +; CHECK-NEXT: %9 = getelementptr inbounds [257 x i32], [257 x i32]* @q, i64 0, i64 %0 +; CHECK-NEXT: %10 = getelementptr inbounds i32, i32* %9, i32 0 +; CHECK-NEXT: %11 = bitcast i32* %10 to <4 x i32>* +; CHECK-NEXT: store <4 x i32> %8, <4 x i32>* %11, align 4 +; CHECK-NEXT: %index.next = add i64 %index, 4 +; CHECK-NEXT: %12 = icmp eq i64 %index.next, 1996 +; CHECK-NEXT: br i1 %12, label %middle.block, label %vector.body, !llvm.loop !6 entry: br label %preheader @@ -378,48 +379,26 @@ ret void } -; Users that are phi nodes cannot be sunk. +; Some users that are phi nodes cannot be sunk. define void @cannot_sink_phi(i32* %ptr) { -; CHECK-LABEL: @cannot_sink_phi( -; CHECK-NEXT: entry: -; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] -; CHECK: loop.header: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 1, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] -; CHECK-NEXT: [[FOR:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[FOR_NEXT:%.*]], [[LOOP_LATCH]] ] -; CHECK-NEXT: [[C_1:%.*]] = icmp ult i64 [[IV]], 500 -; CHECK-NEXT: br i1 [[C_1]], label [[IF_TRUEBB:%.*]], label [[IF_FALSEBB:%.*]] -; CHECK: if.truebb: -; CHECK-NEXT: br label [[LOOP_LATCH]] -; CHECK: if.falsebb: -; CHECK-NEXT: br label [[LOOP_LATCH]] -; CHECK: loop.latch: -; CHECK-NEXT: [[FIRST_TIME_1:%.*]] = phi i32 [ 20, [[IF_TRUEBB]] ], [ [[FOR]], [[IF_FALSEBB]] ] -; CHECK-NEXT: [[C_2:%.*]] = icmp ult i64 [[IV]], 800 -; CHECK-NEXT: [[FOR_NEXT]] = select i1 [[C_2]], i32 30, i32 [[FIRST_TIME_1]] -; CHECK-NEXT: [[PTR_IDX:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i64 [[IV]] -; CHECK-NEXT: store i32 [[FOR_NEXT]], i32* [[PTR_IDX]], align 4 -; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 -; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1000 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT:%.*]], label [[LOOP_HEADER]] -; CHECK: exit: -; CHECK-NEXT: ret void -; +; CHECK-LABEL: define void @cannot_sink_phi( +; CHECK-NOT: vector.body entry: br label %loop.header -loop.header: ; preds = %if.end128, %for.cond108.preheader +loop.header: %iv = phi i64 [ 1, %entry ], [ %iv.next, %loop.latch ] %for = phi i32 [ 0, %entry ], [ %for.next, %loop.latch ] %c.1 = icmp ult i64 %iv, 500 br i1 %c.1, label %if.truebb, label %if.falsebb -if.truebb: ; preds = %for.body114 +if.truebb: br label %loop.latch -if.falsebb: ; preds = %for.body114 +if.falsebb: br label %loop.latch -loop.latch: ; preds = %if.then122, %for.body114.if.end128_crit_edge +loop.latch: %first_time.1 = phi i32 [ 20, %if.truebb ], [ %for, %if.falsebb ] %c.2 = icmp ult i64 %iv, 800 %for.next = select i1 %c.2, i32 30, i32 %first_time.1 diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll @@ -545,11 +545,36 @@ ; b[i] = ((a[i] + 2) * a[i + 1]); ; } ; -; NO-SINK-AFTER-LABEL: no_sink_after -; NO-SINK-AFTER-NOT: vector.ph: -; NO-SINK-AFTER: } +; SINK-AFTER-LABEL: @sink_after_with_multiple_users +; SINK-AFTER-LABEL: vector.ph: +; SINK-AFTER-NEXT: %n.mod.vf = urem i64 %n, 4 +; SINK-AFTER-NEXT: %n.vec = sub i64 %n, %n.mod.vf +; SINK-AFTER-NEXT: %vector.recur.init = insertelement <4 x i16> poison, i16 %.pre, i32 3 +; SINK-AFTER-NEXT: br label %vector.body + +; SINK-AFTER-LABEL: vector.body: +; SINK-AFTER-NEXT: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] +; SINK-AFTER-NEXT: %vector.recur = phi <4 x i16> [ %vector.recur.init, %vector.ph ], [ %wide.load, %vector.body ] +; SINK-AFTER-NEXT: %1 = add i64 %index, 0 +; SINK-AFTER-NEXT: %2 = add nuw nsw i64 %1, 1 +; SINK-AFTER-NEXT: %3 = getelementptr inbounds i16, i16* %a, i64 %2 +; SINK-AFTER-NEXT: %4 = getelementptr inbounds i16, i16* %3, i32 0 +; SINK-AFTER-NEXT: %5 = bitcast i16* %4 to <4 x i16>* +; SINK-AFTER-NEXT: %wide.load = load <4 x i16>, <4 x i16>* %5, align 2, !alias.scope !43 +; SINK-AFTER-NEXT: %6 = shufflevector <4 x i16> %vector.recur, <4 x i16> %wide.load, <4 x i32> +; SINK-AFTER-NEXT: %7 = sext <4 x i16> %6 to <4 x i32> +; SINK-AFTER-NEXT: %8 = add nsw <4 x i32> %7, +; SINK-AFTER-NEXT: %9 = sext <4 x i16> %wide.load to <4 x i32> +; SINK-AFTER-NEXT: %10 = mul nsw <4 x i32> %8, %9 +; SINK-AFTER-NEXT: %11 = getelementptr inbounds i32, i32* %b, i64 %1 +; SINK-AFTER-NEXT: %12 = getelementptr inbounds i32, i32* %11, i32 0 +; SINK-AFTER-NEXT: %13 = bitcast i32* %12 to <4 x i32>* +; SINK-AFTER-NEXT: store <4 x i32> %10, <4 x i32>* %13, align 4, !alias.scope !46, !noalias !43 +; SINK-AFTER-NEXT: %index.next = add i64 %index, 4 +; SINK-AFTER-NEXT: %14 = icmp eq i64 %index.next, %n.vec +; SINK-AFTER-NEXT: br i1 %14, label %middle.block, label %vector.body, !llvm.loop !48 ; -define void @no_sink_after(i16* %a, i32* %b, i64 %n) { +define void @sink_after_with_multiple_users(i16* %a, i32* %b, i64 %n) { entry: %.pre = load i16, i16* %a br label %for.body @@ -626,7 +651,7 @@ ; SINK-AFTER-NEXT: %index.next = add i32 %index, 4 ; SINK-AFTER-NEXT: %vec.ind.next = add <4 x i16> %vec.ind, ; SINK-AFTER-NEXT: %6 = icmp eq i32 %index.next, 40 -; SINK-AFTER-NEXT: br i1 %6, label %middle.block, label %vector.body, !llvm.loop !43 +; SINK-AFTER-NEXT: br i1 %6, label %middle.block, label %vector.body, !llvm.loop !50 ; entry: br label %for.cond @@ -708,7 +733,7 @@ ; CHECK-NEXT: [[TMP23]] = add <4 x i32> [[VEC_PHI1]], [[TMP22]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 ; CHECK-NEXT: [[TMP24:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof !45, [[LOOP46:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof !52, [[LOOP46:!llvm.loop !.*]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP25:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[TMP23]], <4 x i32> [[VEC_PHI1]] ; CHECK-NEXT: [[TMP26:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP25]]) @@ -719,7 +744,7 @@ ; CHECK-NEXT: [[TMP:%.*]] = phi i32 [ undef, [[BB2]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[TMP]] ; CHECK: bb2: -; CHECK-NEXT: br i1 undef, label [[BB1]], label [[BB2]], !prof !47, [[LOOP48:!llvm.loop !.*]] +; CHECK-NEXT: br i1 undef, label [[BB1]], label [[BB2]], !prof !54, [[LOOP48:!llvm.loop !.*]] ; bb: br label %bb2 @@ -835,7 +860,7 @@ ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT3]] = add <4 x i32> [[VEC_IND2]], ; CHECK-NEXT: [[TMP39:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP39]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof !45, [[LOOP49:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[TMP39]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof !52, [[LOOP49:!llvm.loop !.*]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP40:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> [[TMP23]], <4 x i32> [[VEC_PHI4]] ; CHECK-NEXT: [[TMP41:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP40]]) @@ -846,7 +871,7 @@ ; CHECK-NEXT: [[TMP:%.*]] = phi i32 [ undef, [[BB2]] ], [ [[TMP41]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[TMP]] ; CHECK: bb2: -; CHECK-NEXT: br i1 undef, label [[BB1]], label [[BB2]], !prof !47, [[LOOP50:!llvm.loop !.*]] +; CHECK-NEXT: br i1 undef, label [[BB1]], label [[BB2]], !prof !54, [[LOOP50:!llvm.loop !.*]] ; bb: br label %bb2