diff --git a/llvm/include/llvm/Analysis/IVDescriptors.h b/llvm/include/llvm/Analysis/IVDescriptors.h --- a/llvm/include/llvm/Analysis/IVDescriptors.h +++ b/llvm/include/llvm/Analysis/IVDescriptors.h @@ -14,6 +14,7 @@ #define LLVM_ANALYSIS_IVDESCRIPTORS_H #include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/MapVector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" @@ -174,7 +175,7 @@ /// to handle Phi as a first-order recurrence. static bool isFirstOrderRecurrence(PHINode *Phi, Loop *TheLoop, - DenseMap &SinkAfter, + MapVector &SinkAfter, DominatorTree *DT); RecurKind getRecurrenceKind() const { return Kind; } diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h --- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h +++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h @@ -295,7 +295,7 @@ RecurrenceSet &getFirstOrderRecurrences() { return FirstOrderRecurrences; } /// Return the set of instructions to sink to handle first-order recurrences. - DenseMap &getSinkAfter() { return SinkAfter; } + MapVector &getSinkAfter() { return SinkAfter; } /// Returns the widest induction type. Type *getWidestInductionType() { return WidestIndTy; } @@ -518,7 +518,7 @@ /// Holds instructions that need to sink past other instructions to handle /// first-order recurrences. - DenseMap SinkAfter; + MapVector SinkAfter; /// Holds the widest induction type encountered. Type *WidestIndTy = nullptr; diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp --- a/llvm/lib/Analysis/IVDescriptors.cpp +++ b/llvm/lib/Analysis/IVDescriptors.cpp @@ -34,6 +34,8 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/KnownBits.h" +#include + using namespace llvm; using namespace llvm::PatternMatch; @@ -715,7 +717,7 @@ bool RecurrenceDescriptor::isFirstOrderRecurrence( PHINode *Phi, Loop *TheLoop, - DenseMap &SinkAfter, DominatorTree *DT) { + MapVector &SinkAfter, DominatorTree *DT) { // Ensure the phi node is in the loop header and has two incoming values. if (Phi->getParent() != TheLoop->getHeader() || @@ -741,51 +743,76 @@ SinkAfter.count(Previous)) // Cannot rely on dominance due to motion. return false; - // Ensure every user of the phi node is dominated by the previous value. - // The dominance requirement ensures the loop vectorizer will not need to - // vectorize the initial value prior to the first iteration of the loop. - // TODO: Consider extending this sinking to handle memory instructions and - // phis with multiple users. - - // Returns true, if all users of I are dominated by DominatedBy. - auto allUsesDominatedBy = [DT](Instruction *I, Instruction *DominatedBy) { - return all_of(I->uses(), [DT, DominatedBy](Use &U) { - return DT->dominates(DominatedBy, U); - }); + // Ensure every user of the phi node (recursively) is dominated by the + // previous value. The dominance requirement ensures the loop vectorizer will + // not need to vectorize the initial value prior to the first iteration of the + // loop. + // TODO: Consider extending this sinking to handle memory instructions. + + // We optimistically assume we can sink all users after Previous. Keep a set + // of instructions to sink after Previous ordered by dominance in the common + // basic block. It will be applied to SinkAfter if all users can be sunk. + auto CompareByComesBefore = [](const Instruction *A, const Instruction *B) { + return A->comesBefore(B); }; + std::set InstrsToSink( + CompareByComesBefore); + + BasicBlock *PhiBB = Phi->getParent(); + SmallVector WorkList; + auto TryToPushSinkCandidate = [&](Instruction *SinkCandidate) { + // Already sunk SinkCandidate. + if (SinkCandidate->getParent() == PhiBB && + InstrsToSink.find(SinkCandidate) != InstrsToSink.end()) + return true; - if (Phi->hasOneUse()) { - Instruction *I = Phi->user_back(); - - // If the user of the PHI is also the incoming value, we potentially have a - // reduction and which cannot be handled by sinking. - if (Previous == I) + // Cyclic dependence. + if (Previous == SinkCandidate) return false; - // We cannot sink terminator instructions. - if (I->getParent()->getTerminator() == I) + if (DT->dominates(Previous, + SinkCandidate)) // We already are good w/o sinking. + return true; + + if (SinkCandidate->getParent() != PhiBB || + SinkCandidate->mayHaveSideEffects() || + SinkCandidate->mayReadFromMemory() || SinkCandidate->isTerminator()) return false; // Do not try to sink an instruction multiple times (if multiple operands // are first order recurrences). // TODO: We can support this case, by sinking the instruction after the // 'deepest' previous instruction. - if (SinkAfter.find(I) != SinkAfter.end()) + if (SinkAfter.find(SinkCandidate) != SinkAfter.end()) return false; - if (DT->dominates(Previous, I)) // We already are good w/o sinking. + // If we reach a PHI node that is not dominated by Previous, we reached a + // header PHI. No need for sinking. + if (isa(SinkCandidate)) return true; - // We can sink any instruction without side effects, as long as all users - // are dominated by the instruction we are sinking after. - if (I->getParent() == Phi->getParent() && !I->mayHaveSideEffects() && - allUsesDominatedBy(I, Previous)) { - SinkAfter[I] = Previous; - return true; + // Sink User tentatively and check its users + InstrsToSink.insert(SinkCandidate); + WorkList.push_back(SinkCandidate); + return true; + }; + + WorkList.push_back(Phi); + // Try to recursively sink instructions and their users after Previous. + while (!WorkList.empty()) { + Instruction *Current = WorkList.pop_back_val(); + for (User *User : Current->users()) { + if (!TryToPushSinkCandidate(cast(User))) + return false; } } - return allUsesDominatedBy(Phi, Previous); + // We can sink all users of Phi. Update the mapping. + for (Instruction *I : InstrsToSink) { + SinkAfter[I] = Previous; + Previous = I; + } + return true; } /// This function returns the identity element (or neutral element) for diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -344,7 +344,7 @@ /// Legal. This method is only used for the legacy inner loop vectorizer. VPlanPtr buildVPlanWithVPRecipes( VFRange &Range, SmallPtrSetImpl &DeadInstructions, - const DenseMap &SinkAfter); + const MapVector &SinkAfter); /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive, /// according to the information gathered by Legal when it checked if it is diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -8994,7 +8994,7 @@ auto &ConditionalAssumes = Legal->getConditionalAssumes(); DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); - DenseMap &SinkAfter = Legal->getSinkAfter(); + MapVector &SinkAfter = Legal->getSinkAfter(); // Dead instructions do not need sinking. Remove them from SinkAfter. for (Instruction *I : DeadInstructions) SinkAfter.erase(I); @@ -9010,7 +9010,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( VFRange &Range, SmallPtrSetImpl &DeadInstructions, - const DenseMap &SinkAfter) { + const MapVector &SinkAfter) { SmallPtrSet *, 1> InterleaveGroups; diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll @@ -173,20 +173,54 @@ ret void } -; FIXME: Currently we can only sink a single instruction. For the example below, -; we also have to sink users. -define void @cannot_sink_with_additional_user(i32 %x, i32* %ptr, i64 %tc) { -; CHECK-LABEL: @cannot_sink_with_additional_user( +; Sink users of %pre.phi recursively. +define void @can_sink_with_additional_user(i32 %x, i32* %ptr, i64 %tc) { +; CHECK-LABEL: @can_sink_with_additional_user( ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[PREHEADER:%.*]] ; CHECK: preheader: ; CHECK-NEXT: [[IDX_PHI_TRANS:%.*]] = getelementptr inbounds [257 x i32], [257 x i32]* @p, i64 0, i64 1 ; CHECK-NEXT: [[DOTPRE:%.*]] = load i32, i32* [[IDX_PHI_TRANS]], align 4 +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[X:%.*]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <4 x i32> poison, i32 [[DOTPRE]], i32 3 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[WIDE_LOAD:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [257 x i32], [257 x i32]* @p, i64 0, i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[WIDE_LOAD]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP6:%.*]] = add <4 x i32> [[TMP5]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP7:%.*]] = add <4 x i32> [[TMP5]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP8:%.*]] = add <4 x i32> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [257 x i32], [257 x i32]* @q, i64 0, i64 [[TMP0]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP9]], i32 0 +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP8]], <4 x i32>* [[TMP11]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1996 +; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1999, 1996 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 3 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 2 +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[DOTPRE]], [[PREHEADER]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1997, [[MIDDLE_BLOCK]] ], [ 1, [[PREHEADER]] ] ; CHECK-NEXT: br label [[FOR:%.*]] ; CHECK: for: -; CHECK-NEXT: [[PRE_PHI:%.*]] = phi i32 [ [[DOTPRE]], [[PREHEADER]] ], [ [[PRE_NEXT:%.*]], [[FOR]] ] -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 1, [[PREHEADER]] ], [ [[IV_NEXT:%.*]], [[FOR]] ] -; CHECK-NEXT: [[ADD_1:%.*]] = add i32 [[PRE_PHI]], [[X:%.*]] +; CHECK-NEXT: [[SCALAR_RECUR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[PRE_NEXT:%.*]], [[FOR]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR]] ] +; CHECK-NEXT: [[ADD_1:%.*]] = add i32 [[SCALAR_RECUR]], [[X]] ; CHECK-NEXT: [[ADD_2:%.*]] = add i32 [[ADD_1]], [[X]] ; CHECK-NEXT: [[IDX_1:%.*]] = getelementptr inbounds [257 x i32], [257 x i32]* @p, i64 0, i64 [[IV]] ; CHECK-NEXT: [[PRE_NEXT]] = load i32, i32* [[IDX_1]], align 4 @@ -196,14 +230,13 @@ ; CHECK-NEXT: store i32 [[ADD_4]], i32* [[IDX_2]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 2000 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[FOR]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT]], label [[FOR]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; - entry: br label %preheader @@ -378,7 +411,9 @@ ret void } -; Users that are phi nodes cannot be sunk. +; The (first) reason `%first_time.1` cannot be sunk is because it appears outside +; the header and is not dominated by Previous. The fact that it feeds Previous +; is a second sinking-preventing reason. define void @cannot_sink_phi(i32* %ptr) { ; CHECK-LABEL: @cannot_sink_phi( ; CHECK-NEXT: entry: @@ -407,19 +442,19 @@ entry: br label %loop.header -loop.header: ; preds = %if.end128, %for.cond108.preheader +loop.header: %iv = phi i64 [ 1, %entry ], [ %iv.next, %loop.latch ] %for = phi i32 [ 0, %entry ], [ %for.next, %loop.latch ] %c.1 = icmp ult i64 %iv, 500 br i1 %c.1, label %if.truebb, label %if.falsebb -if.truebb: ; preds = %for.body114 +if.truebb: br label %loop.latch -if.falsebb: ; preds = %for.body114 +if.falsebb: br label %loop.latch -loop.latch: ; preds = %if.then122, %for.body114.if.end128_crit_edge +loop.latch: %first_time.1 = phi i32 [ 20, %if.truebb ], [ %for, %if.falsebb ] %c.2 = icmp ult i64 %iv, 800 %for.next = select i1 %c.2, i32 30, i32 %first_time.1 @@ -606,20 +641,67 @@ define void @sink_dominance(i32* %ptr, i32 %N) { ; CHECK-LABEL: @sink_dominance( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[UMAX1:%.*]] = call i32 @llvm.umax.i32(i32 [[N:%.*]], i32 1) +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[UMAX1]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] +; CHECK: vector.scevcheck: +; CHECK-NEXT: [[UMAX:%.*]] = call i32 @llvm.umax.i32(i32 [[N]], i32 1) +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[UMAX]], -1 +; CHECK-NEXT: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 [[TMP0]]) +; CHECK-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0 +; CHECK-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = add i32 0, [[MUL_RESULT]] +; CHECK-NEXT: [[TMP2:%.*]] = sub i32 0, [[MUL_RESULT]] +; CHECK-NEXT: [[TMP3:%.*]] = icmp sgt i32 [[TMP2]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = icmp slt i32 [[TMP1]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = select i1 false, i1 [[TMP3]], i1 [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = or i1 [[TMP5]], [[MUL_OVERFLOW]] +; CHECK-NEXT: [[TMP7:%.*]] = or i1 false, [[TMP6]] +; CHECK-NEXT: br i1 [[TMP7]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[UMAX1]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[UMAX1]], [[N_MOD_VF]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[PTR:%.*]], i32 [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP9]], i32 0 +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP11]], align 4 +; CHECK-NEXT: [[TMP12]] = zext <4 x i32> [[WIDE_LOAD]] to <4 x i64> +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x i64> [[VECTOR_RECUR]], <4 x i64> [[TMP12]], <4 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = trunc <4 x i64> [[TMP13]] to <4 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = icmp slt <4 x i32> [[TMP14]], +; CHECK-NEXT: [[TMP16:%.*]] = select <4 x i1> [[TMP15]], <4 x i32> [[TMP14]], <4 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP10]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP16]], <4 x i32>* [[TMP17]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[UMAX1]], [[N_VEC]] +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i64> [[TMP12]], i32 3 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i64> [[TMP12]], i32 2 +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_SCEVCHECK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[FOR:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[FOR_NEXT:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[FOR_TRUNC:%.*]] = trunc i64 [[FOR]] to i32 +; CHECK-NEXT: [[SCALAR_RECUR:%.*]] = phi i64 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[FOR_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[FOR_TRUNC:%.*]] = trunc i64 [[SCALAR_RECUR]] to i32 ; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[FOR_TRUNC]], 213 ; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[CMP]], i32 [[FOR_TRUNC]], i32 22 -; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, i32* [[PTR:%.*]], i32 [[IV]] +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i32 [[IV]] ; CHECK-NEXT: [[LV:%.*]] = load i32, i32* [[GEP]], align 4 ; CHECK-NEXT: [[FOR_NEXT]] = zext i32 [[LV]] to i64 ; CHECK-NEXT: store i32 [[SELECT]], i32* [[GEP]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 -; CHECK-NEXT: [[CMP73:%.*]] = icmp ugt i32 [[N:%.*]], [[IV_NEXT]] -; CHECK-NEXT: br i1 [[CMP73]], label [[LOOP]], label [[EXIT:%.*]] +; CHECK-NEXT: [[CMP73:%.*]] = icmp ugt i32 [[N]], [[IV_NEXT]] +; CHECK-NEXT: br i1 [[CMP73]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -652,22 +734,71 @@ define void @sink_dominance_2(i32* %ptr, i32 %N) { ; CHECK-LABEL: @sink_dominance_2( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[UMAX1:%.*]] = call i32 @llvm.umax.i32(i32 [[N:%.*]], i32 1) +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[UMAX1]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] +; CHECK: vector.scevcheck: +; CHECK-NEXT: [[UMAX:%.*]] = call i32 @llvm.umax.i32(i32 [[N]], i32 1) +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[UMAX]], -1 +; CHECK-NEXT: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 [[TMP0]]) +; CHECK-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0 +; CHECK-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = add i32 0, [[MUL_RESULT]] +; CHECK-NEXT: [[TMP2:%.*]] = sub i32 0, [[MUL_RESULT]] +; CHECK-NEXT: [[TMP3:%.*]] = icmp sgt i32 [[TMP2]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = icmp slt i32 [[TMP1]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = select i1 false, i1 [[TMP3]], i1 [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = or i1 [[TMP5]], [[MUL_OVERFLOW]] +; CHECK-NEXT: [[TMP7:%.*]] = or i1 false, [[TMP6]] +; CHECK-NEXT: br i1 [[TMP7]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[UMAX1]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[UMAX1]], [[N_MOD_VF]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[PTR:%.*]], i32 [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP9]], i32 0 +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP11]], align 4 +; CHECK-NEXT: [[TMP12]] = zext <4 x i32> [[WIDE_LOAD]] to <4 x i64> +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x i64> [[VECTOR_RECUR]], <4 x i64> [[TMP12]], <4 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = trunc <4 x i64> [[TMP13]] to <4 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = add <4 x i32> [[TMP14]], +; CHECK-NEXT: [[TMP16:%.*]] = mul <4 x i32> [[TMP15]], +; CHECK-NEXT: [[TMP17:%.*]] = icmp slt <4 x i32> [[TMP14]], +; CHECK-NEXT: [[TMP18:%.*]] = select <4 x i1> [[TMP17]], <4 x i32> [[TMP14]], <4 x i32> [[TMP16]] +; CHECK-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP10]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP18]], <4 x i32>* [[TMP19]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[UMAX1]], [[N_VEC]] +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i64> [[TMP12]], i32 3 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i64> [[TMP12]], i32 2 +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_SCEVCHECK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[FOR:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[FOR_NEXT:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[FOR_TRUNC:%.*]] = trunc i64 [[FOR]] to i32 +; CHECK-NEXT: [[SCALAR_RECUR:%.*]] = phi i64 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[FOR_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[FOR_TRUNC:%.*]] = trunc i64 [[SCALAR_RECUR]] to i32 ; CHECK-NEXT: [[STEP:%.*]] = add i32 [[FOR_TRUNC]], 2 ; CHECK-NEXT: [[STEP_2:%.*]] = mul i32 [[STEP]], 99 ; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[FOR_TRUNC]], 213 ; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[CMP]], i32 [[FOR_TRUNC]], i32 [[STEP_2]] -; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, i32* [[PTR:%.*]], i32 [[IV]] +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i32 [[IV]] ; CHECK-NEXT: [[LV:%.*]] = load i32, i32* [[GEP]], align 4 ; CHECK-NEXT: [[FOR_NEXT]] = zext i32 [[LV]] to i64 ; CHECK-NEXT: store i32 [[SELECT]], i32* [[GEP]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 -; CHECK-NEXT: [[CMP73:%.*]] = icmp ugt i32 [[N:%.*]], [[IV_NEXT]] -; CHECK-NEXT: br i1 [[CMP73]], label [[LOOP]], label [[EXIT:%.*]] +; CHECK-NEXT: [[CMP73:%.*]] = icmp ugt i32 [[N]], [[IV_NEXT]] +; CHECK-NEXT: br i1 [[CMP73]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP15:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll @@ -545,11 +545,36 @@ ; b[i] = ((a[i] + 2) * a[i + 1]); ; } ; -; NO-SINK-AFTER-LABEL: no_sink_after -; NO-SINK-AFTER-NOT: vector.ph: -; NO-SINK-AFTER: } +; SINK-AFTER-LABEL: @sink_after_with_multiple_users +; SINK-AFTER-LABEL: vector.ph: +; SINK-AFTER-NEXT: %n.mod.vf = urem i64 %n, 4 +; SINK-AFTER-NEXT: %n.vec = sub i64 %n, %n.mod.vf +; SINK-AFTER-NEXT: %vector.recur.init = insertelement <4 x i16> poison, i16 %.pre, i32 3 +; SINK-AFTER-NEXT: br label %vector.body + +; SINK-AFTER-LABEL: vector.body: +; SINK-AFTER-NEXT: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] +; SINK-AFTER-NEXT: %vector.recur = phi <4 x i16> [ %vector.recur.init, %vector.ph ], [ %wide.load, %vector.body ] +; SINK-AFTER-NEXT: %1 = add i64 %index, 0 +; SINK-AFTER-NEXT: %2 = add nuw nsw i64 %1, 1 +; SINK-AFTER-NEXT: %3 = getelementptr inbounds i16, i16* %a, i64 %2 +; SINK-AFTER-NEXT: %4 = getelementptr inbounds i16, i16* %3, i32 0 +; SINK-AFTER-NEXT: %5 = bitcast i16* %4 to <4 x i16>* +; SINK-AFTER-NEXT: %wide.load = load <4 x i16>, <4 x i16>* %5, align 2, !alias.scope !43 +; SINK-AFTER-NEXT: %6 = shufflevector <4 x i16> %vector.recur, <4 x i16> %wide.load, <4 x i32> +; SINK-AFTER-NEXT: %7 = sext <4 x i16> %6 to <4 x i32> +; SINK-AFTER-NEXT: %8 = add nsw <4 x i32> %7, +; SINK-AFTER-NEXT: %9 = sext <4 x i16> %wide.load to <4 x i32> +; SINK-AFTER-NEXT: %10 = mul nsw <4 x i32> %8, %9 +; SINK-AFTER-NEXT: %11 = getelementptr inbounds i32, i32* %b, i64 %1 +; SINK-AFTER-NEXT: %12 = getelementptr inbounds i32, i32* %11, i32 0 +; SINK-AFTER-NEXT: %13 = bitcast i32* %12 to <4 x i32>* +; SINK-AFTER-NEXT: store <4 x i32> %10, <4 x i32>* %13, align 4, !alias.scope !46, !noalias !43 +; SINK-AFTER-NEXT: %index.next = add i64 %index, 4 +; SINK-AFTER-NEXT: %14 = icmp eq i64 %index.next, %n.vec +; SINK-AFTER-NEXT: br i1 %14, label %middle.block, label %vector.body, !llvm.loop !48 ; -define void @no_sink_after(i16* %a, i32* %b, i64 %n) { +define void @sink_after_with_multiple_users(i16* %a, i32* %b, i64 %n) { entry: %.pre = load i16, i16* %a br label %for.body @@ -626,7 +651,7 @@ ; SINK-AFTER-NEXT: %index.next = add i32 %index, 4 ; SINK-AFTER-NEXT: %vec.ind.next = add <4 x i16> %vec.ind, ; SINK-AFTER-NEXT: %6 = icmp eq i32 %index.next, 40 -; SINK-AFTER-NEXT: br i1 %6, label %middle.block, label %vector.body, !llvm.loop !43 +; SINK-AFTER-NEXT: br i1 %6, label %middle.block, label %vector.body, !llvm.loop !50 ; entry: br label %for.cond @@ -708,7 +733,7 @@ ; CHECK-NEXT: [[TMP23]] = add <4 x i32> [[VEC_PHI1]], [[TMP22]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 ; CHECK-NEXT: [[TMP24:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof !45, [[LOOP46:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof !52, [[LOOP46:!llvm.loop !.*]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP25:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[TMP23]], <4 x i32> [[VEC_PHI1]] ; CHECK-NEXT: [[TMP26:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP25]]) @@ -719,7 +744,7 @@ ; CHECK-NEXT: [[TMP:%.*]] = phi i32 [ undef, [[BB2]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[TMP]] ; CHECK: bb2: -; CHECK-NEXT: br i1 undef, label [[BB1]], label [[BB2]], !prof !47, [[LOOP48:!llvm.loop !.*]] +; CHECK-NEXT: br i1 undef, label [[BB1]], label [[BB2]], !prof !54, [[LOOP48:!llvm.loop !.*]] ; bb: br label %bb2 @@ -835,7 +860,7 @@ ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT3]] = add <4 x i32> [[VEC_IND2]], ; CHECK-NEXT: [[TMP39:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP39]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof !45, [[LOOP49:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[TMP39]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof !52, [[LOOP49:!llvm.loop !.*]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP40:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> [[TMP23]], <4 x i32> [[VEC_PHI4]] ; CHECK-NEXT: [[TMP41:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP40]]) @@ -846,7 +871,7 @@ ; CHECK-NEXT: [[TMP:%.*]] = phi i32 [ undef, [[BB2]] ], [ [[TMP41]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[TMP]] ; CHECK: bb2: -; CHECK-NEXT: br i1 undef, label [[BB1]], label [[BB2]], !prof !47, [[LOOP50:!llvm.loop !.*]] +; CHECK-NEXT: br i1 undef, label [[BB1]], label [[BB2]], !prof !54, [[LOOP50:!llvm.loop !.*]] ; bb: br label %bb2