Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -5504,22 +5504,6 @@ break; } - // The only loops we can vectorize without a scalar epilogue, are loops with - // a bottom-test and a single exiting block. We'd have to handle the fact - // that not every instruction executes on the last iteration. This will - // require a lane mask which varies through the vector loop body. (TODO) - if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { - // If there was a tail-folding hint/switch, but we can't fold the tail by - // masking, fallback to a vectorization with a scalar epilogue. - if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { - LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " - "scalar epilogue instead.\n"); - ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; - return MaxVF; - } - return None; - } - // Now try the tail folding // Invalidate interleave groups that require an epilogue if we can't mask @@ -7519,26 +7503,40 @@ void LoopVectorizationPlanner::collectTriviallyDeadInstructions( SmallPtrSetImpl &DeadInstructions) { - // We create new control-flow for the vectorized loop, so the original exit - // conditions will be dead after vectorization if it's only used by the - // terminator - SmallVector ExitingBlocks; - OrigLoop->getExitingBlocks(ExitingBlocks); - for (auto *BB : ExitingBlocks) { - auto *Cmp = dyn_cast(BB->getTerminator()->getOperand(0)); - if (!Cmp || !Cmp->hasOneUse()) - continue; - - // TODO: we should introduce a getUniqueExitingBlocks on Loop - if (!DeadInstructions.insert(Cmp).second) - continue; - - // The operands of the icmp is often a dead trunc, used by IndUpdate. - // TODO: can recurse through operands in general - for (Value *Op : Cmp->operands()) { - if (isa(Op) && Op->hasOneUse()) + auto handleDeadExit = [&](BasicBlock *BB) { + auto *Term = BB->getTerminator(); + if (!isa(Term) && + (!isa(Term) || !cast(Term)->isConditional())) + return; + auto *Cmp = dyn_cast(Term->getOperand(0)); + if (!Cmp || !Cmp->hasOneUse()) + return; + + if (!DeadInstructions.insert(Cmp).second) + return; + + // The operands of the icmp is often a dead trunc, used by IndUpdate. + // TODO: can recurse through operands in general + for (Value *Op : Cmp->operands()) + if (isa(Op) && Op->hasOneUse()) DeadInstructions.insert(cast(Op)); - } + }; + + + if (CM.foldTailByMasking()) { + // For a tail folded loop, we need any non-latch exit conditions to form + // predicate masks, as such, only the latch condition (if any) is dead. + BasicBlock *Latch = OrigLoop->getLoopLatch(); + if (OrigLoop->isLoopExiting(Latch)) + handleDeadExit(Latch); + } else { + // We create new control-flow for the vectorized loop, so the original exit + // conditions will be dead after vectorization if it's only used by the + // terminator + SmallVector ExitingBlocks; + OrigLoop->getExitingBlocks(ExitingBlocks); + for (auto *BB : ExitingBlocks) + handleDeadExit(BB); } // We create new "steps" for induction variable updates to which the original @@ -7948,10 +7946,12 @@ if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) return EdgeMaskCache[Edge] = SrcMask; - // If source is an exiting block, we know the exit edge is dynamically dead - // in the vector loop, and thus we don't need to restrict the mask. Avoid - // adding uses of an otherwise potentially dead instruction. - if (OrigLoop->isLoopExiting(Src)) + // If source is an exiting block, we know that (if we using an epilogue + // loop) the exit edge is dynamically dead in the vector loop, and thus we + // don't need to restrict the mask. Avoid adding uses of an otherwise + // potentially dead instruction. Note that if we're tail folding, we need + // to form the mask. + if (CM.requiresScalarEpilogue() && OrigLoop->isLoopExiting(Src)) return EdgeMaskCache[Edge] = SrcMask; VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition()); Index: llvm/test/Transforms/LoopVectorize/loop-form.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/loop-form.ll +++ llvm/test/Transforms/LoopVectorize/loop-form.ll @@ -166,17 +166,62 @@ ; ; TAILFOLD-LABEL: @early_exit( ; TAILFOLD-NEXT: entry: +; TAILFOLD-NEXT: [[TMP0:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; TAILFOLD-NEXT: [[SMAX:%.*]] = select i1 [[TMP0]], i32 [[N]], i32 0 +; TAILFOLD-NEXT: [[TMP1:%.*]] = add nuw i32 [[SMAX]], 1 +; TAILFOLD-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; TAILFOLD: vector.ph: +; TAILFOLD-NEXT: [[N_RND_UP:%.*]] = add i32 [[TMP1]], 1 +; TAILFOLD-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 2 +; TAILFOLD-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]] +; TAILFOLD-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[TMP1]], 1 +; TAILFOLD-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0 +; TAILFOLD-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> undef, <2 x i32> zeroinitializer +; TAILFOLD-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i32> undef, i32 [[N]], i32 0 +; TAILFOLD-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT1]], <2 x i32> undef, <2 x i32> zeroinitializer +; TAILFOLD-NEXT: br label [[VECTOR_BODY:%.*]] +; TAILFOLD: vector.body: +; TAILFOLD-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE4:%.*]] ] +; TAILFOLD-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE4]] ] +; TAILFOLD-NEXT: [[TMP2:%.*]] = icmp slt <2 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]] +; TAILFOLD-NEXT: [[TMP3:%.*]] = icmp ule <2 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] +; TAILFOLD-NEXT: [[TMP4:%.*]] = sext <2 x i32> [[VEC_IND]] to <2 x i64> +; TAILFOLD-NEXT: [[TMP5:%.*]] = and <2 x i1> [[TMP2]], [[TMP3]] +; TAILFOLD-NEXT: [[TMP6:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0 +; TAILFOLD-NEXT: br i1 [[TMP6]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; TAILFOLD: pred.store.if: +; TAILFOLD-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0 +; TAILFOLD-NEXT: [[TMP8:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[TMP7]] +; TAILFOLD-NEXT: store i16 0, i16* [[TMP8]], align 4 +; TAILFOLD-NEXT: br label [[PRED_STORE_CONTINUE]] +; TAILFOLD: pred.store.continue: +; TAILFOLD-NEXT: [[TMP9:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1 +; TAILFOLD-NEXT: br i1 [[TMP9]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4]] +; TAILFOLD: pred.store.if3: +; TAILFOLD-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1 +; TAILFOLD-NEXT: [[TMP11:%.*]] = getelementptr inbounds i16, i16* [[P]], i64 [[TMP10]] +; TAILFOLD-NEXT: store i16 0, i16* [[TMP11]], align 4 +; TAILFOLD-NEXT: br label [[PRED_STORE_CONTINUE4]] +; TAILFOLD: pred.store.continue4: +; TAILFOLD-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 2 +; TAILFOLD-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], +; TAILFOLD-NEXT: [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; TAILFOLD-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP4:!llvm.loop !.*]] +; TAILFOLD: middle.block: +; TAILFOLD-NEXT: br i1 true, label [[IF_END:%.*]], label [[SCALAR_PH]] +; TAILFOLD: scalar.ph: +; TAILFOLD-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; TAILFOLD-NEXT: br label [[FOR_COND:%.*]] ; TAILFOLD: for.cond: -; TAILFOLD-NEXT: [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ] -; TAILFOLD-NEXT: [[CMP:%.*]] = icmp slt i32 [[I]], [[N:%.*]] -; TAILFOLD-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[IF_END:%.*]] +; TAILFOLD-NEXT: [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ] +; TAILFOLD-NEXT: [[CMP:%.*]] = icmp slt i32 [[I]], [[N]] +; TAILFOLD-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[IF_END]] ; TAILFOLD: for.body: ; TAILFOLD-NEXT: [[IPROM:%.*]] = sext i32 [[I]] to i64 -; TAILFOLD-NEXT: [[B:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[IPROM]] +; TAILFOLD-NEXT: [[B:%.*]] = getelementptr inbounds i16, i16* [[P]], i64 [[IPROM]] ; TAILFOLD-NEXT: store i16 0, i16* [[B]], align 4 ; TAILFOLD-NEXT: [[INC]] = add nsw i32 [[I]], 1 -; TAILFOLD-NEXT: br label [[FOR_COND]] +; TAILFOLD-NEXT: br label [[FOR_COND]], [[LOOP5:!llvm.loop !.*]] ; TAILFOLD: if.end: ; TAILFOLD-NEXT: ret void ; @@ -204,33 +249,123 @@ define void @optsize(i16* %p, i32 %n) optsize { ; CHECK-LABEL: @optsize( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; CHECK-NEXT: [[SMAX:%.*]] = select i1 [[TMP0]], i32 [[N]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = add nuw i32 [[SMAX]], 1 +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[TMP1]], 1 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 2 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[TMP1]], 1 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i32> undef, i32 [[N]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT1]], <2 x i32> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE4:%.*]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE4]] ] +; CHECK-NEXT: [[TMP2:%.*]] = icmp slt <2 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]] +; CHECK-NEXT: [[TMP3:%.*]] = icmp ule <2 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP4:%.*]] = sext <2 x i32> [[VEC_IND]] to <2 x i64> +; CHECK-NEXT: [[TMP5:%.*]] = and <2 x i1> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0 +; CHECK-NEXT: br i1 [[TMP6]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; CHECK: pred.store.if: +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[TMP7]] +; CHECK-NEXT: store i16 0, i16* [[TMP8]], align 4 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]] +; CHECK: pred.store.continue: +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1 +; CHECK-NEXT: br i1 [[TMP9]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4]] +; CHECK: pred.store.if3: +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i16, i16* [[P]], i64 [[TMP10]] +; CHECK-NEXT: store i16 0, i16* [[TMP11]], align 4 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE4]] +; CHECK: pred.store.continue4: +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 2 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], +; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP6:!llvm.loop !.*]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[IF_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[FOR_COND:%.*]] ; CHECK: for.cond: -; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ] -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[I]], [[N:%.*]] -; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[IF_END:%.*]] +; CHECK-NEXT: [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ] +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[I]], [[N]] +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[IF_END]] ; CHECK: for.body: ; CHECK-NEXT: [[IPROM:%.*]] = sext i32 [[I]] to i64 -; CHECK-NEXT: [[B:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[IPROM]] +; CHECK-NEXT: [[B:%.*]] = getelementptr inbounds i16, i16* [[P]], i64 [[IPROM]] ; CHECK-NEXT: store i16 0, i16* [[B]], align 4 ; CHECK-NEXT: [[INC]] = add nsw i32 [[I]], 1 -; CHECK-NEXT: br label [[FOR_COND]] +; CHECK-NEXT: br label [[FOR_COND]], [[LOOP7:!llvm.loop !.*]] ; CHECK: if.end: ; CHECK-NEXT: ret void ; ; TAILFOLD-LABEL: @optsize( ; TAILFOLD-NEXT: entry: +; TAILFOLD-NEXT: [[TMP0:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; TAILFOLD-NEXT: [[SMAX:%.*]] = select i1 [[TMP0]], i32 [[N]], i32 0 +; TAILFOLD-NEXT: [[TMP1:%.*]] = add nuw i32 [[SMAX]], 1 +; TAILFOLD-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; TAILFOLD: vector.ph: +; TAILFOLD-NEXT: [[N_RND_UP:%.*]] = add i32 [[TMP1]], 1 +; TAILFOLD-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 2 +; TAILFOLD-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]] +; TAILFOLD-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[TMP1]], 1 +; TAILFOLD-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0 +; TAILFOLD-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> undef, <2 x i32> zeroinitializer +; TAILFOLD-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i32> undef, i32 [[N]], i32 0 +; TAILFOLD-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT1]], <2 x i32> undef, <2 x i32> zeroinitializer +; TAILFOLD-NEXT: br label [[VECTOR_BODY:%.*]] +; TAILFOLD: vector.body: +; TAILFOLD-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE4:%.*]] ] +; TAILFOLD-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE4]] ] +; TAILFOLD-NEXT: [[TMP2:%.*]] = icmp slt <2 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]] +; TAILFOLD-NEXT: [[TMP3:%.*]] = icmp ule <2 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] +; TAILFOLD-NEXT: [[TMP4:%.*]] = sext <2 x i32> [[VEC_IND]] to <2 x i64> +; TAILFOLD-NEXT: [[TMP5:%.*]] = and <2 x i1> [[TMP2]], [[TMP3]] +; TAILFOLD-NEXT: [[TMP6:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0 +; TAILFOLD-NEXT: br i1 [[TMP6]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; TAILFOLD: pred.store.if: +; TAILFOLD-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0 +; TAILFOLD-NEXT: [[TMP8:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[TMP7]] +; TAILFOLD-NEXT: store i16 0, i16* [[TMP8]], align 4 +; TAILFOLD-NEXT: br label [[PRED_STORE_CONTINUE]] +; TAILFOLD: pred.store.continue: +; TAILFOLD-NEXT: [[TMP9:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1 +; TAILFOLD-NEXT: br i1 [[TMP9]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4]] +; TAILFOLD: pred.store.if3: +; TAILFOLD-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1 +; TAILFOLD-NEXT: [[TMP11:%.*]] = getelementptr inbounds i16, i16* [[P]], i64 [[TMP10]] +; TAILFOLD-NEXT: store i16 0, i16* [[TMP11]], align 4 +; TAILFOLD-NEXT: br label [[PRED_STORE_CONTINUE4]] +; TAILFOLD: pred.store.continue4: +; TAILFOLD-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 2 +; TAILFOLD-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], +; TAILFOLD-NEXT: [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; TAILFOLD-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP6:!llvm.loop !.*]] +; TAILFOLD: middle.block: +; TAILFOLD-NEXT: br i1 true, label [[IF_END:%.*]], label [[SCALAR_PH]] +; TAILFOLD: scalar.ph: +; TAILFOLD-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; TAILFOLD-NEXT: br label [[FOR_COND:%.*]] ; TAILFOLD: for.cond: -; TAILFOLD-NEXT: [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ] -; TAILFOLD-NEXT: [[CMP:%.*]] = icmp slt i32 [[I]], [[N:%.*]] -; TAILFOLD-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[IF_END:%.*]] +; TAILFOLD-NEXT: [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ] +; TAILFOLD-NEXT: [[CMP:%.*]] = icmp slt i32 [[I]], [[N]] +; TAILFOLD-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[IF_END]] ; TAILFOLD: for.body: ; TAILFOLD-NEXT: [[IPROM:%.*]] = sext i32 [[I]] to i64 -; TAILFOLD-NEXT: [[B:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[IPROM]] +; TAILFOLD-NEXT: [[B:%.*]] = getelementptr inbounds i16, i16* [[P]], i64 [[IPROM]] ; TAILFOLD-NEXT: store i16 0, i16* [[B]], align 4 ; TAILFOLD-NEXT: [[INC]] = add nsw i32 [[I]], 1 -; TAILFOLD-NEXT: br label [[FOR_COND]] +; TAILFOLD-NEXT: br label [[FOR_COND]], [[LOOP7:!llvm.loop !.*]] ; TAILFOLD: if.end: ; TAILFOLD-NEXT: ret void ; @@ -284,7 +419,7 @@ ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 2 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], ; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP6:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP8:!llvm.loop !.*]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[IF_END:%.*]], label [[SCALAR_PH]] @@ -301,24 +436,73 @@ ; CHECK-NEXT: store i16 0, i16* [[B]], align 4 ; CHECK-NEXT: [[INC]] = add nsw i32 [[I]], 1 ; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i32 [[I]], 2096 -; CHECK-NEXT: br i1 [[CMP2]], label [[FOR_COND]], label [[IF_END]], [[LOOP7:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[CMP2]], label [[FOR_COND]], label [[IF_END]], [[LOOP9:!llvm.loop !.*]] ; CHECK: if.end: ; CHECK-NEXT: ret void ; ; TAILFOLD-LABEL: @multiple_unique_exit( ; TAILFOLD-NEXT: entry: +; TAILFOLD-NEXT: [[TMP0:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; TAILFOLD-NEXT: [[SMAX:%.*]] = select i1 [[TMP0]], i32 [[N]], i32 0 +; TAILFOLD-NEXT: [[TMP1:%.*]] = icmp ult i32 [[SMAX]], 2096 +; TAILFOLD-NEXT: [[UMIN:%.*]] = select i1 [[TMP1]], i32 [[SMAX]], i32 2096 +; TAILFOLD-NEXT: [[TMP2:%.*]] = add nuw nsw i32 [[UMIN]], 1 +; TAILFOLD-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; TAILFOLD: vector.ph: +; TAILFOLD-NEXT: [[N_RND_UP:%.*]] = add i32 [[TMP2]], 1 +; TAILFOLD-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 2 +; TAILFOLD-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]] +; TAILFOLD-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[TMP2]], 1 +; TAILFOLD-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0 +; TAILFOLD-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> undef, <2 x i32> zeroinitializer +; TAILFOLD-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i32> undef, i32 [[N]], i32 0 +; TAILFOLD-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT1]], <2 x i32> undef, <2 x i32> zeroinitializer +; TAILFOLD-NEXT: br label [[VECTOR_BODY:%.*]] +; TAILFOLD: vector.body: +; TAILFOLD-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE4:%.*]] ] +; TAILFOLD-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE4]] ] +; TAILFOLD-NEXT: [[TMP3:%.*]] = add i32 [[INDEX]], 0 +; TAILFOLD-NEXT: [[TMP4:%.*]] = add i32 [[INDEX]], 1 +; TAILFOLD-NEXT: [[TMP5:%.*]] = icmp slt <2 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]] +; TAILFOLD-NEXT: [[TMP6:%.*]] = icmp ule <2 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] +; TAILFOLD-NEXT: [[TMP7:%.*]] = sext <2 x i32> [[VEC_IND]] to <2 x i64> +; TAILFOLD-NEXT: [[TMP8:%.*]] = and <2 x i1> [[TMP5]], [[TMP6]] +; TAILFOLD-NEXT: [[TMP9:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0 +; TAILFOLD-NEXT: br i1 [[TMP9]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; TAILFOLD: pred.store.if: +; TAILFOLD-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP7]], i32 0 +; TAILFOLD-NEXT: [[TMP11:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[TMP10]] +; TAILFOLD-NEXT: store i16 0, i16* [[TMP11]], align 4 +; TAILFOLD-NEXT: br label [[PRED_STORE_CONTINUE]] +; TAILFOLD: pred.store.continue: +; TAILFOLD-NEXT: [[TMP12:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1 +; TAILFOLD-NEXT: br i1 [[TMP12]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4]] +; TAILFOLD: pred.store.if3: +; TAILFOLD-NEXT: [[TMP13:%.*]] = extractelement <2 x i64> [[TMP7]], i32 1 +; TAILFOLD-NEXT: [[TMP14:%.*]] = getelementptr inbounds i16, i16* [[P]], i64 [[TMP13]] +; TAILFOLD-NEXT: store i16 0, i16* [[TMP14]], align 4 +; TAILFOLD-NEXT: br label [[PRED_STORE_CONTINUE4]] +; TAILFOLD: pred.store.continue4: +; TAILFOLD-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 2 +; TAILFOLD-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], +; TAILFOLD-NEXT: [[TMP15:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; TAILFOLD-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP8:!llvm.loop !.*]] +; TAILFOLD: middle.block: +; TAILFOLD-NEXT: br i1 true, label [[IF_END:%.*]], label [[SCALAR_PH]] +; TAILFOLD: scalar.ph: +; TAILFOLD-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; TAILFOLD-NEXT: br label [[FOR_COND:%.*]] ; TAILFOLD: for.cond: -; TAILFOLD-NEXT: [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ] -; TAILFOLD-NEXT: [[CMP:%.*]] = icmp slt i32 [[I]], [[N:%.*]] -; TAILFOLD-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[IF_END:%.*]] +; TAILFOLD-NEXT: [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ] +; TAILFOLD-NEXT: [[CMP:%.*]] = icmp slt i32 [[I]], [[N]] +; TAILFOLD-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[IF_END]] ; TAILFOLD: for.body: ; TAILFOLD-NEXT: [[IPROM:%.*]] = sext i32 [[I]] to i64 -; TAILFOLD-NEXT: [[B:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[IPROM]] +; TAILFOLD-NEXT: [[B:%.*]] = getelementptr inbounds i16, i16* [[P]], i64 [[IPROM]] ; TAILFOLD-NEXT: store i16 0, i16* [[B]], align 4 ; TAILFOLD-NEXT: [[INC]] = add nsw i32 [[I]], 1 ; TAILFOLD-NEXT: [[CMP2:%.*]] = icmp slt i32 [[I]], 2096 -; TAILFOLD-NEXT: br i1 [[CMP2]], label [[FOR_COND]], label [[IF_END]] +; TAILFOLD-NEXT: br i1 [[CMP2]], label [[FOR_COND]], label [[IF_END]], [[LOOP9:!llvm.loop !.*]] ; TAILFOLD: if.end: ; TAILFOLD-NEXT: ret void ; @@ -798,7 +982,7 @@ ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], ; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200 -; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP8:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP10:!llvm.loop !.*]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 201, 200 ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] @@ -819,28 +1003,81 @@ ; CHECK-NEXT: br label [[LOOP_LATCH]] ; CHECK: loop.latch: ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 -; CHECK-NEXT: br label [[LOOP_HEADER]], [[LOOP9:!llvm.loop !.*]] +; CHECK-NEXT: br label [[LOOP_HEADER]], [[LOOP11:!llvm.loop !.*]] ; CHECK: exit: ; CHECK-NEXT: ret void ; ; TAILFOLD-LABEL: @scalar_predication( ; TAILFOLD-NEXT: entry: +; TAILFOLD-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; TAILFOLD: vector.ph: +; TAILFOLD-NEXT: br label [[VECTOR_BODY:%.*]] +; TAILFOLD: vector.body: +; TAILFOLD-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE4:%.*]] ] +; TAILFOLD-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE4]] ] +; TAILFOLD-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; TAILFOLD-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; TAILFOLD-NEXT: [[TMP2:%.*]] = getelementptr float, float* [[ADDR:%.*]], i64 [[TMP0]] +; TAILFOLD-NEXT: [[TMP3:%.*]] = getelementptr float, float* [[ADDR]], i64 [[TMP1]] +; TAILFOLD-NEXT: [[TMP4:%.*]] = icmp eq <2 x i64> [[VEC_IND]], +; TAILFOLD-NEXT: [[TMP5:%.*]] = icmp ule <2 x i64> [[VEC_IND]], +; TAILFOLD-NEXT: [[TMP6:%.*]] = xor <2 x i1> [[TMP4]], +; TAILFOLD-NEXT: [[TMP7:%.*]] = and <2 x i1> [[TMP6]], [[TMP5]] +; TAILFOLD-NEXT: [[TMP8:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0 +; TAILFOLD-NEXT: br i1 [[TMP8]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] +; TAILFOLD: pred.load.if: +; TAILFOLD-NEXT: [[TMP9:%.*]] = load float, float* [[TMP2]], align 4 +; TAILFOLD-NEXT: [[TMP10:%.*]] = insertelement <2 x float> undef, float [[TMP9]], i32 0 +; TAILFOLD-NEXT: br label [[PRED_LOAD_CONTINUE]] +; TAILFOLD: pred.load.continue: +; TAILFOLD-NEXT: [[TMP11:%.*]] = phi <2 x float> [ undef, [[VECTOR_BODY]] ], [ [[TMP10]], [[PRED_LOAD_IF]] ] +; TAILFOLD-NEXT: [[TMP12:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1 +; TAILFOLD-NEXT: br i1 [[TMP12]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]] +; TAILFOLD: pred.load.if1: +; TAILFOLD-NEXT: [[TMP13:%.*]] = load float, float* [[TMP3]], align 4 +; TAILFOLD-NEXT: [[TMP14:%.*]] = insertelement <2 x float> [[TMP11]], float [[TMP13]], i32 1 +; TAILFOLD-NEXT: br label [[PRED_LOAD_CONTINUE2]] +; TAILFOLD: pred.load.continue2: +; TAILFOLD-NEXT: [[TMP15:%.*]] = phi <2 x float> [ [[TMP11]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], [[PRED_LOAD_IF1]] ] +; TAILFOLD-NEXT: [[TMP16:%.*]] = fcmp oeq <2 x float> [[TMP15]], zeroinitializer +; TAILFOLD-NEXT: [[TMP17:%.*]] = xor <2 x i1> [[TMP16]], +; TAILFOLD-NEXT: [[TMP18:%.*]] = and <2 x i1> [[TMP17]], [[TMP7]] +; TAILFOLD-NEXT: [[TMP19:%.*]] = extractelement <2 x i1> [[TMP18]], i32 0 +; TAILFOLD-NEXT: br i1 [[TMP19]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; TAILFOLD: pred.store.if: +; TAILFOLD-NEXT: store float 1.000000e+01, float* [[TMP2]], align 4 +; TAILFOLD-NEXT: br label [[PRED_STORE_CONTINUE]] +; TAILFOLD: pred.store.continue: +; TAILFOLD-NEXT: [[TMP20:%.*]] = extractelement <2 x i1> [[TMP18]], i32 1 +; TAILFOLD-NEXT: br i1 [[TMP20]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4]] +; TAILFOLD: pred.store.if3: +; TAILFOLD-NEXT: store float 1.000000e+01, float* [[TMP3]], align 4 +; TAILFOLD-NEXT: br label [[PRED_STORE_CONTINUE4]] +; TAILFOLD: pred.store.continue4: +; TAILFOLD-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2 +; TAILFOLD-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], +; TAILFOLD-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], 202 +; TAILFOLD-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP10:!llvm.loop !.*]] +; TAILFOLD: middle.block: +; TAILFOLD-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; TAILFOLD: scalar.ph: +; TAILFOLD-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 202, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; TAILFOLD-NEXT: br label [[LOOP_HEADER:%.*]] ; TAILFOLD: loop.header: -; TAILFOLD-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] -; TAILFOLD-NEXT: [[GEP:%.*]] = getelementptr float, float* [[ADDR:%.*]], i64 [[IV]] +; TAILFOLD-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] +; TAILFOLD-NEXT: [[GEP:%.*]] = getelementptr float, float* [[ADDR]], i64 [[IV]] ; TAILFOLD-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV]], 200 -; TAILFOLD-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT:%.*]], label [[LOOP_BODY:%.*]] +; TAILFOLD-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP_BODY:%.*]] ; TAILFOLD: loop.body: -; TAILFOLD-NEXT: [[TMP0:%.*]] = load float, float* [[GEP]], align 4 -; TAILFOLD-NEXT: [[PRED:%.*]] = fcmp oeq float [[TMP0]], 0.000000e+00 +; TAILFOLD-NEXT: [[TMP22:%.*]] = load float, float* [[GEP]], align 4 +; TAILFOLD-NEXT: [[PRED:%.*]] = fcmp oeq float [[TMP22]], 0.000000e+00 ; TAILFOLD-NEXT: br i1 [[PRED]], label [[LOOP_LATCH]], label [[THEN:%.*]] ; TAILFOLD: then: ; TAILFOLD-NEXT: store float 1.000000e+01, float* [[GEP]], align 4 ; TAILFOLD-NEXT: br label [[LOOP_LATCH]] ; TAILFOLD: loop.latch: ; TAILFOLD-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 -; TAILFOLD-NEXT: br label [[LOOP_HEADER]] +; TAILFOLD-NEXT: br label [[LOOP_HEADER]], [[LOOP11:!llvm.loop !.*]] ; TAILFOLD: exit: ; TAILFOLD-NEXT: ret void ;