diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -9094,6 +9094,7 @@ for (auto &Entry : SinkAfter) { VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); + // If the target is in a replication region, make sure to move Sink to the // block after it, not into the replication region itself. if (auto *Region = @@ -9106,7 +9107,37 @@ continue; } } - Sink->moveAfter(Target); + + auto *SinkRegion = + dyn_cast_or_null(Sink->getParent()->getParent()); + // Unless the sink source is in a replicate region, sink the recipe + // directly. + if (!SinkRegion || !SinkRegion->isReplicator()) { + Sink->moveAfter(Target); + continue; + } + + // If the sink source is in a replicate region, we need to move the whole + // replicate region, which should only contain a single recipe in the main + // block. + assert(Sink->getParent()->size() == 1 && + "parent must be a replicator with a single recipe"); + auto *SplitBlock = + Target->getParent()->splitAt(std::next(Target->getIterator())); + + auto *Pred = SinkRegion->getSinglePredecessor(); + auto *Succ = SinkRegion->getSingleSuccessor(); + VPBlockUtils::disconnectBlocks(Pred, SinkRegion); + VPBlockUtils::disconnectBlocks(SinkRegion, Succ); + VPBlockUtils::connectBlocks(Pred, Succ); + + auto *SplitPred = SplitBlock->getSinglePredecessor(); + + VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock); + VPBlockUtils::connectBlocks(SplitPred, SinkRegion); + VPBlockUtils::connectBlocks(SinkRegion, SplitBlock); + if (VPBB == SplitPred) + VPBB = SplitBlock; } // Interleave memory: for each Interleave Group we marked earlier as relevant diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1531,6 +1531,11 @@ void dropAllReferences(VPValue *NewValue) override; + /// Split current block at \p SplitAt by inserting a new block between the + /// current block and its successors and moving all recipes starting at + /// SplitAt to the new block. Returns the new block. + VPBasicBlock *splitAt(iterator SplitAt); + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print this VPBsicBlock to \p O, prefixing all lines with \p Indent. \p /// SlotTracker is used to print unnamed VPValue's using consequtive numbers. diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -400,6 +400,32 @@ } } +VPBasicBlock *VPBasicBlock::splitAt(iterator SplitAt) { + assert(SplitAt->getParent() == this && + "can only split at a position in the same block"); + + SmallVector Succs(getSuccessors().begin(), + getSuccessors().end()); + // First, disconnect the current block from its successors. + for (VPBlockBase *Succ : Succs) + VPBlockUtils::disconnectBlocks(this, Succ); + + // Create new empty block after the block to split. + auto *SplitBlock = new VPBasicBlock(getName() + ".split"); + VPBlockUtils::insertBlockAfter(SplitBlock, this); + + // Add successors for block to split to new block. + for (VPBlockBase *Succ : Succs) + VPBlockUtils::connectBlocks(SplitBlock, Succ); + + // Finally, move the recipes starting at SplitAt to new block. + for (VPRecipeBase &ToMove : + make_early_inc_range(make_range(SplitAt, this->end()))) + ToMove.moveBefore(*SplitBlock, SplitBlock->end()); + + return SplitBlock; +} + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPBasicBlock::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll @@ -0,0 +1,204 @@ +; REQUIRES: asserts +; RUN: opt < %s -loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -debug-only=loop-vectorize 2>&1 | FileCheck %s + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" + +; Test cases for PR50009, which require sinking a replicate-region due to a +; first-order recurrence. + +define void @sink_replicate_region_1(i32 %x, i8* %ptr) optsize { +; CHECK-LABEL: sink_replicate_region_1 +; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK-NEXT: loop: +; CHECK-NEXT: WIDEN-PHI %0 = phi 0, %conv +; CHECK-NEXT: WIDEN-INDUCTION %iv = phi 0, %iv.next +; CHECK-NEXT: EMIT vp<%3> = icmp ule ir<%iv> vp<%0> +; CHECK-NEXT: Successor(s): loop.0 + +; CHECK: loop.0: +; CHECK-NEXT: REPLICATE ir<%gep> = getelementptr ir<%ptr>, ir<%iv> +; CHECK-NEXT: Successor(s): pred.load + +; CHECK: pred.load: { +; CHECK-NEXT: pred.load.entry: +; CHECK-NEXT: BRANCH-ON-MASK vp<%3> +; CHECK-NEXT: Successor(s): pred.load.if, pred.load.continue +; CHECK-NEXT: CondBit: vp<%3> (loop) + +; CHECK: pred.load.if: +; CHECK-NEXT: REPLICATE ir<%lv> = load ir<%gep> (S->V) +; CHECK-NEXT: Successor(s): pred.load.continue + +; CHECK: pred.load.continue: +; CHECK-NEXT: PHI-PREDICATED-INSTRUCTION vp<%6> = ir<%lv> +; CHECK-NEXT: No successors +; CHECK-NEXT: } + +; CHECK: loop.1: +; CHECK-NEXT: WIDEN ir<%conv> = sext vp<%6> +; CHECK-NEXT: Successor(s): pred.srem + +; CHECK: pred.srem: { +; CHECK-NEXT: pred.srem.entry: +; CHECK-NEXT: BRANCH-ON-MASK vp<%3> +; CHECK-NEXT: Successor(s): pred.srem.if, pred.srem.continue +; CHECK-NEXT: CondBit: vp<%3> (loop) + +; CHECK: pred.srem.if: +; CHECK-NEXT: REPLICATE ir<%rem> = srem ir<%0>, ir<%x> (S->V) +; CHECK-NEXT: Successor(s): pred.srem.continue + +; CHECK: pred.srem.continue: +; CHECK-NEXT: PHI-PREDICATED-INSTRUCTION vp<%9> = ir<%rem> +; CHECK-NEXT: No successors +; CHECK-NEXT: } + +; CHECK: loop.1.split: +; CHECK-NEXT: WIDEN ir<%add> = add ir<%conv>, vp<%9> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; +entry: + br label %loop + +loop: + %0 = phi i32 [ 0, %entry ], [ %conv, %loop ] + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] + %rem = srem i32 %0, %x + %gep = getelementptr i8, i8* %ptr, i32 %iv + %lv = load i8, i8* %gep + %conv = sext i8 %lv to i32 + %add = add i32 %conv, %rem + %iv.next = add nsw i32 %iv, 1 + %ec = icmp eq i32 %iv.next, 20001 + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +define void @sink_replicate_region_2(i32 %x, i8 %y, i32* %ptr) optsize { +; CHECK-LABEL: sink_replicate_region_2 +; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK-NEXT: loop: +; CHECK-NEXT: WIDEN-PHI %recur = phi 0, %recur.next +; CHECK-NEXT: WIDEN-INDUCTION %iv = phi 0, %iv.next +; CHECK-NEXT: EMIT vp<%3> = icmp ule ir<%iv> vp<%0> +; CHECK-NEXT: Successor(s): loop.0 + +; CHECK: loop.0: +; CHECK-NEXT: WIDEN ir<%recur.next> = sext ir<%y> +; CHECK-NEXT: Successor(s): pred.srem + +; CHECK: pred.srem: { +; CHECK-NEXT: pred.srem.entry: +; CHECK-NEXT: BRANCH-ON-MASK vp<%3> +; CHECK-NEXT: Successor(s): pred.srem.if, pred.srem.continue +; CHECK-NEXT: CondBit: vp<%3> (loop) + +; CHECK: pred.srem.if: +; CHECK-NEXT: REPLICATE ir<%rem> = srem ir<%recur>, ir<%x> +; CHECK-NEXT: Successor(s): pred.srem.continue + +; CHECK: pred.srem.continue: +; CHECK-NEXT: PHI-PREDICATED-INSTRUCTION vp<%6> = ir<%rem> +; CHECK-NEXT: No successors +; CHECK-NEXT: } + +; CHECK: loop.0.split: +; CHECK-NEXT: REPLICATE ir<%add> = add vp<%6>, ir<%recur.next> +; CHECK-NEXT: REPLICATE ir<%gep> = getelementptr ir<%ptr>, ir<%iv> +; CHECK-NEXT: Successor(s): pred.store + +; CHECK: pred.store: { +; CHECK-NEXT: pred.store.entry: +; CHECK-NEXT: BRANCH-ON-MASK vp<%3> +; CHECK-NEXT: Successor(s): pred.store.if, pred.store.continue +; CHECK-NEXT: CondBit: vp<%3> (loop) + +; CHECK: pred.store.if: +; CHECK-NEXT: REPLICATE store ir<%add>, ir<%gep> +; CHECK-NEXT: Successor(s): pred.store.continue + +; CHECK: pred.store.continue: +; CHECK-NEXT: No successors +; CHECK-NEXT: } + +; CHECK: loop.1: +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; +entry: + br label %loop + +loop: + %recur = phi i32 [ 0, %entry ], [ %recur.next, %loop ] + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] + %rem = srem i32 %recur, %x + %recur.next = sext i8 %y to i32 + %add = add i32 %rem, %recur.next + %gep = getelementptr i32, i32* %ptr, i32 %iv + store i32 %add, i32* %gep + %iv.next = add nsw i32 %iv, 1 + %ec = icmp eq i32 %iv.next, 20001 + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +define i32 @sink_replicate_region_3_reduction(i32 %x, i8 %y, i32* %ptr) optsize { +; CHECK-LABEL: sink_replicate_region_3_reduction +; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK-NEXT: loop: +; CHECK-NEXT: WIDEN-PHI %recur = phi 0, %recur.next +; CHECK-NEXT: WIDEN-INDUCTION %iv = phi 0, %iv.next +; CHECK-NEXT: WIDEN-PHI ir<%and.red> = phi ir<1234>, ir<%and.red.next> +; CHECK-NEXT: EMIT vp<%4> = icmp ule ir<%iv> vp<%0> +; CHECK-NEXT: Successor(s): loop.0 + +; CHECK: loop.0: +; CHECK-NEXT: WIDEN ir<%recur.next> = sext ir<%y> +; CHECK-NEXT: Successor(s): pred.srem + +; CHECK: pred.srem: { +; CHECK-NEXT: pred.srem.entry: +; CHECK-NEXT: BRANCH-ON-MASK vp<%4> +; CHECK-NEXT: Successor(s): pred.srem.if, pred.srem.continue +; CHECK-NEXT: CondBit: vp<%4> (loop) + +; CHECK: pred.srem.if: +; CHECK-NEXT: REPLICATE ir<%rem> = srem ir<%recur>, ir<%x> (S->V) +; CHECK-NEXT: Successor(s): pred.srem.continue + +; CHECK: pred.srem.continue: +; CHECK-NEXT: PHI-PREDICATED-INSTRUCTION vp<%7> = ir<%rem> +; CHECK-NEXT: No successors +; CHECK-NEXT: } + +; CHECK: loop.0.split: +; CHECK-NEXT: WIDEN ir<%add> = add vp<%7>, ir<%recur.next> +; CHECK-NEXT: WIDEN ir<%and.red.next> = and ir<%and.red>, ir<%add> +; CHECK-NEXT: EMIT vp<%10> = select vp<%4> ir<%and.red.next> ir<%and.red> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; +entry: + br label %loop + +loop: + %recur = phi i32 [ 0, %entry ], [ %recur.next, %loop ] + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] + %and.red = phi i32 [ 1234, %entry ], [ %and.red.next, %loop ] + %rem = srem i32 %recur, %x + %recur.next = sext i8 %y to i32 + %add = add i32 %rem, %recur.next + %and.red.next = and i32 %and.red, %add + %iv.next = add nsw i32 %iv, 1 + %ec = icmp eq i32 %iv.next, 20001 + br i1 %ec, label %exit, label %loop + +exit: + %res = phi i32 [ %and.red.next, %loop ] + ret i32 %res +}