diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -8180,40 +8180,123 @@ return BlockMaskCache[BB] = BlockMask; } -void VPRecipeBuilder::createTailFoldHeaderMask(VPlan &Plan) { - assert(CM.foldTailByMasking() && "must fold the tail"); - BasicBlock *Header = OrigLoop->getHeader(); +static bool mayCauseUB(unsigned Opcode) { + switch (Opcode) { + case Instruction::UDiv: + case Instruction::URem: + case Instruction::SDiv: + case Instruction::SRem: + return true; + default: + return false; + } +} +static void performTailFolding(VPlan &Plan, TailFoldingStyle TFStyle, + Type *BoolTy) { + VPRegionBlock *TopRegion = Plan.getVectorLoopRegion(); + VPBasicBlock *Header = TopRegion->getEntryBasicBlock(); + VPBuilder Builder; + VPValue *HeaderMask = nullptr; + // First, introduce recipes to compute the mask for the vector loop header. + // // If we're using the active lane mask for control flow, then we get the // mask from the active lane mask PHI that is cached in the VPlan. - TailFoldingStyle TFStyle = CM.getTailFoldingStyle(); if (useActiveLaneMaskForControlFlow(TFStyle)) { - BlockMaskCache[Header] = Plan.getActiveLaneMaskPhi(); - return; + HeaderMask = Plan.getActiveLaneMaskPhi(); + } else { + // Introduce the early-exit compare IV <= BTC to form header block mask. + // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by + // constructing the desired canonical IV in the header block as its first + // non-phi instructions. + auto NewInsertionPoint = Header->getFirstNonPhi(); + auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV()); + Header->insert(IV, Header->getFirstNonPhi()); + + VPBuilder::InsertPointGuard Guard(Builder); + Builder.setInsertPoint(Header, NewInsertionPoint); + if (useActiveLaneMask(TFStyle)) { + VPValue *TC = Plan.getTripCount(); + HeaderMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC}, + nullptr, "active.lane.mask"); + } else { + VPValue *BTC = Plan.getOrCreateBackedgeTakenCount(); + HeaderMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); + } } - // Introduce the early-exit compare IV <= BTC to form header block mask. - // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by - // constructing the desired canonical IV in the header block as its first - // non-phi instructions. + // Iterate over all recipes and adjust their masks as needed. + auto Iter = vp_depth_first_deep(Plan.getEntry()); + for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly(Iter)) { + VPValue *BlockMask = nullptr; + VPValue *NewMask = nullptr; + + auto OrWithHeaderMask = [&](VPValue *BlockInMask, VPRecipeBase *InsertPt) { + if (!NewMask) { + VPBuilder::InsertPointGuard Guard(Builder); + Builder.setInsertPoint(InsertPt->getParent(), InsertPt->getIterator()); + VPValue *False = + Plan.getVPValueOrAddLiveIn(ConstantInt::getFalse(BoolTy)); + NewMask = Builder.createSelect(HeaderMask, BlockInMask, False, {}); + } + return NewMask; + }; - VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock(); - auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi(); - auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV()); - HeaderVPBB->insert(IV, HeaderVPBB->getFirstNonPhi()); + auto NeedsNewMask = [&](VPRecipeBase &R) { + if (auto *RepR = dyn_cast(&R)) { + return !RepR->isUniform() && + (RepR->mayHaveSideEffects() || RepR->mayReadFromMemory() || + mayCauseUB(RepR->getUnderlyingInstr()->getOpcode())); + } + if (auto *CallR = dyn_cast(&R)) + return CallR->needsMask(); - VPBuilder::InsertPointGuard Guard(Builder); - Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint); - VPValue *BlockMask = nullptr; - if (useActiveLaneMask(TFStyle)) { - VPValue *TC = Plan.getTripCount(); - BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC}, - nullptr, "active.lane.mask"); - } else { - VPValue *BTC = Plan.getOrCreateBackedgeTakenCount(); - BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); + return isa(&R); + }; + for (VPRecipeBase &R : *VPBB) { + if (R.isMasked()) { + if (!BlockMask) { + BlockMask = R.getMask(); + } + assert(BlockMask == R.getMask()); + R.setOperand(R.getNumOperands() - 1, OrWithHeaderMask(BlockMask, &R)); + continue; + } + if (NeedsNewMask(R)) { + R.addOperand(HeaderMask); + continue; + } + if (auto *Blend = dyn_cast(&R)) { + for (unsigned I = 0; I < Blend->getNumIncomingValues(); ++I) { + VPValue *Cond = Blend->getMask(I); + R.setOperand(R.getNumOperands() - 1, OrWithHeaderMask(Cond, &R)); + } + } else if (auto *BOM = dyn_cast(&R)) { + if (auto *Cond = BOM->getMask()) { + if (!BlockMask) + BlockMask = Cond; + R.setOperand(R.getNumOperands() - 1, OrWithHeaderMask(BlockMask, &R)); + } else + R.addOperand(HeaderMask); + } + } + } + + // If tail is folded by masking, introduce selects between the phi + // and the live-out instruction of each reduction, at the beginning of the + // dedicated latch block. + VPBasicBlock *Latch = cast(TopRegion->getExiting()); + Builder.setInsertPoint(Latch, Latch->begin()); + for (VPRecipeBase &R : Header->phis()) { + VPReductionPHIRecipe *PhiR = dyn_cast(&R); + if (!PhiR || PhiR->isInLoop()) + continue; + auto *Red = PhiR->getBackedgeValue(); + assert(Red->getDefiningRecipe()->getParent() != Latch && + "reduction recipe must be defined before latch"); + Builder.createNaryOp(Instruction::Select, {HeaderMask, Red, PhiR}); } - BlockMaskCache[Header] = BlockMask; } VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, @@ -8472,7 +8555,7 @@ } return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()), - Intrinsic::not_intrinsic, Variant); + Intrinsic::not_intrinsic, Variant, NeedsMask); } return nullptr; @@ -8889,11 +8972,6 @@ DLInst ? DLInst->getDebugLoc() : DebugLoc(), CM.getTailFoldingStyle(IVUpdateMayOverflow)); - // Proactively create header mask when tail-folding. Masks for other blocks - // are created on demand. - if (CM.foldTailByMasking()) - RecipeBuilder.createTailFoldHeaderMask(*Plan); - // Scan the body of the loop in a topological order to visit each basic block // after having visited its predecessor basic blocks. LoopBlocksDFS DFS(OrigLoop); @@ -8946,9 +9024,11 @@ VPRecipeBase *Recipe = cast(RecipeOrValue); bool IsPredicated = CM.isPredicatedInst(&I); - if (IsPredicated) - Recipe->addOperand( - RecipeBuilder.createBlockInMask(I.getParent(), *Plan)); + if (IsPredicated) { + auto *M = RecipeBuilder.createBlockInMask(I.getParent(), *Plan); + if (M) + Recipe->addOperand(M); + } for (auto *Def : Recipe->definedValues()) { auto *UV = Def->getUnderlyingValue(); @@ -9030,6 +9110,12 @@ } } + // Update Plan to fold the tail by masking. + if (CM.foldTailByMasking()) + performTailFolding( + *Plan, CM.getTailFoldingStyle(), + IntegerType::get(OrigLoop->getHeader()->getContext(), 1)); + for (ElementCount VF : Range) Plan->addVF(VF); Plan->setName("Initial VPlan"); @@ -9221,25 +9307,6 @@ } } - // If tail is folded by masking, introduce selects between the phi - // and the live-out instruction of each reduction, at the beginning of the - // dedicated latch block. - if (CM.foldTailByMasking()) { - Builder.setInsertPoint(LatchVPBB, LatchVPBB->begin()); - for (VPRecipeBase &R : - Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) { - VPReductionPHIRecipe *PhiR = dyn_cast(&R); - if (!PhiR || PhiR->isInLoop()) - continue; - VPValue *Cond = - RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), *Plan); - VPValue *Red = PhiR->getBackedgeValue(); - assert(Red->getDefiningRecipe()->getParent() != LatchVPBB && - "reduction recipe must be defined before latch"); - Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR}); - } - } - VPlanTransforms::clearReductionWrapFlags(*Plan); } diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h --- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h +++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h @@ -138,9 +138,6 @@ /// tail folding. It returns the *entry* mask for the block BB. VPValue *createBlockInMask(BasicBlock *BB, VPlan &Plan); - /// Create the mask for the vector loop header block when tail-folding. - void createTailFoldHeaderMask(VPlan &Plan); - /// A helper function that computes the predicate of the edge between SRC /// and DST. VPValue *createEdgeMask(BasicBlock *Src, BasicBlock *Dst, VPlan &Plan); diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1224,13 +1224,16 @@ /// VF with a valid variant. Function *Variant; + bool NeedsMask = false; + public: template VPWidenCallRecipe(CallInst &I, iterator_range CallArguments, Intrinsic::ID VectorIntrinsicID, - Function *Variant = nullptr) + Function *Variant = nullptr, bool NeedsMask = false) : VPRecipeBase(VPDef::VPWidenCallSC, CallArguments), VPValue(this, &I), - VectorIntrinsicID(VectorIntrinsicID), Variant(Variant) {} + VectorIntrinsicID(VectorIntrinsicID), Variant(Variant), + NeedsMask(NeedsMask) {} ~VPWidenCallRecipe() override = default; @@ -1244,6 +1247,12 @@ void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override; #endif + + unsigned getNumNonMaskOperands() const override { + return NeedsMask ? Variant->arg_size() - 1 : getNumOperands(); + } + + bool needsMask() const { return NeedsMask; } }; /// A recipe for widening select instructions. @@ -1785,6 +1794,9 @@ VPValue *getCondOp() const { return getNumOperands() > 2 ? getOperand(2) : nullptr; } + + /// Returns the number of operands excluding mask operands. + unsigned getNumNonMaskOperands() const override { return 2; } }; /// VPReplicateRecipe replicates a given instruction producing multiple scalar @@ -1948,10 +1960,6 @@ addOperand(Mask); } - bool isMasked() const { - return isStore() ? getNumOperands() == 3 : getNumOperands() == 2; - } - public: VPWidenMemoryInstructionRecipe(LoadInst &Load, VPValue *Addr, VPValue *Mask, bool Consecutive, bool Reverse) diff --git a/llvm/test/Transforms/LoopVectorize/tail-folding-vectorization-factor-1.ll b/llvm/test/Transforms/LoopVectorize/tail-folding-vectorization-factor-1.ll --- a/llvm/test/Transforms/LoopVectorize/tail-folding-vectorization-factor-1.ll +++ b/llvm/test/Transforms/LoopVectorize/tail-folding-vectorization-factor-1.ll @@ -16,46 +16,22 @@ ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE9:%.*]] ] -; CHECK-NEXT: [[VEC_IV:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[VEC_IV1:%.*]] = add i64 [[INDEX]], 1 -; CHECK-NEXT: [[VEC_IV2:%.*]] = add i64 [[INDEX]], 2 -; CHECK-NEXT: [[VEC_IV3:%.*]] = add i64 [[INDEX]], 3 -; CHECK-NEXT: [[TMP0:%.*]] = icmp ule i64 [[VEC_IV]], 14 -; CHECK-NEXT: [[TMP1:%.*]] = icmp ule i64 [[VEC_IV1]], 14 -; CHECK-NEXT: [[TMP2:%.*]] = icmp ule i64 [[VEC_IV2]], 14 -; CHECK-NEXT: [[TMP3:%.*]] = icmp ule i64 [[VEC_IV3]], 14 -; CHECK-NEXT: br i1 [[TMP0]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] -; CHECK: pred.store.if: -; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP3]] +; CHECK-NEXT: store i32 0, ptr [[TMP4]], align 4 ; CHECK-NEXT: store i32 0, ptr [[TMP5]], align 4 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]] -; CHECK: pred.store.continue: -; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_STORE_IF4:%.*]], label [[PRED_STORE_CONTINUE5:%.*]] -; CHECK: pred.store.if4: -; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 1 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP6]] +; CHECK-NEXT: store i32 0, ptr [[TMP6]], align 4 ; CHECK-NEXT: store i32 0, ptr [[TMP7]], align 4 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE5]] -; CHECK: pred.store.continue5: -; CHECK-NEXT: br i1 [[TMP2]], label [[PRED_STORE_IF6:%.*]], label [[PRED_STORE_CONTINUE7:%.*]] -; CHECK: pred.store.if6: -; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 2 -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP8]] -; CHECK-NEXT: store i32 0, ptr [[TMP9]], align 4 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE7]] -; CHECK: pred.store.continue7: -; CHECK-NEXT: br i1 [[TMP3]], label [[PRED_STORE_IF8:%.*]], label [[PRED_STORE_CONTINUE9]] -; CHECK: pred.store.if8: -; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 3 -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP10]] -; CHECK-NEXT: store i32 0, ptr [[TMP11]], align 4 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE9]] -; CHECK: pred.store.continue9: ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 -; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 +; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -69,7 +45,7 @@ ; CHECK-NEXT: store i32 0, ptr [[DST_PTR]], align 4 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 15 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; entry: br label %for.body @@ -95,50 +71,26 @@ ; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[PTR1]], i64 128 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE12:%.*]] ] -; CHECK-NEXT: [[VEC_IV:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[VEC_IV4:%.*]] = add i64 [[INDEX]], 1 -; CHECK-NEXT: [[VEC_IV5:%.*]] = add i64 [[INDEX]], 2 -; CHECK-NEXT: [[VEC_IV6:%.*]] = add i64 [[INDEX]], 3 -; CHECK-NEXT: [[TMP0:%.*]] = icmp ule i64 [[VEC_IV]], 14 -; CHECK-NEXT: [[TMP1:%.*]] = icmp ule i64 [[VEC_IV4]], 14 -; CHECK-NEXT: [[TMP2:%.*]] = icmp ule i64 [[VEC_IV5]], 14 -; CHECK-NEXT: [[TMP3:%.*]] = icmp ule i64 [[VEC_IV6]], 14 -; CHECK-NEXT: br i1 [[TMP0]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] -; CHECK: pred.store.if: -; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PTR1]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 +; CHECK-NEXT: [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[PTR1]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 -; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PTR1]], i64 [[TMP5]] -; CHECK-NEXT: store double 0.000000e+00, ptr [[NEXT_GEP]], align 8 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]] -; CHECK: pred.store.continue: -; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8:%.*]] -; CHECK: pred.store.if7: -; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[PTR1]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 3 ; CHECK-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 8 -; CHECK-NEXT: [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[PTR1]], i64 [[TMP7]] +; CHECK-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[PTR1]], i64 [[TMP7]] +; CHECK-NEXT: store double 0.000000e+00, ptr [[NEXT_GEP]], align 8 ; CHECK-NEXT: store double 0.000000e+00, ptr [[NEXT_GEP1]], align 8 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE8]] -; CHECK: pred.store.continue8: -; CHECK-NEXT: br i1 [[TMP2]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]] -; CHECK: pred.store.if9: -; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 2 -; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 8 -; CHECK-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[PTR1]], i64 [[TMP9]] ; CHECK-NEXT: store double 0.000000e+00, ptr [[NEXT_GEP2]], align 8 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE10]] -; CHECK: pred.store.continue10: -; CHECK-NEXT: br i1 [[TMP3]], label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12]] -; CHECK: pred.store.if11: -; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 3 -; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 8 -; CHECK-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[PTR1]], i64 [[TMP11]] ; CHECK-NEXT: store double 0.000000e+00, ptr [[NEXT_GEP3]], align 8 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE12]] -; CHECK: pred.store.continue12: ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 -; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 +; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -151,7 +103,7 @@ ; CHECK-NEXT: store double 0.000000e+00, ptr [[ADDR]], align 8 ; CHECK-NEXT: [[PTR]] = getelementptr inbounds double, ptr [[ADDR]], i64 1 ; CHECK-NEXT: [[COND:%.*]] = icmp eq ptr [[PTR]], [[PTR2]] -; CHECK-NEXT: br i1 [[COND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: br i1 [[COND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; entry: %ptr2 = getelementptr inbounds double, ptr %ptr1, i64 15