Index: llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -186,6 +186,15 @@ } }; +/// A pair of VPlan and VectorizationFactor, used as the best result of costing +/// different VPlans. +struct VPlanVFPair { + /// The Plan + VPlan *Plan; + /// The VF/Cost from costing + VectorizationFactor VF; +}; + /// Planner drives the vectorization process after having passed /// Legality checks. class LoopVectorizationPlanner { @@ -247,14 +256,14 @@ /// Plan how to best vectorize, return the best VF and its cost, or None if /// vectorization and interleaving should be avoided up front. - Optional plan(ElementCount UserVF, unsigned UserIC); + Optional plan(ElementCount UserVF, unsigned UserIC); /// Use the VPlan-native path to plan how to best vectorize, return the best /// VF and its cost. - VectorizationFactor planInVPlanNativePath(ElementCount UserVF); + VPlanVFPair planInVPlanNativePath(ElementCount UserVF); /// Finalize the best decision and dispose of all other VPlans. - void setBestPlan(ElementCount VF, unsigned UF); + void setBestPlan(VPlan *Plan, ElementCount VF, unsigned UF); /// Generate the IR code for the body of the vectorized loop according to the /// best selected VPlan. Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -317,6 +317,10 @@ cl::desc("Enable VPlan-native vectorization path predicator with " "support for outer loop vectorization.")); +cl::opt CostUsingVPlan("cost-using-vplan", cl::init(false), cl::Hidden, + cl::desc("Enable VPlan based costing path. To " + "become the default in the future.")); + // This flag enables the stress testing of the VPlan H-CFG construction in the // VPlan-native vectorization path. It must be used in conjuction with // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the @@ -1077,6 +1081,11 @@ /// possible. VectorizationFactor selectVectorizationFactor(unsigned MaxVF); + /// \return The most profitable vplan and VF from a list of VPlans. + VPlanVFPair + selectVectorizationFactorFromVPlans(SmallVectorImpl &VPlans, + unsigned MaxVF); + /// Setup cost-based decisions for user vectorization factor. void selectUserVectorizationFactor(ElementCount UserVF) { collectUniformsAndScalars(UserVF); @@ -1092,7 +1101,8 @@ /// If interleave count has been specified by metadata it will be returned. /// Otherwise, the interleave count is computed and returned. VF and LoopCost /// are the selected vectorization factor and the cost of the selected VF. - unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost); + unsigned selectInterleaveCount(VPlan *Plan, ElementCount VF, + unsigned LoopCost); /// Memory access instruction may be vectorized in more than one way. /// Form of instruction after vectorization depends on cost. @@ -1436,6 +1446,10 @@ Scalars.clear(); } + /// Returns the execution time cost of an instruction for a given vector + /// width. Vector width of one means scalar. + VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); + private: unsigned NumPredStores = 0; @@ -1444,25 +1458,12 @@ /// to cost. unsigned computeFeasibleMaxVF(unsigned ConstTripCount); - /// The vectorization cost is a combination of the cost itself and a boolean - /// indicating whether any of the contributing operations will actually - /// operate on - /// vector values after type legalization in the backend. If this latter value - /// is - /// false, then all operations will be scalarized (i.e. no vectorization has - /// actually taken place). - using VectorizationCostTy = std::pair; - /// Returns the expected execution cost. The unit of the cost does /// not matter because we use the 'cost' units to compare different /// vector widths. The cost that is returned is *not* normalized by /// the factor width. VectorizationCostTy expectedCost(ElementCount VF); - /// Returns the execution time cost of an instruction for a given vector - /// width. Vector width of one means scalar. - VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); - /// The cost-computation logic from getInstructionCost which provides /// the vector type as an output parameter. unsigned getInstructionCost(Instruction *I, ElementCount VF, Type *&VectorTy); @@ -5451,6 +5452,74 @@ return Factor; } +VPlanVFPair LoopVectorizationCostModel::selectVectorizationFactorFromVPlans( + SmallVectorImpl &VPlans, unsigned MaxVF) { + VPCostContext Ctx{*this, *Legal}; + bool ForceVectorization = + Hints->getForce() == LoopVectorizeHints::FK_Enabled && MaxVF > 1; + + VPlan *BestPlan = nullptr, *ScalarPlan = nullptr; + ElementCount BestVF = ElementCount::getNull(); + float BestCost, ScalarCost; + for (const auto &Plan : VPlans) { + for (ElementCount VF : Plan->getVFs()) { + + if (ForceVectorization && VF.isScalar()) { + LLVM_DEBUG(dbgs() << " Skipping due to force vectorization\n"); + continue; + } + if (VF.getKnownMinValue() > MaxVF) { + LLVM_DEBUG(dbgs() << " Skipping due to MaxVF\n"); + continue; + } + + VectorizationCostTy Cost = Plan->cost(VF, Ctx); + float VectorCost = Cost.first / (float)VF.getKnownMinValue(); + LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << VF.getKnownMinValue() + << " costs: " << (int)VectorCost << ".\n"); + if (!VF.isScalar() && !Cost.second && !ForceVectorization) { + LLVM_DEBUG( + dbgs() + << "LV: Not considering vector loop of width " + << VF.getKnownMinValue() + << " because it will not generate any vector instructions.\n"); + continue; + } + + if (!BestPlan || VectorCost < BestCost) { + BestPlan = &*Plan; + BestVF = VF; + BestCost = VectorCost; + } + if (!ScalarPlan && VF.isScalar()) { + ScalarPlan = &*Plan; + ScalarCost = VectorCost; + } + } + } + + if (!EnableCondStoresVectorization && NumPredStores) { + reportVectorizationFailure("There are conditional stores.", + "store that is conditionally executed prevents vectorization", + "ConditionalStore", ORE, TheLoop); + BestPlan = ScalarPlan; + BestVF = ElementCount::getFixed(1); + BestCost = ScalarCost; + } + + if (!BestPlan) { + assert(ScalarPlan); + BestPlan = ScalarPlan; + BestVF = ElementCount::getFixed(1); + BestCost = ScalarCost; + } + + LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << BestVF << ".\n"); + VectorizationFactor Factor = { + BestVF, (unsigned)(BestCost * BestVF.getKnownMinValue())}; + return {BestPlan, Factor}; +} + std::pair LoopVectorizationCostModel::getSmallestAndWidestTypes() { unsigned MinWidth = -1U; @@ -5507,7 +5576,8 @@ return {MinWidth, MaxWidth}; } -unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, +unsigned LoopVectorizationCostModel::selectInterleaveCount(VPlan *Plan, + ElementCount VF, unsigned LoopCost) { // -- The interleave heuristics -- // We interleave the loop in order to expose ILP and reduce the loop overhead. @@ -5612,8 +5682,13 @@ // If we did not calculate the cost for VF (because the user selected the VF) // then we calculate the cost of VF here. - if (LoopCost == 0) - LoopCost = expectedCost(VF).first; + if (LoopCost == 0) { + if (CostUsingVPlan) { + VPCostContext Ctx{*this, *Legal}; + LoopCost = Plan->cost(VF, Ctx).first; + } else + LoopCost = expectedCost(VF).first; + } assert(LoopCost && "Non-zero loop cost expected"); @@ -6068,8 +6143,7 @@ return Discount; } -LoopVectorizationCostModel::VectorizationCostTy -LoopVectorizationCostModel::expectedCost(ElementCount VF) { +VectorizationCostTy LoopVectorizationCostModel::expectedCost(ElementCount VF) { assert(!VF.isScalable() && "scalable vectors not yet supported."); VectorizationCostTy Cost; @@ -6314,7 +6388,7 @@ return getWideningCost(I, VF); } -LoopVectorizationCostModel::VectorizationCostTy +VectorizationCostTy LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF) { assert(!VF.isScalable() && @@ -6927,7 +7001,7 @@ return WidestVectorRegBits / WidestType; } -VectorizationFactor +VPlanVFPair LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { assert(!UserVF.isScalable() && "scalable vectors not yet supported"); ElementCount VF = UserVF; @@ -6959,19 +7033,20 @@ // For VPlan build stress testing, we bail out after VPlan construction. if (VPlanBuildStressTest) - return VectorizationFactor::Disabled(); + return {nullptr, VectorizationFactor::Disabled()}; - return {VF, 0 /*Cost*/}; + assert(VPlans.size() == 1 && "Expected a single vplan!"); + return {&*VPlans.front(), {VF, 0 /*Cost*/}}; } LLVM_DEBUG( dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " "VPlan-native path.\n"); - return VectorizationFactor::Disabled(); + return {nullptr, VectorizationFactor::Disabled()}; } -Optional -LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { +Optional LoopVectorizationPlanner::plan(ElementCount UserVF, + unsigned UserIC) { assert(!UserVF.isScalable() && "scalable vectorization not yet handled"); assert(OrigLoop->isInnermost() && "Inner loop expected."); Optional MaybeMaxVF = @@ -7004,7 +7079,9 @@ buildVPlansWithVPRecipes(UserVF.getKnownMinValue(), UserVF.getKnownMinValue()); LLVM_DEBUG(printPlans(dbgs())); - return {{UserVF, 0}}; + assert(VPlans.size() == 1 && VPlans.front()->hasVF(UserVF) && + "Expected a correct width vplan!"); + return VPlanVFPair{&*VPlans.front(), {UserVF, 0}}; } unsigned MaxVF = MaybeMaxVF.getValue(); @@ -7024,22 +7101,38 @@ buildVPlansWithVPRecipes(1, MaxVF); LLVM_DEBUG(printPlans(dbgs())); - if (MaxVF == 1) - return VectorizationFactor::Disabled(); + if (MaxVF == 1) { + assert(VPlans.size() == 1 && + VPlans.front()->hasVF(ElementCount::getFixed(MaxVF))); + return VPlanVFPair{&*VPlans.front(), VectorizationFactor::Disabled()}; + } // Select the optimal vectorization factor. - return CM.selectVectorizationFactor(MaxVF); + if (CostUsingVPlan) + return CM.selectVectorizationFactorFromVPlans(VPlans, MaxVF); + else { + VectorizationFactor VF = CM.selectVectorizationFactor(MaxVF); + for (VPlanPtr &Plan : VPlans) + if (Plan->hasVF(VF.Width)) + return VPlanVFPair{&*Plan, VF}; + llvm_unreachable("Expected to find a vplan with width VF!"); + } } -void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) { +void LoopVectorizationPlanner::setBestPlan(VPlan *Plan, ElementCount VF, + unsigned UF) { LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF << '\n'); BestVF = VF; BestUF = UF; - erase_if(VPlans, [VF](const VPlanPtr &Plan) { - return !Plan->hasVF(VF); - }); + if (!Plan) { + // No best. + VPlans.clear(); + return; + } + + erase_if(VPlans, [Plan](const VPlanPtr &P) { return &*P != Plan; }); assert(VPlans.size() == 1 && "Best VF has not a single VPlan."); } @@ -7528,6 +7621,7 @@ VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); VPBlockUtils::insertBlockAfter(Region, VPBB); auto *RegSucc = new VPBasicBlock(); + RegSucc->setReciprocalPredBlockProb(getReciprocalPredBlockProb()); VPBlockUtils::insertBlockAfter(RegSucc, Region); return RegSucc; } @@ -7546,10 +7640,13 @@ assert(Instr->getParent() && "Predicated instruction not in any basic block"); auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); + Entry->setReciprocalPredBlockProb(Builder.getInsertBlock()->getReciprocalPredBlockProb()); auto *PHIRecipe = Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr); auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); + Exit->setReciprocalPredBlockProb(getReciprocalPredBlockProb()); auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); + Pred->setReciprocalPredBlockProb(getReciprocalPredBlockProb()); VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); // Note: first set Entry as region entry and then connect successors starting @@ -7718,6 +7815,11 @@ VPBB = FirstVPBBForBB; Builder.setInsertPoint(VPBB); + // Update the ReciprocalPredBlockProb of the block, used in costing. + // FIXME: This is not very accurate, and could be improved / replaced. + if (CM.blockNeedsPredication(BB)) + VPBB->setReciprocalPredBlockProb(getReciprocalPredBlockProb()); + // Introduce each ingredient into VPlan. // TODO: Model and preserve debug instrinsics in VPlan. for (Instruction &I : BB->instructionsWithoutDebug()) { @@ -7940,6 +8042,90 @@ return ILV.getOrCreateScalarValue(V, Instance); } +VectorizationCostTy VPlan::cost(ElementCount VF, VPCostContext &Ctx) { + VectorizationCostTy Cost; + + for (VPBlockBase *Block : depth_first(Entry)) { + VectorizationCostTy C = Block->cost(VF, Ctx); + + Cost.first += C.first; + Cost.second |= C.second; + } + + // The vplan does not contain the add+icmp for the loop iteration check. Add + // those costs here. + unsigned ExtraCost = + Ctx.CM.TTI.getArithmeticInstrCost(Instruction::Add, + Ctx.Legal.getWidestInductionType()) + + Ctx.CM.TTI.getCmpSelInstrCost(Instruction::ICmp, + Ctx.Legal.getWidestInductionType()); + Cost.first += ExtraCost; + LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << ExtraCost + << " for VF " << VF + << " For loop induction check (add + icmp)\n"); + // And then add the cost of the backedge, which is often but not always 0. + ExtraCost = + Ctx.CM.TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput); + Cost.first += ExtraCost; + LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << ExtraCost + << " for VF " << VF + << " For loop backedge cost (br)\n"); + + return Cost; +} + +VectorizationCostTy VPRegionBlock::cost(ElementCount VF, VPCostContext &Ctx) { + ReversePostOrderTraversal RPOT(Entry); + VectorizationCostTy Cost; + + for (VPBlockBase *Block : RPOT) { + VectorizationCostTy C = Block->cost(VF, Ctx); + + Cost.first += C.first; + Cost.second |= C.second; + } + + return Cost; +} + +VectorizationCostTy VPBasicBlock::cost(ElementCount VF, VPCostContext &Ctx) { + VectorizationCostTy BlockCost; + VPSlotTracker Tracker(getPlan()); + + for (VPRecipeBase &Recipe : Recipes) { + // Skip ignored values. + // FIXME: This should go via VPValues getUnderlyingValue. + VPValue *Val = Recipe.toVPValue(); + if (Val && (Ctx.CM.ValuesToIgnore.count(Val->getUnderlyingValue()) || + (VF.isVector() && + Ctx.CM.VecValuesToIgnore.count(Val->getUnderlyingValue())))) + continue; + + VectorizationCostTy C = Recipe.cost(VF, Ctx); + + // Check if we should override the cost. + if (ForceTargetInstructionCost.getNumOccurrences() > 0) + C.first = ForceTargetInstructionCost; + + BlockCost.first += C.first; + BlockCost.second |= C.second; + LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first + << " for VF " << VF << " For recipe: "; + Recipe.print(dbgs(), "", Tracker); dbgs() << '\n'); + } + + // If we are vectorizing a predicated block, it will have been + // if-converted. This means that the block's instructions (aside from + // stores and instructions that may divide by zero) will now be + // unconditionally executed. For the scalar case, we may not always execute + // the predicated block. Thus, scale the block's cost by the probability of + // executing it. + if (VF.isScalar()) + BlockCost.first /= getReciprocalPredBlockProb(); + + return BlockCost; +} + void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; @@ -7960,28 +8146,57 @@ State.ILV->widenCallInstruction(Ingredient, *this, State); } +VectorizationCostTy VPWidenCallRecipe::cost(ElementCount VF, + VPCostContext &Ctx) { + return Ctx.CM.getInstructionCost(&Ingredient, VF); +} + void VPWidenSelectRecipe::execute(VPTransformState &State) { State.ILV->widenSelectInstruction(Ingredient, *this, InvariantCond, State); } +VectorizationCostTy VPWidenSelectRecipe::cost(ElementCount VF, + VPCostContext &Ctx) { + return Ctx.CM.getInstructionCost(&Ingredient, VF); +} + void VPWidenRecipe::execute(VPTransformState &State) { State.ILV->widenInstruction(*getUnderlyingInstr(), *this, State); } +VectorizationCostTy VPWidenRecipe::cost(ElementCount VF, VPCostContext &Ctx) { + return Ctx.CM.getInstructionCost(getUnderlyingInstr(), VF); +} + void VPWidenGEPRecipe::execute(VPTransformState &State) { State.ILV->widenGEP(GEP, *this, State.UF, State.VF, IsPtrLoopInvariant, IsIndexLoopInvariant, State); } +VectorizationCostTy VPWidenGEPRecipe::cost(ElementCount VF, + VPCostContext &Ctx) { + return Ctx.CM.getInstructionCost(GEP, VF); +} + void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { assert(!State.Instance && "Int or FP induction being replicated."); State.ILV->widenIntOrFpInduction(IV, Trunc); } +VectorizationCostTy VPWidenIntOrFpInductionRecipe::cost(ElementCount VF, + VPCostContext &Ctx) { + return Ctx.CM.getInstructionCost(IV, VF); +} + void VPWidenPHIRecipe::execute(VPTransformState &State) { State.ILV->widenPHIInstruction(Phi, State.UF, State.VF); } +VectorizationCostTy VPWidenPHIRecipe::cost(ElementCount VF, + VPCostContext &Ctx) { + return Ctx.CM.getInstructionCost(Phi, VF); +} + void VPBlendRecipe::execute(VPTransformState &State) { State.ILV->setDebugLocFromInst(State.Builder, Phi); // We know that all PHIs in non-header blocks are converted into @@ -8021,11 +8236,28 @@ State.ValueMap.setVectorValue(Phi, Part, Entry[Part]); } +VectorizationCostTy VPBlendRecipe::cost(ElementCount VF, VPCostContext &Ctx) { + return Ctx.CM.getInstructionCost(Phi, VF); +} + void VPInterleaveRecipe::execute(VPTransformState &State) { assert(!State.Instance && "Interleave group being replicated."); State.ILV->vectorizeInterleaveGroup(IG, State, getAddr(), getMask()); } +VectorizationCostTy VPInterleaveRecipe::cost(ElementCount VF, + VPCostContext &Ctx) { + VectorizationCostTy Cost = {0, false}; + for (unsigned i = 0; i < IG->getNumMembers(); i++) { + if (!IG->getMember(i)) + continue; + VectorizationCostTy MC = Ctx.CM.getInstructionCost(IG->getMember(i), VF); + Cost.first += MC.first; + Cost.second |= MC.second; + } + return Cost; +} + void VPReductionRecipe::execute(VPTransformState &State) { assert(!State.Instance && "Reduction being replicated."); for (unsigned Part = 0; Part < State.UF; ++Part) { @@ -8059,6 +8291,15 @@ } } +VectorizationCostTy VPReductionRecipe::cost(ElementCount VF, + VPCostContext &Ctx) { + unsigned Cost = Ctx.CM.TTI.getArithmeticReductionCost( + RdxDesc->getRecurrenceBinOp(), + VectorType::get(RdxDesc->getRecurrenceType(), VF), false, + TTI::TCK_RecipThroughput); + return {Cost, false}; +} + void VPReplicateRecipe::execute(VPTransformState &State) { if (State.Instance) { // Generate a single instance. State.ILV->scalarizeInstruction(Ingredient, *this, *State.Instance, @@ -8087,6 +8328,11 @@ IsPredicated, State); } +VectorizationCostTy VPReplicateRecipe::cost(ElementCount VF, + VPCostContext &Ctx) { + return Ctx.CM.getInstructionCost(Ingredient, VF); +} + void VPBranchOnMaskRecipe::execute(VPTransformState &State) { assert(State.Instance && "Branch on Mask works only on single instance."); @@ -8113,6 +8359,28 @@ ReplaceInstWithInst(CurrentTerminator, CondBr); } +VectorizationCostTy VPBranchOnMaskRecipe::cost(ElementCount VF, + VPCostContext &Ctx) { + // In cases of scalarized and predicated instructions, there will be VF + // predicated blocks in the vectorized loop. Each branch around these + // blocks requires also an extract of its vector compare i1 element. + if (VF.isVector()) { + // Return cost for branches around scalarized and predicated blocks. + assert(!VF.isScalable() && "scalable vectors not yet supported."); + LLVMContext &C = Ctx.CM.TheLoop->getHeader()->getContext(); + auto *Vec_i1Ty = VectorType::get(IntegerType::getInt1Ty(C), VF); + unsigned Cost = + Ctx.CM.TTI.getScalarizationOverhead( + Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()), false, + true) + + (Ctx.CM.TTI.getCFInstrCost(Instruction::Br, + TargetTransformInfo::TCK_RecipThroughput) * + VF.getKnownMinValue()); + return {Cost, false}; + } + return {0, false}; +} + void VPPredInstPHIRecipe::execute(VPTransformState &State) { assert(State.Instance && "Predicated instruction PHI works per instance."); Instruction *ScalarPredInst = cast( @@ -8144,6 +8412,11 @@ } } +VectorizationCostTy VPPredInstPHIRecipe::cost(ElementCount VF, + VPCostContext &Ctx) { + return { 0, false }; +} + void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { Instruction *Instr = getUnderlyingInstr(); VPValue *StoredValue = isa(Instr) ? getStoredValue() : nullptr; @@ -8152,6 +8425,24 @@ StoredValue, getMask()); } +VectorizationCostTy VPWidenMemoryInstructionRecipe::cost(ElementCount VF, + VPCostContext &Ctx) { + return Ctx.CM.getInstructionCost(getUnderlyingInstr(), VF); +} + +VectorizationCostTy VPWidenCanonicalIVRecipe::cost(ElementCount VF, + VPCostContext &Ctx) { + return {Ctx.CM.TTI.getCFInstrCost(Instruction::PHI, + TargetTransformInfo::TCK_RecipThroughput), + false}; +} + +VectorizationCostTy VPInstruction::cost(ElementCount VF, VPCostContext &Ctx) { + // FIXME: Cost everything that a VPInstruction can be, which likely needs type + // information. + return {0, false}; +} + // Determine how to lower the scalar epilogue, which depends on 1) optimising // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing // predication, and 4) a TTI hook that analyses whether the loop is suitable @@ -8234,20 +8525,19 @@ const unsigned UserVF = Hints.getWidth(); // Plan how to best vectorize, return the best VF and its cost. - const VectorizationFactor VF = - LVP.planInVPlanNativePath(ElementCount::getFixed(UserVF)); + auto PlanVF = LVP.planInVPlanNativePath(ElementCount::getFixed(UserVF)); // If we are stress testing VPlan builds, do not attempt to generate vector // code. Masked vector code generation support will follow soon. // Also, do not attempt to vectorize if no vector code will be produced. if (VPlanBuildStressTest || EnableVPlanPredication || - VectorizationFactor::Disabled() == VF) + VectorizationFactor::Disabled() == PlanVF.VF) return false; - LVP.setBestPlan(VF.Width, 1); + LVP.setBestPlan(PlanVF.Plan, PlanVF.VF.Width, 1); - InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, - &CM, BFI, PSI); + InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, PlanVF.VF.Width, 1, + LVL, &CM, BFI, PSI); LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" << L->getHeader()->getParent()->getName() << "\"\n"); LVP.executePlan(LB, DT); @@ -8401,16 +8691,18 @@ unsigned UserIC = Hints.getInterleave(); // Plan how to best vectorize, return the best VF and its cost. - Optional MaybeVF = + Optional MaybeVF = LVP.plan(ElementCount::getFixed(UserVF), UserIC); + VPlan *BestPlan = nullptr; VectorizationFactor VF = VectorizationFactor::Disabled(); unsigned IC = 1; if (MaybeVF) { - VF = *MaybeVF; + BestPlan = (*MaybeVF).Plan; + VF = (*MaybeVF).VF; // Select the interleave count. - IC = CM.selectInterleaveCount(VF.Width, VF.Cost); + IC = CM.selectInterleaveCount(BestPlan, VF.Width, VF.Cost); } // Identify the diagnostic messages that should be produced. @@ -8502,7 +8794,7 @@ LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); } - LVP.setBestPlan(VF.Width, IC); + LVP.setBestPlan(BestPlan, VF.Width, IC); using namespace ore; bool DisableRuntimeUnroll = false; Index: llvm/lib/Transforms/Vectorize/VPlan.h =================================================================== --- llvm/lib/Transforms/Vectorize/VPlan.h +++ llvm/lib/Transforms/Vectorize/VPlan.h @@ -59,6 +59,8 @@ class VPRegionBlock; class VPlan; class VPlanSlp; +class LoopVectorizationCostModel; +class LoopVectorizationLegality; /// A range of powers-of-2 vectorization factors with fixed start and /// adjustable end. The range includes start and excludes end, e.g.,: @@ -87,6 +89,13 @@ unsigned Lane; }; +/// The vectorization cost is a combination of the cost itself and a boolean +/// indicating whether any of the contributing operations will actually +/// operate on vector values after type legalization in the backend. If this +/// latter value is false, then all operations will be scalarized (i.e. no +/// vectorization has actually taken place). +using VectorizationCostTy = std::pair; + /// This is a helper struct for maintaining vectorization state. It's used for /// mapping values from the original loop to their corresponding values in /// the new loop. Two mappings are maintained: one for vectorized values and @@ -358,6 +367,16 @@ VPCallback &Callback; }; +/// A struct to hold the context used during cost calculations. Currently just +/// holds the CostModel and Legality pointers, which can be expanded as needed. +struct VPCostContext { + /// The original CostModel, which is currently used for getting instruction + /// cost. + LoopVectorizationCostModel &CM; + /// The Legality analysis. + LoopVectorizationLegality &Legal; +}; + /// VPBlockBase is the building block of the Hierarchical Control-Flow Graph. /// A VPBlockBase can be either a VPBasicBlock or a VPRegionBlock. class VPBlockBase { @@ -583,6 +602,8 @@ /// VPBlockBase, thereby "executing" the VPlan. virtual void execute(struct VPTransformState *State) = 0; + virtual VectorizationCostTy cost(ElementCount VF, VPCostContext &Ctx) = 0; + /// Delete all blocks reachable from a given VPBlockBase, inclusive. static void deleteCFG(VPBlockBase *Entry); @@ -654,6 +675,8 @@ /// this VPRecipe, thereby "executing" the VPlan. virtual void execute(struct VPTransformState &State) = 0; + virtual VectorizationCostTy cost(ElementCount VF, VPCostContext &Ctx) = 0; + /// Each recipe prints itself. virtual void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const = 0; @@ -776,6 +799,8 @@ /// provided. void execute(VPTransformState &State) override; + VectorizationCostTy cost(ElementCount VF, VPCostContext &Ctx) override; + /// Print the Recipe. void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override; @@ -836,6 +861,8 @@ /// Produce widened copies of all Ingredients. void execute(VPTransformState &State) override; + VectorizationCostTy cost(ElementCount VF, VPCostContext &Ctx) override; + /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override; @@ -861,6 +888,8 @@ /// Produce a widened version of the call instruction. void execute(VPTransformState &State) override; + VectorizationCostTy cost(ElementCount VF, VPCostContext &Ctx) override; + /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override; @@ -892,6 +921,8 @@ /// Produce a widened version of the select instruction. void execute(VPTransformState &State) override; + VectorizationCostTy cost(ElementCount VF, VPCostContext &Ctx) override; + /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override; @@ -930,6 +961,8 @@ /// Generate the gep nodes. void execute(VPTransformState &State) override; + VectorizationCostTy cost(ElementCount VF, VPCostContext &Ctx) override; + /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override; @@ -955,6 +988,8 @@ /// needed by their users. void execute(VPTransformState &State) override; + VectorizationCostTy cost(ElementCount VF, VPCostContext &Ctx) override; + /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override; @@ -976,6 +1011,8 @@ /// Generate the phi/select nodes. void execute(VPTransformState &State) override; + VectorizationCostTy cost(ElementCount VF, VPCostContext &Ctx) override; + /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override; @@ -1016,6 +1053,8 @@ /// Generate the phi/select nodes. void execute(VPTransformState &State) override; + VectorizationCostTy cost(ElementCount VF, VPCostContext &Ctx) override; + /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override; @@ -1055,6 +1094,8 @@ /// Generate the wide load or store, and shuffles. void execute(VPTransformState &State) override; + VectorizationCostTy cost(ElementCount VF, VPCostContext &Ctx) override; + /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override; @@ -1097,6 +1138,8 @@ /// Generate the reduction in the loop void execute(VPTransformState &State) override; + VectorizationCostTy cost(ElementCount VF, VPCostContext &Ctx) override; + /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override; @@ -1154,6 +1197,8 @@ /// the \p State. void execute(VPTransformState &State) override; + VectorizationCostTy cost(ElementCount VF, VPCostContext &Ctx) override; + void setAlsoPack(bool Pack) { AlsoPack = Pack; } /// Print the recipe. @@ -1178,10 +1223,12 @@ /// conditional branch. void execute(VPTransformState &State) override; + VectorizationCostTy cost(ElementCount VF, VPCostContext &Ctx) override; + /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override { - O << " +\n" << Indent << "\"BRANCH-ON-MASK "; + O << Indent << "\"BRANCH-ON-MASK "; if (VPValue *Mask = getMask()) Mask->print(O, SlotTracker); else @@ -1221,6 +1268,8 @@ /// Generates phi nodes for live-outs as needed to retain SSA form. void execute(VPTransformState &State) override; + VectorizationCostTy cost(ElementCount VF, VPCostContext &Ctx) override; + /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override; @@ -1289,6 +1338,8 @@ /// Generate the wide load/store. void execute(VPTransformState &State) override; + VectorizationCostTy cost(ElementCount VF, VPCostContext &Ctx) override; + /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override; @@ -1318,6 +1369,8 @@ /// step = . void execute(VPTransformState &State) override; + VectorizationCostTy cost(ElementCount VF, VPCostContext &Ctx) override; + /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override; @@ -1334,6 +1387,8 @@ /// The VPRecipes held in the order of output instructions to generate. RecipeListTy Recipes; + unsigned ReciprocalPredBlockProb = 1; + public: VPBasicBlock(const Twine &Name = "", VPRecipeBase *Recipe = nullptr) : VPBlockBase(VPBasicBlockSC, Name.str()) { @@ -1404,6 +1459,13 @@ /// Return the position of the first non-phi node recipe in the block. iterator getFirstNonPhi(); + VectorizationCostTy cost(ElementCount VF, VPCostContext &Ctx) override; + + unsigned getReciprocalPredBlockProb() const { + return ReciprocalPredBlockProb; + } + void setReciprocalPredBlockProb(unsigned V) { ReciprocalPredBlockProb = V; } + private: /// Create an IR BasicBlock to hold the output instructions generated by this /// VPBasicBlock, and return it. Update the CFGState accordingly. @@ -1490,6 +1552,8 @@ /// The method which generates the output IR instructions that correspond to /// this VPRegionBlock, thereby "executing" the VPlan. void execute(struct VPTransformState *State) override; + + VectorizationCostTy cost(ElementCount VF, VPCostContext &Ctx) override; }; //===----------------------------------------------------------------------===// @@ -1679,6 +1743,8 @@ /// Generate the IR code for this VPlan. void execute(struct VPTransformState *State); + VectorizationCostTy cost(ElementCount VF, VPCostContext &Ctx); + VPBlockBase *getEntry() { return Entry; } const VPBlockBase *getEntry() const { return Entry; } @@ -1695,6 +1761,8 @@ return BackedgeTakenCount; } + const SmallSetVector &getVFs() const { return VFs; } + void addVF(ElementCount VF) { VFs.insert(VF); } bool hasVF(ElementCount VF) { return VFs.count(VF); } Index: llvm/test/Analysis/CostModel/X86/interleave-load-i32.ll =================================================================== --- llvm/test/Analysis/CostModel/X86/interleave-load-i32.ll +++ llvm/test/Analysis/CostModel/X86/interleave-load-i32.ll @@ -1,5 +1,6 @@ ; REQUIRES: asserts -; RUN: opt -loop-vectorize -S -mattr=avx512f --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s +; RUN: opt -loop-vectorize -S -mattr=avx512f -cost-using-vplan=false --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-CM +; RUN: opt -loop-vectorize -S -mattr=avx512f -cost-using-vplan=true --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-VP target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" @@ -10,11 +11,16 @@ ; Function Attrs: nounwind uwtable define void @load_i32_interleave4() { ;CHECK-LABEL: load_i32_interleave4 -;CHECK: Found an estimated cost of 1 for VF 1 For instruction: %0 = load -;CHECK: Found an estimated cost of 5 for VF 2 For instruction: %0 = load -;CHECK: Found an estimated cost of 5 for VF 4 For instruction: %0 = load -;CHECK: Found an estimated cost of 8 for VF 8 For instruction: %0 = load -;CHECK: Found an estimated cost of 22 for VF 16 For instruction: %0 = load +;CHECK-CM: Found an estimated cost of 1 for VF 1 For instruction: %0 = load +;CHECK-CM: Found an estimated cost of 5 for VF 2 For instruction: %0 = load +;CHECK-CM: Found an estimated cost of 5 for VF 4 For instruction: %0 = load +;CHECK-CM: Found an estimated cost of 8 for VF 8 For instruction: %0 = load +;CHECK-CM: Found an estimated cost of 22 for VF 16 For instruction: %0 = load +;CHECK-VP: Found an estimated cost of 1 for VF 1 For recipe: "CLONE %0 = load +;CHECK-VP: Found an estimated cost of 5 for VF 2 For recipe: "INTERLEAVE-GROUP with factor 4 +;CHECK-VP: Found an estimated cost of 5 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 4 +;CHECK-VP: Found an estimated cost of 8 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 4 +;CHECK-VP: Found an estimated cost of 22 for VF 16 For recipe: "INTERLEAVE-GROUP with factor 4 entry: br label %for.body @@ -46,11 +52,16 @@ define void @load_i32_interleave5() { ;CHECK-LABEL: load_i32_interleave5 -;CHECK: Found an estimated cost of 1 for VF 1 For instruction: %0 = load -;CHECK: Found an estimated cost of 6 for VF 2 For instruction: %0 = load -;CHECK: Found an estimated cost of 9 for VF 4 For instruction: %0 = load -;CHECK: Found an estimated cost of 18 for VF 8 For instruction: %0 = load -;CHECK: Found an estimated cost of 35 for VF 16 For instruction: %0 = load +;CHECK-CM: Found an estimated cost of 1 for VF 1 For instruction: %0 = load +;CHECK-CM: Found an estimated cost of 6 for VF 2 For instruction: %0 = load +;CHECK-CM: Found an estimated cost of 9 for VF 4 For instruction: %0 = load +;CHECK-CM: Found an estimated cost of 18 for VF 8 For instruction: %0 = load +;CHECK-CM: Found an estimated cost of 35 for VF 16 For instruction: %0 = load +;CHECK-VP: Found an estimated cost of 1 for VF 1 For recipe: "CLONE %0 = load +;CHECK-VP: Found an estimated cost of 6 for VF 2 For recipe: "INTERLEAVE-GROUP with factor 5 +;CHECK-VP: Found an estimated cost of 9 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 5 +;CHECK-VP: Found an estimated cost of 18 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 5 +;CHECK-VP: Found an estimated cost of 35 for VF 16 For recipe: "INTERLEAVE-GROUP with factor 5 entry: br label %for.body Index: llvm/test/Analysis/CostModel/X86/interleave-store-i32.ll =================================================================== --- llvm/test/Analysis/CostModel/X86/interleave-store-i32.ll +++ llvm/test/Analysis/CostModel/X86/interleave-store-i32.ll @@ -1,5 +1,6 @@ ; REQUIRES: asserts -; RUN: opt -loop-vectorize -S -mattr=avx512f --debug-only=loop-vectorize < %s 2>&1| FileCheck %s +; RUN: opt -loop-vectorize -S -mattr=avx512f -cost-using-vplan=false --debug-only=loop-vectorize < %s 2>&1| FileCheck %s --check-prefixes=CHECK,CHECK-CM +; RUN: opt -loop-vectorize -S -mattr=avx512f -cost-using-vplan=true --debug-only=loop-vectorize < %s 2>&1| FileCheck %s --check-prefixes=CHECK,CHECK-VP target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" @@ -10,11 +11,16 @@ ; Function Attrs: nounwind uwtable define void @store_i32_interleave4() { ;CHECK-LABEL: store_i32_interleave4 -;CHECK: Found an estimated cost of 1 for VF 1 For instruction: store i32 %add16 -;CHECK: Found an estimated cost of 5 for VF 2 For instruction: store i32 %add16 -;CHECK: Found an estimated cost of 5 for VF 4 For instruction: store i32 %add16 -;CHECK: Found an estimated cost of 11 for VF 8 For instruction: store i32 %add16 -;CHECK: Found an estimated cost of 22 for VF 16 For instruction: store i32 %add16 +;CHECK-CM: Found an estimated cost of 1 for VF 1 For instruction: store i32 %add16 +;CHECK-CM: Found an estimated cost of 5 for VF 2 For instruction: store i32 %add16 +;CHECK-CM: Found an estimated cost of 5 for VF 4 For instruction: store i32 %add16 +;CHECK-CM: Found an estimated cost of 11 for VF 8 For instruction: store i32 %add16 +;CHECK-CM: Found an estimated cost of 22 for VF 16 For instruction: store i32 %add16 +;CHECK-VP: Found an estimated cost of 1 for VF 1 For recipe: "CLONE store %add16 +;CHECK-VP: Found an estimated cost of 5 for VF 2 For recipe: "INTERLEAVE-GROUP with factor 4 +;CHECK-VP: Found an estimated cost of 5 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 4 +;CHECK-VP: Found an estimated cost of 11 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 4 +;CHECK-VP: Found an estimated cost of 22 for VF 16 For recipe: "INTERLEAVE-GROUP with factor 4 entry: br label %for.body @@ -46,11 +52,16 @@ define void @store_i32_interleave5() { ;CHECK-LABEL: store_i32_interleave5 -;CHECK: Found an estimated cost of 1 for VF 1 For instruction: store i32 %add22 -;CHECK: Found an estimated cost of 7 for VF 2 For instruction: store i32 %add22 -;CHECK: Found an estimated cost of 14 for VF 4 For instruction: store i32 %add22 -;CHECK: Found an estimated cost of 21 for VF 8 For instruction: store i32 %add22 -;CHECK: Found an estimated cost of 35 for VF 16 For instruction: store i32 %add22 +;CHECK-CM: Found an estimated cost of 1 for VF 1 For instruction: store i32 %add22 +;CHECK-CM: Found an estimated cost of 7 for VF 2 For instruction: store i32 %add22 +;CHECK-CM: Found an estimated cost of 14 for VF 4 For instruction: store i32 %add22 +;CHECK-CM: Found an estimated cost of 21 for VF 8 For instruction: store i32 %add22 +;CHECK-CM: Found an estimated cost of 35 for VF 16 For instruction: store i32 %add22 +;CHECK-VP: Found an estimated cost of 1 for VF 1 For recipe: "CLONE store %add22 +;CHECK-VP: Found an estimated cost of 7 for VF 2 For recipe: "INTERLEAVE-GROUP with factor 5 +;CHECK-VP: Found an estimated cost of 14 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 5 +;CHECK-VP: Found an estimated cost of 21 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 5 +;CHECK-VP: Found an estimated cost of 35 for VF 16 For recipe: "INTERLEAVE-GROUP with factor 5 entry: br label %for.body Index: llvm/test/Analysis/CostModel/X86/interleaved-load-float.ll =================================================================== --- llvm/test/Analysis/CostModel/X86/interleaved-load-float.ll +++ llvm/test/Analysis/CostModel/X86/interleaved-load-float.ll @@ -1,5 +1,6 @@ ; REQUIRES: asserts -; RUN: opt -S -loop-vectorize -debug-only=loop-vectorize -mcpu=skylake %s 2>&1 | FileCheck %s +; RUN: opt -S -loop-vectorize -debug-only=loop-vectorize -mcpu=skylake -cost-using-vplan=false %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-CM +; RUN: opt -S -loop-vectorize -debug-only=loop-vectorize -mcpu=skylake -cost-using-vplan=true %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-VP target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128" target triple = "i386-unknown-linux-gnu" @@ -10,7 +11,8 @@ define void @stride8(float %k, i32 %width_) { entry: -; CHECK: Found an estimated cost of 48 for VF 8 For instruction: %0 = load float +; CHECK-CM: Found an estimated cost of 48 for VF 8 For instruction: %0 = load float +; CHECK-VP: Found an estimated cost of 48 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 8 %cmp72 = icmp sgt i32 %width_, 0 br i1 %cmp72, label %for.body.lr.ph, label %for.cond.cleanup @@ -98,7 +100,8 @@ define void @stride3(float %k, i32 %width_) { entry: -; CHECK: Found an estimated cost of 20 for VF 8 For instruction: %0 = load float +; CHECK-CM: Found an estimated cost of 20 for VF 8 For instruction: %0 = load float +; CHECK-VP: Found an estimated cost of 20 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 3 %cmp27 = icmp sgt i32 %width_, 0 br i1 %cmp27, label %for.body.lr.ph, label %for.cond.cleanup Index: llvm/test/Analysis/CostModel/X86/interleaved-load-i8.ll =================================================================== --- llvm/test/Analysis/CostModel/X86/interleaved-load-i8.ll +++ llvm/test/Analysis/CostModel/X86/interleaved-load-i8.ll @@ -1,17 +1,24 @@ ; REQUIRES: asserts -; RUN: opt -loop-vectorize -S -mcpu=core-avx2 --debug-only=loop-vectorize -vectorizer-maximize-bandwidth < %s 2>&1 | FileCheck %s +; RUN: opt -loop-vectorize -S -mcpu=core-avx2 -cost-using-vplan=false --debug-only=loop-vectorize -vectorizer-maximize-bandwidth < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-CM +; RUN: opt -loop-vectorize -S -mcpu=core-avx2 -cost-using-vplan=true --debug-only=loop-vectorize -vectorizer-maximize-bandwidth < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-VP target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" ; Function Attrs: norecurse nounwind readonly uwtable define i32 @doit_stride3(i8* nocapture readonly %Ptr, i32 %Nels) { -;CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %0 = load i8 -;CHECK: LV: Found an estimated cost of 11 for VF 2 For instruction: %0 = load i8 -;CHECK: LV: Found an estimated cost of 5 for VF 4 For instruction: %0 = load i8 -;CHECK: LV: Found an estimated cost of 10 for VF 8 For instruction: %0 = load i8 -;CHECK: LV: Found an estimated cost of 13 for VF 16 For instruction: %0 = load i8 -;CHECK: LV: Found an estimated cost of 16 for VF 32 For instruction: %0 = load i8 +;CHECK-CM: LV: Found an estimated cost of 1 for VF 1 For instruction: %0 = load i8 +;CHECK-CM: LV: Found an estimated cost of 11 for VF 2 For instruction: %0 = load i8 +;CHECK-CM: LV: Found an estimated cost of 5 for VF 4 For instruction: %0 = load i8 +;CHECK-CM: LV: Found an estimated cost of 10 for VF 8 For instruction: %0 = load i8 +;CHECK-CM: LV: Found an estimated cost of 13 for VF 16 For instruction: %0 = load i8 +;CHECK-CM: LV: Found an estimated cost of 16 for VF 32 For instruction: %0 = load i8 +;CHECK-VP: LV: Found an estimated cost of 1 for VF 1 For recipe: "CLONE %0 = load +;CHECK-VP: LV: Found an estimated cost of 11 for VF 2 For recipe: "INTERLEAVE-GROUP with factor 3 +;CHECK-VP: LV: Found an estimated cost of 5 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 3 +;CHECK-VP: LV: Found an estimated cost of 10 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 3 +;CHECK-VP: LV: Found an estimated cost of 13 for VF 16 For recipe: "INTERLEAVE-GROUP with factor 3 +;CHECK-VP: LV: Found an estimated cost of 16 for VF 32 For recipe: "INTERLEAVE-GROUP with factor 3 entry: %cmp13 = icmp sgt i32 %Nels, 0 br i1 %cmp13, label %for.body.preheader, label %for.end @@ -50,12 +57,18 @@ ; Function Attrs: norecurse nounwind readonly uwtable define i32 @doit_stride4(i8* nocapture readonly %Ptr, i32 %Nels) local_unnamed_addr { -;CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %0 = load i8 -;CHECK: LV: Found an estimated cost of 13 for VF 2 For instruction: %0 = load i8 -;CHECK: LV: Found an estimated cost of 5 for VF 4 For instruction: %0 = load i8 -;CHECK: LV: Found an estimated cost of 21 for VF 8 For instruction: %0 = load i8 -;CHECK: LV: Found an estimated cost of 41 for VF 16 For instruction: %0 = load i8 -;CHECK: LV: Found an estimated cost of 84 for VF 32 For instruction: %0 = load i8 +;CHECK-CM: LV: Found an estimated cost of 1 for VF 1 For instruction: %0 = load i8 +;CHECK-CM: LV: Found an estimated cost of 13 for VF 2 For instruction: %0 = load i8 +;CHECK-CM: LV: Found an estimated cost of 5 for VF 4 For instruction: %0 = load i8 +;CHECK-CM: LV: Found an estimated cost of 21 for VF 8 For instruction: %0 = load i8 +;CHECK-CM: LV: Found an estimated cost of 41 for VF 16 For instruction: %0 = load i8 +;CHECK-CM: LV: Found an estimated cost of 84 for VF 32 For instruction: %0 = load i8 +;CHECK-VP: LV: Found an estimated cost of 1 for VF 1 For recipe: "CLONE %0 = load +;CHECK-VP: LV: Found an estimated cost of 13 for VF 2 For recipe: "INTERLEAVE-GROUP with factor 4 +;CHECK-VP: LV: Found an estimated cost of 5 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 4 +;CHECK-VP: LV: Found an estimated cost of 21 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 4 +;CHECK-VP: LV: Found an estimated cost of 41 for VF 16 For recipe: "INTERLEAVE-GROUP with factor 4 +;CHECK-VP: LV: Found an estimated cost of 84 for VF 32 For recipe: "INTERLEAVE-GROUP with factor 4 entry: %cmp59 = icmp sgt i32 %Nels, 0 br i1 %cmp59, label %for.body.preheader, label %for.end Index: llvm/test/Analysis/CostModel/X86/interleaved-load-store-double.ll =================================================================== --- llvm/test/Analysis/CostModel/X86/interleaved-load-store-double.ll +++ llvm/test/Analysis/CostModel/X86/interleaved-load-store-double.ll @@ -1,5 +1,6 @@ ; REQUIRES: asserts -; RUN: opt -S -loop-vectorize -debug-only=loop-vectorize -mcpu=skylake %s 2>&1 | FileCheck %s +; RUN: opt -S -loop-vectorize -debug-only=loop-vectorize -mcpu=skylake -cost-using-vplan=false %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-CM +; RUN: opt -S -loop-vectorize -debug-only=loop-vectorize -mcpu=skylake -cost-using-vplan=true %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-VP target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128" target triple = "i386-unknown-linux-gnu" @@ -10,8 +11,10 @@ define void @stride2double(double %k, i32 %width_) { entry: -; CHECK: Found an estimated cost of 8 for VF 4 For instruction: %0 = load double -; CHECK: Found an estimated cost of 8 for VF 4 For instruction: store double +; CHECK-CM: Found an estimated cost of 8 for VF 4 For instruction: %0 = load double +; CHECK-CM: Found an estimated cost of 8 for VF 4 For instruction: store double +; CHECK-VP: Found an estimated cost of 8 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 2 at %0 +; CHECK-VP: Found an estimated cost of 8 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 2 %cmp27 = icmp sgt i32 %width_, 0 br i1 %cmp27, label %for.body.lr.ph, label %for.cond.cleanup Index: llvm/test/Analysis/CostModel/X86/interleaved-load-store-i64.ll =================================================================== --- llvm/test/Analysis/CostModel/X86/interleaved-load-store-i64.ll +++ llvm/test/Analysis/CostModel/X86/interleaved-load-store-i64.ll @@ -1,5 +1,6 @@ ; REQUIRES: asserts -; RUN: opt -S -loop-vectorize -debug-only=loop-vectorize -mcpu=core-avx2 %s 2>&1 | FileCheck %s +; RUN: opt -S -loop-vectorize -debug-only=loop-vectorize -mcpu=core-avx2 -cost-using-vplan=false %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-CM +; RUN: opt -S -loop-vectorize -debug-only=loop-vectorize -mcpu=core-avx2 -cost-using-vplan=true %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-VP target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128" target triple = "i386-unknown-linux-gnu" @@ -10,8 +11,10 @@ define void @stride2i64(i64 %k, i32 %width_) { entry: -; CHECK: Found an estimated cost of 8 for VF 4 For instruction: %0 = load i64 -; CHECK: Found an estimated cost of 8 for VF 4 For instruction: store i64 +; CHECK-CM: Found an estimated cost of 8 for VF 4 For instruction: %0 = load i64 +; CHECK-CM: Found an estimated cost of 8 for VF 4 For instruction: store i64 +; CHECK-VP: Found an estimated cost of 8 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 2 at %0 +; CHECK-VP: Found an estimated cost of 8 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 2 %cmp27 = icmp sgt i32 %width_, 0 br i1 %cmp27, label %for.body.lr.ph, label %for.cond.cleanup Index: llvm/test/Analysis/CostModel/X86/interleaved-store-i8.ll =================================================================== --- llvm/test/Analysis/CostModel/X86/interleaved-store-i8.ll +++ llvm/test/Analysis/CostModel/X86/interleaved-store-i8.ll @@ -1,17 +1,24 @@ ; REQUIRES: asserts -; RUN: opt -loop-vectorize -S -mcpu=core-avx2 --debug-only=loop-vectorize -vectorizer-maximize-bandwidth < %s 2>&1 | FileCheck %s +; RUN: opt -loop-vectorize -S -mcpu=core-avx2 -cost-using-vplan=false --debug-only=loop-vectorize -vectorizer-maximize-bandwidth < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-CM +; RUN: opt -loop-vectorize -S -mcpu=core-avx2 -cost-using-vplan=true --debug-only=loop-vectorize -vectorizer-maximize-bandwidth < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-VP target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" ; Function Attrs: norecurse nounwind uwtable define void @doit_stride3(i8* nocapture %Ptr, i32 %Nels) local_unnamed_addr { -;CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %conv4 -;CHECK: LV: Found an estimated cost of 8 for VF 2 For instruction: store i8 %conv4 -;CHECK: LV: Found an estimated cost of 9 for VF 4 For instruction: store i8 %conv4 -;CHECK: LV: Found an estimated cost of 12 for VF 8 For instruction: store i8 %conv4 -;CHECK: LV: Found an estimated cost of 13 for VF 16 For instruction: store i8 %conv4 -;CHECK: LV: Found an estimated cost of 16 for VF 32 For instruction: store i8 %conv4 +;CHECK-CM: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %conv4 +;CHECK-CM: LV: Found an estimated cost of 8 for VF 2 For instruction: store i8 %conv4 +;CHECK-CM: LV: Found an estimated cost of 9 for VF 4 For instruction: store i8 %conv4 +;CHECK-CM: LV: Found an estimated cost of 12 for VF 8 For instruction: store i8 %conv4 +;CHECK-CM: LV: Found an estimated cost of 13 for VF 16 For instruction: store i8 %conv4 +;CHECK-CM: LV: Found an estimated cost of 16 for VF 32 For instruction: store i8 %conv4 +;CHECK-VP: LV: Found an estimated cost of 1 for VF 1 For recipe: "CLONE store %conv +;CHECK-VP: LV: Found an estimated cost of 8 for VF 2 For recipe: "INTERLEAVE-GROUP with factor 3 +;CHECK-VP: LV: Found an estimated cost of 9 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 3 +;CHECK-VP: LV: Found an estimated cost of 12 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 3 +;CHECK-VP: LV: Found an estimated cost of 13 for VF 16 For recipe: "INTERLEAVE-GROUP with factor 3 +;CHECK-VP: LV: Found an estimated cost of 16 for VF 32 For recipe: "INTERLEAVE-GROUP with factor 3 entry: %cmp14 = icmp sgt i32 %Nels, 0 br i1 %cmp14, label %for.body.lr.ph, label %for.end @@ -44,12 +51,18 @@ ; Function Attrs: norecurse nounwind uwtable define void @doit_stride4(i8* nocapture %Ptr, i32 %Nels) local_unnamed_addr { -;CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %conv7 -;CHECK: LV: Found an estimated cost of 13 for VF 2 For instruction: store i8 %conv7 -;CHECK: LV: Found an estimated cost of 10 for VF 4 For instruction: store i8 %conv7 -;CHECK: LV: Found an estimated cost of 11 for VF 8 For instruction: store i8 %conv7 -;CHECK: LV: Found an estimated cost of 12 for VF 16 For instruction: store i8 %conv7 -;CHECK: LV: Found an estimated cost of 16 for VF 32 For instruction: store i8 %conv7 +;CHECK-CM: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %conv7 +;CHECK-CM: LV: Found an estimated cost of 13 for VF 2 For instruction: store i8 %conv7 +;CHECK-CM: LV: Found an estimated cost of 10 for VF 4 For instruction: store i8 %conv7 +;CHECK-CM: LV: Found an estimated cost of 11 for VF 8 For instruction: store i8 %conv7 +;CHECK-CM: LV: Found an estimated cost of 12 for VF 16 For instruction: store i8 %conv7 +;CHECK-CM: LV: Found an estimated cost of 16 for VF 32 For instruction: store i8 %conv7 +;CHECK-VP: LV: Found an estimated cost of 1 for VF 1 For recipe: "CLONE store %conv +;CHECK-VP: LV: Found an estimated cost of 13 for VF 2 For recipe: "INTERLEAVE-GROUP with factor 4 +;CHECK-VP: LV: Found an estimated cost of 10 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 4 +;CHECK-VP: LV: Found an estimated cost of 11 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 4 +;CHECK-VP: LV: Found an estimated cost of 12 for VF 16 For recipe: "INTERLEAVE-GROUP with factor 4 +;CHECK-VP: LV: Found an estimated cost of 16 for VF 32 For recipe: "INTERLEAVE-GROUP with factor 4 entry: %cmp19 = icmp sgt i32 %Nels, 0 br i1 %cmp19, label %for.body.lr.ph, label %for.end Index: llvm/test/Analysis/CostModel/X86/strided-load-i16.ll =================================================================== --- llvm/test/Analysis/CostModel/X86/strided-load-i16.ll +++ llvm/test/Analysis/CostModel/X86/strided-load-i16.ll @@ -1,5 +1,6 @@ ; REQUIRES: asserts -; RUN: opt -loop-vectorize -S -mattr=avx512bw --debug-only=loop-vectorize < %s 2>&1| FileCheck %s +; RUN: opt -loop-vectorize -S -mattr=avx512bw -cost-using-vplan=false --debug-only=loop-vectorize < %s 2>&1| FileCheck %s --check-prefixes=CHECK,CHECK-CM +; RUN: opt -loop-vectorize -S -mattr=avx512bw -cost-using-vplan=true --debug-only=loop-vectorize < %s 2>&1| FileCheck %s --check-prefixes=CHECK,CHECK-VP target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" @@ -10,12 +11,18 @@ ; Function Attrs: nounwind uwtable define void @load_i16_stride2() { ;CHECK-LABEL: load_i16_stride2 -;CHECK: Found an estimated cost of 1 for VF 1 For instruction: %1 = load -;CHECK: Found an estimated cost of 1 for VF 2 For instruction: %1 = load -;CHECK: Found an estimated cost of 1 for VF 4 For instruction: %1 = load -;CHECK: Found an estimated cost of 2 for VF 8 For instruction: %1 = load -;CHECK: Found an estimated cost of 2 for VF 16 For instruction: %1 = load -;CHECK: Found an estimated cost of 3 for VF 32 For instruction: %1 = load +;CHECK-CM: Found an estimated cost of 1 for VF 1 For instruction: %1 = load +;CHECK-CM: Found an estimated cost of 1 for VF 2 For instruction: %1 = load +;CHECK-CM: Found an estimated cost of 1 for VF 4 For instruction: %1 = load +;CHECK-CM: Found an estimated cost of 2 for VF 8 For instruction: %1 = load +;CHECK-CM: Found an estimated cost of 2 for VF 16 For instruction: %1 = load +;CHECK-CM: Found an estimated cost of 3 for VF 32 For instruction: %1 = load +;CHECK-VP: Found an estimated cost of 1 for VF 1 For recipe: "CLONE %1 = load +;CHECK-VP: Found an estimated cost of 1 for VF 2 For recipe: "INTERLEAVE-GROUP with factor 2 at %1 +;CHECK-VP: Found an estimated cost of 1 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 2 at %1 +;CHECK-VP: Found an estimated cost of 2 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 2 at %1 +;CHECK-VP: Found an estimated cost of 2 for VF 16 For recipe: "INTERLEAVE-GROUP with factor 2 at %1 +;CHECK-VP: Found an estimated cost of 3 for VF 32 For recipe: "INTERLEAVE-GROUP with factor 2 at %1 entry: br label %for.body @@ -36,12 +43,18 @@ define void @load_i16_stride3() { ;CHECK-LABEL: load_i16_stride3 -;CHECK: Found an estimated cost of 1 for VF 1 For instruction: %1 = load -;CHECK: Found an estimated cost of 1 for VF 2 For instruction: %1 = load -;CHECK: Found an estimated cost of 2 for VF 4 For instruction: %1 = load -;CHECK: Found an estimated cost of 2 for VF 8 For instruction: %1 = load -;CHECK: Found an estimated cost of 3 for VF 16 For instruction: %1 = load -;CHECK: Found an estimated cost of 5 for VF 32 For instruction: %1 = load +;CHECK-CM: Found an estimated cost of 1 for VF 1 For instruction: %1 = load +;CHECK-CM: Found an estimated cost of 1 for VF 2 For instruction: %1 = load +;CHECK-CM: Found an estimated cost of 2 for VF 4 For instruction: %1 = load +;CHECK-CM: Found an estimated cost of 2 for VF 8 For instruction: %1 = load +;CHECK-CM: Found an estimated cost of 3 for VF 16 For instruction: %1 = load +;CHECK-CM: Found an estimated cost of 5 for VF 32 For instruction: %1 = load +;CHECK-VP: Found an estimated cost of 1 for VF 1 For recipe: "CLONE %1 = load +;CHECK-VP: Found an estimated cost of 1 for VF 2 For recipe: "INTERLEAVE-GROUP with factor 3 at %1 +;CHECK-VP: Found an estimated cost of 2 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 3 at %1 +;CHECK-VP: Found an estimated cost of 2 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 3 at %1 +;CHECK-VP: Found an estimated cost of 3 for VF 16 For recipe: "INTERLEAVE-GROUP with factor 3 at %1 +;CHECK-VP: Found an estimated cost of 5 for VF 32 For recipe: "INTERLEAVE-GROUP with factor 3 at %1 entry: br label %for.body @@ -62,12 +75,18 @@ define void @load_i16_stride4() { ;CHECK-LABEL: load_i16_stride4 -;CHECK: Found an estimated cost of 1 for VF 1 For instruction: %1 = load -;CHECK: Found an estimated cost of 1 for VF 2 For instruction: %1 = load -;CHECK: Found an estimated cost of 2 for VF 4 For instruction: %1 = load -;CHECK: Found an estimated cost of 2 for VF 8 For instruction: %1 = load -;CHECK: Found an estimated cost of 3 for VF 16 For instruction: %1 = load -;CHECK: Found an estimated cost of 8 for VF 32 For instruction: %1 = load +;CHECK-CM: Found an estimated cost of 1 for VF 1 For instruction: %1 = load +;CHECK-CM: Found an estimated cost of 1 for VF 2 For instruction: %1 = load +;CHECK-CM: Found an estimated cost of 2 for VF 4 For instruction: %1 = load +;CHECK-CM: Found an estimated cost of 2 for VF 8 For instruction: %1 = load +;CHECK-CM: Found an estimated cost of 3 for VF 16 For instruction: %1 = load +;CHECK-CM: Found an estimated cost of 8 for VF 32 For instruction: %1 = load +;CHECK-VP: Found an estimated cost of 1 for VF 1 For recipe: "CLONE %1 = load +;CHECK-VP: Found an estimated cost of 1 for VF 2 For recipe: "INTERLEAVE-GROUP with factor 4 at %1 +;CHECK-VP: Found an estimated cost of 2 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 4 at %1 +;CHECK-VP: Found an estimated cost of 2 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 4 at %1 +;CHECK-VP: Found an estimated cost of 3 for VF 16 For recipe: "INTERLEAVE-GROUP with factor 4 at %1 +;CHECK-VP: Found an estimated cost of 8 for VF 32 For recipe: "INTERLEAVE-GROUP with factor 4 at %1 entry: br label %for.body @@ -88,12 +107,18 @@ define void @load_i16_stride5() { ;CHECK-LABEL: load_i16_stride5 -;CHECK: Found an estimated cost of 1 for VF 1 For instruction: %1 = load -;CHECK: Found an estimated cost of 2 for VF 2 For instruction: %1 = load -;CHECK: Found an estimated cost of 2 for VF 4 For instruction: %1 = load -;CHECK: Found an estimated cost of 3 for VF 8 For instruction: %1 = load -;CHECK: Found an estimated cost of 5 for VF 16 For instruction: %1 = load -;CHECK: Found an estimated cost of 10 for VF 32 For instruction: %1 = load +;CHECK-CM: Found an estimated cost of 1 for VF 1 For instruction: %1 = load +;CHECK-CM: Found an estimated cost of 2 for VF 2 For instruction: %1 = load +;CHECK-CM: Found an estimated cost of 2 for VF 4 For instruction: %1 = load +;CHECK-CM: Found an estimated cost of 3 for VF 8 For instruction: %1 = load +;CHECK-CM: Found an estimated cost of 5 for VF 16 For instruction: %1 = load +;CHECK-CM: Found an estimated cost of 10 for VF 32 For instruction: %1 = load +;CHECK-VP: Found an estimated cost of 1 for VF 1 For recipe: "CLONE %1 = load +;CHECK-VP: Found an estimated cost of 2 for VF 2 For recipe: "INTERLEAVE-GROUP with factor 5 at %1 +;CHECK-VP: Found an estimated cost of 2 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 5 at %1 +;CHECK-VP: Found an estimated cost of 3 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 5 at %1 +;CHECK-VP: Found an estimated cost of 5 for VF 16 For recipe: "INTERLEAVE-GROUP with factor 5 at %1 +;CHECK-VP: Found an estimated cost of 10 for VF 32 For recipe: "INTERLEAVE-GROUP with factor 5 at %1 entry: br label %for.body Index: llvm/test/Analysis/CostModel/X86/strided-load-i32.ll =================================================================== --- llvm/test/Analysis/CostModel/X86/strided-load-i32.ll +++ llvm/test/Analysis/CostModel/X86/strided-load-i32.ll @@ -1,5 +1,6 @@ ; REQUIRES: asserts -; RUN: opt -loop-vectorize -S -mattr=avx512f --debug-only=loop-vectorize < %s 2>&1| FileCheck %s +; RUN: opt -loop-vectorize -S -mattr=avx512f -cost-using-vplan=false --debug-only=loop-vectorize < %s 2>&1| FileCheck %s --check-prefixes=CHECK,CHECK-CM +; RUN: opt -loop-vectorize -S -mattr=avx512f -cost-using-vplan=true --debug-only=loop-vectorize < %s 2>&1| FileCheck %s --check-prefixes=CHECK,CHECK-VP target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" @@ -10,11 +11,16 @@ ; Function Attrs: nounwind uwtable define void @load_int_stride2() { ;CHECK-LABEL: load_int_stride2 -;CHECK: Found an estimated cost of 1 for VF 1 For instruction: %1 = load -;CHECK: Found an estimated cost of 1 for VF 2 For instruction: %1 = load -;CHECK: Found an estimated cost of 1 for VF 4 For instruction: %1 = load -;CHECK: Found an estimated cost of 1 for VF 8 For instruction: %1 = load -;CHECK: Found an estimated cost of 2 for VF 16 For instruction: %1 = load +;CHECK-CM: Found an estimated cost of 1 for VF 1 For instruction: %1 = load +;CHECK-CM: Found an estimated cost of 1 for VF 2 For instruction: %1 = load +;CHECK-CM: Found an estimated cost of 1 for VF 4 For instruction: %1 = load +;CHECK-CM: Found an estimated cost of 1 for VF 8 For instruction: %1 = load +;CHECK-CM: Found an estimated cost of 2 for VF 16 For instruction: %1 = load +;CHECK-VP: Found an estimated cost of 1 for VF 1 For recipe: "CLONE %1 = load +;CHECK-VP: Found an estimated cost of 1 for VF 2 For recipe: "INTERLEAVE-GROUP with factor 2 at %1 +;CHECK-VP: Found an estimated cost of 1 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 2 at %1 +;CHECK-VP: Found an estimated cost of 1 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 2 at %1 +;CHECK-VP: Found an estimated cost of 2 for VF 16 For recipe: "INTERLEAVE-GROUP with factor 2 at %1 entry: br label %for.body @@ -35,11 +41,16 @@ define void @load_int_stride3() { ;CHECK-LABEL: load_int_stride3 -;CHECK: Found an estimated cost of 1 for VF 1 For instruction: %1 = load -;CHECK: Found an estimated cost of 1 for VF 2 For instruction: %1 = load -;CHECK: Found an estimated cost of 1 for VF 4 For instruction: %1 = load -;CHECK: Found an estimated cost of 2 for VF 8 For instruction: %1 = load -;CHECK: Found an estimated cost of 3 for VF 16 For instruction: %1 = load +;CHECK-CM: Found an estimated cost of 1 for VF 1 For instruction: %1 = load +;CHECK-CM: Found an estimated cost of 1 for VF 2 For instruction: %1 = load +;CHECK-CM: Found an estimated cost of 1 for VF 4 For instruction: %1 = load +;CHECK-CM: Found an estimated cost of 2 for VF 8 For instruction: %1 = load +;CHECK-CM: Found an estimated cost of 3 for VF 16 For instruction: %1 = load +;CHECK-VP: Found an estimated cost of 1 for VF 1 For recipe: "CLONE %1 = load +;CHECK-VP: Found an estimated cost of 1 for VF 2 For recipe: "INTERLEAVE-GROUP with factor 3 at %1 +;CHECK-VP: Found an estimated cost of 1 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 3 at %1 +;CHECK-VP: Found an estimated cost of 2 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 3 at %1 +;CHECK-VP: Found an estimated cost of 3 for VF 16 For recipe: "INTERLEAVE-GROUP with factor 3 at %1 entry: br label %for.body @@ -60,11 +71,16 @@ define void @load_int_stride4() { ;CHECK-LABEL: load_int_stride4 -;CHECK: Found an estimated cost of 1 for VF 1 For instruction: %1 = load -;CHECK: Found an estimated cost of 1 for VF 2 For instruction: %1 = load -;CHECK: Found an estimated cost of 1 for VF 4 For instruction: %1 = load -;CHECK: Found an estimated cost of 2 for VF 8 For instruction: %1 = load -;CHECK: Found an estimated cost of 5 for VF 16 For instruction: %1 = load +;CHECK-CM: Found an estimated cost of 1 for VF 1 For instruction: %1 = load +;CHECK-CM: Found an estimated cost of 1 for VF 2 For instruction: %1 = load +;CHECK-CM: Found an estimated cost of 1 for VF 4 For instruction: %1 = load +;CHECK-CM: Found an estimated cost of 2 for VF 8 For instruction: %1 = load +;CHECK-CM: Found an estimated cost of 5 for VF 16 For instruction: %1 = load +;CHECK-VP: Found an estimated cost of 1 for VF 1 For recipe: "CLONE %1 = load +;CHECK-VP: Found an estimated cost of 1 for VF 2 For recipe: "INTERLEAVE-GROUP with factor 4 at %1 +;CHECK-VP: Found an estimated cost of 1 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 4 at %1 +;CHECK-VP: Found an estimated cost of 2 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 4 at %1 +;CHECK-VP: Found an estimated cost of 5 for VF 16 For recipe: "INTERLEAVE-GROUP with factor 4 at %1 entry: br label %for.body @@ -85,11 +101,16 @@ define void @load_int_stride5() { ;CHECK-LABEL: load_int_stride5 -;CHECK: Found an estimated cost of 1 for VF 1 For instruction: %1 = load -;CHECK: Found an estimated cost of 1 for VF 2 For instruction: %1 = load -;CHECK: Found an estimated cost of 2 for VF 4 For instruction: %1 = load -;CHECK: Found an estimated cost of 3 for VF 8 For instruction: %1 = load -;CHECK: Found an estimated cost of 6 for VF 16 For instruction: %1 = load +;CHECK-CM: Found an estimated cost of 1 for VF 1 For instruction: %1 = load +;CHECK-CM: Found an estimated cost of 1 for VF 2 For instruction: %1 = load +;CHECK-CM: Found an estimated cost of 2 for VF 4 For instruction: %1 = load +;CHECK-CM: Found an estimated cost of 3 for VF 8 For instruction: %1 = load +;CHECK-CM: Found an estimated cost of 6 for VF 16 For instruction: %1 = load +;CHECK-VP: Found an estimated cost of 1 for VF 1 For recipe: "CLONE %1 = load +;CHECK-VP: Found an estimated cost of 1 for VF 2 For recipe: "INTERLEAVE-GROUP with factor 5 at %1 +;CHECK-VP: Found an estimated cost of 2 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 5 at %1 +;CHECK-VP: Found an estimated cost of 3 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 5 at %1 +;CHECK-VP: Found an estimated cost of 6 for VF 16 For recipe: "INTERLEAVE-GROUP with factor 5 at %1 entry: br label %for.body Index: llvm/test/Analysis/CostModel/X86/strided-load-i64.ll =================================================================== --- llvm/test/Analysis/CostModel/X86/strided-load-i64.ll +++ llvm/test/Analysis/CostModel/X86/strided-load-i64.ll @@ -1,5 +1,6 @@ ; REQUIRES: asserts -; RUN: opt -loop-vectorize -S -mattr=avx512f --debug-only=loop-vectorize < %s 2>&1| FileCheck %s +; RUN: opt -loop-vectorize -S -mattr=avx512f -cost-using-vplan=false --debug-only=loop-vectorize < %s 2>&1| FileCheck %s --check-prefixes=CHECK,CHECK-CM +; RUN: opt -loop-vectorize -S -mattr=avx512f -cost-using-vplan=true --debug-only=loop-vectorize < %s 2>&1| FileCheck %s --check-prefixes=CHECK,CHECK-VP target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" @@ -10,10 +11,14 @@ ; Function Attrs: nounwind uwtable define void @load_i64_stride2() { ;CHECK-LABEL: load_i64_stride2 -;CHECK: Found an estimated cost of 1 for VF 1 For instruction: %1 = load -;CHECK: Found an estimated cost of 1 for VF 2 For instruction: %1 = load -;CHECK: Found an estimated cost of 1 for VF 4 For instruction: %1 = load -;CHECK: Found an estimated cost of 2 for VF 8 For instruction: %1 = load +;CHECK-CM: Found an estimated cost of 1 for VF 1 For instruction: %1 = load +;CHECK-CM: Found an estimated cost of 1 for VF 2 For instruction: %1 = load +;CHECK-CM: Found an estimated cost of 1 for VF 4 For instruction: %1 = load +;CHECK-CM: Found an estimated cost of 2 for VF 8 For instruction: %1 = load +;CHECK-VP: Found an estimated cost of 1 for VF 1 For recipe: "CLONE %1 = load +;CHECK-VP: Found an estimated cost of 1 for VF 2 For recipe: "INTERLEAVE-GROUP with factor 2 at %1 +;CHECK-VP: Found an estimated cost of 1 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 2 at %1 +;CHECK-VP: Found an estimated cost of 2 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 2 at %1 entry: br label %for.body @@ -34,10 +39,14 @@ define void @load_i64_stride3() { ;CHECK-LABEL: load_i64_stride3 -;CHECK: Found an estimated cost of 1 for VF 1 For instruction: %1 = load -;CHECK: Found an estimated cost of 1 for VF 2 For instruction: %1 = load -;CHECK: Found an estimated cost of 2 for VF 4 For instruction: %1 = load -;CHECK: Found an estimated cost of 3 for VF 8 For instruction: %1 = load +;CHECK-CM: Found an estimated cost of 1 for VF 1 For instruction: %1 = load +;CHECK-CM: Found an estimated cost of 1 for VF 2 For instruction: %1 = load +;CHECK-CM: Found an estimated cost of 2 for VF 4 For instruction: %1 = load +;CHECK-CM: Found an estimated cost of 3 for VF 8 For instruction: %1 = load +;CHECK-VP: Found an estimated cost of 1 for VF 1 For recipe: "CLONE %1 = load +;CHECK-VP: Found an estimated cost of 1 for VF 2 For recipe: "INTERLEAVE-GROUP with factor 3 at %1 +;CHECK-VP: Found an estimated cost of 2 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 3 at %1 +;CHECK-VP: Found an estimated cost of 3 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 3 at %1 entry: br label %for.body @@ -58,10 +67,14 @@ define void @load_i64_stride4() { ;CHECK-LABEL: load_i64_stride4 -;CHECK: Found an estimated cost of 1 for VF 1 For instruction: %1 = load -;CHECK: Found an estimated cost of 1 for VF 2 For instruction: %1 = load -;CHECK: Found an estimated cost of 2 for VF 4 For instruction: %1 = load -;CHECK: Found an estimated cost of 5 for VF 8 For instruction: %1 = load +;CHECK-CM: Found an estimated cost of 1 for VF 1 For instruction: %1 = load +;CHECK-CM: Found an estimated cost of 1 for VF 2 For instruction: %1 = load +;CHECK-CM: Found an estimated cost of 2 for VF 4 For instruction: %1 = load +;CHECK-CM: Found an estimated cost of 5 for VF 8 For instruction: %1 = load +;CHECK-VP: Found an estimated cost of 1 for VF 1 For recipe: "CLONE %1 = load +;CHECK-VP: Found an estimated cost of 1 for VF 2 For recipe: "INTERLEAVE-GROUP with factor 4 at %1 +;CHECK-VP: Found an estimated cost of 2 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 4 at %1 +;CHECK-VP: Found an estimated cost of 5 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 4 at %1 entry: br label %for.body Index: llvm/test/Analysis/CostModel/X86/strided-load-i8.ll =================================================================== --- llvm/test/Analysis/CostModel/X86/strided-load-i8.ll +++ llvm/test/Analysis/CostModel/X86/strided-load-i8.ll @@ -1,5 +1,6 @@ ; REQUIRES: asserts -; RUN: opt -loop-vectorize -S -mattr=avx512bw --debug-only=loop-vectorize < %s 2>&1| FileCheck %s +; RUN: opt -loop-vectorize -S -mattr=avx512bw -cost-using-vplan=false --debug-only=loop-vectorize < %s 2>&1| FileCheck %s --check-prefixes=CHECK,CHECK-CM +; RUN: opt -loop-vectorize -S -mattr=avx512bw -cost-using-vplan=true --debug-only=loop-vectorize < %s 2>&1| FileCheck %s --check-prefixes=CHECK,CHECK-VP target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" @@ -10,13 +11,20 @@ ; Function Attrs: nounwind uwtable define void @load_i8_stride2() { ;CHECK-LABEL: load_i8_stride2 -;CHECK: Found an estimated cost of 1 for VF 1 For instruction: %1 = load -;CHECK: Found an estimated cost of 1 for VF 2 For instruction: %1 = load -;CHECK: Found an estimated cost of 1 for VF 4 For instruction: %1 = load -;CHECK: Found an estimated cost of 1 for VF 8 For instruction: %1 = load -;CHECK: Found an estimated cost of 4 for VF 16 For instruction: %1 = load -;CHECK: Found an estimated cost of 8 for VF 32 For instruction: %1 = load -;CHECK: Found an estimated cost of 20 for VF 64 For instruction: %1 = load +;CHECK-CM: Found an estimated cost of 1 for VF 1 For instruction: %1 = load +;CHECK-CM: Found an estimated cost of 1 for VF 2 For instruction: %1 = load +;CHECK-CM: Found an estimated cost of 1 for VF 4 For instruction: %1 = load +;CHECK-CM: Found an estimated cost of 1 for VF 8 For instruction: %1 = load +;CHECK-CM: Found an estimated cost of 4 for VF 16 For instruction: %1 = load +;CHECK-CM: Found an estimated cost of 8 for VF 32 For instruction: %1 = load +;CHECK-CM: Found an estimated cost of 20 for VF 64 For instruction: %1 = load +;CHECK-VP: Found an estimated cost of 1 for VF 1 For recipe: "CLONE %1 = load +;CHECK-VP: Found an estimated cost of 1 for VF 2 For recipe: "INTERLEAVE-GROUP with factor 2 at %1 +;CHECK-VP: Found an estimated cost of 1 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 2 at %1 +;CHECK-VP: Found an estimated cost of 1 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 2 at %1 +;CHECK-VP: Found an estimated cost of 4 for VF 16 For recipe: "INTERLEAVE-GROUP with factor 2 at %1 +;CHECK-VP: Found an estimated cost of 8 for VF 32 For recipe: "INTERLEAVE-GROUP with factor 2 at %1 +;CHECK-VP: Found an estimated cost of 20 for VF 64 For recipe: "INTERLEAVE-GROUP with factor 2 at %1 entry: br label %for.body @@ -37,13 +45,20 @@ define void @load_i8_stride3() { ;CHECK-LABEL: load_i8_stride3 -;CHECK: Found an estimated cost of 1 for VF 1 For instruction: %1 = load -;CHECK: Found an estimated cost of 1 for VF 2 For instruction: %1 = load -;CHECK: Found an estimated cost of 1 for VF 4 For instruction: %1 = load -;CHECK: Found an estimated cost of 4 for VF 8 For instruction: %1 = load -;CHECK: Found an estimated cost of 13 for VF 16 For instruction: %1 = load -;CHECK: Found an estimated cost of 16 for VF 32 For instruction: %1 = load -;CHECK: Found an estimated cost of 25 for VF 64 For instruction: %1 = load +;CHECK-CM: Found an estimated cost of 1 for VF 1 For instruction: %1 = load +;CHECK-CM: Found an estimated cost of 1 for VF 2 For instruction: %1 = load +;CHECK-CM: Found an estimated cost of 1 for VF 4 For instruction: %1 = load +;CHECK-CM: Found an estimated cost of 4 for VF 8 For instruction: %1 = load +;CHECK-CM: Found an estimated cost of 13 for VF 16 For instruction: %1 = load +;CHECK-CM: Found an estimated cost of 16 for VF 32 For instruction: %1 = load +;CHECK-CM: Found an estimated cost of 25 for VF 64 For instruction: %1 = load +;CHECK-VP: Found an estimated cost of 1 for VF 1 For recipe: "CLONE %1 = load +;CHECK-VP: Found an estimated cost of 1 for VF 2 For recipe: "INTERLEAVE-GROUP with factor 3 at %1 +;CHECK-VP: Found an estimated cost of 1 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 3 at %1 +;CHECK-VP: Found an estimated cost of 4 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 3 at %1 +;CHECK-VP: Found an estimated cost of 13 for VF 16 For recipe: "INTERLEAVE-GROUP with factor 3 at %1 +;CHECK-VP: Found an estimated cost of 16 for VF 32 For recipe: "INTERLEAVE-GROUP with factor 3 at %1 +;CHECK-VP: Found an estimated cost of 25 for VF 64 For recipe: "INTERLEAVE-GROUP with factor 3 at %1 entry: br label %for.body @@ -64,13 +79,20 @@ define void @load_i8_stride4() { ;CHECK-LABEL: load_i8_stride4 -;CHECK: Found an estimated cost of 1 for VF 1 For instruction: %1 = load -;CHECK: Found an estimated cost of 1 for VF 2 For instruction: %1 = load -;CHECK: Found an estimated cost of 1 for VF 4 For instruction: %1 = load -;CHECK: Found an estimated cost of 4 for VF 8 For instruction: %1 = load -;CHECK: Found an estimated cost of 8 for VF 16 For instruction: %1 = load -;CHECK: Found an estimated cost of 20 for VF 32 For instruction: %1 = load -;CHECK: Found an estimated cost of 59 for VF 64 For instruction: %1 = load +;CHECK-CM: Found an estimated cost of 1 for VF 1 For instruction: %1 = load +;CHECK-CM: Found an estimated cost of 1 for VF 2 For instruction: %1 = load +;CHECK-CM: Found an estimated cost of 1 for VF 4 For instruction: %1 = load +;CHECK-CM: Found an estimated cost of 4 for VF 8 For instruction: %1 = load +;CHECK-CM: Found an estimated cost of 8 for VF 16 For instruction: %1 = load +;CHECK-CM: Found an estimated cost of 20 for VF 32 For instruction: %1 = load +;CHECK-CM: Found an estimated cost of 59 for VF 64 For instruction: %1 = load +;CHECK-VP: Found an estimated cost of 1 for VF 1 For recipe: "CLONE %1 = load +;CHECK-VP: Found an estimated cost of 1 for VF 2 For recipe: "INTERLEAVE-GROUP with factor 4 at %1 +;CHECK-VP: Found an estimated cost of 1 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 4 at %1 +;CHECK-VP: Found an estimated cost of 4 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 4 at %1 +;CHECK-VP: Found an estimated cost of 8 for VF 16 For recipe: "INTERLEAVE-GROUP with factor 4 at %1 +;CHECK-VP: Found an estimated cost of 20 for VF 32 For recipe: "INTERLEAVE-GROUP with factor 4 at %1 +;CHECK-VP: Found an estimated cost of 59 for VF 64 For recipe: "INTERLEAVE-GROUP with factor 4 at %1 entry: br label %for.body @@ -91,13 +113,20 @@ define void @load_i8_stride5() { ;CHECK-LABEL: load_i8_stride5 -;CHECK: Found an estimated cost of 1 for VF 1 For instruction: %1 = load -;CHECK: Found an estimated cost of 1 for VF 2 For instruction: %1 = load -;CHECK: Found an estimated cost of 4 for VF 4 For instruction: %1 = load -;CHECK: Found an estimated cost of 8 for VF 8 For instruction: %1 = load -;CHECK: Found an estimated cost of 20 for VF 16 For instruction: %1 = load -;CHECK: Found an estimated cost of 39 for VF 32 For instruction: %1 = load -;CHECK: Found an estimated cost of 78 for VF 64 For instruction: %1 = load +;CHECK-CM: Found an estimated cost of 1 for VF 1 For instruction: %1 = load +;CHECK-CM: Found an estimated cost of 1 for VF 2 For instruction: %1 = load +;CHECK-CM: Found an estimated cost of 4 for VF 4 For instruction: %1 = load +;CHECK-CM: Found an estimated cost of 8 for VF 8 For instruction: %1 = load +;CHECK-CM: Found an estimated cost of 20 for VF 16 For instruction: %1 = load +;CHECK-CM: Found an estimated cost of 39 for VF 32 For instruction: %1 = load +;CHECK-CM: Found an estimated cost of 78 for VF 64 For instruction: %1 = load +;CHECK-VP: Found an estimated cost of 1 for VF 1 For recipe: "CLONE %1 = load +;CHECK-VP: Found an estimated cost of 1 for VF 2 For recipe: "INTERLEAVE-GROUP with factor 5 at %1 +;CHECK-VP: Found an estimated cost of 4 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 5 at %1 +;CHECK-VP: Found an estimated cost of 8 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 5 at %1 +;CHECK-VP: Found an estimated cost of 20 for VF 16 For recipe: "INTERLEAVE-GROUP with factor 5 at %1 +;CHECK-VP: Found an estimated cost of 39 for VF 32 For recipe: "INTERLEAVE-GROUP with factor 5 at %1 +;CHECK-VP: Found an estimated cost of 78 for VF 64 For recipe: "INTERLEAVE-GROUP with factor 5 at %1 entry: br label %for.body Index: llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll @@ -1,5 +1,6 @@ ; REQUIRES: asserts -; RUN: opt < %s -loop-vectorize -disable-output -debug-only=loop-vectorize 2>&1 | FileCheck %s --check-prefix=COST +; RUN: opt < %s -loop-vectorize -disable-output -debug-only=loop-vectorize -cost-using-vplan=false 2>&1 | FileCheck %s --check-prefix=COST +; RUN: opt < %s -loop-vectorize -disable-output -debug-only=loop-vectorize -cost-using-vplan=true 2>&1 | FileCheck %s --check-prefix=COST-VPLAN ; RUN: opt < %s -loop-vectorize -force-vector-width=2 -instcombine -simplifycfg -S | FileCheck %s target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" @@ -13,6 +14,8 @@ ; ; COST-LABEL: predicated_udiv_scalarized_operand ; COST: LV: Found an estimated cost of 4 for VF 2 For instruction: %tmp4 = udiv i64 %tmp2, %tmp3 +; COST-VPLAN-LABEL: predicated_udiv_scalarized_operand +; COST-VPLAN: LV: Found an estimated cost of 4 for VF 2 For recipe: "REPLICATE %tmp4 = udiv %tmp2, %tmp3 (S->V) ; ; CHECK-LABEL: @predicated_udiv_scalarized_operand( ; CHECK: vector.body: Index: llvm/test/Transforms/LoopVectorize/AArch64/costmodel.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/AArch64/costmodel.ll @@ -0,0 +1,217 @@ +; REQUIRES: asserts +; RUN: opt < %s -loop-vectorize -cost-using-vplan=false -S --debug-only=loop-vectorize 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-CM +; RUN: opt < %s -loop-vectorize -cost-using-vplan=true -S --debug-only=loop-vectorize 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-VP + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64--linux-gnu" + +; This is a series of test cases that show potential differences between the +; old cost model and the vplan version. The score are not necessarily precise, +; but just to show differences not tested elsewhere. + +; CHECK-LABEL: predicated_store +; CHECK-CM: LV: Found an estimated cost of 0 for VF 1 For instruction: %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ] +; CHECK-CM: LV: Found an estimated cost of 0 for VF 1 For instruction: %arrayidx = getelementptr inbounds i32, i32* %CF_marker_x, i64 %indvars.iv +; CHECK-CM: LV: Found an estimated cost of 2 for VF 1 For instruction: %0 = load i32, i32* %arrayidx, align 4 +; CHECK-CM: LV: Found an estimated cost of 1 for VF 1 For instruction: %cmp1 = icmp eq i32 %0, %fpt +; CHECK-CM: LV: Found an estimated cost of 0 for VF 1 For instruction: br i1 %cmp1, label %if.then, label %for.inc +; CHECK-CM: LV: Found an estimated cost of 0 for VF 1 For instruction: %arrayidx3 = getelementptr inbounds double, double* %y_data, i64 %indvars.iv +; CHECK-CM: LV: Found an estimated cost of 2 for VF 1 For instruction: store double 0.000000e+00, double* %arrayidx3, align 8 +; CHECK-CM: LV: Found an estimated cost of 0 for VF 1 For instruction: br label %for.inc +; CHECK-CM: LV: Found an estimated cost of 1 for VF 1 For instruction: %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 +; CHECK-CM: LV: Found an estimated cost of 1 for VF 1 For instruction: %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count +; CHECK-CM: LV: Found an estimated cost of 0 for VF 1 For instruction: br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body +; CHECK-CM: LV: Scalar loop costs: 6. +; CHECK-CM: LV: Found an estimated cost of 0 for VF 2 For instruction: %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ] +; CHECK-CM: LV: Found an estimated cost of 0 for VF 2 For instruction: %arrayidx = getelementptr inbounds i32, i32* %CF_marker_x, i64 %indvars.iv +; CHECK-CM: LV: Found an estimated cost of 1 for VF 2 For instruction: %0 = load i32, i32* %arrayidx, align 4 +; CHECK-CM: LV: Found an estimated cost of 1 for VF 2 For instruction: %cmp1 = icmp eq i32 %0, %fpt +; CHECK-CM: LV: Found an estimated cost of 3 for VF 2 For instruction: br i1 %cmp1, label %if.then, label %for.inc +; CHECK-CM: LV: Found an estimated cost of 0 for VF 2 For instruction: %arrayidx3 = getelementptr inbounds double, double* %y_data, i64 %indvars.iv +; CHECK-CM: LV: Found an estimated cost of 2 for VF 2 For instruction: store double 0.000000e+00, double* %arrayidx3, align 8 +; CHECK-CM: LV: Found an estimated cost of 0 for VF 2 For instruction: br label %for.inc +; CHECK-CM: LV: Found an estimated cost of 2 for VF 2 For instruction: %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 +; CHECK-CM: LV: Found an estimated cost of 1 for VF 2 For instruction: %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count +; CHECK-CM: LV: Found an estimated cost of 0 for VF 2 For instruction: br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body +; CHECK-CM: LV: Vector loop of width 2 costs: 5. +; CHECK-CM: LV: Selecting VF: 2. +; CHECK-VP: LV: Found an estimated cost of 0 for VF 1 For recipe: "WIDEN-INDUCTION %indvars.iv = phi 0, %indvars.iv.next +; CHECK-VP: LV: Found an estimated cost of 0 for VF 1 For recipe: "CLONE %arrayidx = getelementptr %CF_marker_x, %indvars.iv +; CHECK-VP: LV: Found an estimated cost of 2 for VF 1 For recipe: "CLONE %0 = load %arrayidx +; CHECK-VP: LV: Found an estimated cost of 1 for VF 1 For recipe: "CLONE %cmp1 = icmp %0, %fpt +; CHECK-VP: LV: Found an estimated cost of 0 for VF 1 For recipe: "CLONE %arrayidx3 = getelementptr %y_data, %indvars.iv +; CHECK-VP: LV: Found an estimated cost of 0 for VF 1 For recipe: "BRANCH-ON-MASK ir<%cmp1>\l" +; CHECK-VP: LV: Found an estimated cost of 2 for VF 1 For recipe: "CLONE store 0.000000e+00, %arrayidx3 +; CHECK-VP: LV: Found an estimated cost of 2 for VF 1 For loop induction check (add + icmp) +; CHECK-VP: LV: Found an estimated cost of 0 for VF 1 For loop backedge cost (br) +; CHECK-VP: LV: Vector loop of width 1 costs: 6. +; CHECK-VP: LV: Found an estimated cost of 0 for VF 2 For recipe: "WIDEN-INDUCTION %indvars.iv = phi 0, %indvars.iv.next +; CHECK-VP: LV: Found an estimated cost of 0 for VF 2 For recipe: "CLONE %arrayidx = getelementptr %CF_marker_x, %indvars.iv +; CHECK-VP: LV: Found an estimated cost of 1 for VF 2 For recipe: "WIDEN load ir<%arrayidx> +; CHECK-VP: LV: Found an estimated cost of 1 for VF 2 For recipe: "WIDEN\l"" %cmp1 = icmp %0, %fpt +; CHECK-VP: LV: Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE %arrayidx3 = getelementptr %y_data, %indvars.iv +; CHECK-VP: LV: Found an estimated cost of 3 for VF 2 For recipe: "BRANCH-ON-MASK ir<%cmp1>\l" +; CHECK-VP: LV: Found an estimated cost of 2 for VF 2 For recipe: "REPLICATE store 0.000000e+00, %arrayidx3 +; CHECK-VP: LV: Found an estimated cost of 2 for VF 2 For loop induction check (add + icmp) +; CHECK-VP: LV: Found an estimated cost of 0 for VF 2 For loop backedge cost (br) +; CHECK-VP: LV: Vector loop of width 2 costs: 4. +; CHECK-VP: LV: Selecting VF: 2. +define i32 @predicated_store(i32* nocapture readonly %CF_marker_x, double* nocapture %y_data, i32 %num_rows, i32 %fpt) { +entry: + %cmp8 = icmp sgt i32 %num_rows, 0 + br i1 %cmp8, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %num_rows to i64 + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %for.inc + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + ret i32 undef + +for.body: ; preds = %for.body.preheader, %for.inc + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ] + %arrayidx = getelementptr inbounds i32, i32* %CF_marker_x, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %cmp1 = icmp eq i32 %0, %fpt + br i1 %cmp1, label %if.then, label %for.inc + +if.then: ; preds = %for.body + %arrayidx3 = getelementptr inbounds double, double* %y_data, i64 %indvars.iv + store double 0.000000e+00, double* %arrayidx3, align 8 + br label %for.inc + +for.inc: ; preds = %for.body, %if.then + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body +} + +; CHECK-LABEL: vif +; CHECK-CM: LV: Found an estimated cost of 0 for VF 1 For instruction: %indvars.iv = phi i64 [ 0, %for.cond1.preheader.us ], [ %indvars.iv.next, %for.inc.us ] +; CHECK-CM: LV: Found an estimated cost of 0 for VF 1 For instruction: %arrayidx.us = getelementptr inbounds float, float* %b, i64 %indvars.iv +; CHECK-CM: LV: Found an estimated cost of 2 for VF 1 For instruction: %1 = load float, float* %arrayidx.us, align 4 +; CHECK-CM: LV: Found an estimated cost of 1 for VF 1 For instruction: %cmp5.us = fcmp ogt float %1, 0.000000e+00 +; CHECK-CM: LV: Found an estimated cost of 0 for VF 1 For instruction: br i1 %cmp5.us, label %if.then.us, label %for.inc.us +; CHECK-CM: LV: Found an estimated cost of 0 for VF 1 For instruction: %arrayidx9.us = getelementptr inbounds float, float* %a, i64 %indvars.iv +; CHECK-CM: LV: Found an estimated cost of 2 for VF 1 For instruction: store float %1, float* %arrayidx9.us, align 4 +; CHECK-CM: LV: Found an estimated cost of 0 for VF 1 For instruction: br label %for.inc.us +; CHECK-CM: LV: Found an estimated cost of 1 for VF 1 For instruction: %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 +; CHECK-CM: LV: Found an estimated cost of 1 for VF 1 For instruction: %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count +; CHECK-CM: LV: Found an estimated cost of 0 for VF 1 For instruction: br i1 %exitcond.not, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us +; CHECK-CM: LV: Scalar loop costs: 6. +; CHECK-CM: LV: Found an estimated cost of 0 for VF 2 For instruction: %indvars.iv = phi i64 [ 0, %for.cond1.preheader.us ], [ %indvars.iv.next, %for.inc.us ] +; CHECK-CM: LV: Found an estimated cost of 0 for VF 2 For instruction: %arrayidx.us = getelementptr inbounds float, float* %b, i64 %indvars.iv +; CHECK-CM: LV: Found an estimated cost of 1 for VF 2 For instruction: %1 = load float, float* %arrayidx.us, align 4 +; CHECK-CM: LV: Found an estimated cost of 1 for VF 2 For instruction: %cmp5.us = fcmp ogt float %1, 0.000000e+00 +; CHECK-CM: LV: Found an estimated cost of 3 for VF 2 For instruction: br i1 %cmp5.us, label %if.then.us, label %for.inc.us +; CHECK-CM: LV: Found an estimated cost of 0 for VF 2 For instruction: %arrayidx9.us = getelementptr inbounds float, float* %a, i64 %indvars.iv +; CHECK-CM: LV: Found an estimated cost of 3 for VF 2 For instruction: store float %1, float* %arrayidx9.us, align 4 +; CHECK-CM: LV: Found an estimated cost of 0 for VF 2 For instruction: br label %for.inc.us +; CHECK-CM: LV: Found an estimated cost of 2 for VF 2 For instruction: %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 +; CHECK-CM: LV: Found an estimated cost of 1 for VF 2 For instruction: %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count +; CHECK-CM: LV: Found an estimated cost of 0 for VF 2 For instruction: br i1 %exitcond.not, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us +; CHECK-CM: LV: Vector loop of width 2 costs: 5. +; CHECK-CM: LV: Found an estimated cost of 0 for VF 4 For instruction: %indvars.iv = phi i64 [ 0, %for.cond1.preheader.us ], [ %indvars.iv.next, %for.inc.us ] +; CHECK-CM: LV: Found an estimated cost of 0 for VF 4 For instruction: %arrayidx.us = getelementptr inbounds float, float* %b, i64 %indvars.iv +; CHECK-CM: LV: Found an estimated cost of 1 for VF 4 For instruction: %1 = load float, float* %arrayidx.us, align 4 +; CHECK-CM: LV: Found an estimated cost of 1 for VF 4 For instruction: %cmp5.us = fcmp ogt float %1, 0.000000e+00 +; CHECK-CM: LV: Found an estimated cost of 9 for VF 4 For instruction: br i1 %cmp5.us, label %if.then.us, label %for.inc.us +; CHECK-CM: LV: Found an estimated cost of 0 for VF 4 For instruction: %arrayidx9.us = getelementptr inbounds float, float* %a, i64 %indvars.iv +; CHECK-CM: LV: Found an estimated cost of 8 for VF 4 For instruction: store float %1, float* %arrayidx9.us, align 4 +; CHECK-CM: LV: Found an estimated cost of 0 for VF 4 For instruction: br label %for.inc.us +; CHECK-CM: LV: Found an estimated cost of 4 for VF 4 For instruction: %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 +; CHECK-CM: LV: Found an estimated cost of 1 for VF 4 For instruction: %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count +; CHECK-CM: LV: Found an estimated cost of 0 for VF 4 For instruction: br i1 %exitcond.not, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us +; CHECK-CM: LV: Vector loop of width 4 costs: 6. +; CHECK-CM: LV: Selecting VF: 2. +; CHECK-VP: LV: Found an estimated cost of 0 for VF 1 For recipe: "WIDEN-INDUCTION %indvars.iv = phi 0, %indvars.iv.next +; CHECK-VP: LV: Found an estimated cost of 0 for VF 1 For recipe: "CLONE %arrayidx.us = getelementptr %b, %indvars.iv +; CHECK-VP: LV: Found an estimated cost of 2 for VF 1 For recipe: "CLONE %1 = load %arrayidx.us +; CHECK-VP: LV: Found an estimated cost of 1 for VF 1 For recipe: "CLONE %cmp5.us = fcmp %1, 0.000000e+00 +; CHECK-VP: LV: Found an estimated cost of 0 for VF 1 For recipe: "CLONE %arrayidx9.us = getelementptr %a, %indvars.iv +; CHECK-VP: LV: Found an estimated cost of 0 for VF 1 For recipe: "BRANCH-ON-MASK ir<%cmp5.us>\l" +; CHECK-VP: LV: Found an estimated cost of 2 for VF 1 For recipe: "CLONE store %1, %arrayidx9.us +; CHECK-VP: LV: Found an estimated cost of 2 for VF 1 For loop induction check (add + icmp) +; CHECK-VP: LV: Found an estimated cost of 0 for VF 1 For loop backedge cost (br) +; CHECK-VP: LV: Vector loop of width 1 costs: 6. +; CHECK-VP: LV: Found an estimated cost of 0 for VF 2 For recipe: "WIDEN-INDUCTION %indvars.iv = phi 0, %indvars.iv.next +; CHECK-VP: LV: Found an estimated cost of 0 for VF 2 For recipe: "CLONE %arrayidx.us = getelementptr %b, %indvars.iv +; CHECK-VP: LV: Found an estimated cost of 1 for VF 2 For recipe: "WIDEN load ir<%arrayidx.us> +; CHECK-VP: LV: Found an estimated cost of 1 for VF 2 For recipe: "WIDEN\l"" %cmp5.us = fcmp %1, 0.000000e+00 +; CHECK-VP: LV: Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE %arrayidx9.us = getelementptr %a, %indvars.iv +; CHECK-VP: LV: Found an estimated cost of 3 for VF 2 For recipe: "BRANCH-ON-MASK ir<%cmp5.us>\l" +; CHECK-VP: LV: Found an estimated cost of 3 for VF 2 For recipe: "REPLICATE store %1, %arrayidx9.us +; CHECK-VP: LV: Found an estimated cost of 2 for VF 2 For loop induction check (add + icmp) +; CHECK-VP: LV: Found an estimated cost of 0 for VF 2 For loop backedge cost (br) +; CHECK-VP: LV: Vector loop of width 2 costs: 5. +; CHECK-VP: LV: Found an estimated cost of 0 for VF 4 For recipe: "WIDEN-INDUCTION %indvars.iv = phi 0, %indvars.iv.next +; CHECK-VP: LV: Found an estimated cost of 0 for VF 4 For recipe: "CLONE %arrayidx.us = getelementptr %b, %indvars.iv +; CHECK-VP: LV: Found an estimated cost of 1 for VF 4 For recipe: "WIDEN load ir<%arrayidx.us> +; CHECK-VP: LV: Found an estimated cost of 1 for VF 4 For recipe: "WIDEN\l"" %cmp5.us = fcmp %1, 0.000000e+00 +; CHECK-VP: LV: Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE %arrayidx9.us = getelementptr %a, %indvars.iv +; CHECK-VP: LV: Found an estimated cost of 9 for VF 4 For recipe: "BRANCH-ON-MASK ir<%cmp5.us>\l" +; CHECK-VP: LV: Found an estimated cost of 8 for VF 4 For recipe: "REPLICATE store %1, %arrayidx9.us +; CHECK-VP: LV: Found an estimated cost of 2 for VF 4 For loop induction check (add + icmp) +; CHECK-VP: LV: Found an estimated cost of 0 for VF 4 For loop backedge cost (br) +; CHECK-VP: LV: Vector loop of width 4 costs: 5. +; CHECK-VP: LV: Selecting VF: 2. +define i32 @vif(i32 %ntimes, i32 %LEN, float* %a, float* %b, float* %c, float* %d, float* %e, i32 %aa, i32 %bb, i32 %cc) { +entry: + %cmp27 = icmp sgt i32 %ntimes, 0 + br i1 %cmp27, label %for.cond1.preheader.lr.ph, label %for.cond.cleanup + +for.cond1.preheader.lr.ph: ; preds = %entry + %cmp225 = icmp sgt i32 %LEN, 0 + br i1 %cmp225, label %for.cond1.preheader.us.preheader, label %for.cond1.preheader.preheader + +for.cond1.preheader.preheader: ; preds = %for.cond1.preheader.lr.ph + br label %for.cond1.preheader + +for.cond1.preheader.us.preheader: ; preds = %for.cond1.preheader.lr.ph + %wide.trip.count = zext i32 %LEN to i64 + br label %for.cond1.preheader.us + +for.cond1.preheader.us: ; preds = %for.cond1.preheader.us.preheader, %for.cond1.for.cond.cleanup3_crit_edge.us + %nl.028.us = phi i32 [ %inc12.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %for.cond1.preheader.us.preheader ] + br label %for.body4.us + +for.body4.us: ; preds = %for.cond1.preheader.us, %for.inc.us + %indvars.iv = phi i64 [ 0, %for.cond1.preheader.us ], [ %indvars.iv.next, %for.inc.us ] + %arrayidx.us = getelementptr inbounds float, float* %b, i64 %indvars.iv + %0 = load float, float* %arrayidx.us, align 4 + %cmp5.us = fcmp ogt float %0, 0.000000e+00 + br i1 %cmp5.us, label %if.then.us, label %for.inc.us + +if.then.us: ; preds = %for.body4.us + %arrayidx9.us = getelementptr inbounds float, float* %a, i64 %indvars.iv + store float %0, float* %arrayidx9.us, align 4 + br label %for.inc.us + +for.inc.us: ; preds = %if.then.us, %for.body4.us + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us + +for.cond1.for.cond.cleanup3_crit_edge.us: ; preds = %for.inc.us + %inc12.us = add nuw nsw i32 %nl.028.us, 1 + %exitcond30.not = icmp eq i32 %inc12.us, %ntimes + br i1 %exitcond30.not, label %for.cond.cleanup.loopexit, label %for.cond1.preheader.us + +for.cond1.preheader: ; preds = %for.cond1.preheader.preheader, %for.cond1.preheader + %nl.028 = phi i32 [ %inc12, %for.cond1.preheader ], [ 0, %for.cond1.preheader.preheader ] + %inc12 = add nuw nsw i32 %nl.028, 1 + %exitcond31.not = icmp eq i32 %inc12, %ntimes + br i1 %exitcond31.not, label %for.cond.cleanup.loopexit33, label %for.cond1.preheader + +for.cond.cleanup.loopexit: ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us + br label %for.cond.cleanup + +for.cond.cleanup.loopexit33: ; preds = %for.cond1.preheader + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit33, %for.cond.cleanup.loopexit, %entry + ret i32 0 +} Index: llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll @@ -1,6 +1,7 @@ ; REQUIRES: asserts -; RUN: opt -loop-vectorize -mtriple=arm64-apple-ios %s -S -debug -disable-output 2>&1 | FileCheck --check-prefix=CM %s +; RUN: opt -loop-vectorize -mtriple=arm64-apple-ios -cost-using-vplan=false %s -S -debug -disable-output 2>&1 | FileCheck --check-prefix=CM-OLD %s +; RUN: opt -loop-vectorize -mtriple=arm64-apple-ios -cost-using-vplan=true %s -S -debug -disable-output 2>&1 | FileCheck --check-prefix=CM-VPLAN %s ; RUN: opt -loop-vectorize -force-vector-width=2 -force-vector-interleave=1 %s -S | FileCheck --check-prefix=FORCED %s ; Test case from PR41294. @@ -8,9 +9,12 @@ ; Check scalar cost for extractvalue. The constant and loop invariant operands are free, ; leaving cost 3 for scalarizing the result + 2 for executing the op with VF 2. -; CM: LV: Scalar loop costs: 7. -; CM: LV: Found an estimated cost of 5 for VF 2 For instruction: %a = extractvalue { i64, i64 } %sv, 0 -; CM-NEXT: LV: Found an estimated cost of 5 for VF 2 For instruction: %b = extractvalue { i64, i64 } %sv, 1 +; CM-OLD: LV: Scalar loop costs: 7. +; CM-OLD: LV: Found an estimated cost of 5 for VF 2 For instruction: %a = extractvalue { i64, i64 } %sv, 0 +; CM-OLD-NEXT: LV: Found an estimated cost of 5 for VF 2 For instruction: %b = extractvalue { i64, i64 } %sv, 1 +; CM-VPLAN: LV: Vector loop of width 1 costs: 7. +; CM-VPLAN: LV: Found an estimated cost of 5 for VF 2 For recipe: "REPLICATE %a = extractvalue %sv +; CM-VPLAN-NEXT: LV: Found an estimated cost of 5 for VF 2 For recipe: "REPLICATE %b = extractvalue %sv ; Check that the extractvalue operands are actually free in vector code. @@ -57,9 +61,12 @@ ; Similar to the test case above, but checks getVectorCallCost as well. declare float @pow(float, float) readnone nounwind -; CM: LV: Scalar loop costs: 16. -; CM: LV: Found an estimated cost of 5 for VF 2 For instruction: %a = extractvalue { float, float } %sv, 0 -; CM-NEXT: LV: Found an estimated cost of 5 for VF 2 For instruction: %b = extractvalue { float, float } %sv, 1 +; CM-OLD: LV: Scalar loop costs: 16. +; CM-OLD: LV: Found an estimated cost of 5 for VF 2 For instruction: %a = extractvalue { float, float } %sv, 0 +; CM-OLD-NEXT: LV: Found an estimated cost of 5 for VF 2 For instruction: %b = extractvalue { float, float } %sv, 1 +; CM-VPLAN: LV: Vector loop of width 1 costs: 16. +; CM-VPLAN: LV: Found an estimated cost of 5 for VF 2 For recipe: "REPLICATE %a = extractvalue %sv +; CM-VPLAN-NEXT: LV: Found an estimated cost of 5 for VF 2 For recipe: "REPLICATE %b = extractvalue %sv ; FORCED-LABEL: define void @test_getVectorCallCost Index: llvm/test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll @@ -1,5 +1,6 @@ ; REQUIRES: asserts -; RUN: opt < %s -force-vector-width=2 -force-vector-interleave=1 -loop-vectorize -S --debug-only=loop-vectorize 2>&1 | FileCheck %s +; RUN: opt < %s -force-vector-width=2 -force-vector-interleave=1 -loop-vectorize -cost-using-vplan=false -S --debug-only=loop-vectorize 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-CM +; RUN: opt < %s -force-vector-width=2 -force-vector-interleave=1 -loop-vectorize -cost-using-vplan=true -S --debug-only=loop-vectorize 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-VP ; This test shows extremely high interleaving cost that, probably, should be fixed. ; Due to the high cost, interleaving is not beneficial and the cost model chooses to scalarize @@ -11,8 +12,10 @@ %pair = type { i8, i8 } ; CHECK-LABEL: test -; CHECK: Found an estimated cost of 20 for VF 2 For instruction: {{.*}} load i8 -; CHECK: Found an estimated cost of 0 for VF 2 For instruction: {{.*}} load i8 +; CHECK-CM: Found an estimated cost of 20 for VF 2 For instruction: {{.*}} load i8 +; CHECK-CM: Found an estimated cost of 0 for VF 2 For instruction: {{.*}} load i8 +; CHECK-VP: Found an estimated cost of 20 for VF 2 For recipe: {{.*}} load +; CHECK-VP: Found an estimated cost of 0 for VF 2 For recipe: {{.*}} load ; CHECK: vector.body ; CHECK: load i8 ; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body Index: llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll @@ -1,7 +1,11 @@ -; RUN: opt -loop-vectorize -force-vector-width=2 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_2 -; RUN: opt -loop-vectorize -force-vector-width=4 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_4 -; RUN: opt -loop-vectorize -force-vector-width=8 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_8 -; RUN: opt -loop-vectorize -force-vector-width=16 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_16 +; RUN: opt -loop-vectorize -force-vector-width=2 -cost-using-vplan=false -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_2 +; RUN: opt -loop-vectorize -force-vector-width=4 -cost-using-vplan=false -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_4 +; RUN: opt -loop-vectorize -force-vector-width=8 -cost-using-vplan=false -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_8 +; RUN: opt -loop-vectorize -force-vector-width=16 -cost-using-vplan=false -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_16 +; RUN: opt -loop-vectorize -force-vector-width=2 -cost-using-vplan=true -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VP_2 +; RUN: opt -loop-vectorize -force-vector-width=4 -cost-using-vplan=true -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VP_4 +; RUN: opt -loop-vectorize -force-vector-width=8 -cost-using-vplan=true -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VP_8 +; RUN: opt -loop-vectorize -force-vector-width=16 -cost-using-vplan=true -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VP_16 ; REQUIRES: asserts target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" @@ -22,6 +26,12 @@ ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i8, i8* %tmp1, align 1 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i8 0, i8* %tmp0, align 1 ; VF_16-NEXT: Found an estimated cost of 2 for VF 16 For instruction: store i8 0, i8* %tmp1, align 1 +; VP_8-LABEL: Checking a loop in "i8_factor_2" +; VP_8: Found an estimated cost of 2 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 2 at %tmp2, ir<%tmp0> +; VP_8: Found an estimated cost of 2 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 2 at , ir<%tmp1> +; VP_16-LABEL: Checking a loop in "i8_factor_2" +; VP_16: Found an estimated cost of 2 for VF 16 For recipe: "INTERLEAVE-GROUP with factor 2 at %tmp2, ir<%tmp0> +; VP_16: Found an estimated cost of 2 for VF 16 For recipe: "INTERLEAVE-GROUP with factor 2 at , ir<%tmp1> for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i8.2, %i8.2* %data, i64 %i, i32 0 @@ -58,6 +68,15 @@ ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i16, i16* %tmp1, align 2 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i16 0, i16* %tmp0, align 2 ; VF_16-NEXT: Found an estimated cost of 4 for VF 16 For instruction: store i16 0, i16* %tmp1, align 2 +; VP_4-LABEL: Checking a loop in "i16_factor_2" +; VP_4: Found an estimated cost of 2 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 2 at %tmp2, ir<%tmp0> +; VP_4: Found an estimated cost of 2 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 2 at , ir<%tmp1> +; VP_8-LABEL: Checking a loop in "i16_factor_2" +; VP_8: Found an estimated cost of 2 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 2 at %tmp2, ir<%tmp0> +; VP_8: Found an estimated cost of 2 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 2 at , ir<%tmp1> +; VP_16-LABEL: Checking a loop in "i16_factor_2" +; VP_16: Found an estimated cost of 4 for VF 16 For recipe: "INTERLEAVE-GROUP with factor 2 at %tmp2, ir<%tmp0> +; VP_16: Found an estimated cost of 4 for VF 16 For recipe: "INTERLEAVE-GROUP with factor 2 at , ir<%tmp1> for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i16.2, %i16.2* %data, i64 %i, i32 0 @@ -99,6 +118,18 @@ ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i32, i32* %tmp1, align 4 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i32 0, i32* %tmp0, align 4 ; VF_16-NEXT: Found an estimated cost of 8 for VF 16 For instruction: store i32 0, i32* %tmp1, align 4 +; VP_2-LABEL: Checking a loop in "i32_factor_2" +; VP_2: Found an estimated cost of 2 for VF 2 For recipe: "INTERLEAVE-GROUP with factor 2 at %tmp2, ir<%tmp0> +; VP_2: Found an estimated cost of 2 for VF 2 For recipe: "INTERLEAVE-GROUP with factor 2 at , ir<%tmp1> +; VP_4-LABEL: Checking a loop in "i32_factor_2" +; VP_4: Found an estimated cost of 2 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 2 at %tmp2, ir<%tmp0> +; VP_4: Found an estimated cost of 2 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 2 at , ir<%tmp1> +; VP_8-LABEL: Checking a loop in "i32_factor_2" +; VP_8: Found an estimated cost of 4 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 2 at %tmp2, ir<%tmp0> +; VP_8: Found an estimated cost of 4 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 2 at , ir<%tmp1> +; VP_16-LABEL: Checking a loop in "i32_factor_2" +; VP_16: Found an estimated cost of 8 for VF 16 For recipe: "INTERLEAVE-GROUP with factor 2 at %tmp2, ir<%tmp0> +; VP_16: Found an estimated cost of 8 for VF 16 For recipe: "INTERLEAVE-GROUP with factor 2 at , ir<%tmp1> for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i32.2, %i32.2* %data, i64 %i, i32 0 @@ -140,6 +171,18 @@ ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i64, i64* %tmp1, align 8 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i64 0, i64* %tmp0, align 8 ; VF_16-NEXT: Found an estimated cost of 16 for VF 16 For instruction: store i64 0, i64* %tmp1, align 8 +; VP_2-LABEL: Checking a loop in "i64_factor_2" +; VP_2: Found an estimated cost of 2 for VF 2 For recipe: "INTERLEAVE-GROUP with factor 2 at %tmp2, ir<%tmp0> +; VP_2: Found an estimated cost of 2 for VF 2 For recipe: "INTERLEAVE-GROUP with factor 2 at , ir<%tmp1> +; VP_4-LABEL: Checking a loop in "i64_factor_2" +; VP_4: Found an estimated cost of 4 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 2 at %tmp2, ir<%tmp0> +; VP_4: Found an estimated cost of 4 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 2 at , ir<%tmp1> +; VP_8-LABEL: Checking a loop in "i64_factor_2" +; VP_8: Found an estimated cost of 8 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 2 at %tmp2, ir<%tmp0> +; VP_8: Found an estimated cost of 8 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 2 at , ir<%tmp1> +; VP_16-LABEL: Checking a loop in "i64_factor_2" +; VP_16: Found an estimated cost of 16 for VF 16 For recipe: "INTERLEAVE-GROUP with factor 2 at %tmp2, ir<%tmp0> +; VP_16: Found an estimated cost of 16 for VF 16 For recipe: "INTERLEAVE-GROUP with factor 2 at , ir<%tmp1> for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i64.2, %i64.2* %data, i64 %i, i32 0 @@ -172,6 +215,10 @@ ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i64, i64* %tmp1, align 8 ; VF_2-NEXT: Found an estimated cost of 7 for VF 2 For instruction: store i64 0, i64* %tmp0, align 8 ; VF_2-NEXT: Found an estimated cost of 7 for VF 2 For instruction: store i64 0, i64* %tmp1, align 8 +; VP_2-LABEL: Checking a loop in "i64_factor_8" +; VP_2: Found an estimated cost of 6 for VF 2 For recipe: "INTERLEAVE-GROUP with factor 8 at %tmp2, ir<%tmp0> +; VP_2: Found an estimated cost of 7 for VF 2 For recipe: "REPLICATE store 0, %tmp0 +; VP_2-NEXT: Found an estimated cost of 7 for VF 2 For recipe: "REPLICATE store 0, %tmp1 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i64.8, %i64.8* %data, i64 %i, i32 2 Index: llvm/test/Transforms/LoopVectorize/AArch64/no_vector_instructions.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/no_vector_instructions.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/no_vector_instructions.ll @@ -1,12 +1,14 @@ ; REQUIRES: asserts -; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -S -debug-only=loop-vectorize 2>&1 | FileCheck %s +; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -cost-using-vplan=false -S -debug-only=loop-vectorize 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-CM +; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -cost-using-vplan=true -S -debug-only=loop-vectorize 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-VP target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64--linux-gnu" ; CHECK-LABEL: all_scalar ; CHECK: LV: Found scalar instruction: %i.next = add nuw nsw i64 %i, 2 -; CHECK: LV: Found an estimated cost of 2 for VF 2 For instruction: %i.next = add nuw nsw i64 %i, 2 +; CHECK-CM: LV: Found an estimated cost of 2 for VF 2 For instruction: %i.next = add nuw nsw i64 %i, 2 +; CHECK-VP-NOT: LV: Found an estimated cost of {{.*}} for VF 2 For Recipe: {{.*}} zext ; CHECK: LV: Not considering vector loop of width 2 because it will not generate any vector instructions ; define void @all_scalar(i64* %a, i64 %n) { @@ -27,7 +29,8 @@ ; CHECK-LABEL: PR33193 ; CHECK: LV: Found scalar instruction: %i.next = zext i32 %j.next to i64 -; CHECK: LV: Found an estimated cost of 0 for VF 8 For instruction: %i.next = zext i32 %j.next to i64 +; CHECK-CM: LV: Found an estimated cost of 0 for VF 8 For instruction: %i.next = zext i32 %j.next to i64 +; CHECK-VP-NOT: LV: Found an estimated cost of {{.*}} for VF 8 For Recipe: {{.*}} zext ; CHECK: LV: Not considering vector loop of width 8 because it will not generate any vector instructions %struct.a = type { i32, i8 } define void @PR33193(%struct.a* %a, i64 %n) { Index: llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll @@ -1,5 +1,6 @@ ; REQUIRES: asserts -; RUN: opt < %s -force-vector-width=2 -loop-vectorize -debug-only=loop-vectorize -disable-output 2>&1 | FileCheck %s +; RUN: opt < %s -force-vector-width=2 -loop-vectorize -cost-using-vplan=false -debug-only=loop-vectorize -disable-output 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-CM +; RUN: opt < %s -force-vector-width=2 -loop-vectorize -cost-using-vplan=true -debug-only=loop-vectorize -disable-output 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-VP target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" target triple = "aarch64--linux-gnu" @@ -18,8 +19,9 @@ ; Cost of udiv: ; (udiv(2) + extractelement(6) + insertelement(3)) / 2 = 5 ; -; CHECK: Scalarizing and predicating: %tmp4 = udiv i32 %tmp2, %tmp3 -; CHECK: Found an estimated cost of 5 for VF 2 For instruction: %tmp4 = udiv i32 %tmp2, %tmp3 +; CHECK: Scalarizing and predicating: %tmp4 = udiv i32 %tmp2, %tmp3 +; CHECK-CM: Found an estimated cost of 5 for VF 2 For instruction: %tmp4 = udiv i32 %tmp2, %tmp3 +; CHECK-VP: Found an estimated cost of 5 for VF 2 For recipe: "REPLICATE %tmp4 = udiv %tmp2, %tmp3 (S->V) ; define i32 @predicated_udiv(i32* %a, i32* %b, i1 %c, i64 %n) { entry: @@ -59,8 +61,9 @@ ; Cost of store: ; (store(4) + extractelement(3)) / 2 = 3 ; -; CHECK: Scalarizing and predicating: store i32 %tmp2, i32* %tmp0, align 4 -; CHECK: Found an estimated cost of 3 for VF 2 For instruction: store i32 %tmp2, i32* %tmp0, align 4 +; CHECK: Scalarizing and predicating: store i32 %tmp2, i32* %tmp0, align 4 +; CHECK-CM: Found an estimated cost of 3 for VF 2 For instruction: store i32 %tmp2, i32* %tmp0, align 4 +; CHECK-VP: Found an estimated cost of 3 for VF 2 For recipe: "REPLICATE store %tmp2, %tmp0 ; define void @predicated_store(i32* %a, i1 %c, i32 %x, i64 %n) { entry: @@ -100,8 +103,10 @@ ; ; CHECK: Scalarizing: %tmp3 = add nsw i32 %tmp2, %x ; CHECK: Scalarizing and predicating: %tmp4 = udiv i32 %tmp2, %tmp3 -; CHECK: Found an estimated cost of 2 for VF 2 For instruction: %tmp3 = add nsw i32 %tmp2, %x -; CHECK: Found an estimated cost of 4 for VF 2 For instruction: %tmp4 = udiv i32 %tmp2, %tmp3 +; CHECK-CM: Found an estimated cost of 2 for VF 2 For instruction: %tmp3 = add nsw i32 %tmp2, %x +; CHECK-CM: Found an estimated cost of 4 for VF 2 For instruction: %tmp4 = udiv i32 %tmp2, %tmp3 +; CHECK-VP: Found an estimated cost of 2 for VF 2 For recipe: "REPLICATE %tmp3 = add %tmp2, %x +; CHECK-VP: Found an estimated cost of 4 for VF 2 For recipe: "REPLICATE %tmp4 = udiv %tmp2, %tmp3 ; define i32 @predicated_udiv_scalarized_operand(i32* %a, i1 %c, i32 %x, i64 %n) { entry: @@ -145,8 +150,10 @@ ; ; CHECK: Scalarizing: %tmp2 = add nsw i32 %tmp1, %x ; CHECK: Scalarizing and predicating: store i32 %tmp2, i32* %tmp0, align 4 -; CHECK: Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = add nsw i32 %tmp1, %x -; CHECK: Found an estimated cost of 2 for VF 2 For instruction: store i32 %tmp2, i32* %tmp0, align 4 +; CHECK-CM: Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = add nsw i32 %tmp1, %x +; CHECK-CM: Found an estimated cost of 2 for VF 2 For instruction: store i32 %tmp2, i32* %tmp0, align 4 +; CHECK-VP: Found an estimated cost of 2 for VF 2 For recipe: "REPLICATE %tmp2 = add %tmp1, %x +; CHECK-VP: Found an estimated cost of 2 for VF 2 For recipe: "REPLICATE store %tmp2, %tmp0 ; define void @predicated_store_scalarized_operand(i32* %a, i1 %c, i32 %x, i64 %n) { entry: @@ -197,11 +204,16 @@ ; CHECK: Scalarizing and predicating: %tmp4 = udiv i32 %tmp3, %tmp2 ; CHECK: Scalarizing: %tmp5 = sub i32 %tmp4, %x ; CHECK: Scalarizing and predicating: store i32 %tmp5, i32* %tmp0, align 4 -; CHECK: Found an estimated cost of 1 for VF 2 For instruction: %tmp2 = add i32 %tmp1, %x -; CHECK: Found an estimated cost of 5 for VF 2 For instruction: %tmp3 = sdiv i32 %tmp1, %tmp2 -; CHECK: Found an estimated cost of 5 for VF 2 For instruction: %tmp4 = udiv i32 %tmp3, %tmp2 -; CHECK: Found an estimated cost of 2 for VF 2 For instruction: %tmp5 = sub i32 %tmp4, %x -; CHECK: Found an estimated cost of 2 for VF 2 For instruction: store i32 %tmp5, i32* %tmp0, align 4 +; CHECK-CM: Found an estimated cost of 1 for VF 2 For instruction: %tmp2 = add i32 %tmp1, %x +; CHECK-CM: Found an estimated cost of 5 for VF 2 For instruction: %tmp3 = sdiv i32 %tmp1, %tmp2 +; CHECK-CM: Found an estimated cost of 5 for VF 2 For instruction: %tmp4 = udiv i32 %tmp3, %tmp2 +; CHECK-CM: Found an estimated cost of 2 for VF 2 For instruction: %tmp5 = sub i32 %tmp4, %x +; CHECK-CM: Found an estimated cost of 2 for VF 2 For instruction: store i32 %tmp5, i32* %tmp0, align 4 +; CHECK-VP: Found an estimated cost of 1 for VF 2 For recipe: "WIDEN\l"" %tmp2 = add %tmp1, %x +; CHECK-VP: Found an estimated cost of 5 for VF 2 For recipe: "REPLICATE %tmp3 = sdiv %tmp1, %tmp2 +; CHECK-VP: Found an estimated cost of 5 for VF 2 For recipe: "REPLICATE %tmp4 = udiv %tmp3, %tmp2 +; CHECK-VP: Found an estimated cost of 2 for VF 2 For recipe: "REPLICATE %tmp5 = sub %tmp4, %x +; CHECK-VP: Found an estimated cost of 2 for VF 2 For recipe: "REPLICATE store %tmp5, %tmp0 ; define void @predication_multi_context(i32* %a, i1 %c, i32 %x, i64 %n) { entry: Index: llvm/test/Transforms/LoopVectorize/ARM/interleaved_cost.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/ARM/interleaved_cost.ll +++ llvm/test/Transforms/LoopVectorize/ARM/interleaved_cost.ll @@ -1,7 +1,11 @@ -; RUN: opt -loop-vectorize -force-vector-width=2 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_2 -; RUN: opt -loop-vectorize -force-vector-width=4 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_4 -; RUN: opt -loop-vectorize -force-vector-width=8 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_8 -; RUN: opt -loop-vectorize -force-vector-width=16 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_16 +; RUN: opt -loop-vectorize -force-vector-width=2 -cost-using-vplan=false -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_2 +; RUN: opt -loop-vectorize -force-vector-width=4 -cost-using-vplan=false -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_4 +; RUN: opt -loop-vectorize -force-vector-width=8 -cost-using-vplan=false -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_8 +; RUN: opt -loop-vectorize -force-vector-width=16 -cost-using-vplan=false -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_16 +; RUN: opt -loop-vectorize -force-vector-width=2 -cost-using-vplan=true -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VP_2 +; RUN: opt -loop-vectorize -force-vector-width=4 -cost-using-vplan=true -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VP_4 +; RUN: opt -loop-vectorize -force-vector-width=8 -cost-using-vplan=true -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VP_8 +; RUN: opt -loop-vectorize -force-vector-width=16 -cost-using-vplan=true -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VP_16 ; REQUIRES: asserts target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" @@ -22,6 +26,12 @@ ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i8, i8* %tmp1, align 1 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i8 0, i8* %tmp0, align 1 ; VF_16-NEXT: Found an estimated cost of 2 for VF 16 For instruction: store i8 0, i8* %tmp1, align 1 +; VP_8-LABEL: Checking a loop in "i8_factor_2" +; VP_8: Found an estimated cost of 2 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 2 at %tmp2, ir<%tmp0> +; VP_8: Found an estimated cost of 2 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 2 at , ir<%tmp1> +; VP_16-LABEL: Checking a loop in "i8_factor_2" +; VP_16: Found an estimated cost of 2 for VF 16 For recipe: "INTERLEAVE-GROUP with factor 2 at %tmp2, ir<%tmp0> +; VP_16: Found an estimated cost of 2 for VF 16 For recipe: "INTERLEAVE-GROUP with factor 2 at , ir<%tmp1> for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i8.2, %i8.2* %data, i64 %i, i32 0 @@ -58,6 +68,15 @@ ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i16, i16* %tmp1, align 2 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i16 0, i16* %tmp0, align 2 ; VF_16-NEXT: Found an estimated cost of 4 for VF 16 For instruction: store i16 0, i16* %tmp1, align 2 +; VP_4-LABEL: Checking a loop in "i16_factor_2" +; VP_4: Found an estimated cost of 2 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 2 at %tmp2, ir<%tmp0> +; VP_4: Found an estimated cost of 2 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 2 at , ir<%tmp1> +; VP_8-LABEL: Checking a loop in "i16_factor_2" +; VP_8: Found an estimated cost of 2 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 2 at %tmp2, ir<%tmp0> +; VP_8: Found an estimated cost of 2 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 2 at , ir<%tmp1> +; VP_16-LABEL: Checking a loop in "i16_factor_2" +; VP_16: Found an estimated cost of 4 for VF 16 For recipe: "INTERLEAVE-GROUP with factor 2 at %tmp2, ir<%tmp0> +; VP_16: Found an estimated cost of 4 for VF 16 For recipe: "INTERLEAVE-GROUP with factor 2 at , ir<%tmp1> for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i16.2, %i16.2* %data, i64 %i, i32 0 @@ -99,6 +118,18 @@ ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i32, i32* %tmp1, align 4 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i32 0, i32* %tmp0, align 4 ; VF_16-NEXT: Found an estimated cost of 8 for VF 16 For instruction: store i32 0, i32* %tmp1, align 4 +; VP_2-LABEL: Checking a loop in "i32_factor_2" +; VP_2: Found an estimated cost of 2 for VF 2 For recipe: "INTERLEAVE-GROUP with factor 2 at %tmp2, ir<%tmp0> +; VP_2: Found an estimated cost of 2 for VF 2 For recipe: "INTERLEAVE-GROUP with factor 2 at , ir<%tmp1> +; VP_4-LABEL: Checking a loop in "i32_factor_2" +; VP_4: Found an estimated cost of 2 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 2 at %tmp2, ir<%tmp0> +; VP_4: Found an estimated cost of 2 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 2 at , ir<%tmp1> +; VP_8-LABEL: Checking a loop in "i32_factor_2" +; VP_8: Found an estimated cost of 4 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 2 at %tmp2, ir<%tmp0> +; VP_8: Found an estimated cost of 4 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 2 at , ir<%tmp1> +; VP_16-LABEL: Checking a loop in "i32_factor_2" +; VP_16: Found an estimated cost of 8 for VF 16 For recipe: "INTERLEAVE-GROUP with factor 2 at %tmp2, ir<%tmp0> +; VP_16: Found an estimated cost of 8 for VF 16 For recipe: "INTERLEAVE-GROUP with factor 2 at , ir<%tmp1> for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i32.2, %i32.2* %data, i64 %i, i32 0 @@ -130,6 +161,12 @@ ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load half, half* %tmp1, align 2 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store half 0xH0000, half* %tmp0, align 2 ; VF_8-NEXT: Found an estimated cost of 64 for VF 8 For instruction: store half 0xH0000, half* %tmp1, align 2 +; VP_4-LABEL: Checking a loop in "half_factor_2" +; VP_4: Found an estimated cost of 40 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 2 at %tmp2, ir<%tmp0> +; VP_4: Found an estimated cost of 32 for VF 4 For recipe: "REPLICATE store 0xH0000, %tmp1 +; VP_8-LABEL: Checking a loop in "half_factor_2" +; VP_8: Found an estimated cost of 80 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 2 at %tmp2, ir<%tmp0> +; VP_8: Found an estimated cost of 64 for VF 8 For recipe: "REPLICATE store 0xH0000, %tmp1 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %half.2, %half.2* %data, i64 %i, i32 0 Index: llvm/test/Transforms/LoopVectorize/ARM/mve-interleaved-cost.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/ARM/mve-interleaved-cost.ll +++ llvm/test/Transforms/LoopVectorize/ARM/mve-interleaved-cost.ll @@ -1,7 +1,11 @@ -; RUN: opt -loop-vectorize -force-vector-width=2 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_2 -; RUN: opt -loop-vectorize -force-vector-width=4 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_4 -; RUN: opt -loop-vectorize -force-vector-width=8 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_8 -; RUN: opt -loop-vectorize -force-vector-width=16 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_16 +; RUN: opt -loop-vectorize -force-vector-width=2 -debug-only=loop-vectorize -cost-using-vplan=false -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_2 +; RUN: opt -loop-vectorize -force-vector-width=4 -debug-only=loop-vectorize -cost-using-vplan=false -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_4 +; RUN: opt -loop-vectorize -force-vector-width=8 -debug-only=loop-vectorize -cost-using-vplan=false -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_8 +; RUN: opt -loop-vectorize -force-vector-width=16 -debug-only=loop-vectorize -cost-using-vplan=false -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_16 +; RUN: opt -loop-vectorize -force-vector-width=2 -debug-only=loop-vectorize -cost-using-vplan=true -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VP_2 +; RUN: opt -loop-vectorize -force-vector-width=4 -debug-only=loop-vectorize -cost-using-vplan=true -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VP_4 +; RUN: opt -loop-vectorize -force-vector-width=8 -debug-only=loop-vectorize -cost-using-vplan=true -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VP_8 +; RUN: opt -loop-vectorize -force-vector-width=16 -debug-only=loop-vectorize -cost-using-vplan=true -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VP_16 ; REQUIRES: asserts target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" @@ -34,6 +38,20 @@ ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i8, i8* %tmp1, align 1 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i8 0, i8* %tmp0, align 1 ; VF_16-NEXT: Found an estimated cost of 4 for VF 16 For instruction: store i8 0, i8* %tmp1, align 1 +; VP_2-LABEL: Checking a loop in "i8_factor_2" +; VP_2: Found an estimated cost of 20 for VF 2 For recipe: "REPLICATE %tmp2 = load %tmp0 +; VP_2-NEXT: Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE %tmp3 = load %tmp1 +; VP_2-NEXT: Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE store 0, %tmp0 +; VP_2-NEXT: Found an estimated cost of 12 for VF 2 For recipe: "REPLICATE store 0, %tmp1 +; VP_4-LABEL: Checking a loop in "i8_factor_2" +; VP_4: Found an estimated cost of 4 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 2 at %tmp2, ir<%tmp0> +; VP_4: Found an estimated cost of 4 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 2 at , ir<%tmp1> +; VP_8-LABEL: Checking a loop in "i8_factor_2" +; VP_8: Found an estimated cost of 4 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 2 at %tmp2, ir<%tmp0> +; VP_8: Found an estimated cost of 4 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 2 at , ir<%tmp1> +; VP_16-LABEL: Checking a loop in "i8_factor_2" +; VP_16: Found an estimated cost of 4 for VF 16 For recipe: "INTERLEAVE-GROUP with factor 2 at %tmp2, ir<%tmp0> +; VP_16: Found an estimated cost of 4 for VF 16 For recipe: "INTERLEAVE-GROUP with factor 2 at , ir<%tmp1> for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i8.2, %i8.2* %data, i64 %i, i32 0 @@ -75,6 +93,20 @@ ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i16, i16* %tmp1, align 2 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i16 0, i16* %tmp0, align 2 ; VF_16-NEXT: Found an estimated cost of 8 for VF 16 For instruction: store i16 0, i16* %tmp1, align 2 +; VP_2-LABEL: Checking a loop in "i16_factor_2" +; VP_2: Found an estimated cost of 20 for VF 2 For recipe: "REPLICATE %tmp2 = load %tmp0 +; VP_2-NEXT: Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE %tmp3 = load %tmp1 +; VP_2-NEXT: Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE store 0, %tmp0 +; VP_2-NEXT: Found an estimated cost of 12 for VF 2 For recipe: "REPLICATE store 0, %tmp1 +; VP_4-LABEL: Checking a loop in "i16_factor_2" +; VP_4: Found an estimated cost of 4 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 2 at %tmp2, ir<%tmp0> +; VP_4: Found an estimated cost of 4 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 2 at , ir<%tmp1> +; VP_8-LABEL: Checking a loop in "i16_factor_2" +; VP_8: Found an estimated cost of 4 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 2 at %tmp2, ir<%tmp0> +; VP_8: Found an estimated cost of 4 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 2 at , ir<%tmp1> +; VP_16-LABEL: Checking a loop in "i16_factor_2" +; VP_16: Found an estimated cost of 8 for VF 16 For recipe: "INTERLEAVE-GROUP with factor 2 at %tmp2, ir<%tmp0> +; VP_16: Found an estimated cost of 8 for VF 16 For recipe: "INTERLEAVE-GROUP with factor 2 at , ir<%tmp1> for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i16.2, %i16.2* %data, i64 %i, i32 0 @@ -116,6 +148,20 @@ ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i32, i32* %tmp1, align 4 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i32 0, i32* %tmp0, align 4 ; VF_16-NEXT: Found an estimated cost of 16 for VF 16 For instruction: store i32 0, i32* %tmp1, align 4 +; VP_2-LABEL: Checking a loop in "i32_factor_2" +; VP_2: Found an estimated cost of 20 for VF 2 For recipe: "REPLICATE %tmp2 = load %tmp0 +; VP_2-NEXT: Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE %tmp3 = load %tmp1 +; VP_2-NEXT: Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE store 0, %tmp0 +; VP_2-NEXT: Found an estimated cost of 12 for VF 2 For recipe: "REPLICATE store 0, %tmp1 +; VP_4-LABEL: Checking a loop in "i32_factor_2" +; VP_4: Found an estimated cost of 4 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 2 at %tmp2, ir<%tmp0> +; VP_4: Found an estimated cost of 4 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 2 at , ir<%tmp1> +; VP_8-LABEL: Checking a loop in "i32_factor_2" +; VP_8: Found an estimated cost of 8 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 2 at %tmp2, ir<%tmp0> +; VP_8: Found an estimated cost of 8 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 2 at , ir<%tmp1> +; VP_16-LABEL: Checking a loop in "i32_factor_2" +; VP_16: Found an estimated cost of 16 for VF 16 For recipe: "INTERLEAVE-GROUP with factor 2 at %tmp2, ir<%tmp0> +; VP_16: Found an estimated cost of 16 for VF 16 For recipe: "INTERLEAVE-GROUP with factor 2 at , ir<%tmp1> for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i32.2, %i32.2* %data, i64 %i, i32 0 @@ -157,6 +203,26 @@ ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i64, i64* %tmp1, align 8 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i64 0, i64* %tmp0, align 8 ; VF_16-NEXT: Found an estimated cost of 576 for VF 16 For instruction: store i64 0, i64* %tmp1, align 8 +; VP_2-LABEL: Checking a loop in "i64_factor_2" +; VP_2: Found an estimated cost of 24 for VF 2 For recipe: "REPLICATE %tmp2 = load %tmp0 +; VP_2-NEXT: Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE %tmp3 = load %tmp1 +; VP_2-NEXT: Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE store 0, %tmp0 +; VP_2-NEXT: Found an estimated cost of 16 for VF 2 For recipe: "REPLICATE store 0, %tmp1 +; VP_4-LABEL: Checking a loop in "i64_factor_2" +; VP_4: Found an estimated cost of 80 for VF 4 For recipe: "REPLICATE %tmp2 = load %tmp0 +; VP_4-NEXT: Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE %tmp3 = load %tmp1 +; VP_4-NEXT: Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE store 0, %tmp0 +; VP_4-NEXT: Found an estimated cost of 48 for VF 4 For recipe: "REPLICATE store 0, %tmp1 +; VP_8-LABEL: Checking a loop in "i64_factor_2" +; VP_8: Found an estimated cost of 288 for VF 8 For recipe: "REPLICATE %tmp2 = load %tmp0 +; VP_8-NEXT: Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE %tmp3 = load %tmp1 +; VP_8-NEXT: Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE store 0, %tmp0 +; VP_8-NEXT: Found an estimated cost of 160 for VF 8 For recipe: "REPLICATE store 0, %tmp1 +; VP_16-LABEL: Checking a loop in "i64_factor_2" +; VP_16: Found an estimated cost of 1088 for VF 16 For recipe: "REPLICATE %tmp2 = load %tmp0 +; VP_16-NEXT: Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE %tmp3 = load %tmp1 +; VP_16-NEXT: Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE store 0, %tmp0 +; VP_16-NEXT: Found an estimated cost of 576 for VF 16 For recipe: "REPLICATE store 0, %tmp1 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i64.2, %i64.2* %data, i64 %i, i32 0 @@ -198,6 +264,22 @@ ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load half, half* %tmp1, align 2 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store half 0xH0000, half* %tmp0, align 2 ; VF_16-NEXT: Found an estimated cost of 8 for VF 16 For instruction: store half 0xH0000, half* %tmp1, align 2 +; VP_2-LABEL: Checking a loop in "f16_factor_2" +; VP_2: Found an estimated cost of 20 for VF 2 For recipe: "REPLICATE %tmp2 = load %tmp0 +; VP_2-NEXT: Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE %tmp3 = load %tmp1 +; VP_2-NEXT: Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE store 0xH0000, %tmp0 +; VP_2-NEXT: Found an estimated cost of 12 for VF 2 For recipe: "REPLICATE store 0xH0000, %tmp1 +; VP_4-LABEL: Checking a loop in "f16_factor_2" +; VP_4: Found an estimated cost of 72 for VF 4 For recipe: "REPLICATE %tmp2 = load %tmp0 +; VP_4-NEXT: Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE %tmp3 = load %tmp1 +; VP_4-NEXT: Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE store 0xH0000, %tmp0 +; VP_4-NEXT: Found an estimated cost of 40 for VF 4 For recipe: "REPLICATE store 0xH0000, %tmp1 +; VP_8-LABEL: Checking a loop in "f16_factor_2" +; VP_8: Found an estimated cost of 4 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 2 at %tmp2, ir<%tmp0> +; VP_8: Found an estimated cost of 4 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 2 at , ir<%tmp1> +; VP_16-LABEL: Checking a loop in "f16_factor_2" +; VP_16: Found an estimated cost of 8 for VF 16 For recipe: "INTERLEAVE-GROUP with factor 2 at %tmp2, ir<%tmp0> +; VP_16: Found an estimated cost of 8 for VF 16 For recipe: "INTERLEAVE-GROUP with factor 2 at , ir<%tmp1> for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %f16.2, %f16.2* %data, i64 %i, i32 0 @@ -239,6 +321,20 @@ ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load float, float* %tmp1, align 4 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store float 0.000000e+00, float* %tmp0, align 4 ; VF_16-NEXT: Found an estimated cost of 16 for VF 16 For instruction: store float 0.000000e+00, float* %tmp1, align 4 +; VP_2-LABEL: Checking a loop in "f32_factor_2" +; VP_2: Found an estimated cost of 20 for VF 2 For recipe: "REPLICATE %tmp2 = load %tmp0 +; VP_2-NEXT: Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE %tmp3 = load %tmp1 +; VP_2-NEXT: Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE store 0.000000e+00, %tmp0 +; VP_2-NEXT: Found an estimated cost of 12 for VF 2 For recipe: "REPLICATE store 0.000000e+00, %tmp1 +; VP_4-LABEL: Checking a loop in "f32_factor_2" +; VP_4: Found an estimated cost of 4 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 2 at %tmp2, ir<%tmp0> +; VP_4: Found an estimated cost of 4 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 2 at , ir<%tmp1> +; VP_8-LABEL: Checking a loop in "f32_factor_2" +; VP_8: Found an estimated cost of 8 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 2 at %tmp2, ir<%tmp0> +; VP_8: Found an estimated cost of 8 for VF 8 For recipe: "INTERLEAVE-GROUP with factor 2 at , ir<%tmp1> +; VP_16-LABEL: Checking a loop in "f32_factor_2" +; VP_16: Found an estimated cost of 16 for VF 16 For recipe: "INTERLEAVE-GROUP with factor 2 at %tmp2, ir<%tmp0> +; VP_16: Found an estimated cost of 16 for VF 16 For recipe: "INTERLEAVE-GROUP with factor 2 at , ir<%tmp1> for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %f32.2, %f32.2* %data, i64 %i, i32 0 @@ -280,6 +376,26 @@ ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load double, double* %tmp1, align 8 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store double 0.000000e+00, double* %tmp0, align 8 ; VF_16-NEXT: Found an estimated cost of 544 for VF 16 For instruction: store double 0.000000e+00, double* %tmp1, align 8 +; VP_2-LABEL: Checking a loop in "f64_factor_2" +; VP_2: Found an estimated cost of 20 for VF 2 For recipe: "REPLICATE %tmp2 = load %tmp0 +; VP_2-NEXT: Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE %tmp3 = load %tmp1 +; VP_2-NEXT: Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE store 0.000000e+00, %tmp0 +; VP_2-NEXT: Found an estimated cost of 12 for VF 2 For recipe: "REPLICATE store 0.000000e+00, %tmp1 +; VP_4-LABEL: Checking a loop in "f64_factor_2" +; VP_4: Found an estimated cost of 72 for VF 4 For recipe: "REPLICATE %tmp2 = load %tmp0 +; VP_4-NEXT: Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE %tmp3 = load %tmp1 +; VP_4-NEXT: Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE store 0.000000e+00, %tmp0 +; VP_4-NEXT: Found an estimated cost of 40 for VF 4 For recipe: "REPLICATE store 0.000000e+00, %tmp1 +; VP_8-LABEL: Checking a loop in "f64_factor_2" +; VP_8: Found an estimated cost of 272 for VF 8 For recipe: "REPLICATE %tmp2 = load %tmp0 +; VP_8-NEXT: Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE %tmp3 = load %tmp1 +; VP_8-NEXT: Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE store 0.000000e+00, %tmp0 +; VP_8-NEXT: Found an estimated cost of 144 for VF 8 For recipe: "REPLICATE store 0.000000e+00, %tmp1 +; VP_16-LABEL: Checking a loop in "f64_factor_2" +; VP_16: Found an estimated cost of 1056 for VF 16 For recipe: "REPLICATE %tmp2 = load %tmp0 +; VP_16-NEXT: Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE %tmp3 = load %tmp1 +; VP_16-NEXT: Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE store 0.000000e+00, %tmp0 +; VP_16-NEXT: Found an estimated cost of 544 for VF 16 For recipe: "REPLICATE store 0.000000e+00, %tmp1 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %f64.2, %f64.2* %data, i64 %i, i32 0 @@ -333,6 +449,34 @@ ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i8 0, i8* %tmp0, align 1 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i8 0, i8* %tmp1, align 1 ; VF_16-NEXT: Found an estimated cost of 816 for VF 16 For instruction: store i8 0, i8* %tmp2, align 1 +; VP_2-LABEL: Checking a loop in "i8_factor_3" +; VP_2: Found an estimated cost of 30 for VF 2 For recipe: "REPLICATE %tmp3 = load %tmp0 +; VP_2-NEXT: Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE %tmp4 = load %tmp1 +; VP_2-NEXT: Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE %tmp5 = load %tmp2 +; VP_2-NEXT: Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE store 0, %tmp0 +; VP_2-NEXT: Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE store 0, %tmp1 +; VP_2-NEXT: Found an estimated cost of 18 for VF 2 For recipe: "REPLICATE store 0, %tmp2 +; VP_4-LABEL: Checking a loop in "i8_factor_3" +; VP_4: Found an estimated cost of 108 for VF 4 For recipe: "REPLICATE %tmp3 = load %tmp0 +; VP_4-NEXT: Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE %tmp4 = load %tmp1 +; VP_4-NEXT: Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE %tmp5 = load %tmp2 +; VP_4-NEXT: Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE store 0, %tmp0 +; VP_4-NEXT: Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE store 0, %tmp1 +; VP_4-NEXT: Found an estimated cost of 60 for VF 4 For recipe: "REPLICATE store 0, %tmp2 +; VP_8-LABEL: Checking a loop in "i8_factor_3" +; VP_8: Found an estimated cost of 408 for VF 8 For recipe: "REPLICATE %tmp3 = load %tmp0 +; VP_8-NEXT: Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE %tmp4 = load %tmp1 +; VP_8-NEXT: Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE %tmp5 = load %tmp2 +; VP_8-NEXT: Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE store 0, %tmp0 +; VP_8-NEXT: Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE store 0, %tmp1 +; VP_8-NEXT: Found an estimated cost of 216 for VF 8 For recipe: "REPLICATE store 0, %tmp2 +; VP_16-LABEL: Checking a loop in "i8_factor_3" +; VP_16: Found an estimated cost of 1584 for VF 16 For recipe: "REPLICATE %tmp3 = load %tmp0 +; VP_16-NEXT: Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE %tmp4 = load %tmp1 +; VP_16-NEXT: Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE %tmp5 = load %tmp2 +; VP_16-NEXT: Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE store 0, %tmp0 +; VP_16-NEXT: Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE store 0, %tmp1 +; VP_16-NEXT: Found an estimated cost of 816 for VF 16 For recipe: "REPLICATE store 0, %tmp2 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i8.3, %i8.3* %data, i64 %i, i32 0 @@ -385,6 +529,34 @@ ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i16 0, i16* %tmp0, align 2 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i16 0, i16* %tmp1, align 2 ; VF_16-NEXT: Found an estimated cost of 816 for VF 16 For instruction: store i16 0, i16* %tmp2, align 2 +; VP_2-LABEL: Checking a loop in "i16_factor_3" +; VP_2: Found an estimated cost of 30 for VF 2 For recipe: "REPLICATE %tmp3 = load %tmp0 +; VP_2-NEXT: Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE %tmp4 = load %tmp1 +; VP_2-NEXT: Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE %tmp5 = load %tmp2 +; VP_2-NEXT: Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE store 0, %tmp0 +; VP_2-NEXT: Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE store 0, %tmp1 +; VP_2-NEXT: Found an estimated cost of 18 for VF 2 For recipe: "REPLICATE store 0, %tmp2 +; VP_4-LABEL: Checking a loop in "i16_factor_3" +; VP_4: Found an estimated cost of 108 for VF 4 For recipe: "REPLICATE %tmp3 = load %tmp0 +; VP_4-NEXT: Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE %tmp4 = load %tmp1 +; VP_4-NEXT: Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE %tmp5 = load %tmp2 +; VP_4-NEXT: Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE store 0, %tmp0 +; VP_4-NEXT: Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE store 0, %tmp1 +; VP_4-NEXT: Found an estimated cost of 60 for VF 4 For recipe: "REPLICATE store 0, %tmp2 +; VP_8-LABEL: Checking a loop in "i16_factor_3" +; VP_8: Found an estimated cost of 408 for VF 8 For recipe: "REPLICATE %tmp3 = load %tmp0 +; VP_8-NEXT: Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE %tmp4 = load %tmp1 +; VP_8-NEXT: Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE %tmp5 = load %tmp2 +; VP_8-NEXT: Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE store 0, %tmp0 +; VP_8-NEXT: Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE store 0, %tmp1 +; VP_8-NEXT: Found an estimated cost of 216 for VF 8 For recipe: "REPLICATE store 0, %tmp2 +; VP_16-LABEL: Checking a loop in "i16_factor_3" +; VP_16: Found an estimated cost of 1584 for VF 16 For recipe: "REPLICATE %tmp3 = load %tmp0 +; VP_16-NEXT: Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE %tmp4 = load %tmp1 +; VP_16-NEXT: Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE %tmp5 = load %tmp2 +; VP_16-NEXT: Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE store 0, %tmp0 +; VP_16-NEXT: Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE store 0, %tmp1 +; VP_16-NEXT: Found an estimated cost of 816 for VF 16 For recipe: "REPLICATE store 0, %tmp2 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i16.3, %i16.3* %data, i64 %i, i32 0 @@ -437,6 +609,34 @@ ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i32 0, i32* %tmp0, align 4 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i32 0, i32* %tmp1, align 4 ; VF_16-NEXT: Found an estimated cost of 816 for VF 16 For instruction: store i32 0, i32* %tmp2, align 4 +; VP_2-LABEL: Checking a loop in "i32_factor_3" +; VP_2: Found an estimated cost of 30 for VF 2 For recipe: "REPLICATE %tmp3 = load %tmp0 +; VP_2-NEXT: Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE %tmp4 = load %tmp1 +; VP_2-NEXT: Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE %tmp5 = load %tmp2 +; VP_2-NEXT: Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE store 0, %tmp0 +; VP_2-NEXT: Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE store 0, %tmp1 +; VP_2-NEXT: Found an estimated cost of 18 for VF 2 For recipe: "REPLICATE store 0, %tmp2 +; VP_4-LABEL: Checking a loop in "i32_factor_3" +; VP_4: Found an estimated cost of 24 for VF 4 For recipe: "WIDEN load ir<%tmp0> +; VP_4-NEXT: Found an estimated cost of 0 for VF 4 For recipe: "WIDEN load ir<%tmp1> +; VP_4-NEXT: Found an estimated cost of 0 for VF 4 For recipe: "WIDEN load ir<%tmp2> +; VP_4-NEXT: Found an estimated cost of 0 for VF 4 For recipe: "WIDEN store ir<%tmp0>, ir<0> +; VP_4-NEXT: Found an estimated cost of 0 for VF 4 For recipe: "WIDEN store ir<%tmp1>, ir<0> +; VP_4-NEXT: Found an estimated cost of 24 for VF 4 For recipe: "WIDEN store ir<%tmp2>, ir<0> +; VP_8-LABEL: Checking a loop in "i32_factor_3" +; VP_8: Found an estimated cost of 408 for VF 8 For recipe: "REPLICATE %tmp3 = load %tmp0 +; VP_8-NEXT: Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE %tmp4 = load %tmp1 +; VP_8-NEXT: Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE %tmp5 = load %tmp2 +; VP_8-NEXT: Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE store 0, %tmp0 +; VP_8-NEXT: Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE store 0, %tmp1 +; VP_8-NEXT: Found an estimated cost of 216 for VF 8 For recipe: "REPLICATE store 0, %tmp2 +; VP_16-LABEL: Checking a loop in "i32_factor_3" +; VP_16: Found an estimated cost of 1584 for VF 16 For recipe: "REPLICATE %tmp3 = load %tmp0 +; VP_16-NEXT: Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE %tmp4 = load %tmp1 +; VP_16-NEXT: Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE %tmp5 = load %tmp2 +; VP_16-NEXT: Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE store 0, %tmp0 +; VP_16-NEXT: Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE store 0, %tmp1 +; VP_16-NEXT: Found an estimated cost of 816 for VF 16 For recipe: "REPLICATE store 0, %tmp2 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i32.3, %i32.3* %data, i64 %i, i32 0 @@ -489,6 +689,34 @@ ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i64 0, i64* %tmp0, align 8 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i64 0, i64* %tmp1, align 8 ; VF_16-NEXT: Found an estimated cost of 864 for VF 16 For instruction: store i64 0, i64* %tmp2, align 8 +; VP_2-LABEL: Checking a loop in "i64_factor_3" +; VP_2: Found an estimated cost of 36 for VF 2 For recipe: "REPLICATE %tmp3 = load %tmp0 +; VP_2-NEXT: Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE %tmp4 = load %tmp1 +; VP_2-NEXT: Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE %tmp5 = load %tmp2 +; VP_2-NEXT: Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE store 0, %tmp0 +; VP_2-NEXT: Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE store 0, %tmp1 +; VP_2-NEXT: Found an estimated cost of 24 for VF 2 For recipe: "REPLICATE store 0, %tmp2 +; VP_4-LABEL: Checking a loop in "i64_factor_3" +; VP_4: Found an estimated cost of 120 for VF 4 For recipe: "REPLICATE %tmp3 = load %tmp0 +; VP_4-NEXT: Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE %tmp4 = load %tmp1 +; VP_4-NEXT: Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE %tmp5 = load %tmp2 +; VP_4-NEXT: Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE store 0, %tmp0 +; VP_4-NEXT: Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE store 0, %tmp1 +; VP_4-NEXT: Found an estimated cost of 72 for VF 4 For recipe: "REPLICATE store 0, %tmp2 +; VP_8-LABEL: Checking a loop in "i64_factor_3" +; VP_8: Found an estimated cost of 432 for VF 8 For recipe: "REPLICATE %tmp3 = load %tmp0 +; VP_8-NEXT: Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE %tmp4 = load %tmp1 +; VP_8-NEXT: Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE %tmp5 = load %tmp2 +; VP_8-NEXT: Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE store 0, %tmp0 +; VP_8-NEXT: Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE store 0, %tmp1 +; VP_8-NEXT: Found an estimated cost of 240 for VF 8 For recipe: "REPLICATE store 0, %tmp2 +; VP_16-LABEL: Checking a loop in "i64_factor_3" +; VP_16: Found an estimated cost of 1632 for VF 16 For recipe: "REPLICATE %tmp3 = load %tmp0 +; VP_16-NEXT: Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE %tmp4 = load %tmp1 +; VP_16-NEXT: Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE %tmp5 = load %tmp2 +; VP_16-NEXT: Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE store 0, %tmp0 +; VP_16-NEXT: Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE store 0, %tmp1 +; VP_16-NEXT: Found an estimated cost of 864 for VF 16 For recipe: "REPLICATE store 0, %tmp2 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i64.3, %i64.3* %data, i64 %i, i32 0 @@ -541,6 +769,34 @@ ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store half 0xH0000, half* %tmp0, align 2 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store half 0xH0000, half* %tmp1, align 2 ; VF_16-NEXT: Found an estimated cost of 816 for VF 16 For instruction: store half 0xH0000, half* %tmp2, align 2 +; VP_2-LABEL: Checking a loop in "f16_factor_3" +; VP_2: Found an estimated cost of 30 for VF 2 For recipe: "REPLICATE %tmp3 = load %tmp0 +; VP_2-NEXT: Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE %tmp4 = load %tmp1 +; VP_2-NEXT: Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE %tmp5 = load %tmp2 +; VP_2-NEXT: Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE store 0xH0000, %tmp0 +; VP_2-NEXT: Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE store 0xH0000, %tmp1 +; VP_2-NEXT: Found an estimated cost of 18 for VF 2 For recipe: "REPLICATE store 0xH0000, %tmp2 +; VP_4-LABEL: Checking a loop in "f16_factor_3" +; VP_4: Found an estimated cost of 108 for VF 4 For recipe: "REPLICATE %tmp3 = load %tmp0 +; VP_4-NEXT: Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE %tmp4 = load %tmp1 +; VP_4-NEXT: Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE %tmp5 = load %tmp2 +; VP_4-NEXT: Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE store 0xH0000, %tmp0 +; VP_4-NEXT: Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE store 0xH0000, %tmp1 +; VP_4-NEXT: Found an estimated cost of 60 for VF 4 For recipe: "REPLICATE store 0xH0000, %tmp2 +; VP_8-LABEL: Checking a loop in "f16_factor_3" +; VP_8: Found an estimated cost of 408 for VF 8 For recipe: "REPLICATE %tmp3 = load %tmp0 +; VP_8-NEXT: Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE %tmp4 = load %tmp1 +; VP_8-NEXT: Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE %tmp5 = load %tmp2 +; VP_8-NEXT: Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE store 0xH0000, %tmp0 +; VP_8-NEXT: Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE store 0xH0000, %tmp1 +; VP_8-NEXT: Found an estimated cost of 216 for VF 8 For recipe: "REPLICATE store 0xH0000, %tmp2 +; VP_16-LABEL: Checking a loop in "f16_factor_3" +; VP_16: Found an estimated cost of 1584 for VF 16 For recipe: "REPLICATE %tmp3 = load %tmp0 +; VP_16-NEXT: Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE %tmp4 = load %tmp1 +; VP_16-NEXT: Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE %tmp5 = load %tmp2 +; VP_16-NEXT: Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE store 0xH0000, %tmp0 +; VP_16-NEXT: Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE store 0xH0000, %tmp1 +; VP_16-NEXT: Found an estimated cost of 816 for VF 16 For recipe: "REPLICATE store 0xH0000, %tmp2 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %f16.3, %f16.3* %data, i64 %i, i32 0 @@ -593,6 +849,34 @@ ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store float 0.000000e+00, float* %tmp0, align 4 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store float 0.000000e+00, float* %tmp1, align 4 ; VF_16-NEXT: Found an estimated cost of 816 for VF 16 For instruction: store float 0.000000e+00, float* %tmp2, align 4 +; VP_2-LABEL: Checking a loop in "f32_factor_3" +; VP_2: Found an estimated cost of 30 for VF 2 For recipe: "REPLICATE %tmp3 = load %tmp0 +; VP_2-NEXT: Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE %tmp4 = load %tmp1 +; VP_2-NEXT: Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE %tmp5 = load %tmp2 +; VP_2-NEXT: Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE store 0.000000e+00, %tmp0 +; VP_2-NEXT: Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE store 0.000000e+00, %tmp1 +; VP_2-NEXT: Found an estimated cost of 18 for VF 2 For recipe: "REPLICATE store 0.000000e+00, %tmp2 +; VP_4-LABEL: Checking a loop in "f32_factor_3" +; VP_4: Found an estimated cost of 24 for VF 4 For recipe: "WIDEN load ir<%tmp0> +; VP_4-NEXT: Found an estimated cost of 0 for VF 4 For recipe: "WIDEN load ir<%tmp1> +; VP_4-NEXT: Found an estimated cost of 0 for VF 4 For recipe: "WIDEN load ir<%tmp2> +; VP_4-NEXT: Found an estimated cost of 0 for VF 4 For recipe: "WIDEN store ir<%tmp0>, ir<0.000000e+00> +; VP_4-NEXT: Found an estimated cost of 0 for VF 4 For recipe: "WIDEN store ir<%tmp1>, ir<0.000000e+00> +; VP_4-NEXT: Found an estimated cost of 24 for VF 4 For recipe: "WIDEN store ir<%tmp2>, ir<0.000000e+00> +; VP_8-LABEL: Checking a loop in "f32_factor_3" +; VP_8: Found an estimated cost of 408 for VF 8 For recipe: "REPLICATE %tmp3 = load %tmp0 +; VP_8-NEXT: Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE %tmp4 = load %tmp1 +; VP_8-NEXT: Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE %tmp5 = load %tmp2 +; VP_8-NEXT: Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE store 0.000000e+00, %tmp0 +; VP_8-NEXT: Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE store 0.000000e+00, %tmp1 +; VP_8-NEXT: Found an estimated cost of 216 for VF 8 For recipe: "REPLICATE store 0.000000e+00, %tmp2 +; VP_16-LABEL: Checking a loop in "f32_factor_3" +; VP_16: Found an estimated cost of 1584 for VF 16 For recipe: "REPLICATE %tmp3 = load %tmp0 +; VP_16-NEXT: Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE %tmp4 = load %tmp1 +; VP_16-NEXT: Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE %tmp5 = load %tmp2 +; VP_16-NEXT: Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE store 0.000000e+00, %tmp0 +; VP_16-NEXT: Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE store 0.000000e+00, %tmp1 +; VP_16-NEXT: Found an estimated cost of 816 for VF 16 For recipe: "REPLICATE store 0.000000e+00, %tmp2 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %f32.3, %f32.3* %data, i64 %i, i32 0 @@ -645,6 +929,34 @@ ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store double 0.000000e+00, double* %tmp0, align 8 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store double 0.000000e+00, double* %tmp1, align 8 ; VF_16-NEXT: Found an estimated cost of 816 for VF 16 For instruction: store double 0.000000e+00, double* %tmp2, align 8 +; VP_2-LABEL: Checking a loop in "f64_factor_3" +; VP_2: Found an estimated cost of 30 for VF 2 For recipe: "REPLICATE %tmp3 = load %tmp0 +; VP_2-NEXT: Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE %tmp4 = load %tmp1 +; VP_2-NEXT: Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE %tmp5 = load %tmp2 +; VP_2-NEXT: Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE store 0.000000e+00, %tmp0 +; VP_2-NEXT: Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE store 0.000000e+00, %tmp1 +; VP_2-NEXT: Found an estimated cost of 18 for VF 2 For recipe: "REPLICATE store 0.000000e+00, %tmp2 +; VP_4-LABEL: Checking a loop in "f64_factor_3" +; VP_4: Found an estimated cost of 108 for VF 4 For recipe: "REPLICATE %tmp3 = load %tmp0 +; VP_4-NEXT: Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE %tmp4 = load %tmp1 +; VP_4-NEXT: Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE %tmp5 = load %tmp2 +; VP_4-NEXT: Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE store 0.000000e+00, %tmp0 +; VP_4-NEXT: Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE store 0.000000e+00, %tmp1 +; VP_4-NEXT: Found an estimated cost of 60 for VF 4 For recipe: "REPLICATE store 0.000000e+00, %tmp2 +; VP_8-LABEL: Checking a loop in "f64_factor_3" +; VP_8: Found an estimated cost of 408 for VF 8 For recipe: "REPLICATE %tmp3 = load %tmp0 +; VP_8-NEXT: Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE %tmp4 = load %tmp1 +; VP_8-NEXT: Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE %tmp5 = load %tmp2 +; VP_8-NEXT: Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE store 0.000000e+00, %tmp0 +; VP_8-NEXT: Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE store 0.000000e+00, %tmp1 +; VP_8-NEXT: Found an estimated cost of 216 for VF 8 For recipe: "REPLICATE store 0.000000e+00, %tmp2 +; VP_16-LABEL: Checking a loop in "f64_factor_3" +; VP_16: Found an estimated cost of 1584 for VF 16 For recipe: "REPLICATE %tmp3 = load %tmp0 +; VP_16-NEXT: Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE %tmp4 = load %tmp1 +; VP_16-NEXT: Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE %tmp5 = load %tmp2 +; VP_16-NEXT: Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE store 0.000000e+00, %tmp0 +; VP_16-NEXT: Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE store 0.000000e+00, %tmp1 +; VP_16-NEXT: Found an estimated cost of 816 for VF 16 For recipe: "REPLICATE store 0.000000e+00, %tmp2 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %f64.3, %f64.3* %data, i64 %i, i32 0 @@ -708,6 +1020,42 @@ ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i8 0, i8* %tmp1, align 1 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i8 0, i8* %tmp2, align 1 ; VF_16-NEXT: Found an estimated cost of 1088 for VF 16 For instruction: store i8 0, i8* %tmp3, align 1 +; VP_2-LABEL: Checking a loop in "i8_factor_4" +; VP_2: Found an estimated cost of 40 for VF 2 For recipe: "REPLICATE %tmp4 = load %tmp0 +; VP_2-NEXT: Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE %tmp5 = load %tmp1 +; VP_2-NEXT: Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE %tmp6 = load %tmp2 +; VP_2-NEXT: Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE %tmp7 = load %tmp3 +; VP_2-NEXT: Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE store 0, %tmp0 +; VP_2-NEXT: Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE store 0, %tmp1 +; VP_2-NEXT: Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE store 0, %tmp2 +; VP_2-NEXT: Found an estimated cost of 24 for VF 2 For recipe: "REPLICATE store 0, %tmp3 +; VP_4-LABEL: Checking a loop in "i8_factor_4" +; VP_4: Found an estimated cost of 144 for VF 4 For recipe: "REPLICATE %tmp4 = load %tmp0 +; VP_4-NEXT: Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE %tmp5 = load %tmp1 +; VP_4-NEXT: Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE %tmp6 = load %tmp2 +; VP_4-NEXT: Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE %tmp7 = load %tmp3 +; VP_4-NEXT: Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE store 0, %tmp0 +; VP_4-NEXT: Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE store 0, %tmp1 +; VP_4-NEXT: Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE store 0, %tmp2 +; VP_4-NEXT: Found an estimated cost of 80 for VF 4 For recipe: "REPLICATE store 0, %tmp3 +; VP_8-LABEL: Checking a loop in "i8_factor_4" +; VP_8: Found an estimated cost of 544 for VF 8 For recipe: "REPLICATE %tmp4 = load %tmp0 +; VP_8-NEXT: Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE %tmp5 = load %tmp1 +; VP_8-NEXT: Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE %tmp6 = load %tmp2 +; VP_8-NEXT: Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE %tmp7 = load %tmp3 +; VP_8-NEXT: Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE store 0, %tmp0 +; VP_8-NEXT: Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE store 0, %tmp1 +; VP_8-NEXT: Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE store 0, %tmp2 +; VP_8-NEXT: Found an estimated cost of 288 for VF 8 For recipe: "REPLICATE store 0, %tmp3 +; VP_16-LABEL: Checking a loop in "i8_factor_4" +; VP_16: Found an estimated cost of 2112 for VF 16 For recipe: "REPLICATE %tmp4 = load %tmp0 +; VP_16-NEXT: Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE %tmp5 = load %tmp1 +; VP_16-NEXT: Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE %tmp6 = load %tmp2 +; VP_16-NEXT: Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE %tmp7 = load %tmp3 +; VP_16-NEXT: Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE store 0, %tmp0 +; VP_16-NEXT: Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE store 0, %tmp1 +; VP_16-NEXT: Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE store 0, %tmp2 +; VP_16-NEXT: Found an estimated cost of 1088 for VF 16 For recipe: "REPLICATE store 0, %tmp3 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i8.4, %i8.4* %data, i64 %i, i32 0 @@ -771,6 +1119,42 @@ ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i16 0, i16* %tmp1, align 2 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i16 0, i16* %tmp2, align 2 ; VF_16-NEXT: Found an estimated cost of 1088 for VF 16 For instruction: store i16 0, i16* %tmp3, align 2 +; VP_2-LABEL: Checking a loop in "i16_factor_4" +; VP_2: Found an estimated cost of 40 for VF 2 For recipe: "REPLICATE %tmp4 = load %tmp0 +; VP_2-NEXT: Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE %tmp5 = load %tmp1 +; VP_2-NEXT: Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE %tmp6 = load %tmp2 +; VP_2-NEXT: Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE %tmp7 = load %tmp3 +; VP_2-NEXT: Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE store 0, %tmp0 +; VP_2-NEXT: Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE store 0, %tmp1 +; VP_2-NEXT: Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE store 0, %tmp2 +; VP_2-NEXT: Found an estimated cost of 24 for VF 2 For recipe: "REPLICATE store 0, %tmp3 +; VP_4-LABEL: Checking a loop in "i16_factor_4" +; VP_4: Found an estimated cost of 144 for VF 4 For recipe: "REPLICATE %tmp4 = load %tmp0 +; VP_4-NEXT: Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE %tmp5 = load %tmp1 +; VP_4-NEXT: Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE %tmp6 = load %tmp2 +; VP_4-NEXT: Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE %tmp7 = load %tmp3 +; VP_4-NEXT: Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE store 0, %tmp0 +; VP_4-NEXT: Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE store 0, %tmp1 +; VP_4-NEXT: Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE store 0, %tmp2 +; VP_4-NEXT: Found an estimated cost of 80 for VF 4 For recipe: "REPLICATE store 0, %tmp3 +; VP_8-LABEL: Checking a loop in "i16_factor_4" +; VP_8: Found an estimated cost of 544 for VF 8 For recipe: "REPLICATE %tmp4 = load %tmp0 +; VP_8-NEXT: Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE %tmp5 = load %tmp1 +; VP_8-NEXT: Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE %tmp6 = load %tmp2 +; VP_8-NEXT: Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE %tmp7 = load %tmp3 +; VP_8-NEXT: Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE store 0, %tmp0 +; VP_8-NEXT: Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE store 0, %tmp1 +; VP_8-NEXT: Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE store 0, %tmp2 +; VP_8-NEXT: Found an estimated cost of 288 for VF 8 For recipe: "REPLICATE store 0, %tmp3 +; VP_16-LABEL: Checking a loop in "i16_factor_4" +; VP_16: Found an estimated cost of 2112 for VF 16 For recipe: "REPLICATE %tmp4 = load %tmp0 +; VP_16-NEXT: Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE %tmp5 = load %tmp1 +; VP_16-NEXT: Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE %tmp6 = load %tmp2 +; VP_16-NEXT: Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE %tmp7 = load %tmp3 +; VP_16-NEXT: Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE store 0, %tmp0 +; VP_16-NEXT: Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE store 0, %tmp1 +; VP_16-NEXT: Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE store 0, %tmp2 +; VP_16-NEXT: Found an estimated cost of 1088 for VF 16 For recipe: "REPLICATE store 0, %tmp3 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i16.4, %i16.4* %data, i64 %i, i32 0 @@ -834,6 +1218,42 @@ ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i32 0, i32* %tmp1, align 4 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i32 0, i32* %tmp2, align 4 ; VF_16-NEXT: Found an estimated cost of 1088 for VF 16 For instruction: store i32 0, i32* %tmp3, align 4 +; VP_2-LABEL: Checking a loop in "i32_factor_4" +; VP_2: Found an estimated cost of 40 for VF 2 For recipe: "REPLICATE %tmp4 = load %tmp0 +; VP_2-NEXT: Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE %tmp5 = load %tmp1 +; VP_2-NEXT: Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE %tmp6 = load %tmp2 +; VP_2-NEXT: Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE %tmp7 = load %tmp3 +; VP_2-NEXT: Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE store 0, %tmp0 +; VP_2-NEXT: Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE store 0, %tmp1 +; VP_2-NEXT: Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE store 0, %tmp2 +; VP_2-NEXT: Found an estimated cost of 24 for VF 2 For recipe: "REPLICATE store 0, %tmp3 +; VP_4-LABEL: Checking a loop in "i32_factor_4" +; VP_4: Found an estimated cost of 32 for VF 4 For recipe: "WIDEN load ir<%tmp0> +; VP_4-NEXT: Found an estimated cost of 0 for VF 4 For recipe: "WIDEN load ir<%tmp1> +; VP_4-NEXT: Found an estimated cost of 0 for VF 4 For recipe: "WIDEN load ir<%tmp2> +; VP_4-NEXT: Found an estimated cost of 0 for VF 4 For recipe: "WIDEN load ir<%tmp3> +; VP_4-NEXT: Found an estimated cost of 0 for VF 4 For recipe: "WIDEN store ir<%tmp0>, ir<0> +; VP_4-NEXT: Found an estimated cost of 0 for VF 4 For recipe: "WIDEN store ir<%tmp1>, ir<0> +; VP_4-NEXT: Found an estimated cost of 0 for VF 4 For recipe: "WIDEN store ir<%tmp2>, ir<0> +; VP_4-NEXT: Found an estimated cost of 32 for VF 4 For recipe: "WIDEN store ir<%tmp3>, ir<0> +; VP_8-LABEL: Checking a loop in "i32_factor_4" +; VP_8: Found an estimated cost of 544 for VF 8 For recipe: "REPLICATE %tmp4 = load %tmp0 +; VP_8-NEXT: Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE %tmp5 = load %tmp1 +; VP_8-NEXT: Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE %tmp6 = load %tmp2 +; VP_8-NEXT: Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE %tmp7 = load %tmp3 +; VP_8-NEXT: Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE store 0, %tmp0 +; VP_8-NEXT: Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE store 0, %tmp1 +; VP_8-NEXT: Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE store 0, %tmp2 +; VP_8-NEXT: Found an estimated cost of 288 for VF 8 For recipe: "REPLICATE store 0, %tmp3 +; VP_16-LABEL: Checking a loop in "i32_factor_4" +; VP_16: Found an estimated cost of 2112 for VF 16 For recipe: "REPLICATE %tmp4 = load %tmp0 +; VP_16-NEXT: Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE %tmp5 = load %tmp1 +; VP_16-NEXT: Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE %tmp6 = load %tmp2 +; VP_16-NEXT: Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE %tmp7 = load %tmp3 +; VP_16-NEXT: Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE store 0, %tmp0 +; VP_16-NEXT: Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE store 0, %tmp1 +; VP_16-NEXT: Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE store 0, %tmp2 +; VP_16-NEXT: Found an estimated cost of 1088 for VF 16 For recipe: "REPLICATE store 0, %tmp3 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i32.4, %i32.4* %data, i64 %i, i32 0 @@ -897,6 +1317,42 @@ ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i64 0, i64* %tmp1, align 8 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i64 0, i64* %tmp2, align 8 ; VF_16-NEXT: Found an estimated cost of 1152 for VF 16 For instruction: store i64 0, i64* %tmp3, align 8 +; VP_2-LABEL: Checking a loop in "i64_factor_4" +; VP_2: Found an estimated cost of 48 for VF 2 For recipe: "REPLICATE %tmp4 = load %tmp0 +; VP_2-NEXT: Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE %tmp5 = load %tmp1 +; VP_2-NEXT: Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE %tmp6 = load %tmp2 +; VP_2-NEXT: Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE %tmp7 = load %tmp3 +; VP_2-NEXT: Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE store 0, %tmp0 +; VP_2-NEXT: Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE store 0, %tmp1 +; VP_2-NEXT: Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE store 0, %tmp2 +; VP_2-NEXT: Found an estimated cost of 32 for VF 2 For recipe: "REPLICATE store 0, %tmp3 +; VP_4-LABEL: Checking a loop in "i64_factor_4" +; VP_4: Found an estimated cost of 160 for VF 4 For recipe: "REPLICATE %tmp4 = load %tmp0 +; VP_4-NEXT: Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE %tmp5 = load %tmp1 +; VP_4-NEXT: Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE %tmp6 = load %tmp2 +; VP_4-NEXT: Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE %tmp7 = load %tmp3 +; VP_4-NEXT: Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE store 0, %tmp0 +; VP_4-NEXT: Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE store 0, %tmp1 +; VP_4-NEXT: Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE store 0, %tmp2 +; VP_4-NEXT: Found an estimated cost of 96 for VF 4 For recipe: "REPLICATE store 0, %tmp3 +; VP_8-LABEL: Checking a loop in "i64_factor_4" +; VP_8: Found an estimated cost of 576 for VF 8 For recipe: "REPLICATE %tmp4 = load %tmp0 +; VP_8-NEXT: Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE %tmp5 = load %tmp1 +; VP_8-NEXT: Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE %tmp6 = load %tmp2 +; VP_8-NEXT: Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE %tmp7 = load %tmp3 +; VP_8-NEXT: Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE store 0, %tmp0 +; VP_8-NEXT: Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE store 0, %tmp1 +; VP_8-NEXT: Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE store 0, %tmp2 +; VP_8-NEXT: Found an estimated cost of 320 for VF 8 For recipe: "REPLICATE store 0, %tmp3 +; VP_16-LABEL: Checking a loop in "i64_factor_4" +; VP_16: Found an estimated cost of 2176 for VF 16 For recipe: "REPLICATE %tmp4 = load %tmp0 +; VP_16-NEXT: Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE %tmp5 = load %tmp1 +; VP_16-NEXT: Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE %tmp6 = load %tmp2 +; VP_16-NEXT: Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE %tmp7 = load %tmp3 +; VP_16-NEXT: Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE store 0, %tmp0 +; VP_16-NEXT: Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE store 0, %tmp1 +; VP_16-NEXT: Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE store 0, %tmp2 +; VP_16-NEXT: Found an estimated cost of 1152 for VF 16 For recipe: "REPLICATE store 0, %tmp3 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i64.4, %i64.4* %data, i64 %i, i32 0 @@ -960,6 +1416,42 @@ ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store half 0xH0000, half* %tmp1, align 2 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store half 0xH0000, half* %tmp2, align 2 ; VF_16-NEXT: Found an estimated cost of 1088 for VF 16 For instruction: store half 0xH0000, half* %tmp3, align 2 +; VP_2-LABEL: Checking a loop in "f16_factor_4" +; VP_2: Found an estimated cost of 40 for VF 2 For recipe: "REPLICATE %tmp4 = load %tmp0 +; VP_2-NEXT: Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE %tmp5 = load %tmp1 +; VP_2-NEXT: Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE %tmp6 = load %tmp2 +; VP_2-NEXT: Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE %tmp7 = load %tmp3 +; VP_2-NEXT: Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE store 0xH0000, %tmp0 +; VP_2-NEXT: Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE store 0xH0000, %tmp1 +; VP_2-NEXT: Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE store 0xH0000, %tmp2 +; VP_2-NEXT: Found an estimated cost of 24 for VF 2 For recipe: "REPLICATE store 0xH0000, %tmp3 +; VP_4-LABEL: Checking a loop in "f16_factor_4" +; VP_4: Found an estimated cost of 144 for VF 4 For recipe: "REPLICATE %tmp4 = load %tmp0 +; VP_4-NEXT: Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE %tmp5 = load %tmp1 +; VP_4-NEXT: Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE %tmp6 = load %tmp2 +; VP_4-NEXT: Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE %tmp7 = load %tmp3 +; VP_4-NEXT: Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE store 0xH0000, %tmp0 +; VP_4-NEXT: Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE store 0xH0000, %tmp1 +; VP_4-NEXT: Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE store 0xH0000, %tmp2 +; VP_4-NEXT: Found an estimated cost of 80 for VF 4 For recipe: "REPLICATE store 0xH0000, %tmp3 +; VP_8-LABEL: Checking a loop in "f16_factor_4" +; VP_8: Found an estimated cost of 544 for VF 8 For recipe: "REPLICATE %tmp4 = load %tmp0 +; VP_8-NEXT: Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE %tmp5 = load %tmp1 +; VP_8-NEXT: Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE %tmp6 = load %tmp2 +; VP_8-NEXT: Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE %tmp7 = load %tmp3 +; VP_8-NEXT: Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE store 0xH0000, %tmp0 +; VP_8-NEXT: Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE store 0xH0000, %tmp1 +; VP_8-NEXT: Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE store 0xH0000, %tmp2 +; VP_8-NEXT: Found an estimated cost of 288 for VF 8 For recipe: "REPLICATE store 0xH0000, %tmp3 +; VP_16-LABEL: Checking a loop in "f16_factor_4" +; VP_16: Found an estimated cost of 2112 for VF 16 For recipe: "REPLICATE %tmp4 = load %tmp0 +; VP_16-NEXT: Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE %tmp5 = load %tmp1 +; VP_16-NEXT: Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE %tmp6 = load %tmp2 +; VP_16-NEXT: Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE %tmp7 = load %tmp3 +; VP_16-NEXT: Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE store 0xH0000, %tmp0 +; VP_16-NEXT: Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE store 0xH0000, %tmp1 +; VP_16-NEXT: Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE store 0xH0000, %tmp2 +; VP_16-NEXT: Found an estimated cost of 1088 for VF 16 For recipe: "REPLICATE store 0xH0000, %tmp3 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %f16.4, %f16.4* %data, i64 %i, i32 0 @@ -1023,6 +1515,42 @@ ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store float 0.000000e+00, float* %tmp1, align 4 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store float 0.000000e+00, float* %tmp2, align 4 ; VF_16-NEXT: Found an estimated cost of 1088 for VF 16 For instruction: store float 0.000000e+00, float* %tmp3, align 4 +; VP_2-LABEL: Checking a loop in "f32_factor_4" +; VP_2: Found an estimated cost of 40 for VF 2 For recipe: "REPLICATE %tmp4 = load %tmp0 +; VP_2-NEXT: Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE %tmp5 = load %tmp1 +; VP_2-NEXT: Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE %tmp6 = load %tmp2 +; VP_2-NEXT: Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE %tmp7 = load %tmp3 +; VP_2-NEXT: Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE store 0.000000e+00, %tmp0 +; VP_2-NEXT: Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE store 0.000000e+00, %tmp1 +; VP_2-NEXT: Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE store 0.000000e+00, %tmp2 +; VP_2-NEXT: Found an estimated cost of 24 for VF 2 For recipe: "REPLICATE store 0.000000e+00, %tmp3 +; VP_4-LABEL: Checking a loop in "f32_factor_4" +; VP_4: Found an estimated cost of 32 for VF 4 For recipe: "WIDEN load ir<%tmp0> +; VP_4-NEXT: Found an estimated cost of 0 for VF 4 For recipe: "WIDEN load ir<%tmp1> +; VP_4-NEXT: Found an estimated cost of 0 for VF 4 For recipe: "WIDEN load ir<%tmp2> +; VP_4-NEXT: Found an estimated cost of 0 for VF 4 For recipe: "WIDEN load ir<%tmp3> +; VP_4-NEXT: Found an estimated cost of 0 for VF 4 For recipe: "WIDEN store ir<%tmp0>, ir<0.000000e+00> +; VP_4-NEXT: Found an estimated cost of 0 for VF 4 For recipe: "WIDEN store ir<%tmp1>, ir<0.000000e+00> +; VP_4-NEXT: Found an estimated cost of 0 for VF 4 For recipe: "WIDEN store ir<%tmp2>, ir<0.000000e+00> +; VP_4-NEXT: Found an estimated cost of 32 for VF 4 For recipe: "WIDEN store ir<%tmp3>, ir<0.000000e+00> +; VP_8-LABEL: Checking a loop in "f32_factor_4" +; VP_8: Found an estimated cost of 544 for VF 8 For recipe: "REPLICATE %tmp4 = load %tmp0 +; VP_8-NEXT: Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE %tmp5 = load %tmp1 +; VP_8-NEXT: Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE %tmp6 = load %tmp2 +; VP_8-NEXT: Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE %tmp7 = load %tmp3 +; VP_8-NEXT: Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE store 0.000000e+00, %tmp0 +; VP_8-NEXT: Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE store 0.000000e+00, %tmp1 +; VP_8-NEXT: Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE store 0.000000e+00, %tmp2 +; VP_8-NEXT: Found an estimated cost of 288 for VF 8 For recipe: "REPLICATE store 0.000000e+00, %tmp3 +; VP_16-LABEL: Checking a loop in "f32_factor_4" +; VP_16: Found an estimated cost of 2112 for VF 16 For recipe: "REPLICATE %tmp4 = load %tmp0 +; VP_16-NEXT: Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE %tmp5 = load %tmp1 +; VP_16-NEXT: Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE %tmp6 = load %tmp2 +; VP_16-NEXT: Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE %tmp7 = load %tmp3 +; VP_16-NEXT: Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE store 0.000000e+00, %tmp0 +; VP_16-NEXT: Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE store 0.000000e+00, %tmp1 +; VP_16-NEXT: Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE store 0.000000e+00, %tmp2 +; VP_16-NEXT: Found an estimated cost of 1088 for VF 16 For recipe: "REPLICATE store 0.000000e+00, %tmp3 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %f32.4, %f32.4* %data, i64 %i, i32 0 @@ -1086,6 +1614,42 @@ ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store double 0.000000e+00, double* %tmp1, align 8 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store double 0.000000e+00, double* %tmp2, align 8 ; VF_16-NEXT: Found an estimated cost of 1088 for VF 16 For instruction: store double 0.000000e+00, double* %tmp3, align 8 +; VP_2-LABEL: Checking a loop in "f64_factor_4" +; VP_2: Found an estimated cost of 40 for VF 2 For recipe: "REPLICATE %tmp4 = load %tmp0 +; VP_2-NEXT: Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE %tmp5 = load %tmp1 +; VP_2-NEXT: Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE %tmp6 = load %tmp2 +; VP_2-NEXT: Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE %tmp7 = load %tmp3 +; VP_2-NEXT: Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE store 0.000000e+00, %tmp0 +; VP_2-NEXT: Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE store 0.000000e+00, %tmp1 +; VP_2-NEXT: Found an estimated cost of 0 for VF 2 For recipe: "REPLICATE store 0.000000e+00, %tmp2 +; VP_2-NEXT: Found an estimated cost of 24 for VF 2 For recipe: "REPLICATE store 0.000000e+00, %tmp3 +; VP_4-LABEL: Checking a loop in "f64_factor_4" +; VP_4: Found an estimated cost of 144 for VF 4 For recipe: "REPLICATE %tmp4 = load %tmp0 +; VP_4-NEXT: Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE %tmp5 = load %tmp1 +; VP_4-NEXT: Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE %tmp6 = load %tmp2 +; VP_4-NEXT: Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE %tmp7 = load %tmp3 +; VP_4-NEXT: Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE store 0.000000e+00, %tmp0 +; VP_4-NEXT: Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE store 0.000000e+00, %tmp1 +; VP_4-NEXT: Found an estimated cost of 0 for VF 4 For recipe: "REPLICATE store 0.000000e+00, %tmp2 +; VP_4-NEXT: Found an estimated cost of 80 for VF 4 For recipe: "REPLICATE store 0.000000e+00, %tmp3 +; VP_8-LABEL: Checking a loop in "f64_factor_4" +; VP_8: Found an estimated cost of 544 for VF 8 For recipe: "REPLICATE %tmp4 = load %tmp0 +; VP_8-NEXT: Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE %tmp5 = load %tmp1 +; VP_8-NEXT: Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE %tmp6 = load %tmp2 +; VP_8-NEXT: Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE %tmp7 = load %tmp3 +; VP_8-NEXT: Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE store 0.000000e+00, %tmp0 +; VP_8-NEXT: Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE store 0.000000e+00, %tmp1 +; VP_8-NEXT: Found an estimated cost of 0 for VF 8 For recipe: "REPLICATE store 0.000000e+00, %tmp2 +; VP_8-NEXT: Found an estimated cost of 288 for VF 8 For recipe: "REPLICATE store 0.000000e+00, %tmp3 +; VP_16-LABEL: Checking a loop in "f64_factor_4" +; VP_16: Found an estimated cost of 2112 for VF 16 For recipe: "REPLICATE %tmp4 = load %tmp0 +; VP_16-NEXT: Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE %tmp5 = load %tmp1 +; VP_16-NEXT: Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE %tmp6 = load %tmp2 +; VP_16-NEXT: Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE %tmp7 = load %tmp3 +; VP_16-NEXT: Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE store 0.000000e+00, %tmp0 +; VP_16-NEXT: Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE store 0.000000e+00, %tmp1 +; VP_16-NEXT: Found an estimated cost of 0 for VF 16 For recipe: "REPLICATE store 0.000000e+00, %tmp2 +; VP_16-NEXT: Found an estimated cost of 1088 for VF 16 For recipe: "REPLICATE store 0.000000e+00, %tmp3 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %f64.4, %f64.4* %data, i64 %i, i32 0 Index: llvm/test/Transforms/LoopVectorize/ARM/mve-shiftcost.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/ARM/mve-shiftcost.ll +++ llvm/test/Transforms/LoopVectorize/ARM/mve-shiftcost.ll @@ -1,5 +1,6 @@ ; RUN: opt -loop-vectorize < %s -S -o - | FileCheck %s --check-prefix=CHECK -; RUN: opt -loop-vectorize -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=CHECK-COST +; RUN: opt -loop-vectorize -debug-only=loop-vectorize -cost-using-vplan=false -disable-output < %s 2>&1 | FileCheck %s --check-prefix=CHECK-COST +; RUN: opt -loop-vectorize -debug-only=loop-vectorize -cost-using-vplan -disable-output < %s 2>&1 | FileCheck %s --check-prefix=CHECK-COST-VPLAN ; REQUIRES: asserts target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" @@ -10,6 +11,10 @@ ; CHECK-COST: LV: Found an estimated cost of 1 for VF 1 For instruction: %l45 = and i32 %and515, 131072 ; CHECK-COST: LV: Found an estimated cost of 2 for VF 4 For instruction: %and515 = shl i32 %l41, 3 ; CHECK-COST: LV: Found an estimated cost of 2 for VF 4 For instruction: %l45 = and i32 %and515, 131072 +; CHECK-COST-VPLAN: LV: Found an estimated cost of 0 for VF 1 For recipe: "CLONE %and515 = shl %l41, 3 +; CHECK-COST-VPLAN: LV: Found an estimated cost of 1 for VF 1 For recipe: "CLONE %l45 = and %and515, 131072 +; CHECK-COST-VPLAN: LV: Found an estimated cost of 2 for VF 4 For recipe: "WIDEN\l"" %and515 = shl %l41, 3 +; CHECK-COST-VPLAN: LV: Found an estimated cost of 2 for VF 4 For recipe: "WIDEN\l"" %l45 = and %and515, 131072 ; CHECK-NOT: vector.body define void @test([101 x i32] *%src, i32 %N) #0 { Index: llvm/test/Transforms/LoopVectorize/SystemZ/branch-for-predicated-block.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/SystemZ/branch-for-predicated-block.ll +++ llvm/test/Transforms/LoopVectorize/SystemZ/branch-for-predicated-block.ll @@ -1,7 +1,10 @@ ; REQUIRES: asserts -; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z13 -loop-vectorize \ +; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z13 -loop-vectorize -cost-using-vplan=false \ ; RUN: -force-vector-width=2 -debug-only=loop-vectorize \ -; RUN: -disable-output < %s 2>&1 | FileCheck %s +; RUN: -disable-output < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-CM +; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z13 -loop-vectorize -cost-using-vplan=true \ +; RUN: -force-vector-width=2 -debug-only=loop-vectorize \ +; RUN: -disable-output < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-VP ; Check costs for branches inside a vectorized loop around predicated ; blocks. Each such branch will be guarded with an extractelement from the @@ -32,7 +35,10 @@ for.end.loopexit: ret void -; CHECK: LV: Found an estimated cost of 7 for VF 2 For instruction: br i1 %cmp55, label %if.then, label %for.inc -; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction: br label %for.inc -; CHECK: LV: Found an estimated cost of 1 for VF 2 For instruction: br i1 %exitcond, label %for.end.loopexit, label %for.body +; CHECK-CM: LV: Found an estimated cost of 7 for VF 2 For instruction: br i1 %cmp55, label %if.then, label %for.inc +; CHECK-CM: LV: Found an estimated cost of 0 for VF 2 For instruction: br label %for.inc +; CHECK-CM: LV: Found an estimated cost of 1 for VF 2 For instruction: br i1 %exitcond, label %for.end.loopexit, label %for.body +; CHECK-VP: LV: Found an estimated cost of 7 for VF 2 For recipe: "BRANCH-ON-MASK ir<%cmp55> +; CHECK-VP-NOT: LV: Found an estimated cost of {{.*}} for VF 2 For recipe: {{.*}} br +; CHECK-VP: LV: Found an estimated cost of 1 for VF 2 For loop backedge cost (br) } Index: llvm/test/Transforms/LoopVectorize/SystemZ/load-scalarization-cost-0.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/SystemZ/load-scalarization-cost-0.ll +++ llvm/test/Transforms/LoopVectorize/SystemZ/load-scalarization-cost-0.ll @@ -1,6 +1,9 @@ -; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z13 -loop-vectorize \ +; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z13 -loop-vectorize -cost-using-vplan=false \ ; RUN: -force-vector-width=2 -debug-only=loop-vectorize \ -; RUN: -disable-output < %s 2>&1 | FileCheck %s +; RUN: -disable-output < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-CM +; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z13 -loop-vectorize -cost-using-vplan=true \ +; RUN: -force-vector-width=2 -debug-only=loop-vectorize \ +; RUN: -disable-output < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-VP ; REQUIRES: asserts ; ; Check that a scalarized load does not get operands scalarization costs added. @@ -22,6 +25,8 @@ for.end: ret void -; CHECK: LV: Found an estimated cost of 2 for VF 2 For instruction: %mul = mul nsw i64 %iv, %s -; CHECK: LV: Found an estimated cost of 2 for VF 2 For instruction: %ld = load i64, i64* %bct +; CHECK-CM: LV: Found an estimated cost of 2 for VF 2 For instruction: %mul = mul nsw i64 %iv, %s +; CHECK-CM: LV: Found an estimated cost of 2 for VF 2 For instruction: %ld = load i64, i64* %bct +; CHECK-VP: LV: Found an estimated cost of 2 for VF 2 For recipe: "REPLICATE %mul = mul %iv, %s +; CHECK-VP: LV: Found an estimated cost of 2 for VF 2 For recipe: "REPLICATE %ld = load %bct } Index: llvm/test/Transforms/LoopVectorize/SystemZ/load-scalarization-cost-1.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/SystemZ/load-scalarization-cost-1.ll +++ llvm/test/Transforms/LoopVectorize/SystemZ/load-scalarization-cost-1.ll @@ -1,7 +1,11 @@ -; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z13 -loop-vectorize \ +; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z13 -loop-vectorize -cost-using-vplan=false \ ; RUN: -force-vector-width=4 -debug-only=loop-vectorize \ ; RUN: -enable-interleaved-mem-accesses=false -disable-output < %s 2>&1 \ -; RUN: | FileCheck %s +; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-CM +; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z13 -loop-vectorize -cost-using-vplan=true \ +; RUN: -force-vector-width=4 -debug-only=loop-vectorize \ +; RUN: -enable-interleaved-mem-accesses=false -disable-output < %s 2>&1 \ +; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-VP ; REQUIRES: asserts ; ; Check that a scalarized load does not get a zero cost in a vectorized @@ -24,5 +28,6 @@ for.end: ret i32 %acc_next -; CHECK: Found an estimated cost of 4 for VF 4 For instruction: %ld = load i32, i32* %gep +; CHECK-CM: Found an estimated cost of 4 for VF 4 For instruction: %ld = load i32, i32* %gep +; CHECK-VP: Found an estimated cost of 4 for VF 4 For recipe: "REPLICATE %ld = load %gep } Index: llvm/test/Transforms/LoopVectorize/SystemZ/load-store-scalarization-cost.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/SystemZ/load-store-scalarization-cost.ll +++ llvm/test/Transforms/LoopVectorize/SystemZ/load-store-scalarization-cost.ll @@ -1,8 +1,12 @@ ; REQUIRES: asserts -; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z13 -loop-vectorize \ +; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z13 -loop-vectorize -cost-using-vplan=false \ ; RUN: -force-vector-width=4 -debug-only=loop-vectorize \ ; RUN: -disable-output -enable-interleaved-mem-accesses=false < %s 2>&1 | \ -; RUN: FileCheck %s +; RUN: FileCheck %s --check-prefixes=CHECK,CHECK-CM +; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z13 -loop-vectorize -cost-using-vplan=true \ +; RUN: -force-vector-width=4 -debug-only=loop-vectorize \ +; RUN: -disable-output -enable-interleaved-mem-accesses=false < %s 2>&1 | \ +; RUN: FileCheck %s --check-prefixes=CHECK,CHECK-VP ; ; Check that a scalarized load/store does not get a cost for insterts/ ; extracts, since z13 supports element load/store. @@ -27,7 +31,9 @@ ; CHECK: LV: Scalarizing: %tmp1 = load i32, i32* %tmp0, align 4 ; CHECK: LV: Scalarizing: store i32 %tmp2, i32* %tmp0, align 4 -; CHECK: LV: Found an estimated cost of 4 for VF 4 For instruction: %tmp1 = load i32, i32* %tmp0, align 4 -; CHECK: LV: Found an estimated cost of 4 for VF 4 For instruction: store i32 %tmp2, i32* %tmp0, align 4 +; CHECK-CM: LV: Found an estimated cost of 4 for VF 4 For instruction: %tmp1 = load i32, i32* %tmp0, align 4 +; CHECK-CM: LV: Found an estimated cost of 4 for VF 4 For instruction: store i32 %tmp2, i32* %tmp0, align 4 +; CHECK-VP: LV: Found an estimated cost of 4 for VF 4 For recipe: "REPLICATE %tmp1 = load %tmp0 +; CHECK-VP: LV: Found an estimated cost of 4 for VF 4 For recipe: "REPLICATE store %tmp2, %tmp0 } Index: llvm/test/Transforms/LoopVectorize/SystemZ/mem-interleaving-costs-02.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/SystemZ/mem-interleaving-costs-02.ll +++ llvm/test/Transforms/LoopVectorize/SystemZ/mem-interleaving-costs-02.ll @@ -1,7 +1,10 @@ ; REQUIRES: asserts -; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z13 -loop-vectorize \ +; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z13 -loop-vectorize -cost-using-vplan=false \ ; RUN: -debug-only=loop-vectorize,vectorutils -max-interleave-group-factor=64\ -; RUN: -disable-output < %s 2>&1 | FileCheck %s +; RUN: -disable-output < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-CM +; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z13 -loop-vectorize -cost-using-vplan=true \ +; RUN: -debug-only=loop-vectorize,vectorutils -max-interleave-group-factor=64\ +; RUN: -disable-output < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-VP ; ; Check that some cost estimations for interleave groups make sense. @@ -11,10 +14,11 @@ ; two vector registers using one vperm each, which gives a cost of 2 + 4 = 6. ; ; CHECK: LV: Checking a loop in "fun0" -; CHECK: LV: Found an estimated cost of 6 for VF 4 For instruction: %ld0 = load i16 -; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction: %ld1 = load i16 -; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction: %ld2 = load i16 -; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction: %ld3 = load i16 +; CHECK-CM: LV: Found an estimated cost of 6 for VF 4 For instruction: %ld0 = load i16 +; CHECK-CM: LV: Found an estimated cost of 0 for VF 4 For instruction: %ld1 = load i16 +; CHECK-CM: LV: Found an estimated cost of 0 for VF 4 For instruction: %ld2 = load i16 +; CHECK-CM: LV: Found an estimated cost of 0 for VF 4 For instruction: %ld3 = load i16 +; CHECK-VP: LV: Found an estimated cost of 6 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 4 at %ld0 define void @fun0(i16 *%ptr, i16 *%dst) { entry: br label %for.body @@ -49,7 +53,8 @@ ; which gives a cost of 5. ; ; CHECK: LV: Checking a loop in "fun1" -; CHECK: LV: Found an estimated cost of 5 for VF 16 For instruction: %ld0 = load i8 +; CHECK-CM: LV: Found an estimated cost of 5 for VF 16 For instruction: %ld0 = load i8 +; CHECK-VP: LV: Found an estimated cost of 5 for VF 16 For recipe: "INTERLEAVE-GROUP with factor 3 at %ld0 define void @fun1(i8 *%ptr, i8 *%dst) { entry: br label %for.body @@ -75,10 +80,11 @@ ; produce the vector values, which gives a cost of 6. ; ; CHECK: LV: Checking a loop in "fun2" -; CHECK: LV: Found an estimated cost of 6 for VF 2 For instruction: %ld0 = load i8 -; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction: %ld1 = load i8 -; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction: %ld2 = load i8 -; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction: %ld3 = load i8 +; CHECK-CM: LV: Found an estimated cost of 6 for VF 2 For instruction: %ld0 = load i8 +; CHECK-CM: LV: Found an estimated cost of 0 for VF 2 For instruction: %ld1 = load i8 +; CHECK-CM: LV: Found an estimated cost of 0 for VF 2 For instruction: %ld2 = load i8 +; CHECK-CM: LV: Found an estimated cost of 0 for VF 2 For instruction: %ld3 = load i8 +; CHECK-VP: LV: Found an estimated cost of 6 for VF 2 For recipe: "INTERLEAVE-GROUP with factor 32 at %ld0 define void @fun2(i8 *%ptr, i8 *%dst) { entry: br label %for.body @@ -115,10 +121,11 @@ ; vector register boundary. ; ; CHECK: LV: Checking a loop in "fun3" -; CHECK: LV: Found an estimated cost of 7 for VF 2 For instruction: %ld0 = load i8 -; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction: %ld1 = load i8 -; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction: %ld2 = load i8 -; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction: %ld3 = load i8 +; CHECK-CM: LV: Found an estimated cost of 7 for VF 2 For instruction: %ld0 = load i8 +; CHECK-CM: LV: Found an estimated cost of 0 for VF 2 For instruction: %ld1 = load i8 +; CHECK-CM: LV: Found an estimated cost of 0 for VF 2 For instruction: %ld2 = load i8 +; CHECK-CM: LV: Found an estimated cost of 0 for VF 2 For instruction: %ld3 = load i8 +; CHECK-VP: LV: Found an estimated cost of 7 for VF 2 For recipe: "INTERLEAVE-GROUP with factor 30 at %ld0 define void @fun3(i8 *%ptr, i8 *%dst) { entry: br label %for.body Index: llvm/test/Transforms/LoopVectorize/SystemZ/mem-interleaving-costs.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/SystemZ/mem-interleaving-costs.ll +++ llvm/test/Transforms/LoopVectorize/SystemZ/mem-interleaving-costs.ll @@ -1,7 +1,10 @@ ; REQUIRES: asserts -; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z13 -loop-vectorize \ +; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z13 -loop-vectorize -cost-using-vplan=false \ ; RUN: -force-vector-width=4 -debug-only=loop-vectorize,vectorutils \ -; RUN: -disable-output < %s 2>&1 | FileCheck %s +; RUN: -disable-output < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-CM +; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z13 -loop-vectorize -cost-using-vplan=true \ +; RUN: -force-vector-width=4 -debug-only=loop-vectorize,vectorutils \ +; RUN: -disable-output < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-VP ; ; Check that the loop vectorizer performs memory interleaving with accurate ; cost estimations. @@ -27,7 +30,8 @@ ret void ; CHECK: LV: Creating an interleave group with: %tmp1 = load i32, i32* %tmp0, align 4 -; CHECK: LV: Found an estimated cost of 3 for VF 4 For instruction: %tmp1 = load i32, i32* %tmp0, align 4 +; CHECK-CM: LV: Found an estimated cost of 3 for VF 4 For instruction: %tmp1 = load i32, i32* %tmp0, align 4 +; CHECK-VP: LV: Found an estimated cost of 3 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 2 at %tmp1 ; (vl; vl; vperm) } @@ -59,12 +63,14 @@ ; CHECK: LV: Inserted: %tmp1 = load i32, i32* %tmp0, align 4 ; CHECK: into the interleave group with %tmp3 = load i32, i32* %tmp2, align 4 -; CHECK: LV: Found an estimated cost of 4 for VF 4 For instruction: %tmp1 = load i32, i32* %tmp0, align 4 -; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i32, i32* %tmp2, align 4 +; CHECK-CM: LV: Found an estimated cost of 4 for VF 4 For instruction: %tmp1 = load i32, i32* %tmp0, align 4 +; CHECK-CM: LV: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i32, i32* %tmp2, align 4 +; CHECK-VP: LV: Found an estimated cost of 4 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 2 at %tmp1 ; (vl; vl; vperm, vpkg) -; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction: store i32 %tmp1, i32* %tmp2, align 4 -; CHECK: LV: Found an estimated cost of 4 for VF 4 For instruction: store i32 %tmp3, i32* %tmp0, align 4 +; CHECK-CM: LV: Found an estimated cost of 0 for VF 4 For instruction: store i32 %tmp1, i32* %tmp2, align 4 +; CHECK-CM: LV: Found an estimated cost of 4 for VF 4 For instruction: store i32 %tmp3, i32* %tmp0, align 4 +; CHECK-VP: LV: Found an estimated cost of 4 for VF 4 For recipe: "INTERLEAVE-GROUP with factor 2 ; (vmrlf; vmrhf; vst; vst) } Index: llvm/test/Transforms/LoopVectorize/X86/fneg-cost.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/X86/fneg-cost.ll +++ llvm/test/Transforms/LoopVectorize/X86/fneg-cost.ll @@ -5,9 +5,9 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.8.0" -; CHECK: Found an estimated cost of 4 for VF 1 For instruction: %neg = fneg float %{{.*}} -; CHECK: Found an estimated cost of 4 for VF 2 For instruction: %neg = fneg float %{{.*}} -; CHECK: Found an estimated cost of 4 for VF 4 For instruction: %neg = fneg float %{{.*}} +; CHECK: Found an estimated cost of 4 for VF 1 For {{.*}} %neg = fneg +; CHECK: Found an estimated cost of 4 for VF 2 For {{.*}} %neg = fneg +; CHECK: Found an estimated cost of 4 for VF 4 For {{.*}} %neg = fneg define void @fneg_cost(float* %a, i64 %n) { entry: br label %for.body Index: llvm/test/Transforms/LoopVectorize/X86/fp_to_sint8-cost-model.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/X86/fp_to_sint8-cost-model.ll +++ llvm/test/Transforms/LoopVectorize/X86/fp_to_sint8-cost-model.ll @@ -5,7 +5,7 @@ target triple = "x86_64-apple-macosx10.8.0" -; CHECK: cost of 4 for VF 8 For instruction: %conv = fptosi float %tmp to i8 +; CHECK: cost of 4 for VF 8 For {{.*}} %conv = fptosi define void @float_to_sint8_cost(i8* noalias nocapture %a, float* noalias nocapture readonly %b) nounwind { entry: br label %for.body Index: llvm/test/Transforms/LoopVectorize/X86/mul_slm_16bit.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/X86/mul_slm_16bit.ll +++ llvm/test/Transforms/LoopVectorize/X86/mul_slm_16bit.ll @@ -32,38 +32,38 @@ %conv3 = sext i8 %1 to i32 ; sources of the mul is sext\sext from i8 ; use pmullw\sext seq. -; SLM: cost of 3 for VF 4 {{.*}} mul nsw i32 +; SLM: cost of 3 for VF 4 {{.*}} mul %mul = mul nsw i32 %conv3, %conv ; sources of the mul is zext\sext from i8 ; use pmulhw\pmullw\pshuf -; SLM: cost of 5 for VF 4 {{.*}} mul nsw i32 +; SLM: cost of 5 for VF 4 {{.*}} mul %conv4 = zext i8 %1 to i32 %mul2 = mul nsw i32 %conv4, %conv %sum0 = add i32 %mul, %mul2 ; sources of the mul is zext\zext from i8 ; use pmullw\zext -; SLM: cost of 3 for VF 4 {{.*}} mul nsw i32 +; SLM: cost of 3 for VF 4 {{.*}} mul %conv5 = zext i8 %0 to i32 %mul3 = mul nsw i32 %conv5, %conv4 %sum1 = add i32 %sum0, %mul3 ; sources of the mul is sext\-120 ; use pmullw\sext -; SLM: cost of 3 for VF 4 {{.*}} mul nsw i32 +; SLM: cost of 3 for VF 4 {{.*}} mul %mul4 = mul nsw i32 -120, %conv3 %sum2 = add i32 %sum1, %mul4 ; sources of the mul is sext\250 ; use pmulhw\pmullw\pshuf -; SLM: cost of 5 for VF 4 {{.*}} mul nsw i32 +; SLM: cost of 5 for VF 4 {{.*}} mul %mul5 = mul nsw i32 250, %conv3 %sum3 = add i32 %sum2, %mul5 ; sources of the mul is zext\-120 ; use pmulhw\pmullw\pshuf -; SLM: cost of 5 for VF 4 {{.*}} mul nsw i32 +; SLM: cost of 5 for VF 4 {{.*}} mul %mul6 = mul nsw i32 -120, %conv4 %sum4 = add i32 %sum3, %mul6 ; sources of the mul is zext\250 ; use pmullw\zext -; SLM: cost of 3 for VF 4 {{.*}} mul nsw i32 +; SLM: cost of 3 for VF 4 {{.*}} mul %mul7 = mul nsw i32 250, %conv4 %sum5 = add i32 %sum4, %mul7 %add = add i32 %acc.013, 5 @@ -101,38 +101,38 @@ %conv3 = sext i16 %1 to i32 ; sources of the mul is sext\sext from i16 ; use pmulhw\pmullw\pshuf seq. -; SLM: cost of 5 for VF 4 {{.*}} mul nsw i32 +; SLM: cost of 5 for VF 4 {{.*}} mul %mul = mul nsw i32 %conv3, %conv ; sources of the mul is zext\sext from i16 ; use pmulld -; SLM: cost of 11 for VF 4 {{.*}} mul nsw i32 +; SLM: cost of 11 for VF 4 {{.*}} mul %conv4 = zext i16 %1 to i32 %mul2 = mul nsw i32 %conv4, %conv %sum0 = add i32 %mul, %mul2 ; sources of the mul is zext\zext from i16 ; use pmulhw\pmullw\zext -; SLM: cost of 5 for VF 4 {{.*}} mul nsw i32 +; SLM: cost of 5 for VF 4 {{.*}} mul %conv5 = zext i16 %0 to i32 %mul3 = mul nsw i32 %conv5, %conv4 %sum1 = add i32 %sum0, %mul3 ; sources of the mul is sext\-32000 ; use pmulhw\pmullw\sext -; SLM: cost of 5 for VF 4 {{.*}} mul nsw i32 +; SLM: cost of 5 for VF 4 {{.*}} mul %mul4 = mul nsw i32 -32000, %conv3 %sum2 = add i32 %sum1, %mul4 ; sources of the mul is sext\64000 ; use pmulld -; SLM: cost of 11 for VF 4 {{.*}} mul nsw i32 +; SLM: cost of 11 for VF 4 {{.*}} mul %mul5 = mul nsw i32 64000, %conv3 %sum3 = add i32 %sum2, %mul5 ; sources of the mul is zext\-32000 ; use pmulld -; SLM: cost of 11 for VF 4 {{.*}} mul nsw i32 +; SLM: cost of 11 for VF 4 {{.*}} mul %mul6 = mul nsw i32 -32000, %conv4 %sum4 = add i32 %sum3, %mul6 ; sources of the mul is zext\64000 ; use pmulhw\pmullw\zext -; SLM: cost of 5 for VF 4 {{.*}} mul nsw i32 +; SLM: cost of 5 for VF 4 {{.*}} mul %mul7 = mul nsw i32 250, %conv4 %sum5 = add i32 %sum4, %mul7 %add = add i32 %acc.013, 5 Index: llvm/test/Transforms/LoopVectorize/X86/reduction-small-size.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/X86/reduction-small-size.ll +++ llvm/test/Transforms/LoopVectorize/X86/reduction-small-size.ll @@ -1,5 +1,6 @@ ; REQUIRES: asserts -; RUN: opt < %s -loop-vectorize -mcpu=core-axv2 -force-vector-interleave=1 -dce -instcombine -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck %s +; RUN: opt < %s -loop-vectorize -mcpu=core-axv2 -force-vector-interleave=1 -dce -instcombine -cost-using-vplan=false -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-CM +; RUN: opt < %s -loop-vectorize -mcpu=core-axv2 -force-vector-interleave=1 -dce -instcombine -cost-using-vplan=true -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-VP target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" @@ -13,36 +14,60 @@ ; ; CHECK-LABEL: reduction_i8 -; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction: %{{.*}} = phi -; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction: %{{.*}} = phi -; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction: %{{.*}} = getelementptr -; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction: %{{.*}} = load -; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction: %{{.*}} = zext i8 %{{.*}} to i32 -; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction: %{{.*}} = getelementptr -; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction: %{{.*}} = load -; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction: %{{.*}} = zext i8 %{{.*}} to i32 -; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction: %{{.*}} = and i32 %{{.*}}, 255 -; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction: %{{.*}} = add -; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction: %{{.*}} = add -; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction: %{{.*}} = add -; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction: %{{.*}} = trunc -; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction: %{{.*}} = icmp -; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction: br -; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction: %{{.*}} = phi -; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction: %{{.*}} = phi -; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction: %{{.*}} = getelementptr -; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction: %{{.*}} = load -; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction: %{{.*}} = zext i8 %{{.*}} to i32 -; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction: %{{.*}} = getelementptr -; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction: %{{.*}} = load -; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction: %{{.*}} = zext i8 %{{.*}} to i32 -; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction: %{{.*}} = and i32 %{{.*}}, 255 -; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction: %{{.*}} = add -; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction: %{{.*}} = add -; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction: %{{.*}} = add -; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction: %{{.*}} = trunc -; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction: %{{.*}} = icmp -; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction: br +; CHECK-CM: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction: %{{.*}} = phi +; CHECK-CM: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction: %{{.*}} = phi +; CHECK-CM: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction: %{{.*}} = getelementptr +; CHECK-CM: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction: %{{.*}} = load +; CHECK-CM: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction: %{{.*}} = zext i8 %{{.*}} to i32 +; CHECK-CM: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction: %{{.*}} = getelementptr +; CHECK-CM: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction: %{{.*}} = load +; CHECK-CM: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction: %{{.*}} = zext i8 %{{.*}} to i32 +; CHECK-CM: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction: %{{.*}} = and i32 %{{.*}}, 255 +; CHECK-CM: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction: %{{.*}} = add +; CHECK-CM: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction: %{{.*}} = add +; CHECK-CM: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction: %{{.*}} = add +; CHECK-CM: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction: %{{.*}} = trunc +; CHECK-CM: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction: %{{.*}} = icmp +; CHECK-CM: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction: br +; CHECK-CM: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction: %{{.*}} = phi +; CHECK-CM: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction: %{{.*}} = phi +; CHECK-CM: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction: %{{.*}} = getelementptr +; CHECK-CM: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction: %{{.*}} = load +; CHECK-CM-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction: %{{.*}} = zext i8 %{{.*}} to i32 +; CHECK-CM: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction: %{{.*}} = getelementptr +; CHECK-CM: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction: %{{.*}} = load +; CHECK-CM-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction: %{{.*}} = zext i8 %{{.*}} to i32 +; CHECK-CM-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction: %{{.*}} = and i32 %{{.*}}, 255 +; CHECK-CM: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction: %{{.*}} = add +; CHECK-CM: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction: %{{.*}} = add +; CHECK-CM: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction: %{{.*}} = add +; CHECK-CM: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction: %{{.*}} = trunc +; CHECK-CM: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction: %{{.*}} = icmp +; CHECK-CM: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction: br +; CHECK-VP: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For recipe: "WIDEN-INDUCTION %{{.*}} = phi +; CHECK-VP: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For recipe: "WIDEN-PHI %{{.*}} = phi +; CHECK-VP: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For recipe: "CLONE %{{.*}} = getelementptr +; CHECK-VP: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For recipe: "CLONE %{{.*}} = load +; CHECK-VP: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For recipe: "CLONE %{{.*}} = zext +; CHECK-VP: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For recipe: "CLONE %{{.*}} = getelementptr +; CHECK-VP: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For recipe: "CLONE %{{.*}} = load +; CHECK-VP: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For recipe: "CLONE %{{.*}} = zext +; CHECK-VP: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For recipe: "CLONE %{{.*}} = and +; CHECK-VP: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For recipe: "CLONE %{{.*}} = add +; CHECK-VP: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For recipe: "CLONE %{{.*}} = add +; CHECK-VP: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For loop induction check (add + icmp) +; CHECK-VP: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For recipe: "WIDEN-INDUCTION %{{.*}} = phi +; CHECK-VP: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For recipe: "WIDEN-PHI %{{.*}} = phi +; CHECK-VP: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For recipe: "CLONE %{{.*}} = getelementptr +; CHECK-VP: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For recipe: "WIDEN load +; CHECK-VP-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For recipe: "WIDEN\l"" %{{.*}} = zext +; CHECK-VP: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For recipe: "CLONE %{{.*}} = getelementptr +; CHECK-VP: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For recipe: "WIDEN load +; CHECK-VP-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For recipe: "WIDEN\l"" %{{.*}} = zext +; CHECK-VP-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For recipe: "WIDEN\l"" %{{.*}} = and +; CHECK-VP: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For recipe: "WIDEN\l"" %{{.*}} = add +; CHECK-VP: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For recipe: "WIDEN\l"" %{{.*}} = add +; CHECK-VP: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For loop induction check (add + icmp) ; define i8 @reduction_i8(i8* nocapture readonly %a, i8* nocapture readonly %b, i32 %n) { entry: Index: llvm/test/Transforms/LoopVectorize/X86/redundant-vf2-cost.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/X86/redundant-vf2-cost.ll +++ llvm/test/Transforms/LoopVectorize/X86/redundant-vf2-cost.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -loop-vectorize -mtriple x86_64 -debug -disable-output 2>&1 | FileCheck %s +; RUN: opt < %s -loop-vectorize -mtriple x86_64 -cost-using-vplan=false -debug -disable-output 2>&1 | FileCheck %s ; REQUIRES: asserts ; Check that cost model is not executed twice for VF=2 when vectorization is Index: llvm/test/Transforms/LoopVectorize/X86/uint64_to_fp64-cost-model.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/X86/uint64_to_fp64-cost-model.ll +++ llvm/test/Transforms/LoopVectorize/X86/uint64_to_fp64-cost-model.ll @@ -1,13 +1,17 @@ -; RUN: opt < %s -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -S -debug-only=loop-vectorize 2>&1 | FileCheck %s +; RUN: opt < %s -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -cost-using-vplan=false -S -debug-only=loop-vectorize 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-CM +; RUN: opt < %s -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -cost-using-vplan=true -S -debug-only=loop-vectorize 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-VP ; REQUIRES: asserts target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.8.0" -; CHECK: cost of 4 for VF 1 For instruction: %conv = uitofp i64 %tmp to double -; CHECK: cost of 5 for VF 2 For instruction: %conv = uitofp i64 %tmp to double -; CHECK: cost of 6 for VF 4 For instruction: %conv = uitofp i64 %tmp to double +; CHECK-CM: cost of 4 for VF 1 For instruction: %conv = uitofp i64 %tmp to double +; CHECK-CM: cost of 5 for VF 2 For instruction: %conv = uitofp i64 %tmp to double +; CHECK-CM: cost of 6 for VF 4 For instruction: %conv = uitofp i64 %tmp to double +; CHECK-VP: cost of 4 for VF 1 For recipe: "CLONE %conv = uitofp %tmp +; CHECK-VP: cost of 5 for VF 2 For recipe: "WIDEN\l"" %conv = uitofp %tmp +; CHECK-VP: cost of 6 for VF 4 For recipe: "WIDEN\l"" %conv = uitofp %tmp define void @uint64_to_double_cost(i64* noalias nocapture %a, double* noalias nocapture readonly %b) nounwind { entry: br label %for.body Index: llvm/test/Transforms/LoopVectorize/X86/uniformshift.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/X86/uniformshift.ll +++ llvm/test/Transforms/LoopVectorize/X86/uniformshift.ll @@ -1,8 +1,10 @@ -; RUN: opt -mtriple=x86_64-apple-darwin -mattr=+sse2 -loop-vectorize -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck %s +; RUN: opt -mtriple=x86_64-apple-darwin -mattr=+sse2 -loop-vectorize -cost-using-vplan=false -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-CM +; RUN: opt -mtriple=x86_64-apple-darwin -mattr=+sse2 -loop-vectorize -cost-using-vplan=true -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-VP ; REQUIRES: asserts ; CHECK: "foo" -; CHECK: LV: Found an estimated cost of 1 for VF 4 For instruction: %shift = ashr i32 %val, %k +; CHECK-CM: LV: Found an estimated cost of 1 for VF 4 For instruction: %shift = ashr i32 %val, %k +; CHECK-VP: LV: Found an estimated cost of 1 for VF 4 For recipe: "WIDEN\l"" %shift = ashr %val, %k define void @foo(i32* nocapture %p, i32 %k) local_unnamed_addr #0 { entry: br label %body Index: llvm/test/Transforms/LoopVectorize/X86/vector-scalar-select-cost.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/X86/vector-scalar-select-cost.ll +++ llvm/test/Transforms/LoopVectorize/X86/vector-scalar-select-cost.ll @@ -23,7 +23,7 @@ %7 = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 %indvars.iv ; A scalar select has a cost of 1 on core2 -; CHECK: cost of 1 for VF 2 {{.*}} select i1 %cond, i32 %6, i32 0 +; CHECK: cost of 1 for VF 2 {{.*}} select %sel = select i1 %cond, i32 %6, i32 zeroinitializer store i32 %sel, i32* %7, align 4 @@ -51,7 +51,7 @@ %8 = icmp ult i64 %indvars.iv, 8 ; A vector select has a cost of 1 on core2 -; CHECK: cost of 1 for VF 2 {{.*}} select i1 %8, i32 %6, i32 0 +; CHECK: cost of 1 for VF 2 {{.*}} select %sel = select i1 %8, i32 %6, i32 zeroinitializer store i32 %sel, i32* %7, align 4 Index: llvm/test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll +++ llvm/test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll @@ -12,6 +12,7 @@ ; uniform after vectorization. ; ; CHECK: LV: Found uniform instruction: %tmp1 = getelementptr inbounds i32, i32* %a, i64 %i +; CHECK-LABEL: @consecutive_ptr_forward( ; CHECK: vector.body ; CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] ; CHECK-NOT: getelementptr @@ -44,6 +45,7 @@ ; uniform after vectorization. ; ; CHECK: LV: Found uniform instruction: %tmp1 = getelementptr inbounds i32, i32* %a, i64 %i +; CHECK-LABEL: @consecutive_ptr_reverse( ; CHECK: vector.body ; CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] ; CHECK: %offset.idx = sub i64 %n, %index @@ -82,6 +84,7 @@ ; ; CHECK-NOT: LV: Found uniform instruction: %tmp1 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 0 ; CHECK-NOT: LV: Found uniform instruction: %tmp2 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 1 +; CHECK-LABEL: @interleaved_access_forward( ; CHECK: vector.body ; CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] ; CHECK: %[[I1:.+]] = or i64 %index, 1 @@ -99,6 +102,7 @@ ; ; INTER: LV: Found uniform instruction: %tmp1 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 0 ; INTER: LV: Found uniform instruction: %tmp2 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 1 +; INTER-LABEL: @interleaved_access_forward( ; INTER: vector.body ; INTER: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] ; INTER-NOT: getelementptr @@ -139,6 +143,7 @@ ; recognized as uniform, and it should not be uniform after vectorization. ; CHECK-NOT: LV: Found uniform instruction: %tmp1 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 0 ; CHECK-NOT: LV: Found uniform instruction: %tmp2 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 1 +; CHECK-LABEL: @interleaved_access_reverse( ; CHECK: vector.body ; CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] ; CHECK: %offset.idx = sub i64 %n, %index @@ -157,6 +162,7 @@ ; ; INTER: LV: Found uniform instruction: %tmp1 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 0 ; INTER: LV: Found uniform instruction: %tmp2 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 1 +; INTER-LABEL: @interleaved_access_reverse( ; INTER: vector.body ; INTER: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] ; INTER: %offset.idx = sub i64 %n, %index @@ -198,6 +204,7 @@ ; non-uniform. ; ; INTER-NOT: LV: Found uniform instruction: %tmp0 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 0 +; CHECK-LABEL: @predicated_store( ; INTER: vector.body ; INTER: %index = phi i64 [ 0, %vector.ph ], [ %index.next, {{.*}} ] ; INTER: %[[G0:.+]] = getelementptr inbounds %pair, %pair* %p, i64 %index, i32 0 @@ -242,6 +249,7 @@ ; because the stored type may required padding. ; ; CHECK-NOT: LV: Found uniform instruction: %tmp1 = getelementptr inbounds x86_fp80, x86_fp80* %a, i64 %i +; CHECK-LABEL: @irregular_type( ; CHECK: vector.body ; CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] ; CHECK: %[[I1:.+]] = or i64 %index, 1 @@ -276,6 +284,7 @@ ; uniform after vectorization. ; ; CHECK: LV: Found uniform instruction: %p = phi i32* [ %tmp03, %for.body ], [ %a, %entry ] +; CHECK-LABEL: @pointer_iv_uniform( ; CHECK: vector.body ; CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] ; CHECK-NOT: getelementptr @@ -308,6 +317,7 @@ ; due to scalarization of the stores. ; ; INTER-NOT: LV: Found uniform instruction: %p = phi i32* [ %tmp03, %for.body ], [ %a, %entry ] +; CHECK-LABEL: @pointer_iv_non_uniform_0( ; INTER: vector.body ; INTER: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] ; INTER: %[[I0:.+]] = shl i64 %index, 2 @@ -358,6 +368,7 @@ ; induction variable is used by a store that will be scalarized. ; ; CHECK-NOT: LV: Found uniform instruction: %p = phi x86_fp80* [%tmp1, %for.body], [%a, %entry] +; CHECK-LABEL: @pointer_iv_non_uniform_1( ; CHECK: vector.body ; CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] ; CHECK: %next.gep = getelementptr x86_fp80, x86_fp80* %a, i64 %index @@ -396,6 +407,7 @@ ; ; CHECK-NOT: LV: Found uniform instruction: %p = phi i32* [ %tmp3, %for.body ], [ %a, %entry ] ; CHECK: LV: Found uniform instruction: %q = phi i32** [ %tmp4, %for.body ], [ %b, %entry ] +; CHECK-LABEL: @pointer_iv_mixed( ; CHECK: vector.body ; CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] ; CHECK: %next.gep = getelementptr i32, i32* %a, i64 %index @@ -453,6 +465,7 @@ ; INTER-NEXT: LV: Found uniform instruction: %tmp0 = getelementptr inbounds i64, i64* %A, i64 %i ; INTER-NEXT: LV: Found uniform instruction: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] ; INTER-NEXT: LV: Found uniform instruction: %i.next = add nuw nsw i64 %i, 1 +; INTER-LABEL: @bitcast_pointer_operand( ; INTER: vector.body: ; INTER-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] ; INTER-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, i64* %A, i64 [[INDEX]] Index: llvm/test/Transforms/LoopVectorize/loop-scalars.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/loop-scalars.ll +++ llvm/test/Transforms/LoopVectorize/loop-scalars.ll @@ -5,6 +5,7 @@ ; CHECK-LABEL: vector_gep ; CHECK-NOT: LV: Found scalar instruction: %tmp0 = getelementptr inbounds i32, i32* %b, i64 %i +; CHECK-LABEL: @vector_gep( ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ] @@ -38,6 +39,7 @@ ; CHECK-NEXT: LV: Found scalar instruction: %tmp0 = getelementptr inbounds i32, i32* %b, i64 %i ; CHECK-NEXT: LV: Found scalar instruction: %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] ; CHECK-NEXT: LV: Found scalar instruction: %i.next = add nuw nsw i64 %i, 2 +; CHECK-LABEL: @scalar_store( ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1 @@ -75,6 +77,7 @@ ; CHECK-NEXT: LV: Found scalar instruction: %tmp0 = getelementptr inbounds i64, i64* %b, i64 %i ; CHECK-NEXT: LV: Found scalar instruction: %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] ; CHECK-NEXT: LV: Found scalar instruction: %i.next = add nuw nsw i64 %i, 2 +; CHECK-LABEL: @expansion( ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1 @@ -113,6 +116,7 @@ ; CHECK-NOT: LV: Found scalar instruction: %tmp1 = load i32*, i32** %tmp0, align 8 ; CHECK: LV: Found scalar instruction: %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] ; CHECK-NEXT: LV: Found scalar instruction: %i.next = add nuw nsw i64 %i, 1 +; CHECK-LABEL: @no_gep_or_bitcast( ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32*, i32** %a, i64 [[INDEX]] Index: llvm/test/Transforms/LoopVectorize/phi-cost.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/phi-cost.ll +++ llvm/test/Transforms/LoopVectorize/phi-cost.ll @@ -1,11 +1,15 @@ ; REQUIRES: asserts -; RUN: opt < %s -loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -instcombine -debug-only=loop-vectorize -disable-output -print-after=instcombine 2>&1 | FileCheck %s +; RUN: opt < %s -loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -instcombine -cost-using-vplan=false -debug-only=loop-vectorize -disable-output -print-after=instcombine 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-CM +; RUN: opt < %s -loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -instcombine -cost-using-vplan=true -debug-only=loop-vectorize -disable-output -print-after=instcombine 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-VP target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" ; CHECK-LABEL: phi_two_incoming_values -; CHECK: LV: Found an estimated cost of 1 for VF 2 For instruction: %i = phi i64 [ %i.next, %if.end ], [ 0, %entry ] -; CHECK: LV: Found an estimated cost of 1 for VF 2 For instruction: %tmp5 = phi i32 [ %tmp1, %for.body ], [ %tmp4, %if.then ] +; CHECK-CM: LV: Found an estimated cost of 1 for VF 2 For instruction: %i = phi i64 [ %i.next, %if.end ], [ 0, %entry ] +; CHECK-CM: LV: Found an estimated cost of 1 for VF 2 For instruction: %tmp5 = phi i32 [ %tmp1, %for.body ], [ %tmp4, %if.then ] +; CHECK-VP: LV: Found an estimated cost of 1 for VF 2 For recipe: "WIDEN-INDUCTION %i = phi %i.next, 0 +; CHECK-VP: LV: Found an estimated cost of 1 for VF 2 For recipe: "BLEND %tmp5 = ir<%tmp1>/vp<%0> ir<%tmp4>/ir<%tmp3> +; CHECK-LABEL: @phi_two_incoming_values( ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] ; CHECK: [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* {{.*}} @@ -43,8 +47,11 @@ } ; CHECK-LABEL: phi_three_incoming_values -; CHECK: LV: Found an estimated cost of 1 for VF 2 For instruction: %i = phi i64 [ %i.next, %if.end ], [ 0, %entry ] -; CHECK: LV: Found an estimated cost of 2 for VF 2 For instruction: %tmp8 = phi i32 [ 9, %for.body ], [ 3, %if.then ], [ %tmp7, %if.else ] +; CHECK-CM: LV: Found an estimated cost of 1 for VF 2 For instruction: %i = phi i64 [ %i.next, %if.end ], [ 0, %entry ] +; CHECK-CM: LV: Found an estimated cost of 2 for VF 2 For instruction: %tmp8 = phi i32 [ 9, %for.body ], [ 3, %if.then ], [ %tmp7, %if.else ] +; CHECK-VP: LV: Found an estimated cost of 1 for VF 2 For recipe: "WIDEN-INDUCTION %i = phi %i.next, 0 +; CHECK-VP: LV: Found an estimated cost of 2 for VF 2 For recipe: "BLEND %tmp8 = ir<9>/vp<%0> ir<3>/vp<%1> ir<%tmp7>/vp<%3> +; CHECK-LABEL: @phi_three_incoming_values( ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] ; CHECK: [[PREDPHI:%.*]] = select <2 x i1> {{.*}}, <2 x i32> , <2 x i32>