Index: lib/Transforms/Vectorize/LoopVectorizationPlanner.h =================================================================== --- lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -359,6 +359,9 @@ /// exclusive, possibly decreasing \p Range.End. VPlanPtr buildVPlan(VFRange &Range, const SmallPtrSetImpl &NeedDef); + + VPlanPtr transformVPInstructionsToVPRecipies(VPlanPtr &OriginalPlan, + VFRange &Range); }; } // namespace llvm Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -6358,7 +6358,11 @@ // 2. Copy and widen instructions from the old loop into the new loop. assert(VPlans.size() == 1 && "Not a single VPlan to execute."); - VPlans.front()->execute(&State); + + VFRange Range = {BestVF, BestVF + 1}; + VPlanPtr Widened = transformVPInstructionsToVPRecipies(VPlans.front(), Range); + + Widened->execute(&State); // 3. Fix the vectorized code: take care of header phi's, live-outs, // predication, updating analyses. @@ -6850,6 +6854,15 @@ LoopVectorizationPlanner::VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range, const SmallPtrSetImpl &NeedDef) { + // Create new empty VPlan + auto Plan = llvm::make_unique(); + + // Build hierarchical CFG + VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI); + HCFGBuilder.buildHierarchicalCFG(*Plan.get()); + + sinkInstructions(Plan, Legal->getSinkAfter()); + // Outer loop handling: They may require CFG and instruction level // transformations before even evaluating whether vectorization is profitable. // Since we cannot modify the incoming IR, we need to build VPlan upfront in @@ -6857,22 +6870,32 @@ if (!OrigLoop->empty()) { assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); - // Create new empty VPlan - auto Plan = llvm::make_unique(); - - // Build hierarchical CFG - VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI); - HCFGBuilder.buildHierarchicalCFG(*Plan.get()); return Plan; } - assert(OrigLoop->empty() && "Inner loop expected."); - EdgeMaskCache.clear(); - BlockMaskCache.clear(); - DenseMap &SinkAfter = Legal->getSinkAfter(); - DenseMap SinkAfterInverse; + std::string PlanName; + raw_string_ostream RSO(PlanName); + unsigned VF = Range.Start; + Plan->addVF(VF); + RSO << "Initial VPlan for VF={" << VF; + for (VF *= 2; VF < Range.End; VF *= 2) { + Plan->addVF(VF); + RSO << "," << VF; + } + RSO << "},UF>=1"; + RSO.flush(); + Plan->setName(PlanName); + + return Plan; +} + +// FIXME: move to LoopVectorizationPlanner.cpp, once LoopVectorizationCodeModel +// is moved the a header file. +LoopVectorizationPlanner::VPlanPtr +LoopVectorizationPlanner::transformVPInstructionsToVPRecipies( + VPlanPtr &OriginalPlan, VFRange &Range) { // Collect instructions from the original loop that will become trivially dead // in the vectorized loop. We don't need to vectorize these instructions. For // example, original induction update instructions can become dead because we @@ -6891,72 +6914,62 @@ VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); auto Plan = llvm::make_unique(VPBB); - // Represent values that will have defs inside VPlan. - for (Value *V : NeedDef) - Plan->addVPValue(V); + // Create VPValues used by createEdgeMask. + auto *Latch = OrigLoop->getLoopLatch(); + SmallPtrSet AddedValues; + for (BasicBlock *BB : OrigLoop->blocks()) { + if (BB == Latch) + continue; + BranchInst *Branch = dyn_cast(BB->getTerminator()); + if (Branch && Branch->isConditional() && + !AddedValues.count(Branch->getCondition())) { + Plan->addVPValue(Branch->getCondition()); + AddedValues.insert(Branch->getCondition()); + } + } - // Scan the body of the loop in a topological order to visit each basic block - // after having visited its predecessor basic blocks. - LoopBlocksDFS DFS(OrigLoop); - DFS.perform(LI); + VPRegionBlock *TopRegion = dyn_cast(OriginalPlan->getEntry()); + ReversePostOrderTraversal RPOT(TopRegion->getEntry()); + for (VPBlockBase *Base : RPOT) { + VPBasicBlock *OriginalVPBB = Base->getEntryBasicBlock(); + // Skip entry and exit nodes for now. Currently the recipes will take + // care of creating instructions in entry and exit blocks. + if (TopRegion && (OriginalVPBB == TopRegion->getEntry() || + OriginalVPBB == TopRegion->getExit())) + continue; - for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { - // Relevant instructions from basic block BB will be grouped into VPRecipe - // ingredients and fill a new VPBasicBlock. - unsigned VPBBsForBB = 0; - auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); + auto *FirstVPBBForBB = new VPBasicBlock(OriginalVPBB->getName()); VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); VPBB = FirstVPBBForBB; Builder.setInsertPoint(VPBB); + unsigned VPBBsForBB = 0; - std::vector Ingredients; + std::vector Ingredients; - // Organize the ingredients to vectorize from current basic block in the - // right order. - for (Instruction &I : BB->instructionsWithoutDebug()) { - Instruction *Instr = &I; + // Introduce each ingredient into VPlan. + for (VPRecipeBase &Ingredient : *OriginalVPBB) { + VPInstruction *VPInst = dyn_cast(&Ingredient); + if (!VPInst) { + VPBB->appendRecipe(VPInst); + continue; + } - // First filter out irrelevant instructions, to ensure no recipes are - // built for them. - if (isa(Instr) || DeadInstructions.count(Instr)) + assert(VPInst && "Can only handle VPInstructions."); + Instruction *Instr = dyn_cast(VPInst->getUnderlyingValue()); + if (DeadInstructions.count(Instr) || isa(Instr)) continue; - // I is a member of an InterleaveGroup for Range.Start. If it's an adjunct + VPRecipeBase *Recipe = nullptr; + // member of the IG, do not construct any Recipe for it. const InterleaveGroup *IG = CM.getInterleavedAccessGroup(Instr); if (IG && Instr != IG->getInsertPos() && Range.Start >= 2 && // Query is illegal for VF == 1 CM.getWideningDecision(Instr, Range.Start) == LoopVectorizationCostModel::CM_Interleave) { - if (SinkAfterInverse.count(Instr)) - Ingredients.push_back(SinkAfterInverse.find(Instr)->second); - continue; - } - - // Move instructions to handle first-order recurrences, step 1: avoid - // handling this instruction until after we've handled the instruction it - // should follow. - auto SAIt = SinkAfter.find(Instr); - if (SAIt != SinkAfter.end()) { - DEBUG(dbgs() << "Sinking" << *SAIt->first << " after" << *SAIt->second - << " to vectorize a 1st order recurrence.\n"); - SinkAfterInverse[SAIt->second] = Instr; continue; } - Ingredients.push_back(Instr); - - // Move instructions to handle first-order recurrences, step 2: push the - // instruction to be sunk at its insertion point. - auto SAInvIt = SinkAfterInverse.find(Instr); - if (SAInvIt != SinkAfterInverse.end()) - Ingredients.push_back(SAInvIt->second); - } - - // Introduce each ingredient into VPlan. - for (Instruction *Instr : Ingredients) { - VPRecipeBase *Recipe = nullptr; - // Check if Instr should belong to an interleave memory recipe, or already // does. In the latter case Instr is irrelevant. if ((Recipe = tryToInterleaveMemory(Instr, Range))) { @@ -6996,8 +7009,7 @@ handleReplication(Instr, Range, VPBB, PredInst2Recipe, Plan); if (NextVPBB != VPBB) { VPBB = NextVPBB; - VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) - : ""); + VPBB->setName(VPBB->getName() + "." + Twine(VPBBsForBB++)); } } } Index: test/Transforms/LoopVectorize/AArch64/predication_costs.ll =================================================================== --- test/Transforms/LoopVectorize/AArch64/predication_costs.ll +++ test/Transforms/LoopVectorize/AArch64/predication_costs.ll @@ -18,8 +18,8 @@ ; Cost of udiv: ; (udiv(2) + extractelement(6) + insertelement(3)) / 2 = 5 ; -; CHECK: Scalarizing and predicating: %tmp4 = udiv i32 %tmp2, %tmp3 ; CHECK: Found an estimated cost of 5 for VF 2 For instruction: %tmp4 = udiv i32 %tmp2, %tmp3 +; CHECK: Scalarizing and predicating: %tmp4 = udiv i32 %tmp2, %tmp3 ; define i32 @predicated_udiv(i32* %a, i32* %b, i1 %c, i64 %n) { entry: @@ -59,8 +59,8 @@ ; Cost of store: ; (store(4) + extractelement(3)) / 2 = 3 ; -; CHECK: Scalarizing and predicating: store i32 %tmp2, i32* %tmp0, align 4 ; CHECK: Found an estimated cost of 3 for VF 2 For instruction: store i32 %tmp2, i32* %tmp0, align 4 +; CHECK: Scalarizing and predicating: store i32 %tmp2, i32* %tmp0, align 4 ; define void @predicated_store(i32* %a, i1 %c, i32 %x, i64 %n) { entry: @@ -98,10 +98,10 @@ ; Cost of udiv: ; (udiv(2) + extractelement(3) + insertelement(3)) / 2 = 4 ; -; CHECK: Scalarizing: %tmp3 = add nsw i32 %tmp2, %x -; CHECK: Scalarizing and predicating: %tmp4 = udiv i32 %tmp2, %tmp3 ; CHECK: Found an estimated cost of 2 for VF 2 For instruction: %tmp3 = add nsw i32 %tmp2, %x ; CHECK: Found an estimated cost of 4 for VF 2 For instruction: %tmp4 = udiv i32 %tmp2, %tmp3 +; CHECK: Scalarizing: %tmp3 = add nsw i32 %tmp2, %x +; CHECK: Scalarizing and predicating: %tmp4 = udiv i32 %tmp2, %tmp3 ; define i32 @predicated_udiv_scalarized_operand(i32* %a, i1 %c, i32 %x, i64 %n) { entry: @@ -143,10 +143,10 @@ ; Cost of store: ; store(4) / 2 = 2 ; -; CHECK: Scalarizing: %tmp2 = add nsw i32 %tmp1, %x -; CHECK: Scalarizing and predicating: store i32 %tmp2, i32* %tmp0, align 4 ; CHECK: Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = add nsw i32 %tmp1, %x ; CHECK: Found an estimated cost of 2 for VF 2 For instruction: store i32 %tmp2, i32* %tmp0, align 4 +; CHECK: Scalarizing: %tmp2 = add nsw i32 %tmp1, %x +; CHECK: Scalarizing and predicating: store i32 %tmp2, i32* %tmp0, align 4 ; define void @predicated_store_scalarized_operand(i32* %a, i1 %c, i32 %x, i64 %n) { entry: @@ -192,16 +192,16 @@ ; Cost of store: ; store(4) / 2 = 2 ; -; CHECK-NOT: Scalarizing: %tmp2 = add i32 %tmp1, %x -; CHECK: Scalarizing and predicating: %tmp3 = sdiv i32 %tmp1, %tmp2 -; CHECK: Scalarizing and predicating: %tmp4 = udiv i32 %tmp3, %tmp2 -; CHECK: Scalarizing: %tmp5 = sub i32 %tmp4, %x -; CHECK: Scalarizing and predicating: store i32 %tmp5, i32* %tmp0, align 4 ; CHECK: Found an estimated cost of 1 for VF 2 For instruction: %tmp2 = add i32 %tmp1, %x ; CHECK: Found an estimated cost of 5 for VF 2 For instruction: %tmp3 = sdiv i32 %tmp1, %tmp2 ; CHECK: Found an estimated cost of 5 for VF 2 For instruction: %tmp4 = udiv i32 %tmp3, %tmp2 ; CHECK: Found an estimated cost of 2 for VF 2 For instruction: %tmp5 = sub i32 %tmp4, %x ; CHECK: Found an estimated cost of 2 for VF 2 For instruction: store i32 %tmp5, i32* %tmp0, align 4 +; CHECK-NOT: Scalarizing: %tmp2 = add i32 %tmp1, %x +; CHECK: Scalarizing and predicating: %tmp3 = sdiv i32 %tmp1, %tmp2 +; CHECK: Scalarizing and predicating: %tmp4 = udiv i32 %tmp3, %tmp2 +; CHECK: Scalarizing: %tmp5 = sub i32 %tmp4, %x +; CHECK: Scalarizing and predicating: store i32 %tmp5, i32* %tmp0, align 4 ; define void @predication_multi_context(i32* %a, i1 %c, i32 %x, i64 %n) { entry: Index: test/Transforms/LoopVectorize/SystemZ/load-store-scalarization-cost.ll =================================================================== --- test/Transforms/LoopVectorize/SystemZ/load-store-scalarization-cost.ll +++ test/Transforms/LoopVectorize/SystemZ/load-store-scalarization-cost.ll @@ -24,10 +24,10 @@ for.end: ret void -; CHECK: LV: Scalarizing: %tmp1 = load i32, i32* %tmp0, align 4 -; CHECK: LV: Scalarizing: store i32 %tmp2, i32* %tmp0, align 4 - ; CHECK: LV: Found an estimated cost of 4 for VF 4 For instruction: %tmp1 = load i32, i32* %tmp0, align 4 ; CHECK: LV: Found an estimated cost of 4 for VF 4 For instruction: store i32 %tmp2, i32* %tmp0, align 4 + +; CHECK: LV: Scalarizing: %tmp1 = load i32, i32* %tmp0, align 4 +; CHECK: LV: Scalarizing: store i32 %tmp2, i32* %tmp0, align 4 }