Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -58,6 +58,7 @@ #include "LoopVectorizationPlanner.h" #include "VPRecipeBuilder.h" #include "VPlanHCFGBuilder.h" +#include "VPlanHCFGTransforms.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" @@ -6294,12 +6295,23 @@ if (VPlanBuildStressTest) return NoVectorization; + // No codegen support for outer loop VPlans for now. + return NoVectorization; + } + + if (UserVF) { + LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); + assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two"); + // Collect the instructions (and their associated costs) that will be more + // profitable to scalarize. + CM.selectUserVectorizationFactor(UserVF); + buildVPlans(UserVF, UserVF); + LLVM_DEBUG(printPlans(dbgs())); return {UserVF, 0}; } - LLVM_DEBUG( - dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " - "VPlan-native path.\n"); + LLVM_DEBUG(dbgs() << "LV: Not vectorizing. Loop is not supported in the " + "VPlan-native path yet.\n"); return NoVectorization; } @@ -6379,7 +6391,20 @@ // 2. Copy and widen instructions from the old loop into the new loop. assert(VPlans.size() == 1 && "Not a single VPlan to execute."); - VPlans.front()->execute(&State); + if (EnableVPlanNativePath) { + VPlanHCFGTransforms::sinkInstructions(VPlans.front(), + Legal->getSinkAfter()); + + VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, TTI, Legal, CM, Builder); + SmallPtrSet DeadInstructions; + collectTriviallyDeadInstructions(DeadInstructions); + + VPlanPtr Widened = VPlanHCFGTransforms::VPInstructionsToVPRecipies( + OrigLoop, VPlans.front(), Legal->getInductionVars(), DeadInstructions); + + Widened->execute(&State); + } else + VPlans.front()->execute(&State); // 3. Fix the vectorized code: take care of header phi's, live-outs, // predication, updating analyses. @@ -7045,11 +7070,9 @@ LoopVectorizationPlanner::VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { - // Outer loop handling: They may require CFG and instruction level - // transformations before even evaluating whether vectorization is profitable. // Since we cannot modify the incoming IR, we need to build VPlan upfront in - // the vectorization pipeline. - assert(!OrigLoop->empty()); + // the vectorization pipeline so we can apply CFG and instruction level + // transformations. assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); // Create new empty VPlan @@ -7059,6 +7082,19 @@ VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI); HCFGBuilder.buildHierarchicalCFG(*Plan.get()); + std::string PlanName; + raw_string_ostream RSO(PlanName); + unsigned VF = Range.Start; + Plan->addVF(VF); + RSO << "Initial VPlan for VF={" << VF; + for (VF *= 2; VF < Range.End; VF *= 2) { + Plan->addVF(VF); + RSO << "," << VF; + } + RSO << "},UF>=1"; + RSO.flush(); + Plan->setName(PlanName); + return Plan; } @@ -7260,11 +7296,20 @@ Hints.getForce() != LoopVectorizeHints::FK_Enabled && F->optForSize(); // Plan how to best vectorize, return the best VF and its cost. - LVP.planInVPlanNativePath(OptForSize, UserVF); + VectorizationFactor VF = LVP.planInVPlanNativePath(OptForSize, UserVF); - // Returning false. We are currently not generating vector code in the VPlan - // native path. - return false; + if (VF.Width < 2) + return false; + + LVP.setBestPlan(VF.Width, 1); + + // If we decided that it is *legal* to vectorize the loop, then do it. + InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, + &CM); + LVP.executePlan(LB, DT); + ++LoopsVectorized; + + return true; } bool LoopVectorizePass::processLoop(Loop *L) { @@ -7330,7 +7375,7 @@ // even evaluating whether vectorization is profitable. Since we cannot modify // the incoming IR, we need to build VPlan upfront in the vectorization // pipeline. - if (!L->empty()) + if (EnableVPlanNativePath) return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, ORE, Hints); Index: lib/Transforms/Vectorize/VPRecipeBuilder.h =================================================================== --- lib/Transforms/Vectorize/VPRecipeBuilder.h +++ lib/Transforms/Vectorize/VPRecipeBuilder.h @@ -115,6 +115,8 @@ bool tryToCreateRecipe(Instruction *Instr, VFRange &Range, VPlanPtr &Plan, VPBasicBlock *VPBB); + void setInsertPoint(VPBasicBlock *VPBB) { Builder.setInsertPoint(VPBB); } + /// Build a VPReplicationRecipe for \p I and enclose it within a Region if it /// is predicated. \return \p VPBB augmented with this new recipe if \p I is /// not predicated, otherwise \return a new VPBasicBlock that succeeds the new Index: lib/Transforms/Vectorize/VPlan.h =================================================================== --- lib/Transforms/Vectorize/VPlan.h +++ lib/Transforms/Vectorize/VPlan.h @@ -1090,6 +1090,8 @@ bool hasVF(unsigned VF) { return VFs.count(VF); } + const SmallSet &getVFs() const { return VFs; } + const std::string &getName() const { return Name; } void setName(const Twine &newName) { Name = newName.str(); } Index: lib/Transforms/Vectorize/VPlanHCFGTransforms.h =================================================================== --- lib/Transforms/Vectorize/VPlanHCFGTransforms.h +++ lib/Transforms/Vectorize/VPlanHCFGTransforms.h @@ -15,22 +15,29 @@ #define LLVM_TRANSFORMS_VECTORIZE_VPLANHCFGTRANSFORMS_H #include "LoopVectorizationPlanner.h" +#include "VPRecipeBuilder.h" #include "VPlan.h" +#include "llvm/ADT/MapVector.h" #include "llvm/IR/Instruction.h" - +#include "llvm/Transforms/Utils/LoopUtils.h" +#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" namespace llvm { class VPlanHCFGTransforms { - using VPlanPtr = std::unique_ptr; public: /// Sinks instructions in \p Plan, depending on their underlying values in /// \p SinkAfter. - // FIXME: Migrate to using a VPlan based mapping, once - // LoopVectorizationLegality::getSinkAfter is moved to VPlan. static void sinkInstructions(VPlanPtr &Plan, DenseMap &SinkAfter); + + /// Creates a new VPlan using VPRecipes from a VPInstruction VPlan + /// \p OriginalPlan + static VPlanPtr VPInstructionsToVPRecipies( + Loop *OrigLoop, VPlanPtr &OriginalPlan, + LoopVectorizationLegality::InductionList *Inductions, + SmallPtrSetImpl &DeadInstructions); }; } // namespace llvm Index: lib/Transforms/Vectorize/VPlanHCFGTransforms.cpp =================================================================== --- lib/Transforms/Vectorize/VPlanHCFGTransforms.cpp +++ lib/Transforms/Vectorize/VPlanHCFGTransforms.cpp @@ -14,6 +14,7 @@ #include "VPlanHCFGTransforms.h" #include "llvm/ADT/PostOrderIterator.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/Support/Debug.h" using namespace llvm; @@ -55,3 +56,88 @@ } } } + +VPlanPtr VPlanHCFGTransforms::VPInstructionsToVPRecipies( + Loop *OrigLoop, VPlanPtr &OriginalPlan, + LoopVectorizationLegality::InductionList *Inductions, + SmallPtrSetImpl &DeadInstructions) { + // Hold a mapping from predicated instructions to their recipes, in order to + // fix their AlsoPack behavior if a user is determined to replicate and use a + // scalar instead of vector value. + DenseMap PredInst2Recipe; + + // Create a dummy pre-entry VPBasicBlock to start building VPBBs for the + // VPlan. + VPBasicBlock *PreEntryVPBB = new VPBasicBlock("Pre-Entry"); + VPBasicBlock *VPBB = PreEntryVPBB; + + VPRegionBlock *TopRegion = dyn_cast(OriginalPlan->getEntry()); + ReversePostOrderTraversal RPOT(TopRegion->getEntry()); + for (VPBlockBase *Base : RPOT) { + VPBasicBlock *OriginalVPBB = Base->getEntryBasicBlock(); + // Skip entry and exit nodes for now. Currently the recipes will take + // care of creating instructions in entry and exit blocks. + if (TopRegion && (OriginalVPBB == TopRegion->getEntry() || + OriginalVPBB == TopRegion->getExit())) + continue; + + auto *FirstVPBBForBB = new VPBasicBlock(OriginalVPBB->getName()); + VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); + VPBB = FirstVPBBForBB; + + std::vector Ingredients; + + // Introduce each ingredient into VPlan. + for (VPRecipeBase &Ingredient : *OriginalVPBB) { + VPInstruction *VPInst = dyn_cast(&Ingredient); + assert(VPInst && "Can only handle VPInstructions."); + Instruction *Inst = dyn_cast(VPInst->getUnderlyingValue()); + if (DeadInstructions.count(Inst) || isa(Inst)) + continue; + + // Create VPWidenMemoryInstructionRecipe for loads and stores. + if (isa(Inst) || isa(Inst)) { + VPBB->appendRecipe( + new VPWidenMemoryInstructionRecipe(*Inst, nullptr /*Mask*/)); + continue; + } + if (PHINode *Phi = dyn_cast(Inst)) { + InductionDescriptor II = Inductions->lookup(Phi); + if (II.getKind() == InductionDescriptor::IK_IntInduction || + II.getKind() == InductionDescriptor::IK_FpInduction) + VPBB->appendRecipe(new VPWidenIntOrFpInductionRecipe(Phi)); + else + VPBB->appendRecipe(new VPWidenPHIRecipe(Phi)); + continue; + } + + // Create VPWidenRecipe to widen this instruction. We optimize the common + // case where consecutive instructions can be represented by a single + // recipe. + if (!VPBB->empty()) { + VPWidenRecipe *LastWidenRecipe = dyn_cast(&VPBB->back()); + if (LastWidenRecipe && LastWidenRecipe->appendInstruction(Inst)) + continue; + } + + VPBB->appendRecipe(new VPWidenRecipe(Inst)); + } + } + + // Create VPlan. + assert(PreEntryVPBB->empty() && "Expecting empty pre-entry block."); + auto Plan = llvm::make_unique(PreEntryVPBB->getSingleSuccessor()); + + std::string PlanName; + raw_string_ostream RSO(PlanName); + RSO << "VPRecipe-based VPlan for VF={"; + for (unsigned VF : OriginalPlan->getVFs()) { + Plan->addVF(VF); + RSO << "," << VF; + } + RSO << "},UF>=1"; + RSO.flush(); + Plan->setName(PlanName); + + return Plan; +} Index: test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll =================================================================== --- test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll +++ test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll @@ -1,5 +1,6 @@ ; REQUIRES: asserts ; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -instcombine -debug-only=loop-vectorize -disable-output -print-after=instcombine 2>&1 | FileCheck %s +; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -enable-vplan-native-path -instcombine -debug-only=loop-vectorize -disable-output -print-after=instcombine 2>&1 | FileCheck %s ; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -enable-interleaved-mem-accesses -instcombine -debug-only=loop-vectorize -disable-output -print-after=instcombine 2>&1 | FileCheck %s --check-prefix=INTER target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" Index: test/Transforms/LoopVectorize/i8-induction.ll =================================================================== --- test/Transforms/LoopVectorize/i8-induction.ll +++ test/Transforms/LoopVectorize/i8-induction.ll @@ -1,4 +1,5 @@ ; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S +; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -enable-vplan-native-path -dce -instcombine -S target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" Index: test/Transforms/LoopVectorize/if-conversion.ll =================================================================== --- test/Transforms/LoopVectorize/if-conversion.ll +++ test/Transforms/LoopVectorize/if-conversion.ll @@ -1,4 +1,5 @@ ; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -enable-if-conversion -dce -instcombine -S | FileCheck %s +; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -enable-vplan-native-path -enable-if-conversion -dce -instcombine -S | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" Index: test/Transforms/LoopVectorize/increment.ll =================================================================== --- test/Transforms/LoopVectorize/increment.ll +++ test/Transforms/LoopVectorize/increment.ll @@ -1,3 +1,4 @@ +; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -enable-vplan-native-path -dce -instcombine -S | FileCheck %s ; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" Index: test/Transforms/LoopVectorize/induction.ll =================================================================== --- test/Transforms/LoopVectorize/induction.ll +++ test/Transforms/LoopVectorize/induction.ll @@ -1,4 +1,5 @@ ; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -S | FileCheck %s +; RUN: opt < %s -enable-vplan-native-path -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -S | FileCheck %s ; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -instcombine -S | FileCheck %s --check-prefix=IND ; RUN: opt < %s -loop-vectorize -force-vector-interleave=2 -force-vector-width=2 -instcombine -S | FileCheck %s --check-prefix=UNROLL ; RUN: opt < %s -loop-vectorize -force-vector-interleave=2 -force-vector-width=2 -S | FileCheck %s --check-prefix=UNROLL-NO-IC Index: test/Transforms/LoopVectorize/induction_plus.ll =================================================================== --- test/Transforms/LoopVectorize/induction_plus.ll +++ test/Transforms/LoopVectorize/induction_plus.ll @@ -1,4 +1,5 @@ ; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S | FileCheck %s +; RUN: opt < %s -enable-vplan-native-path -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" Index: test/Transforms/LoopVectorize/loop-scalars.ll =================================================================== --- test/Transforms/LoopVectorize/loop-scalars.ll +++ test/Transforms/LoopVectorize/loop-scalars.ll @@ -1,5 +1,6 @@ ; REQUIRES: asserts ; RUN: opt < %s -loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -instcombine -debug-only=loop-vectorize -disable-output -print-after=instcombine 2>&1 | FileCheck %s +; RUN: opt < %s -loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -enable-vplan-native-path -instcombine -debug-only=loop-vectorize -disable-output -print-after=instcombine 2>&1 | FileCheck %s target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" Index: test/Transforms/LoopVectorize/minmax_reduction.ll =================================================================== --- test/Transforms/LoopVectorize/minmax_reduction.ll +++ test/Transforms/LoopVectorize/minmax_reduction.ll @@ -1,4 +1,5 @@ -; RUN: opt -S -loop-vectorize -dce -instcombine -force-vector-width=2 -force-vector-interleave=1 < %s | FileCheck %s +; RUN: opt -S -loop-vectorize -dce -instcombine -force-vector-width=2 -force-vector-interleave=1 < %s | FileCheck %s +; RUN: opt -S -loop-vectorize -dce -instcombine -force-vector-width=2 -force-vector-interleave=1 -enable-vplan-native-path < %s | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"