Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -428,6 +428,11 @@ /// new unrolled loop, where UF is the unroll factor. using VectorParts = SmallVector; + /// Vectorize a single GetElementPtrInst based on information gathered and + /// decisions taken during planning. + void widenGEP(GetElementPtrInst *GEP, unsigned UF, unsigned VF, + bool IsPtrLoopInvariant, SmallBitVector &IsIndexLoopInvariant); + /// Vectorize a single PHINode in a block. This method handles the induction /// variable canonicalization. It supports both VF = 1 for unrolled loops and /// arbitrary length vectors. @@ -3961,6 +3966,75 @@ } } +void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, unsigned UF, + unsigned VF, bool IsPtrLoopInvariant, + SmallBitVector &IsIndexLoopInvariant) { + // Construct a vector GEP by widening the operands of the scalar GEP as + // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP + // results in a vector of pointers when at least one operand of the GEP + // is vector-typed. Thus, to keep the representation compact, we only use + // vector-typed operands for loop-varying values. + + if (VF > 1 && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { + // If we are vectorizing, but the GEP has only loop-invariant operands, + // the GEP we build (by only using vector-typed operands for + // loop-varying values) would be a scalar pointer. Thus, to ensure we + // produce a vector of pointers, we need to either arbitrarily pick an + // operand to broadcast, or broadcast a clone of the original GEP. + // Here, we broadcast a clone of the original. + // + // TODO: If at some point we decide to scalarize instructions having + // loop-invariant operands, this special case will no longer be + // required. We would add the scalarization decision to + // collectLoopScalars() and teach getVectorValue() to broadcast + // the lane-zero scalar value. + auto *Clone = Builder.Insert(GEP->clone()); + for (unsigned Part = 0; Part < UF; ++Part) { + Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); + VectorLoopValueMap.setVectorValue(GEP, Part, EntryPart); + addMetadata(EntryPart, GEP); + } + } else { + // If the GEP has at least one loop-varying operand, we are sure to + // produce a vector of pointers. But if we are only unrolling, we want + // to produce a scalar GEP for each unroll part. Thus, the GEP we + // produce with the code below will be scalar (if VF == 1) or vector + // (otherwise). Note that for the unroll-only case, we still maintain + // values in the vector mapping with initVector, as we do for other + // instructions. + for (unsigned Part = 0; Part < UF; ++Part) { + // The pointer operand of the new GEP. If it's loop-invariant, we + // won't broadcast it. + auto *Ptr = IsPtrLoopInvariant + ? GEP->getPointerOperand() + : getOrCreateVectorValue(GEP->getPointerOperand(), Part); + + // Collect all the indices for the new GEP. If any index is + // loop-invariant, we won't broadcast it. + SmallVector Indices; + for (auto Index : enumerate(GEP->indices())) { + Value *User = Index.value().get(); + if (IsIndexLoopInvariant[Index.index()]) + Indices.push_back(User); + else + Indices.push_back(getOrCreateVectorValue(User, Part)); + } + + // Create the new GEP. Note that this GEP may be a scalar if VF == 1, + // but it should be a vector, otherwise. + auto *NewGEP = + GEP->isInBounds() + ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, + Indices) + : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); + assert((VF == 1 || NewGEP->getType()->isVectorTy()) && + "NewGEP is not a pointer vector"); + VectorLoopValueMap.setVectorValue(GEP, Part, NewGEP); + addMetadata(NewGEP, GEP); + } + } +} + void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF) { PHINode *P = cast(PN); @@ -4063,76 +4137,8 @@ switch (I.getOpcode()) { case Instruction::Br: case Instruction::PHI: + case Instruction::GetElementPtr: llvm_unreachable("This instruction is handled by a different recipe."); - case Instruction::GetElementPtr: { - // Construct a vector GEP by widening the operands of the scalar GEP as - // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP - // results in a vector of pointers when at least one operand of the GEP - // is vector-typed. Thus, to keep the representation compact, we only use - // vector-typed operands for loop-varying values. - auto *GEP = cast(&I); - - if (VF > 1 && OrigLoop->hasLoopInvariantOperands(GEP)) { - // If we are vectorizing, but the GEP has only loop-invariant operands, - // the GEP we build (by only using vector-typed operands for - // loop-varying values) would be a scalar pointer. Thus, to ensure we - // produce a vector of pointers, we need to either arbitrarily pick an - // operand to broadcast, or broadcast a clone of the original GEP. - // Here, we broadcast a clone of the original. - // - // TODO: If at some point we decide to scalarize instructions having - // loop-invariant operands, this special case will no longer be - // required. We would add the scalarization decision to - // collectLoopScalars() and teach getVectorValue() to broadcast - // the lane-zero scalar value. - auto *Clone = Builder.Insert(GEP->clone()); - for (unsigned Part = 0; Part < UF; ++Part) { - Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); - VectorLoopValueMap.setVectorValue(&I, Part, EntryPart); - addMetadata(EntryPart, GEP); - } - } else { - // If the GEP has at least one loop-varying operand, we are sure to - // produce a vector of pointers. But if we are only unrolling, we want - // to produce a scalar GEP for each unroll part. Thus, the GEP we - // produce with the code below will be scalar (if VF == 1) or vector - // (otherwise). Note that for the unroll-only case, we still maintain - // values in the vector mapping with initVector, as we do for other - // instructions. - for (unsigned Part = 0; Part < UF; ++Part) { - // The pointer operand of the new GEP. If it's loop-invariant, we - // won't broadcast it. - auto *Ptr = - OrigLoop->isLoopInvariant(GEP->getPointerOperand()) - ? GEP->getPointerOperand() - : getOrCreateVectorValue(GEP->getPointerOperand(), Part); - - // Collect all the indices for the new GEP. If any index is - // loop-invariant, we won't broadcast it. - SmallVector Indices; - for (auto &U : make_range(GEP->idx_begin(), GEP->idx_end())) { - if (OrigLoop->isLoopInvariant(U.get())) - Indices.push_back(U.get()); - else - Indices.push_back(getOrCreateVectorValue(U.get(), Part)); - } - - // Create the new GEP. Note that this GEP may be a scalar if VF == 1, - // but it should be a vector, otherwise. - auto *NewGEP = - GEP->isInBounds() - ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, - Indices) - : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); - assert((VF == 1 || NewGEP->getType()->isVectorTy()) && - "NewGEP is not a pointer vector"); - VectorLoopValueMap.setVectorValue(&I, Part, NewGEP); - addMetadata(NewGEP, GEP); - } - } - - break; - } case Instruction::UDiv: case Instruction::SDiv: case Instruction::SRem: @@ -6831,7 +6837,6 @@ case Instruction::FPTrunc: case Instruction::FRem: case Instruction::FSub: - case Instruction::GetElementPtr: case Instruction::ICmp: case Instruction::IntToPtr: case Instruction::Load: @@ -6896,12 +6901,13 @@ if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) return false; - // If this ingredient's recipe is to be recorded, keep its recipe a singleton // to avoid having to split recipes later. bool IsSingleton = Ingredient2Recipe.count(I); - // Success: widen this instruction. We optimize the common case where + // Success: widen this instruction. + + // Use the default widening recipe. We optimize the common case where // consecutive instructions can be represented by a single recipe. if (!IsSingleton && !VPBB->empty() && LastExtensibleRecipe == &VPBB->back() && LastExtensibleRecipe->appendInstruction(I)) @@ -6999,7 +7005,23 @@ return true; } - // Check if Instr is to be widened by a general VPWidenRecipe. + // Handle GEP widening. + if (GetElementPtrInst *GEP = dyn_cast(Instr)) { + auto Scalarize = [&](unsigned VF) { + return CM.isScalarWithPredication(Instr, VF) || + CM.isScalarAfterVectorization(Instr, VF) || + CM.isProfitableToScalarize(Instr, VF); + }; + if (LoopVectorizationPlanner::getDecisionAndClampRange(Scalarize, Range)) + return false; + VPWidenGEPRecipe *Recipe = new VPWidenGEPRecipe(GEP, OrigLoop); + setRecipe(Instr, Recipe); + VPBB->appendRecipe(Recipe); + return true; + } + + // Check if Instr is to be widened by a general VPWidenRecipe, after + // having first checked for specific widening recipes. if (tryToWiden(Instr, VPBB, Range)) return true; @@ -7241,7 +7263,7 @@ SmallPtrSet DeadInstructions; VPlanHCFGTransforms::VPInstructionsToVPRecipes( - Plan, Legal->getInductionVars(), DeadInstructions); + OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions); return Plan; } @@ -7271,6 +7293,11 @@ State.ILV->widenInstruction(Instr); } +void VPWidenGEPRecipe::execute(VPTransformState &State) { + State.ILV->widenGEP(GEP, State.UF, State.VF, IsPtrLoopInvariant, + IsIndexLoopInvariant); +} + void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { assert(!State.Instance && "Int or FP induction being replicated."); State.ILV->widenIntOrFpInduction(IV, Trunc); Index: llvm/lib/Transforms/Vectorize/VPlan.h =================================================================== --- llvm/lib/Transforms/Vectorize/VPlan.h +++ llvm/lib/Transforms/Vectorize/VPlan.h @@ -31,6 +31,7 @@ #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/GraphTraits.h" #include "llvm/ADT/Optional.h" +#include "llvm/ADT/SmallBitVector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" @@ -587,6 +588,7 @@ VPInterleaveSC, VPPredInstPHISC, VPReplicateSC, + VPWidenGEPSC, VPWidenIntOrFpInductionSC, VPWidenMemoryInstructionSC, VPWidenPHISC, @@ -749,6 +751,36 @@ void print(raw_ostream &O, const Twine &Indent) const override; }; +/// A recipe for handling GEP instructions. +class VPWidenGEPRecipe : public VPRecipeBase { +private: + GetElementPtrInst *GEP; + bool IsPtrLoopInvariant; + SmallBitVector IsIndexLoopInvariant; + +public: + VPWidenGEPRecipe(GetElementPtrInst *GEP, Loop *OrigLoop) + : VPRecipeBase(VPWidenGEPSC), GEP(GEP), + IsIndexLoopInvariant(GEP->getNumIndices(), false) { + IsPtrLoopInvariant = OrigLoop->isLoopInvariant(GEP->getPointerOperand()); + for (auto Index : enumerate(GEP->indices())) + IsIndexLoopInvariant[Index.index()] = + OrigLoop->isLoopInvariant(Index.value().get()); + } + ~VPWidenGEPRecipe() override = default; + + /// Method to support type inquiry through isa, cast, and dyn_cast. + static inline bool classof(const VPRecipeBase *V) { + return V->getVPRecipeID() == VPRecipeBase::VPWidenGEPSC; + } + + /// Generate the gep nodes. + void execute(VPTransformState &State) override; + + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent) const override; +}; + /// A recipe for handling phi nodes of integer and floating-point inductions, /// producing their vector and scalar values. class VPWidenIntOrFpInductionRecipe : public VPRecipeBase { Index: llvm/lib/Transforms/Vectorize/VPlan.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/VPlan.cpp +++ llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -683,6 +683,16 @@ O << " " << VPlanIngredient(IV) << "\\l\""; } +void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent) const { + O << " +\n" << Indent << "\"WIDEN-GEP "; + O << (IsPtrLoopInvariant ? "Inv" : "Var"); + size_t IndicesNumber = IsIndexLoopInvariant.size(); + for (size_t I = 0; I < IndicesNumber; ++I) + O << "[" << (IsIndexLoopInvariant[I] ? "Inv" : "Var") << "]"; + O << "\\l\""; + O << " +\n" << Indent << "\" " << VPlanIngredient(GEP) << "\\l\""; +} + void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent) const { O << " +\n" << Indent << "\"WIDEN-PHI " << VPlanIngredient(Phi) << "\\l\""; } Index: llvm/lib/Transforms/Vectorize/VPlanHCFGTransforms.h =================================================================== --- llvm/lib/Transforms/Vectorize/VPlanHCFGTransforms.h +++ llvm/lib/Transforms/Vectorize/VPlanHCFGTransforms.h @@ -25,7 +25,7 @@ /// Replaces the VPInstructions in \p Plan with corresponding /// widen recipes. static void VPInstructionsToVPRecipes( - VPlanPtr &Plan, + Loop *OrigLoop, VPlanPtr &Plan, LoopVectorizationLegality::InductionList *Inductions, SmallPtrSetImpl &DeadInstructions); }; Index: llvm/lib/Transforms/Vectorize/VPlanHCFGTransforms.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/VPlanHCFGTransforms.cpp +++ llvm/lib/Transforms/Vectorize/VPlanHCFGTransforms.cpp @@ -17,7 +17,7 @@ using namespace llvm; void VPlanHCFGTransforms::VPInstructionsToVPRecipes( - VPlanPtr &Plan, + Loop *OrigLoop, VPlanPtr &Plan, LoopVectorizationLegality::InductionList *Inductions, SmallPtrSetImpl &DeadInstructions) { @@ -64,6 +64,8 @@ NewRecipe = new VPWidenIntOrFpInductionRecipe(Phi); } else NewRecipe = new VPWidenPHIRecipe(Phi); + } else if (GetElementPtrInst *GEP = dyn_cast(Inst)) { + NewRecipe = new VPWidenGEPRecipe(GEP, OrigLoop); } else { // If the last recipe is a VPWidenRecipe, add Inst to it instead of // creating a new recipe. Index: llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp =================================================================== --- llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp +++ llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp @@ -89,8 +89,8 @@ LoopVectorizationLegality::InductionList Inductions; SmallPtrSet DeadInstructions; - VPlanHCFGTransforms::VPInstructionsToVPRecipes(Plan, &Inductions, - DeadInstructions); + VPlanHCFGTransforms::VPInstructionsToVPRecipes( + LI->getLoopFor(LoopHeader), Plan, &Inductions, DeadInstructions); } TEST_F(VPlanHCFGTest, testVPInstructionToVPRecipesInner) { @@ -119,8 +119,8 @@ LoopVectorizationLegality::InductionList Inductions; SmallPtrSet DeadInstructions; - VPlanHCFGTransforms::VPInstructionsToVPRecipes(Plan, &Inductions, - DeadInstructions); + VPlanHCFGTransforms::VPInstructionsToVPRecipes( + LI->getLoopFor(LoopHeader), Plan, &Inductions, DeadInstructions); VPBlockBase *Entry = Plan->getEntry()->getEntryBasicBlock(); EXPECT_NE(nullptr, Entry->getSingleSuccessor()); @@ -136,7 +136,7 @@ auto *Phi = dyn_cast(&*Iter++); EXPECT_NE(nullptr, Phi); - auto *Idx = dyn_cast(&*Iter++); + auto *Idx = dyn_cast(&*Iter++); EXPECT_NE(nullptr, Idx); auto *Load = dyn_cast(&*Iter++);