Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -428,6 +428,11 @@ /// new unrolled loop, where UF is the unroll factor. using VectorParts = SmallVector; + /// Vectorize a single GetElementPtrInst based on information gathered and + /// decisions taken during planning. + void widenGEP(GetElementPtrInst *GEP, unsigned UF, unsigned VF, + bool IsPtrLoopInvariant, SmallBitVector &IsIndexLoopInvariant); + /// Vectorize a single PHINode in a block. This method handles the induction /// variable canonicalization. It supports both VF = 1 for unrolled loops and /// arbitrary length vectors. @@ -484,7 +489,7 @@ /// Vectorize Load and Store instructions, optionally masking the vector /// operations if \p BlockInMask is non-null. - void vectorizeMemoryInstruction(Instruction *Instr, + void vectorizeMemoryInstruction(Instruction *Instr, Value *Ptr, bool InBounds, VectorParts *BlockInMask = nullptr); /// Set the debug location in the builder using the debug location in @@ -2343,6 +2348,7 @@ } void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, + Value *Ptr, bool InBounds, VectorParts *BlockInMask) { // Attempt to issue a wide load. LoadInst *LI = dyn_cast(Instr); @@ -2359,7 +2365,6 @@ Type *ScalarDataTy = getMemInstValueType(Instr); Type *DataTy = VectorType::get(ScalarDataTy, VF); - Value *Ptr = getLoadStorePointerOperand(Instr); // An alignment of 0 means target abi alignment. We need to use the scalar's // target abi alignment in such a case. const DataLayout &DL = Instr->getModule()->getDataLayout(); @@ -2389,11 +2394,6 @@ if (isMaskRequired) Mask = *BlockInMask; - bool InBounds = false; - if (auto *gep = dyn_cast( - getLoadStorePointerOperand(Instr)->stripPointerCasts())) - InBounds = gep->isInBounds(); - const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { // Calculate the pointer for the specific unroll-part. GetElementPtrInst *PartPtr = nullptr; @@ -3961,6 +3961,75 @@ } } +void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, unsigned UF, + unsigned VF, bool IsPtrLoopInvariant, + SmallBitVector &IsIndexLoopInvariant) { + // Construct a vector GEP by widening the operands of the scalar GEP as + // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP + // results in a vector of pointers when at least one operand of the GEP + // is vector-typed. Thus, to keep the representation compact, we only use + // vector-typed operands for loop-varying values. + + if (VF > 1 && IsIndexLoopInvariant.all()) { + // If we are vectorizing, but the GEP has only loop-invariant operands, + // the GEP we build (by only using vector-typed operands for + // loop-varying values) would be a scalar pointer. Thus, to ensure we + // produce a vector of pointers, we need to either arbitrarily pick an + // operand to broadcast, or broadcast a clone of the original GEP. + // Here, we broadcast a clone of the original. + // + // TODO: If at some point we decide to scalarize instructions having + // loop-invariant operands, this special case will no longer be + // required. We would add the scalarization decision to + // collectLoopScalars() and teach getVectorValue() to broadcast + // the lane-zero scalar value. + auto *Clone = Builder.Insert(GEP->clone()); + for (unsigned Part = 0; Part < UF; ++Part) { + Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); + VectorLoopValueMap.setVectorValue(GEP, Part, EntryPart); + addMetadata(EntryPart, GEP); + } + } else { + // If the GEP has at least one loop-varying operand, we are sure to + // produce a vector of pointers. But if we are only unrolling, we want + // to produce a scalar GEP for each unroll part. Thus, the GEP we + // produce with the code below will be scalar (if VF == 1) or vector + // (otherwise). Note that for the unroll-only case, we still maintain + // values in the vector mapping with initVector, as we do for other + // instructions. + for (unsigned Part = 0; Part < UF; ++Part) { + // The pointer operand of the new GEP. If it's loop-invariant, we + // won't broadcast it. + auto *Ptr = IsPtrLoopInvariant + ? GEP->getPointerOperand() + : getOrCreateVectorValue(GEP->getPointerOperand(), Part); + + // Collect all the indices for the new GEP. If any index is + // loop-invariant, we won't broadcast it. + SmallVector Indices; + unsigned I = 0; + for (auto &U : make_range(GEP->idx_begin(), GEP->idx_end())) { + if (IsIndexLoopInvariant[I++]) + Indices.push_back(U.get()); + else + Indices.push_back(getOrCreateVectorValue(U.get(), Part)); + } + + // Create the new GEP. Note that this GEP may be a scalar if VF == 1, + // but it should be a vector, otherwise. + auto *NewGEP = + GEP->isInBounds() + ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, + Indices) + : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); + assert((VF == 1 || NewGEP->getType()->isVectorTy()) && + "NewGEP is not a pointer vector"); + VectorLoopValueMap.setVectorValue(GEP, Part, NewGEP); + addMetadata(NewGEP, GEP); + } + } +} + void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF) { PHINode *P = cast(PN); @@ -4063,76 +4132,8 @@ switch (I.getOpcode()) { case Instruction::Br: case Instruction::PHI: + case Instruction::GetElementPtr: llvm_unreachable("This instruction is handled by a different recipe."); - case Instruction::GetElementPtr: { - // Construct a vector GEP by widening the operands of the scalar GEP as - // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP - // results in a vector of pointers when at least one operand of the GEP - // is vector-typed. Thus, to keep the representation compact, we only use - // vector-typed operands for loop-varying values. - auto *GEP = cast(&I); - - if (VF > 1 && OrigLoop->hasLoopInvariantOperands(GEP)) { - // If we are vectorizing, but the GEP has only loop-invariant operands, - // the GEP we build (by only using vector-typed operands for - // loop-varying values) would be a scalar pointer. Thus, to ensure we - // produce a vector of pointers, we need to either arbitrarily pick an - // operand to broadcast, or broadcast a clone of the original GEP. - // Here, we broadcast a clone of the original. - // - // TODO: If at some point we decide to scalarize instructions having - // loop-invariant operands, this special case will no longer be - // required. We would add the scalarization decision to - // collectLoopScalars() and teach getVectorValue() to broadcast - // the lane-zero scalar value. - auto *Clone = Builder.Insert(GEP->clone()); - for (unsigned Part = 0; Part < UF; ++Part) { - Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); - VectorLoopValueMap.setVectorValue(&I, Part, EntryPart); - addMetadata(EntryPart, GEP); - } - } else { - // If the GEP has at least one loop-varying operand, we are sure to - // produce a vector of pointers. But if we are only unrolling, we want - // to produce a scalar GEP for each unroll part. Thus, the GEP we - // produce with the code below will be scalar (if VF == 1) or vector - // (otherwise). Note that for the unroll-only case, we still maintain - // values in the vector mapping with initVector, as we do for other - // instructions. - for (unsigned Part = 0; Part < UF; ++Part) { - // The pointer operand of the new GEP. If it's loop-invariant, we - // won't broadcast it. - auto *Ptr = - OrigLoop->isLoopInvariant(GEP->getPointerOperand()) - ? GEP->getPointerOperand() - : getOrCreateVectorValue(GEP->getPointerOperand(), Part); - - // Collect all the indices for the new GEP. If any index is - // loop-invariant, we won't broadcast it. - SmallVector Indices; - for (auto &U : make_range(GEP->idx_begin(), GEP->idx_end())) { - if (OrigLoop->isLoopInvariant(U.get())) - Indices.push_back(U.get()); - else - Indices.push_back(getOrCreateVectorValue(U.get(), Part)); - } - - // Create the new GEP. Note that this GEP may be a scalar if VF == 1, - // but it should be a vector, otherwise. - auto *NewGEP = - GEP->isInBounds() - ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, - Indices) - : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); - assert((VF == 1 || NewGEP->getType()->isVectorTy()) && - "NewGEP is not a pointer vector"); - VectorLoopValueMap.setVectorValue(&I, Part, NewGEP); - addMetadata(NewGEP, GEP); - } - } - - break; - } case Instruction::UDiv: case Instruction::SDiv: case Instruction::SRem: @@ -6857,7 +6858,6 @@ case Instruction::FPTrunc: case Instruction::FRem: case Instruction::FSub: - case Instruction::GetElementPtr: case Instruction::ICmp: case Instruction::IntToPtr: case Instruction::Load: @@ -6923,7 +6923,9 @@ if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) return false; - // Success: widen this instruction. We optimize the common case where + // Success: widen this instruction. + + // Use the default widening recipe. We optimize the common case where // consecutive instructions can be represented by a single recipe. if (!VPBB->empty()) { VPWidenRecipe *LastWidenRecipe = dyn_cast(&VPBB->back()); @@ -7032,6 +7034,19 @@ return true; } + // Handle GEP widening. + if (GetElementPtrInst *GEP = dyn_cast(Instr)) { + auto Scalarize = [&](unsigned VF) { + return CM.isScalarWithPredication(Instr, VF) || + CM.isScalarAfterVectorization(Instr, VF) || + CM.isProfitableToScalarize(Instr, VF); + }; + if (LoopVectorizationPlanner::getDecisionAndClampRange(Scalarize, Range)) + return false; + VPBB->appendRecipe(new VPWidenGEPRecipe(GEP, OrigLoop)); + return true; + } + // Check if Instr is to be widened by a general VPWidenRecipe, after // having first checked for specific widening recipes that deal with // Interleave Groups, Inductions and Phi nodes. @@ -7252,7 +7267,7 @@ SmallPtrSet DeadInstructions; VPlanHCFGTransforms::VPInstructionsToVPRecipes( - Plan, Legal->getInductionVars(), DeadInstructions); + OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions); return Plan; } @@ -7282,6 +7297,11 @@ State.ILV->widenInstruction(Instr); } +void VPWidenGEPRecipe::execute(VPTransformState &State) { + State.ILV->widenGEP(GEP, State.UF, State.VF, IsPtrLoopInvariant, + IsIndexLoopInvariant); +} + void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { assert(!State.Instance && "Int or FP induction being replicated."); State.ILV->widenIntOrFpInduction(IV, Trunc); @@ -7428,14 +7448,14 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { if (!User) - return State.ILV->vectorizeMemoryInstruction(&Instr); + return State.ILV->vectorizeMemoryInstruction(&Instr, Ptr, InBounds); // Last (and currently only) operand is a mask. InnerLoopVectorizer::VectorParts MaskValues(State.UF); VPValue *Mask = User->getOperand(User->getNumOperands() - 1); for (unsigned Part = 0; Part < State.UF; ++Part) MaskValues[Part] = State.get(Mask, Part); - State.ILV->vectorizeMemoryInstruction(&Instr, &MaskValues); + State.ILV->vectorizeMemoryInstruction(&Instr, Ptr, InBounds, &MaskValues); } static ScalarEpilogueLowering Index: llvm/lib/Transforms/Vectorize/VPlan.h =================================================================== --- llvm/lib/Transforms/Vectorize/VPlan.h +++ llvm/lib/Transforms/Vectorize/VPlan.h @@ -31,6 +31,7 @@ #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/GraphTraits.h" #include "llvm/ADT/Optional.h" +#include "llvm/ADT/SmallBitVector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" @@ -586,6 +587,7 @@ VPInterleaveSC, VPPredInstPHISC, VPReplicateSC, + VPWidenGEPSC, VPWidenIntOrFpInductionSC, VPWidenMemoryInstructionSC, VPWidenPHISC, @@ -740,6 +742,36 @@ void print(raw_ostream &O, const Twine &Indent) const override; }; +/// A recipe for handling GEP instructions. +class VPWidenGEPRecipe : public VPRecipeBase { +private: + GetElementPtrInst *GEP; + bool IsPtrLoopInvariant; + SmallBitVector IsIndexLoopInvariant; + +public: + VPWidenGEPRecipe(GetElementPtrInst *GEP, Loop *OrigLoop) + : VPRecipeBase(VPWidenGEPSC), GEP(GEP), + IsIndexLoopInvariant(GEP->getNumIndices(), false) { + IsPtrLoopInvariant = OrigLoop->isLoopInvariant(GEP->getPointerOperand()); + unsigned I = 0; + for (auto &U : make_range(GEP->idx_begin(), GEP->idx_end())) + IsIndexLoopInvariant[I++] = OrigLoop->isLoopInvariant(U.get()); + } + ~VPWidenGEPRecipe() override = default; + + /// Method to support type inquiry through isa, cast, and dyn_cast. + static inline bool classof(const VPRecipeBase *V) { + return V->getVPRecipeID() == VPRecipeBase::VPWidenGEPSC; + } + + /// Generate the gep nodes. + void execute(VPTransformState &State) override; + + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent) const override; +}; + /// A recipe for handling phi nodes of integer and floating-point inductions, /// producing their vector and scalar values. class VPWidenIntOrFpInductionRecipe : public VPRecipeBase { @@ -960,12 +992,17 @@ private: Instruction &Instr; std::unique_ptr User; + bool InBounds = false; + Value *Ptr; public: VPWidenMemoryInstructionRecipe(Instruction &Instr, VPValue *Mask) : VPRecipeBase(VPWidenMemoryInstructionSC), Instr(Instr) { if (Mask) // Create a VPInstruction to register as a user of the mask. User.reset(new VPUser({Mask})); + Ptr = getLoadStorePointerOperand(&Instr); + if (auto *GEP = dyn_cast(Ptr->stripPointerCasts())) + InBounds = GEP->isInBounds(); } /// Method to support type inquiry through isa, cast, and dyn_cast. Index: llvm/lib/Transforms/Vectorize/VPlan.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/VPlan.cpp +++ llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -661,6 +661,10 @@ O << " " << VPlanIngredient(IV) << "\\l\""; } +void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent) const { + O << " +\n" << Indent << "\"WIDEN-GEP " << VPlanIngredient(GEP) << "\\l\""; +} + void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent) const { O << " +\n" << Indent << "\"WIDEN-PHI " << VPlanIngredient(Phi) << "\\l\""; } Index: llvm/lib/Transforms/Vectorize/VPlanHCFGTransforms.h =================================================================== --- llvm/lib/Transforms/Vectorize/VPlanHCFGTransforms.h +++ llvm/lib/Transforms/Vectorize/VPlanHCFGTransforms.h @@ -25,7 +25,7 @@ /// Replaces the VPInstructions in \p Plan with corresponding /// widen recipes. static void VPInstructionsToVPRecipes( - VPlanPtr &Plan, + Loop *OrigLoop, VPlanPtr &Plan, LoopVectorizationLegality::InductionList *Inductions, SmallPtrSetImpl &DeadInstructions); }; Index: llvm/lib/Transforms/Vectorize/VPlanHCFGTransforms.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/VPlanHCFGTransforms.cpp +++ llvm/lib/Transforms/Vectorize/VPlanHCFGTransforms.cpp @@ -17,7 +17,7 @@ using namespace llvm; void VPlanHCFGTransforms::VPInstructionsToVPRecipes( - VPlanPtr &Plan, + Loop *OrigLoop, VPlanPtr &Plan, LoopVectorizationLegality::InductionList *Inductions, SmallPtrSetImpl &DeadInstructions) { @@ -64,6 +64,8 @@ NewRecipe = new VPWidenIntOrFpInductionRecipe(Phi); } else NewRecipe = new VPWidenPHIRecipe(Phi); + } else if (GetElementPtrInst *GEP = dyn_cast(Inst)) { + NewRecipe = new VPWidenGEPRecipe(GEP, OrigLoop); } else { // If the last recipe is a VPWidenRecipe, add Inst to it instead of // creating a new recipe. Index: llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp =================================================================== --- llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp +++ llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp @@ -89,8 +89,8 @@ LoopVectorizationLegality::InductionList Inductions; SmallPtrSet DeadInstructions; - VPlanHCFGTransforms::VPInstructionsToVPRecipes(Plan, &Inductions, - DeadInstructions); + VPlanHCFGTransforms::VPInstructionsToVPRecipes( + LI->getLoopFor(LoopHeader), Plan, &Inductions, DeadInstructions); } TEST_F(VPlanHCFGTest, testVPInstructionToVPRecipesInner) { @@ -119,8 +119,8 @@ LoopVectorizationLegality::InductionList Inductions; SmallPtrSet DeadInstructions; - VPlanHCFGTransforms::VPInstructionsToVPRecipes(Plan, &Inductions, - DeadInstructions); + VPlanHCFGTransforms::VPInstructionsToVPRecipes( + LI->getLoopFor(LoopHeader), Plan, &Inductions, DeadInstructions); VPBlockBase *Entry = Plan->getEntry()->getEntryBasicBlock(); EXPECT_NE(nullptr, Entry->getSingleSuccessor()); @@ -136,7 +136,7 @@ auto *Phi = dyn_cast(&*Iter++); EXPECT_NE(nullptr, Phi); - auto *Idx = dyn_cast(&*Iter++); + auto *Idx = dyn_cast(&*Iter++); EXPECT_NE(nullptr, Idx); auto *Load = dyn_cast(&*Iter++);