Index: llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -214,6 +214,8 @@ VPCallbackILV(InnerLoopVectorizer &ILV) : ILV(ILV) {} Value *getOrCreateVectorValues(Value *V, unsigned Part) override; + Value *getOrCreateScalarValue(Value *V, + const VPIteration &Instance) override; }; /// A builder used to construct the current plan. Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -482,15 +482,18 @@ /// Construct the vector value of a scalarized value \p V one lane at a time. void packScalarIntoVectorValue(Value *V, const VPIteration &Instance); - /// Try to vectorize the interleaved access group that \p Instr belongs to, - /// optionally masking the vector operations if \p BlockInMask is non-null. - void vectorizeInterleaveGroup(Instruction *Instr, - VectorParts *BlockInMask = nullptr); + /// Try to vectorize the interleaved access group that \p Instr belongs to + /// with the base address given in \p Addr, optionally masking the vector + /// operations if \p BlockInMask is non-null. Use \p State to translate given + /// VPValues to IR values in the vectorized loop. + void vectorizeInterleaveGroup(Instruction *Instr, VPTransformState &State, + VPValue *Addr, VPValue *BlockInMask = nullptr); /// Vectorize Load and Store instructions, optionally masking the vector /// operations if \p BlockInMask is non-null. - void vectorizeMemoryInstruction(Instruction *Instr, - VectorParts *BlockInMask = nullptr); + void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State, + VPValue *Addr, + VPValue *BlockInMask = nullptr); /// Set the debug location in the builder using the debug location in /// the instruction. @@ -2166,7 +2169,9 @@ // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr, - VectorParts *BlockInMask) { + VPTransformState &State, + VPValue *Addr, + VPValue *BlockInMask) { const InterleaveGroup *Group = Cost->getInterleavedAccessGroup(Instr); assert(Group && "Fail to get an interleaved access group."); @@ -2176,23 +2181,21 @@ return; const DataLayout &DL = Instr->getModule()->getDataLayout(); - Value *Ptr = getLoadStorePointerOperand(Instr); // Prepare for the vector type of the interleaved load/store. Type *ScalarTy = getMemInstValueType(Instr); unsigned InterleaveFactor = Group->getFactor(); Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF); - Type *PtrTy = VecTy->getPointerTo(getLoadStoreAddressSpace(Instr)); // Prepare for the new pointers. - setDebugLocFromInst(Builder, Ptr); SmallVector NewPtrs; unsigned Index = Group->getIndex(Instr); - VectorParts Mask; + VectorParts Mask(UF); bool IsMaskForCondRequired = BlockInMask; if (IsMaskForCondRequired) { - Mask = *BlockInMask; + for (unsigned Part = 0; Part < UF; ++Part) + Mask[Part] = State.get(BlockInMask, Part); // TODO: extend the masked interleaved-group support to reversed access. assert(!Group->isReverse() && "Reversed masked interleave-group " "not supported."); @@ -2207,12 +2210,9 @@ if (Group->isReverse()) Index += (VF - 1) * Group->getFactor(); - bool InBounds = false; - if (auto *gep = dyn_cast(Ptr->stripPointerCasts())) - InBounds = gep->isInBounds(); - for (unsigned Part = 0; Part < UF; Part++) { - Value *NewPtr = getOrCreateScalarValue(Ptr, {Part, 0}); + Value *NewPtr = State.get(Addr, {Part, 0}); + setDebugLocFromInst(Builder, NewPtr); // Notice current instruction could be any index. Need to adjust the address // to the member of index 0. @@ -2225,11 +2225,16 @@ // A[i] = b; // Member of index 0 // A[i+2] = c; // Member of index 2 (Current instruction) // Current pointer is pointed to A[i+2], adjust it to A[i]. + + bool InBounds = false; + if (auto *gep = dyn_cast(NewPtr->stripPointerCasts())) + InBounds = gep->isInBounds(); NewPtr = Builder.CreateGEP(ScalarTy, NewPtr, Builder.getInt32(-Index)); - if (InBounds) - cast(NewPtr)->setIsInBounds(true); + cast(NewPtr)->setIsInBounds(InBounds); // Cast to the vector pointer type. + unsigned AddressSpace = NewPtr->getType()->getPointerAddressSpace(); + Type *PtrTy = VecTy->getPointerTo(AddressSpace); NewPtrs.push_back(Builder.CreateBitCast(NewPtr, PtrTy)); } @@ -2353,7 +2358,9 @@ } void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, - VectorParts *BlockInMask) { + VPTransformState &State, + VPValue *Addr, + VPValue *BlockInMask) { // Attempt to issue a wide load. LoadInst *LI = dyn_cast(Instr); StoreInst *SI = dyn_cast(Instr); @@ -2365,17 +2372,15 @@ assert(Decision != LoopVectorizationCostModel::CM_Unknown && "CM decision should be taken at this point"); if (Decision == LoopVectorizationCostModel::CM_Interleave) - return vectorizeInterleaveGroup(Instr); + return vectorizeInterleaveGroup(Instr, State, Addr, BlockInMask); Type *ScalarDataTy = getMemInstValueType(Instr); Type *DataTy = VectorType::get(ScalarDataTy, VF); - Value *Ptr = getLoadStorePointerOperand(Instr); // An alignment of 0 means target abi alignment. We need to use the scalar's // target abi alignment in such a case. const DataLayout &DL = Instr->getModule()->getDataLayout(); const Align Alignment = DL.getValueOrABITypeAlignment(getLoadStoreAlignment(Instr), ScalarDataTy); - unsigned AddressSpace = getLoadStoreAddressSpace(Instr); // Determine if the pointer operand of the access is either consecutive or // reverse consecutive. @@ -2390,24 +2395,20 @@ assert((ConsecutiveStride || CreateGatherScatter) && "The instruction should be scalarized"); - // Handle consecutive loads/stores. - if (ConsecutiveStride) - Ptr = getOrCreateScalarValue(Ptr, {0, 0}); - - VectorParts Mask; + VectorParts Mask(UF); bool isMaskRequired = BlockInMask; if (isMaskRequired) - Mask = *BlockInMask; - - bool InBounds = false; - if (auto *gep = dyn_cast( - getLoadStorePointerOperand(Instr)->stripPointerCasts())) - InBounds = gep->isInBounds(); + for (unsigned Part = 0; Part < UF; ++Part) + Mask[Part] = State.get(BlockInMask, Part); const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { // Calculate the pointer for the specific unroll-part. GetElementPtrInst *PartPtr = nullptr; + bool InBounds = false; + if (auto *gep = dyn_cast(Ptr->stripPointerCasts())) + InBounds = gep->isInBounds(); + if (Reverse) { // If the address is consecutive but reversed, then the // wide store needs to start at the last vector element. @@ -2425,6 +2426,7 @@ PartPtr->setIsInBounds(InBounds); } + unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); }; @@ -2437,7 +2439,7 @@ Value *StoredVal = getOrCreateVectorValue(SI->getValueOperand(), Part); if (CreateGatherScatter) { Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr; - Value *VectorGep = getOrCreateVectorValue(Ptr, Part); + Value *VectorGep = State.get(Addr, Part); NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment.value(), MaskPart); } else { @@ -2448,7 +2450,7 @@ // We don't want to update the value in the map as it might be used in // another expression. So don't call resetVectorValue(StoredVal). } - auto *VecPtr = CreateVecPtr(Part, Ptr); + auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); if (isMaskRequired) NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment.value(), Mask[Part]); @@ -2468,12 +2470,12 @@ Value *NewLI; if (CreateGatherScatter) { Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr; - Value *VectorGep = getOrCreateVectorValue(Ptr, Part); + Value *VectorGep = State.get(Addr, Part); NewLI = Builder.CreateMaskedGather(VectorGep, Alignment.value(), MaskPart, nullptr, "wide.masked.gather"); addMetadata(NewLI, LI); } else { - auto *VecPtr = CreateVecPtr(Part, Ptr); + auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); if (isMaskRequired) NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment.value(), Mask[Part], UndefValue::get(DataTy), @@ -6800,7 +6802,8 @@ if (Legal->isMaskRequired(I)) Mask = createBlockInMask(I->getParent(), Plan); - return new VPWidenMemoryInstructionRecipe(*I, Mask); + VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I)); + return new VPWidenMemoryInstructionRecipe(*I, Addr, Mask); } VPWidenIntOrFpInductionRecipe * @@ -7249,7 +7252,8 @@ for (auto IG : InterleaveGroups) { auto *Recipe = cast( RecipeBuilder.getRecipe(IG->getInsertPos())); - (new VPInterleaveRecipe(IG, Recipe->getMask()))->insertBefore(Recipe); + (new VPInterleaveRecipe(IG, Recipe->getAddr(), Recipe->getMask())) + ->insertBefore(Recipe); for (unsigned i = 0; i < IG->getFactor(); ++i) if (Instruction *Member = IG->getMember(i)) { @@ -7323,13 +7327,21 @@ return ILV.getOrCreateVectorValue(V, Part); } +Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue( + Value *V, const VPIteration &Instance) { + return ILV.getOrCreateScalarValue(V, Instance); +} + void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent) const { O << " +\n" << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; IG->getInsertPos()->printAsOperand(O, false); - if (User) { + O << ", "; + getAddr()->printAsOperand(O); + VPValue *Mask = getMask(); + if (Mask) { O << ", "; - User->getOperand(0)->printAsOperand(O); + Mask->printAsOperand(O); } O << "\\l\""; for (unsigned i = 0; i < IG->getFactor(); ++i) @@ -7398,15 +7410,8 @@ void VPInterleaveRecipe::execute(VPTransformState &State) { assert(!State.Instance && "Interleave group being replicated."); - if (!User) - return State.ILV->vectorizeInterleaveGroup(IG->getInsertPos()); - - // Last (and currently only) operand is a mask. - InnerLoopVectorizer::VectorParts MaskValues(State.UF); - VPValue *Mask = User->getOperand(User->getNumOperands() - 1); - for (unsigned Part = 0; Part < State.UF; ++Part) - MaskValues[Part] = State.get(Mask, Part); - State.ILV->vectorizeInterleaveGroup(IG->getInsertPos(), &MaskValues); + State.ILV->vectorizeInterleaveGroup(IG->getInsertPos(), State, getAddr(), + getMask()); } void VPReplicateRecipe::execute(VPTransformState &State) { @@ -7493,14 +7498,7 @@ } void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { - VPValue *Mask = getMask(); - if (!Mask) - return State.ILV->vectorizeMemoryInstruction(&Instr); - - InnerLoopVectorizer::VectorParts MaskValues(State.UF); - for (unsigned Part = 0; Part < State.UF; ++Part) - MaskValues[Part] = State.get(Mask, Part); - State.ILV->vectorizeMemoryInstruction(&Instr, &MaskValues); + State.ILV->vectorizeMemoryInstruction(&Instr, State, getAddr(), getMask()); } static ScalarEpilogueLowering Index: llvm/lib/Transforms/Vectorize/VPlan.h =================================================================== --- llvm/lib/Transforms/Vectorize/VPlan.h +++ llvm/lib/Transforms/Vectorize/VPlan.h @@ -227,6 +227,8 @@ struct VPCallback { virtual ~VPCallback() {} virtual Value *getOrCreateVectorValues(Value *V, unsigned Part) = 0; + virtual Value *getOrCreateScalarValue(Value *V, + const VPIteration &Instance) = 0; }; /// VPTransformState holds information passed down when "executing" a VPlan, @@ -269,6 +271,13 @@ return Callback.getOrCreateVectorValues(VPValue2Value[Def], Part); } + /// Get the generated Value for a given VPValue and given Part and Lane. Note + /// that as per-lane Defs are still created by ILV and managed in its ValueMap + /// this method currently just delegates the call to ILV. + Value *get(VPValue *Def, const VPIteration &Instance) { + return Callback.getOrCreateScalarValue(VPValue2Value[Def], Instance); + } + /// Set the generated Value for a given VPValue and a given Part. void set(VPValue *Def, Value *V, unsigned Part) { if (!Data.PerPartOutput.count(Def)) { @@ -862,13 +871,14 @@ class VPInterleaveRecipe : public VPRecipeBase { private: const InterleaveGroup *IG; - std::unique_ptr User; + VPUser User; public: - VPInterleaveRecipe(const InterleaveGroup *IG, VPValue *Mask) - : VPRecipeBase(VPInterleaveSC), IG(IG) { - if (Mask) // Create a VPInstruction to register as a user of the mask. - User.reset(new VPUser({Mask})); + VPInterleaveRecipe(const InterleaveGroup *IG, VPValue *Addr, + VPValue *Mask) + : VPRecipeBase(VPInterleaveSC), IG(IG), User({Addr}) { + if (Mask) + User.addOperand(Mask); } ~VPInterleaveRecipe() override = default; @@ -877,6 +887,18 @@ return V->getVPRecipeID() == VPRecipeBase::VPInterleaveSC; } + /// Return the address accessed by this recipe. + VPValue *getAddr() const { + return User.getOperand(0); // Address is the 1st, mandatory operand. + } + + /// Return the mask used by this recipe. Note that a full mask is represented + /// by a nullptr. + VPValue *getMask() const { + // Mask is optional and therefore the last, currently 2nd operand. + return User.getNumOperands() == 2 ? User.getOperand(1) : nullptr; + } + /// Generate the wide load or store, and shuffles. void execute(VPTransformState &State) override; @@ -999,13 +1021,14 @@ class VPWidenMemoryInstructionRecipe : public VPRecipeBase { private: Instruction &Instr; - std::unique_ptr User; + VPUser User; public: - VPWidenMemoryInstructionRecipe(Instruction &Instr, VPValue *Mask) - : VPRecipeBase(VPWidenMemoryInstructionSC), Instr(Instr) { - if (Mask) // Create a VPInstruction to register as a user of the mask. - User.reset(new VPUser({Mask})); + VPWidenMemoryInstructionRecipe(Instruction &Instr, VPValue *Addr, + VPValue *Mask) + : VPRecipeBase(VPWidenMemoryInstructionSC), Instr(Instr), User({Addr}) { + if (Mask) + User.addOperand(Mask); } /// Method to support type inquiry through isa, cast, and dyn_cast. @@ -1013,11 +1036,16 @@ return V->getVPRecipeID() == VPRecipeBase::VPWidenMemoryInstructionSC; } + /// Return the address accessed by this recipe. + VPValue *getAddr() const { + return User.getOperand(0); // Address is the 1st, mandatory operand. + } + /// Return the mask used by this recipe. Note that a full mask is represented /// by a nullptr. VPValue *getMask() const { - // Mask is the last operand. - return User ? User->getOperand(User->getNumOperands() - 1) : nullptr; + // Mask is optional and therefore the last, currently 2nd operand. + return User.getNumOperands() == 2 ? User.getOperand(1) : nullptr; } /// Generate the wide load/store. @@ -1414,6 +1442,13 @@ return Value2VPValue[V]; } + VPValue *getOrAddVPValue(Value *V) { + assert(V && "Trying to get or add the VPValue of a null Value"); + if (!Value2VPValue.count(V)) + addVPValue(V); + return getVPValue(V); + } + /// Return the VPLoopInfo analysis for this VPlan. VPLoopInfo &getVPLoopInfo() { return VPLInfo; } const VPLoopInfo &getVPLoopInfo() const { return VPLInfo; } Index: llvm/lib/Transforms/Vectorize/VPlan.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/VPlan.cpp +++ llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -735,6 +735,8 @@ void VPWidenMemoryInstructionRecipe::print(raw_ostream &O, const Twine &Indent) const { O << " +\n" << Indent << "\"WIDEN " << VPlanIngredient(&Instr); + O << ", "; + getAddr()->printAsOperand(O); VPValue *Mask = getMask(); if (Mask) { O << ", "; Index: llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -56,7 +56,9 @@ VPRecipeBase *NewRecipe = nullptr; // Create VPWidenMemoryInstructionRecipe for loads and stores. if (isa(Inst) || isa(Inst)) - NewRecipe = new VPWidenMemoryInstructionRecipe(*Inst, nullptr /*Mask*/); + NewRecipe = new VPWidenMemoryInstructionRecipe( + *Inst, Plan->getOrAddVPValue(getLoadStorePointerOperand(Inst)), + nullptr /*Mask*/); else if (PHINode *Phi = dyn_cast(Inst)) { InductionDescriptor II = Inductions->lookup(Phi); if (II.getKind() == InductionDescriptor::IK_IntInduction ||