diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -502,6 +502,7 @@ /// BlockInMask is non-null. Use \p State to translate given VPValues to IR /// values in the vectorized loop. void vectorizeInterleaveGroup(const InterleaveGroup *Group, + ArrayRef VPDefs, VPTransformState &State, VPValue *Addr, VPValue *BlockInMask = nullptr); @@ -2211,8 +2212,8 @@ // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B void InnerLoopVectorizer::vectorizeInterleaveGroup( - const InterleaveGroup *Group, VPTransformState &State, - VPValue *Addr, VPValue *BlockInMask) { + const InterleaveGroup *Group, ArrayRef VPDefs, + VPTransformState &State, VPValue *Addr, VPValue *BlockInMask) { Instruction *Instr = Group->getInsertPos(); const DataLayout &DL = Instr->getModule()->getDataLayout(); @@ -2309,6 +2310,7 @@ // For each member in the group, shuffle out the appropriate data from the // wide loads. + unsigned J = 0; for (unsigned I = 0; I < InterleaveFactor; ++I) { Instruction *Member = Group->getMember(I); @@ -2330,8 +2332,9 @@ if (Group->isReverse()) StridedVec = reverseVector(StridedVec); - VectorLoopValueMap.setVectorValue(Member, Part, StridedVec); + State.set(VPDefs[J], Member, StridedVec, Part); } + ++J; } return; } @@ -6925,9 +6928,8 @@ return BlockMaskCache[BB] = BlockMask; } -VPWidenMemoryInstructionRecipe * -VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, - VPlanPtr &Plan) { +VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, + VPlanPtr &Plan) { assert((isa(I) || isa(I)) && "Must be called with either a load or store"); @@ -6954,6 +6956,19 @@ Mask = createBlockInMask(I->getParent(), Plan); VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I)); + auto II = InsertPtToGroup.find(I); + if (II != InsertPtToGroup.end()) { + auto *IG = II->second; + auto *InterleaveG = new VPInterleaveRecipe(IG, Addr, Mask); + unsigned j = 0; + for (unsigned i = 0; i < IG->getFactor(); i++) + if (Instruction *Member = IG->getMember(i)) { + Plan->addVPValue(Member, InterleaveG->getResult(j)); + j++; + } + return InterleaveG; + } + if (LoadInst *Load = dyn_cast(I)) { auto *WidenLoad = new VPWidenMemoryInstructionRecipe(*Load, Addr, Mask); Plan->addVPValue(Load, WidenLoad); @@ -7306,6 +7321,7 @@ DenseMap PredInst2Recipe; SmallPtrSet *, 1> InterleaveGroups; + SmallPtrSet DeadInterleaveGroupMembers; VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); @@ -7314,13 +7330,6 @@ // process after constructing the initial VPlan. // --------------------------------------------------------------------------- - // Mark instructions we'll need to sink later and their targets as - // ingredients whose recipe we'll need to record. - for (auto &Entry : SinkAfter) { - RecipeBuilder.recordRecipeOf(Entry.first); - RecipeBuilder.recordRecipeOf(Entry.second); - } - // For each interleave group which is relevant for this (possibly trimmed) // Range, add it to the set of groups to be later applied to the VPlan and add // placeholders for its members' Recipes which we'll be replacing with a @@ -7334,11 +7343,30 @@ if (!getDecisionAndClampRange(applyIG, Range)) continue; InterleaveGroups.insert(IG); + RecipeBuilder.recordInterleaveGroup(IG); for (unsigned i = 0; i < IG->getFactor(); i++) - if (Instruction *Member = IG->getMember(i)) + if (Instruction *Member = IG->getMember(i)) { RecipeBuilder.recordRecipeOf(Member); + if (Member != IG->getInsertPos()) + DeadInterleaveGroupMembers.insert(Member); + } }; + auto skipDeadInterleaveMembers = + [&DeadInterleaveGroupMembers](Instruction *I) { + BasicBlock *BB = I->getParent(); + for (auto &I : make_range(I->getIterator(), BB->end())) + if (!DeadInterleaveGroupMembers.contains(&I)) + return &I; + llvm_unreachable("Need to find a valid insert point"); + }; + // Mark instructions we'll need to sink later and their targets as + // ingredients whose recipe we'll need to record. + for (auto &Entry : SinkAfter) { + RecipeBuilder.recordRecipeOf(skipDeadInterleaveMembers(Entry.first)); + RecipeBuilder.recordRecipeOf(skipDeadInterleaveMembers(Entry.second)); + } + // --------------------------------------------------------------------------- // Build initial VPlan: Scan the body of the loop in a topological order to // visit each basic block after having visited its predecessor basic blocks. @@ -7374,7 +7402,8 @@ // First filter out irrelevant instructions, to ensure no recipes are // built for them. - if (isa(Instr) || DeadInstructions.count(Instr)) + if (isa(Instr) || DeadInstructions.count(Instr) || + DeadInterleaveGroupMembers.contains(Instr)) continue; if (auto Recipe = @@ -7412,26 +7441,13 @@ // Apply Sink-After legal constraints. for (auto &Entry : SinkAfter) { - VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); - VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); + VPRecipeBase *Sink = + RecipeBuilder.getRecipe(skipDeadInterleaveMembers(Entry.first)); + VPRecipeBase *Target = + RecipeBuilder.getRecipe(skipDeadInterleaveMembers(Entry.second)); Sink->moveAfter(Target); } - // Interleave memory: for each Interleave Group we marked earlier as relevant - // for this VPlan, replace the Recipes widening its memory instructions with a - // single VPInterleaveRecipe at its insertion point. - for (auto IG : InterleaveGroups) { - auto *Recipe = cast( - RecipeBuilder.getRecipe(IG->getInsertPos())); - (new VPInterleaveRecipe(IG, Recipe->getAddr(), Recipe->getMask())) - ->insertBefore(Recipe); - - for (unsigned i = 0; i < IG->getFactor(); ++i) - if (Instruction *Member = IG->getMember(i)) { - RecipeBuilder.getRecipe(Member)->eraseFromParent(); - } - } - // Finally, if tail is folded by masking, introduce selects between the phi // and the live-out instruction of each reduction, at the end of the latch. if (CM.foldTailByMasking()) { @@ -7588,7 +7604,8 @@ void VPInterleaveRecipe::execute(VPTransformState &State) { assert(!State.Instance && "Interleave group being replicated."); - State.ILV->vectorizeInterleaveGroup(IG, State, getAddr(), getMask()); + State.ILV->vectorizeInterleaveGroup(IG, getDefs(), State, getAddr(), + getMask()); } void VPReplicateRecipe::execute(VPTransformState &State) { diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h --- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h +++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h @@ -53,6 +53,8 @@ // marked by having a nullptr entry in this map. DenseMap Ingredient2Recipe; + DenseMap *> InsertPtToGroup; + /// Check if \p I can be widened at the start of \p Range and possibly /// decrease the range such that the returned value holds for the entire \p /// Range. The function should not be called for memory instructions or calls. @@ -61,8 +63,8 @@ /// Check if the load or store instruction \p I should widened for \p /// Range.Start and potentially masked. Such instructions are handled by a /// recipe that takes an additional VPInstruction for the mask. - VPWidenMemoryInstructionRecipe * - tryToWidenMemory(Instruction *I, VFRange &Range, VPlanPtr &Plan); + VPRecipeBase *tryToWidenMemory(Instruction *I, VFRange &Range, + VPlanPtr &Plan); /// Check if an induction recipe should be constructed for \I. If so build and /// return it. If not, return null. @@ -129,6 +131,10 @@ Ingredient2Recipe[I] = nullptr; } + void recordInterleaveGroup(const InterleaveGroup *IG) { + InsertPtToGroup[IG->getInsertPos()] = IG; + } + /// Return the recipe created for given ingredient. VPRecipeBase *getRecipe(Instruction *I) { assert(Ingredient2Recipe.count(I) && diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1006,16 +1006,20 @@ /// VPInterleaveRecipe is a recipe for transforming an interleave group of load /// or stores into one wide load/store and shuffles. -class VPInterleaveRecipe : public VPRecipeBase { +class VPInterleaveRecipe : public VPRecipeBase, public VPValue { const InterleaveGroup *IG; VPUser User; + SmallVector Defs; public: VPInterleaveRecipe(const InterleaveGroup *IG, VPValue *Addr, VPValue *Mask) - : VPRecipeBase(VPInterleaveSC), IG(IG), User({Addr}) { + : VPRecipeBase(VPInterleaveSC), VPValue(VPValue::VPVInterleaveSC), IG(IG), + User({Addr}) { if (Mask) User.addOperand(Mask); + for (unsigned i = 0; i < IG->getNumMembers(); i++) + Defs.push_back(new VPMultiValue(this)); } ~VPInterleaveRecipe() override = default; @@ -1023,6 +1027,9 @@ static inline bool classof(const VPRecipeBase *V) { return V->getVPRecipeID() == VPRecipeBase::VPInterleaveSC; } + static inline bool classof(const VPValue *V) { + return V->getVPValueID() == VPValue::VPVInterleaveSC; + } /// Return the address accessed by this recipe. VPValue *getAddr() const { @@ -1044,6 +1051,25 @@ VPSlotTracker &SlotTracker) const override; const InterleaveGroup *getInterleaveGroup() { return IG; } + + VPValue *getResult(unsigned Idx) { return Defs[Idx]; } + ArrayRef getDefs() { return Defs; } +}; + +template <> struct simplify_type { + using SimpleType = VPValue *; + + static SimpleType getSimplifiedValue(VPMultiValue &Val) { + return Val.getProducer(); + } +}; + +template <> struct simplify_type { + using SimpleType = VPValue *; + + static SimpleType getSimplifiedValue(VPMultiValue *&Val) { + return Val->getProducer(); + } }; /// VPReplicateRecipe replicates a given instruction producing multiple scalar diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -47,6 +47,8 @@ SmallVector Users; + unsigned ResultIdx = 0; + protected: // Hold the underlying Value, if any, attached to this VPValue. Value *UnderlyingVal; @@ -78,16 +80,19 @@ /// type identification. enum { VPValueSC, + VPMultiValueSC, VPInstructionSC, VPMemoryInstructionSC, VPVWidenCallSC, VPVWidenSelectSC, - VPVWidenGEPSC + VPVWidenGEPSC, + VPVInterleaveSC }; VPValue(Value *UV = nullptr) : VPValue(VPValueSC, UV) {} VPValue(const VPValue &) = delete; VPValue &operator=(const VPValue &) = delete; + virtual ~VPValue() {} /// \return an ID for the concrete type of this object. /// This is used to implement the classof checks. This should not be used @@ -98,7 +103,7 @@ void print(raw_ostream &OS, VPSlotTracker &Tracker) const; unsigned getNumUsers() const { return Users.size(); } - void addUser(VPUser &User) { Users.push_back(&User); } + virtual void addUser(VPUser &User) { Users.push_back(&User); } typedef SmallVectorImpl::iterator user_iterator; typedef SmallVectorImpl::const_iterator const_user_iterator; @@ -129,6 +134,27 @@ void replaceAllUsesWith(VPValue *New); }; +class VPMultiValue : public VPValue { + VPValue *Producer; + +public: + VPMultiValue(VPValue *Producer, Value *UV = nullptr) + : VPValue(VPValue::VPMultiValueSC, UV), Producer(Producer) {} + ~VPMultiValue() override {} + + VPValue *getProducer() { return Producer; } + VPValue const *getProducer() const { return Producer; } + + void addUser(VPUser &User) override { + VPValue::addUser(User); + Producer->addUser(User); + } + + static inline bool classof(const VPValue *V) { + return V->getVPValueID() == VPValue::VPMultiValueSC; + } +}; + typedef DenseMap Value2VPValueTy; typedef DenseMap VPValue2ValueTy; diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll --- a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll +++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll @@ -909,8 +909,8 @@ ; CHECK: %[[VSHUF1:.+]] = shufflevector <8 x i16> %wide.vec, <8 x i16> undef, <4 x i32> ; CHECK: %[[VSHUF:.+]] = shufflevector <4 x i16> %vector.recur, <4 x i16> %[[VSHUF1]], <4 x i32> ; CHECK: sext <4 x i16> %[[VSHUF0]] to <4 x i32> -; CHECK: sext <4 x i16> %[[VSHUF]] to <4 x i32> ; CHECK: sext <4 x i16> %[[VSHUF1]] to <4 x i32> +; CHECK: sext <4 x i16> %[[VSHUF]] to <4 x i32> ; CHECK: mul nsw <4 x i32> ; CHECK: mul nsw <4 x i32>