diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -546,6 +546,7 @@ /// BlockInMask is non-null. Use \p State to translate given VPValues to IR /// values in the vectorized loop. void vectorizeInterleaveGroup(const InterleaveGroup *Group, + ArrayRef VPDefs, VPTransformState &State, VPValue *Addr, VPValue *BlockInMask = nullptr); @@ -2320,8 +2321,8 @@ // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B void InnerLoopVectorizer::vectorizeInterleaveGroup( - const InterleaveGroup *Group, VPTransformState &State, - VPValue *Addr, VPValue *BlockInMask) { + const InterleaveGroup *Group, ArrayRef VPDefs, + VPTransformState &State, VPValue *Addr, VPValue *BlockInMask) { Instruction *Instr = Group->getInsertPos(); const DataLayout &DL = Instr->getModule()->getDataLayout(); @@ -2424,6 +2425,7 @@ // For each member in the group, shuffle out the appropriate data from the // wide loads. + unsigned J = 0; for (unsigned I = 0; I < InterleaveFactor; ++I) { Instruction *Member = Group->getMember(I); @@ -2448,8 +2450,9 @@ if (Group->isReverse()) StridedVec = reverseVector(StridedVec); - VectorLoopValueMap.setVectorValue(Member, Part, StridedVec); + State.set(VPDefs[J], Member, StridedVec, Part); } + ++J; } return; } @@ -7288,9 +7291,8 @@ return BlockMaskCache[BB] = BlockMask; } -VPWidenMemoryInstructionRecipe * -VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, - VPlanPtr &Plan) { +VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, + VPlanPtr &Plan) { assert((isa(I) || isa(I)) && "Must be called with either a load or store"); @@ -7318,6 +7320,19 @@ Mask = createBlockInMask(I->getParent(), Plan); VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I)); + auto II = InsertPtToGroup.find(I); + if (II != InsertPtToGroup.end()) { + auto *IG = II->second; + auto *InterleaveG = new VPInterleaveRecipe(IG, Addr, Mask); + unsigned j = 0; + for (unsigned i = 0; i < IG->getFactor(); i++) + if (Instruction *Member = IG->getMember(i)) { + Plan->addVPValue(Member, InterleaveG->getResult(j)); + j++; + } + return InterleaveG; + } + if (LoadInst *Load = dyn_cast(I)) { auto *WidenLoad = new VPWidenMemoryInstructionRecipe(*Load, Addr, Mask); Plan->addVPValue(Load, WidenLoad); @@ -7677,6 +7692,7 @@ DenseMap PredInst2Recipe; SmallPtrSet *, 1> InterleaveGroups; + SmallPtrSet DeadInterleaveGroupMembers; VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); @@ -7722,11 +7738,30 @@ if (!getDecisionAndClampRange(applyIG, Range)) continue; InterleaveGroups.insert(IG); + RecipeBuilder.recordInterleaveGroup(IG); for (unsigned i = 0; i < IG->getFactor(); i++) - if (Instruction *Member = IG->getMember(i)) + if (Instruction *Member = IG->getMember(i)) { RecipeBuilder.recordRecipeOf(Member); + if (Member != IG->getInsertPos()) + DeadInterleaveGroupMembers.insert(Member); + } }; + auto skipDeadInterleaveMembers = + [&DeadInterleaveGroupMembers](Instruction *I) { + BasicBlock *BB = I->getParent(); + for (auto &I : make_range(I->getIterator(), BB->end())) + if (!DeadInterleaveGroupMembers.contains(&I)) + return &I; + llvm_unreachable("Need to find a valid insert point"); + }; + // Mark instructions we'll need to sink later and their targets as + // ingredients whose recipe we'll need to record. + for (auto &Entry : SinkAfter) { + RecipeBuilder.recordRecipeOf(skipDeadInterleaveMembers(Entry.first)); + RecipeBuilder.recordRecipeOf(skipDeadInterleaveMembers(Entry.second)); + } + // --------------------------------------------------------------------------- // Build initial VPlan: Scan the body of the loop in a topological order to // visit each basic block after having visited its predecessor basic blocks. @@ -7762,7 +7797,8 @@ // First filter out irrelevant instructions, to ensure no recipes are // built for them. - if (isa(Instr) || DeadInstructions.count(Instr)) + if (isa(Instr) || DeadInstructions.count(Instr) || + DeadInterleaveGroupMembers.contains(Instr)) continue; if (auto Recipe = @@ -7800,26 +7836,13 @@ // Apply Sink-After legal constraints. for (auto &Entry : SinkAfter) { - VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); - VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); + VPRecipeBase *Sink = + RecipeBuilder.getRecipe(skipDeadInterleaveMembers(Entry.first)); + VPRecipeBase *Target = + RecipeBuilder.getRecipe(skipDeadInterleaveMembers(Entry.second)); Sink->moveAfter(Target); } - // Interleave memory: for each Interleave Group we marked earlier as relevant - // for this VPlan, replace the Recipes widening its memory instructions with a - // single VPInterleaveRecipe at its insertion point. - for (auto IG : InterleaveGroups) { - auto *Recipe = cast( - RecipeBuilder.getRecipe(IG->getInsertPos())); - (new VPInterleaveRecipe(IG, Recipe->getAddr(), Recipe->getMask())) - ->insertBefore(Recipe); - - for (unsigned i = 0; i < IG->getFactor(); ++i) - if (Instruction *Member = IG->getMember(i)) { - RecipeBuilder.getRecipe(Member)->eraseFromParent(); - } - } - // Adjust the recipes for any inloop reductions. if (Range.Start > 1) adjustRecipesForInLoopReductions(Plan, RecipeBuilder); @@ -8036,7 +8059,8 @@ void VPInterleaveRecipe::execute(VPTransformState &State) { assert(!State.Instance && "Interleave group being replicated."); - State.ILV->vectorizeInterleaveGroup(IG, State, getAddr(), getMask()); + State.ILV->vectorizeInterleaveGroup(IG, getDefs(), State, getAddr(), + getMask()); } void VPReductionRecipe::execute(VPTransformState &State) { diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h --- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h +++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h @@ -53,6 +53,8 @@ // marked by having a nullptr entry in this map. DenseMap Ingredient2Recipe; + DenseMap *> InsertPtToGroup; + /// Check if \p I can be widened at the start of \p Range and possibly /// decrease the range such that the returned value holds for the entire \p /// Range. The function should not be called for memory instructions or calls. @@ -61,8 +63,8 @@ /// Check if the load or store instruction \p I should widened for \p /// Range.Start and potentially masked. Such instructions are handled by a /// recipe that takes an additional VPInstruction for the mask. - VPWidenMemoryInstructionRecipe * - tryToWidenMemory(Instruction *I, VFRange &Range, VPlanPtr &Plan); + VPRecipeBase *tryToWidenMemory(Instruction *I, VFRange &Range, + VPlanPtr &Plan); /// Check if an induction recipe should be constructed for \I. If so build and /// return it. If not, return null. @@ -129,6 +131,10 @@ Ingredient2Recipe[I] = nullptr; } + void recordInterleaveGroup(const InterleaveGroup *IG) { + InsertPtToGroup[IG->getInsertPos()] = IG; + } + /// Return the recipe created for given ingredient. VPRecipeBase *getRecipe(Instruction *I) { assert(Ingredient2Recipe.count(I) && diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1012,16 +1012,20 @@ /// VPInterleaveRecipe is a recipe for transforming an interleave group of load /// or stores into one wide load/store and shuffles. -class VPInterleaveRecipe : public VPRecipeBase { +class VPInterleaveRecipe : public VPRecipeBase, public VPValue { const InterleaveGroup *IG; VPUser User; + SmallVector Defs; public: VPInterleaveRecipe(const InterleaveGroup *IG, VPValue *Addr, VPValue *Mask) - : VPRecipeBase(VPInterleaveSC), IG(IG), User({Addr}) { + : VPRecipeBase(VPInterleaveSC), VPValue(VPValue::VPVInterleaveSC), IG(IG), + User({Addr}) { if (Mask) User.addOperand(Mask); + for (unsigned i = 0; i < IG->getNumMembers(); i++) + Defs.push_back(new VPMultiValue(this)); } ~VPInterleaveRecipe() override = default; @@ -1029,6 +1033,9 @@ static inline bool classof(const VPRecipeBase *V) { return V->getVPRecipeID() == VPRecipeBase::VPInterleaveSC; } + static inline bool classof(const VPValue *V) { + return V->getVPValueID() == VPValue::VPVInterleaveSC; + } /// Return the address accessed by this recipe. VPValue *getAddr() const { @@ -1050,8 +1057,27 @@ VPSlotTracker &SlotTracker) const override; const InterleaveGroup *getInterleaveGroup() { return IG; } + + VPMultiValue *getResult(unsigned Idx) { return Defs[Idx]; } + ArrayRef getDefs() { return Defs; } }; +/*template <> struct simplify_type {*/ +// using SimpleType = VPValue *; + +// static SimpleType getSimplifiedValue(VPMultiValue &Val) { +// return Val.getProducer(); +//} +//}; + +// template <> struct simplify_type { +// using SimpleType = VPValue *; + +// static SimpleType getSimplifiedValue(VPMultiValue *&Val) { +// return Val->getProducer(); +//} +//}; + /// A recipe to represent inloop reduction operations, performing a reduction on /// a vector operand into a scalar value, and adding the result to a chain. class VPReductionRecipe : public VPRecipeBase { diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -83,7 +83,8 @@ VPMemoryInstructionSC, VPVWidenCallSC, VPVWidenSelectSC, - VPVWidenGEPSC + VPVWidenGEPSC, + VPVInterleaveSC }; VPValue(Value *UV = nullptr) : VPValue(VPValueSC, UV) {} diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll --- a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll +++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll @@ -909,8 +909,8 @@ ; CHECK: %[[VSHUF1:.+]] = shufflevector <8 x i16> %wide.vec, <8 x i16> undef, <4 x i32> ; CHECK: %[[VSHUF:.+]] = shufflevector <4 x i16> %vector.recur, <4 x i16> %[[VSHUF1]], <4 x i32> ; CHECK: sext <4 x i16> %[[VSHUF0]] to <4 x i32> -; CHECK: sext <4 x i16> %[[VSHUF]] to <4 x i32> ; CHECK: sext <4 x i16> %[[VSHUF1]] to <4 x i32> +; CHECK: sext <4 x i16> %[[VSHUF]] to <4 x i32> ; CHECK: mul nsw <4 x i32> ; CHECK: mul nsw <4 x i32>