diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -2181,6 +2181,22 @@ const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R); + + /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the + /// users of \p TE and collects the stores. It returns the map from the store + /// pointers to the collected stores. + DenseMap> + collectUserStores(const BoUpSLP::TreeEntry *TE) const; + + /// Iterates through the users of \p TE, looking for scalar stores that can be + /// potentially vectorized in a future SLP-tree. If found, it keeps track of + /// their order and builds an order index vector for each store bundle. It + /// returns all these order vectors found. + /// We run this after the tree has formed, otherwise we may come across user + /// instructions that are not yet in the tree. + SmallVector + findExternalStoreUsersReorderIndices(TreeEntry *TE) const; + struct TreeEntry { using VecTreeTy = SmallVector, 8>; TreeEntry(VecTreeTy &Container) : Container(Container) {} @@ -2264,6 +2280,10 @@ /// Does this entry require reordering? SmallVector ReorderIndices; + /// The vector of reorder (i.e., shuffle) indices of external users that are + /// probably going to be vectorized in a future SLP tree. + SmallVector ExternalUserReorderIndices; + /// Points back to the VectorizableTree. /// /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has @@ -2453,6 +2473,14 @@ for (unsigned ReorderIdx : ReorderIndices) dbgs() << ReorderIdx << ", "; dbgs() << "\n"; + dbgs() << "ExternalUserReorderIndices: "; + for (const auto &ReorderIndices : ExternalUserReorderIndices) { + dbgs() << "{"; + for (unsigned Idx : ReorderIndices) + dbgs() << Idx << ", "; + dbgs() << "}, "; + } + dbgs() << "\n"; dbgs() << "UserTreeIndices: "; for (const auto &EInfo : UserTreeIndices) dbgs() << EInfo << ", "; @@ -3525,6 +3553,8 @@ if (TE->State != TreeEntry::Vectorize) GathersToOrders.try_emplace(TE.get(), *CurrentOrder); } + if (!TE->ExternalUserReorderIndices.empty()) + VFToOrderedEntries[TE->Scalars.size()].insert(TE.get()); }); // Reorder the graph nodes according to their vectorization factor. @@ -3554,6 +3584,14 @@ return GathersToOrders.find(OpTE)->second; return OpTE->ReorderIndices; }(); + // First consider the order of the external scalar users. + if (!OpTE->ExternalUserReorderIndices.empty()) { + for (const OrdersType &Order : OpTE->ExternalUserReorderIndices) + ++OrdersUses.insert(std::make_pair(Order, 0)).first->second; + // No other useful reorder data in this entry. + if (Order.empty()) + continue; + } // Stores actually store the mask, not the order, need to invert. if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() && OpTE->getOpcode() == Instruction::Store && !Order.empty()) { @@ -3620,6 +3658,8 @@ // Build correct orders for extract{element,value}, loads and // stores. reorderOrder(TE->ReorderIndices, Mask); + for (OrdersType &ExtReorder : TE->ExternalUserReorderIndices) + reorderOrder(ExtReorder, Mask); if (isa(TE->getMainOp())) TE->reorderOperands(Mask); } else { @@ -3628,6 +3668,8 @@ assert(TE->ReorderIndices.empty() && "Expected empty reorder sequence."); reorderScalars(TE->Scalars, Mask); + for (OrdersType &ExtReorder : TE->ExternalUserReorderIndices) + reorderOrder(ExtReorder, Mask); } if (!TE->ReuseShuffleIndices.empty()) { // Apply reversed order to keep the original ordering of the reused @@ -3985,6 +4027,126 @@ } } +DenseMap> +BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const { + unsigned NumLanes = TE->Scalars.size(); + DenseMap> PtrToStoresMap; + for (unsigned Lane = 0, E = NumLanes; Lane != E; ++Lane) { + Value *V = TE->Scalars[Lane]; + // To save compilation time we don't visit if we have too many users. + static constexpr unsigned UsersLimit = 4; + if (V->hasNUsesOrMore(UsersLimit)) + break; + + // Collect stores per pointer object. + for (User *U : V->users()) { + if (!isa(U)) + continue; + // Skip entry if already + if (getTreeEntry(U)) + continue; + auto *SI = cast(U); + if (!SI->isSimple()) + continue; + if (!isValidElementType(SI->getValueOperand()->getType())) + continue; + + Value *Ptr = getUnderlyingObject(SI->getPointerOperand()); + auto &StoresVec = PtrToStoresMap[Ptr]; + // For now just keep one store per pointer object per lane. + // TODO: Extend this to support multiple stores per pointer per lane + if (StoresVec.size() > Lane) + continue; + // Skip if in different BBs. + if (!StoresVec.empty() && + SI->getParent() != StoresVec.back()->getParent()) + continue; + StoresVec.push_back(SI); + } + } + return PtrToStoresMap; +} + +SmallVector +BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const { + unsigned NumLanes = TE->Scalars.size(); + + DenseMap> PtrToStoresMap = + collectUserStores(TE); + + // Holds the reorder indices for each candidate store vector that is a user of + // the current TreeEntry. + SmallVector ExternalReorderIndices; + + // Now inspect the stores collected per pointer and look for vectorization + // candidates. For each candidate calculate the reorder index vector and push + // it into `ExternalReorderIndices` + for (const auto &Pair : PtrToStoresMap) { + auto &StoresVec = Pair.second; + // If we have fewer than NumLanes stores, then we can't form a vector. + if (StoresVec.size() != NumLanes) + continue; + + // Computes the store address diff S1 - S2. + auto GetStoreAddrDiff = [this](StoreInst *S1, StoreInst *S2) { + return getPointersDiff( + S2->getValueOperand()->getType(), S2->getPointerOperand(), + S1->getValueOperand()->getType(), S1->getPointerOperand(), *DL, *SE, + /*StrictCheck=*/true); + }; + bool FailedToCompare = false; + auto Cmp = [&FailedToCompare, &GetStoreAddrDiff](StoreInst *S1, + StoreInst *S2) { + auto Diff = GetStoreAddrDiff(S1, S2); + if (!Diff) { + FailedToCompare = true; + return false; + } + return Diff < 0; + }; + // Sort the vector based on the pointers. We create a copy because we may + // need the original later for calculating the reorder (shuffle) indices. + auto StoresVecSorted = StoresVec; + stable_sort(StoresVecSorted, Cmp); + // If we failed to compare stores, then just abandon this stores vector. + if (FailedToCompare) + continue; + + // \Returns true if the stores in `SortedStoresVec` are consecutive. + auto CanFormVector = [&GetStoreAddrDiff](const auto &SortedVec) { + for (unsigned Idx = 1, E = SortedVec.size(); Idx < E; ++Idx) + if (GetStoreAddrDiff(SortedVec[Idx], SortedVec[Idx - 1]) != 1) + return false; + return true; + }; + // If the stores are not consecutive then abandon this sotres vector. + if (!CanFormVector(StoresVecSorted)) + continue; + + // The scalars in StoresVec can form a vector instruction, so calculate the + // shuffle indices. + ExternalReorderIndices.resize(ExternalReorderIndices.size() + 1); + OrdersType &ReorderIndices = ExternalReorderIndices.back(); + for (StoreInst *SI : StoresVec) { + unsigned Idx = llvm::find(StoresVecSorted, SI) - StoresVecSorted.begin(); + ReorderIndices.push_back(Idx); + } + + // Identity order (e.g., {0,1,2,3}) is modeled as an empty OrdersType in + // reorderTopToBottom() and reorderBottomToTop(), so we are following the + // same convention here. + auto IsIdentityOrder = [](const OrdersType &Order) { + for (unsigned Idx = 0, E = Order.size(); Idx != E; ++Idx) + if (Idx != Order[Idx]) + return false; + return true; + }; + if (IsIdentityOrder(ReorderIndices)) + ReorderIndices.clear(); + } + return ExternalReorderIndices; +} + void BoUpSLP::buildTree(ArrayRef Roots, ArrayRef UserIgnoreLst) { deleteTree(); @@ -3992,6 +4154,10 @@ if (!allSameType(Roots)) return; buildTree_rec(Roots, 0, EdgeInfo()); + // Look for reorder indices of external users that will probably be vectorized + for (auto &TE : VectorizableTree) + TE->ExternalUserReorderIndices = + findExternalStoreUsersReorderIndices(TE.get()); } namespace { diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder_with_external_users.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder_with_external_users.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/reorder_with_external_users.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder_with_external_users.ll @@ -14,18 +14,17 @@ ; CHECK-NEXT: [[LD:%.*]] = load double, double* undef, align 8 ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[LD]], i32 0 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[LD]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = fadd <2 x double> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[TMP2]], +; CHECK-NEXT: [[TMP2:%.*]] = fadd <2 x double> [[TMP1]], +; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[TMP2]], ; CHECK-NEXT: [[PTRA1:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 0 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <2 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* [[PTRA1]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[SHUFFLE]], <2 x double>* [[TMP4]], align 8 +; CHECK-NEXT: store <2 x double> [[TMP3]], <2 x double>* [[TMP4]], align 8 ; CHECK-NEXT: br label [[BB2:%.*]] ; CHECK: bb2: -; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[TMP3]], +; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[TMP3]], ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP5]], i32 0 ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP5]], i32 1 -; CHECK-NEXT: [[SEED:%.*]] = fcmp ogt double [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[SEED:%.*]] = fcmp ogt double [[TMP7]], [[TMP6]] ; CHECK-NEXT: ret void ; bb1: