diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -2181,6 +2181,29 @@ const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R); + + /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the + /// users of \p TE and collects the stores. It returns the map from the store + /// pointers to the collected stores. + DenseMap> + collectUserStores(const BoUpSLP::TreeEntry *TE) const; + + /// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the + /// stores in \p StoresVec can for a vector instruction and returns true if + /// so. If true it also updates \p StoresVecSorted with the contents of \p + /// StoresVec but sorted in the order as they would get vectorized. + bool CanFormVector(const SmallVector &StoresVec, + SmallVector &StoresVecSorted) const; + + /// Iterates through the users of \p TE, looking for scalar stores that can be + /// potentially vectorized in a future SLP-tree. If found, it keeps track of + /// their order and builds an order index vector for each store bundle. It + /// returns all these order vectors found. + /// We run this after the tree has formed, otherwise we may come across user + /// instructions that are not yet in the tree. + SmallVector + findExternalStoreUsersReorderIndices(TreeEntry *TE) const; + struct TreeEntry { using VecTreeTy = SmallVector, 8>; TreeEntry(VecTreeTy &Container) : Container(Container) {} @@ -2264,6 +2287,10 @@ /// Does this entry require reordering? SmallVector ReorderIndices; + /// The vector of reorder (i.e., shuffle) indices of external users that are + /// probably going to be vectorized in a future SLP tree. + SmallVector ExternalUserReorderIndices; + /// Points back to the VectorizableTree. /// /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has @@ -2453,6 +2480,14 @@ for (unsigned ReorderIdx : ReorderIndices) dbgs() << ReorderIdx << ", "; dbgs() << "\n"; + dbgs() << "ExternalUserReorderIndices: "; + for (const auto &ReorderIndices : ExternalUserReorderIndices) { + dbgs() << "{"; + for (unsigned Idx : ReorderIndices) + dbgs() << Idx << ", "; + dbgs() << "}, "; + } + dbgs() << "\n"; dbgs() << "UserTreeIndices: "; for (const auto &EInfo : UserTreeIndices) dbgs() << EInfo << ", "; @@ -3525,6 +3560,8 @@ if (TE->State != TreeEntry::Vectorize) GathersToOrders.try_emplace(TE.get(), *CurrentOrder); } + if (!TE->ExternalUserReorderIndices.empty()) + VFToOrderedEntries[TE->Scalars.size()].insert(TE.get()); }); // Reorder the graph nodes according to their vectorization factor. @@ -3554,6 +3591,14 @@ return GathersToOrders.find(OpTE)->second; return OpTE->ReorderIndices; }(); + // First consider the order of the external scalar users. + if (!OpTE->ExternalUserReorderIndices.empty()) { + for (const OrdersType &ExtOrder : OpTE->ExternalUserReorderIndices) + ++OrdersUses.insert(std::make_pair(ExtOrder, 0)).first->second; + // No other useful reorder data in this entry. + if (Order.empty()) + continue; + } // Stores actually store the mask, not the order, need to invert. if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() && OpTE->getOpcode() == Instruction::Store && !Order.empty()) { @@ -3620,6 +3665,8 @@ // Build correct orders for extract{element,value}, loads and // stores. reorderOrder(TE->ReorderIndices, Mask); + for (OrdersType &ExtReorder : TE->ExternalUserReorderIndices) + reorderOrder(ExtReorder, Mask); if (isa(TE->getMainOp())) TE->reorderOperands(Mask); } else { @@ -3628,6 +3675,8 @@ assert(TE->ReorderIndices.empty() && "Expected empty reorder sequence."); reorderScalars(TE->Scalars, Mask); + for (OrdersType &ExtReorder : TE->ExternalUserReorderIndices) + reorderOrder(ExtReorder, Mask); } if (!TE->ReuseShuffleIndices.empty()) { // Apply reversed order to keep the original ordering of the reused @@ -3985,6 +4034,136 @@ } } +DenseMap> +BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const { + DenseMap> PtrToStoresMap; + for (unsigned Lane : seq(0, TE->Scalars.size())) { + Value *V = TE->Scalars[Lane]; + // To save compilation time we don't visit if we have too many users. + static constexpr unsigned UsersLimit = 4; + if (V->hasNUsesOrMore(UsersLimit)) + break; + + // Collect stores per pointer object. + for (User *U : V->users()) { + auto *SI = dyn_cast(U); + if (SI == nullptr || !SI->isSimple() || + !isValidElementType(SI->getValueOperand()->getType())) + continue; + // Skip entry if already + if (getTreeEntry(U)) + continue; + + Value *Ptr = getUnderlyingObject(SI->getPointerOperand()); + auto &StoresVec = PtrToStoresMap[Ptr]; + // For now just keep one store per pointer object per lane. + // TODO: Extend this to support multiple stores per pointer per lane + if (StoresVec.size() > Lane) + continue; + // Skip if in different BBs. + if (!StoresVec.empty() && + SI->getParent() != StoresVec.back()->getParent()) + continue; + // Make sure that the stores are of the same type. + if (!StoresVec.empty() && + SI->getValueOperand()->getType() != + StoresVec.back()->getValueOperand()->getType()) + continue; + StoresVec.push_back(SI); + } + } + return PtrToStoresMap; +} + +bool BoUpSLP::CanFormVector( + const SmallVector &StoresVec, + SmallVector &StoresVecSorted) const { + StoresVecSorted = StoresVec; + // Sort the vector based on the pointers. We create a copy because we may + // need the original later for calculating the reorder (shuffle) indices. + bool FailedToCompare = false; + stable_sort(StoresVecSorted, [&FailedToCompare, this](StoreInst *S1, + StoreInst *S2) { + // Early return if we failed to avoid calling getPointerDiff(). + if (FailedToCompare) + return false; + assert(S1->getValueOperand()->getType() == + S2->getValueOperand()->getType() && + "Different types should have been rejected by collectUserStores()"); + Optional Diff = getPointersDiff( + S2->getValueOperand()->getType(), S2->getPointerOperand(), + S1->getValueOperand()->getType(), S1->getPointerOperand(), *DL, *SE, + /*StrictCheck=*/true); + if (!Diff) { + FailedToCompare = true; + return false; + } + return Diff < 0; + }); + // If we failed to compare stores, then just abandon this stores vector. + if (FailedToCompare) + return false; + + // Check if the stores are consecutive by checking if last-first == size-1. + StoreInst *LastSI = StoresVecSorted.back(); + StoreInst *FirstSI = StoresVecSorted.front(); + Optional PtrRange = getPointersDiff( + FirstSI->getValueOperand()->getType(), FirstSI->getPointerOperand(), + LastSI->getValueOperand()->getType(), LastSI->getPointerOperand(), *DL, + *SE, + /*StrictCheck=*/true); + return *PtrRange == (int)StoresVecSorted.size() - 1; +} + +SmallVector +BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const { + unsigned NumLanes = TE->Scalars.size(); + + DenseMap> PtrToStoresMap = + collectUserStores(TE); + + // Holds the reorder indices for each candidate store vector that is a user of + // the current TreeEntry. + SmallVector ExternalReorderIndices; + + // Now inspect the stores collected per pointer and look for vectorization + // candidates. For each candidate calculate the reorder index vector and push + // it into `ExternalReorderIndices` + for (const auto &Pair : PtrToStoresMap) { + auto &StoresVec = Pair.second; + // If we have fewer than NumLanes stores, then we can't form a vector. + if (StoresVec.size() != NumLanes) + continue; + + // If the stores are not consecutive then abandon this sotres vector. + SmallVector StoresVecSorted; + if (!CanFormVector(StoresVec, StoresVecSorted)) + continue; + + // The scalars in StoresVec can form a vector instruction, so calculate the + // shuffle indices. + ExternalReorderIndices.resize(ExternalReorderIndices.size() + 1); + OrdersType &ReorderIndices = ExternalReorderIndices.back(); + for (StoreInst *SI : StoresVec) { + unsigned Idx = llvm::find(StoresVecSorted, SI) - StoresVecSorted.begin(); + ReorderIndices.push_back(Idx); + } + + // Identity order (e.g., {0,1,2,3}) is modeled as an empty OrdersType in + // reorderTopToBottom() and reorderBottomToTop(), so we are following the + // same convention here. + auto IsIdentityOrder = [](const OrdersType &Order) { + for (unsigned Idx : seq(0, Order.size())) + if (Idx != Order[Idx]) + return false; + return true; + }; + if (IsIdentityOrder(ReorderIndices)) + ReorderIndices.clear(); + } + return ExternalReorderIndices; +} + void BoUpSLP::buildTree(ArrayRef Roots, ArrayRef UserIgnoreLst) { deleteTree(); @@ -3992,6 +4171,10 @@ if (!allSameType(Roots)) return; buildTree_rec(Roots, 0, EdgeInfo()); + // Look for reorder indices of external users that will probably be vectorized + for (auto &TE : VectorizableTree) + TE->ExternalUserReorderIndices = + findExternalStoreUsersReorderIndices(TE.get()); } namespace { diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder_with_external_users.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder_with_external_users.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/reorder_with_external_users.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder_with_external_users.ll @@ -14,18 +14,17 @@ ; CHECK-NEXT: [[LD:%.*]] = load double, double* undef, align 8 ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[LD]], i32 0 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[LD]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = fadd <2 x double> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[TMP2]], +; CHECK-NEXT: [[TMP2:%.*]] = fadd <2 x double> [[TMP1]], +; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[TMP2]], ; CHECK-NEXT: [[PTRA1:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 0 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <2 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* [[PTRA1]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[SHUFFLE]], <2 x double>* [[TMP4]], align 8 +; CHECK-NEXT: store <2 x double> [[TMP3]], <2 x double>* [[TMP4]], align 8 ; CHECK-NEXT: br label [[BB2:%.*]] ; CHECK: bb2: -; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[TMP3]], +; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[TMP3]], ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP5]], i32 0 ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP5]], i32 1 -; CHECK-NEXT: [[SEED:%.*]] = fcmp ogt double [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[SEED:%.*]] = fcmp ogt double [[TMP7]], [[TMP6]] ; CHECK-NEXT: ret void ; bb1: