diff --git a/llvm/lib/Transforms/Scalar/LoopDistribute.cpp b/llvm/lib/Transforms/Scalar/LoopDistribute.cpp --- a/llvm/lib/Transforms/Scalar/LoopDistribute.cpp +++ b/llvm/lib/Transforms/Scalar/LoopDistribute.cpp @@ -117,7 +117,8 @@ "Consider loops that are already vectorizable for loop distribution.")); static cl::opt DistributeMergeVectorizablePartitions( - "loop-distribute-merge-vectorizable-partitions", cl::init(true), cl::Hidden, + "loop-distribute-merge-vectorizable-partitions", cl::init(false), + cl::Hidden, cl::desc("Merge adjacent partitions that are already vectorizable.")); static cl::opt PragmaDistributeSCEVCheckThreshold( @@ -132,6 +133,16 @@ cl::desc("Enable the new, experimental LoopDistribution Pass"), cl::init(false)); +static cl::opt DistributeNearbyLoadStoreThreshold( + "loop-distribute-nearby-load-store-threshold", cl::init(4), cl::Hidden, + cl::desc("Pairs of load/stores below the threshold (measured in number of " + "elements) will be merged into the " + "same partition.")); + +static cl::opt DistributeMinLoadStorePerPartition( + "loop-distribute-min-load-store-per-partition", cl::init(1), cl::Hidden, + cl::desc("Minimum number of load or store instructions in a disributed loop.")); + STATISTIC(NumLoopsDistributed, "Number of loops distributed"); namespace { @@ -305,14 +316,25 @@ ValueToValueMapTy VMap; }; +// calculate number of load/store instrictions in a partition +static unsigned PartitionLoadStoreCount(InstPartition *A) { + unsigned LoadStoreCount = 0; + for (Instruction *InstA : *A) { + if (isa(InstA) || isa(InstA)) + LoadStoreCount++; + } + return LoadStoreCount; +} + /// Holds the set of Partitions. It populates them, merges them and then /// clones the loops. class InstPartitionContainer { using InstToPartitionIdT = DenseMap; public: - InstPartitionContainer(Loop *L, LoopInfo *LI, DominatorTree *DT) - : L(L), LI(LI), DT(DT) {} + InstPartitionContainer(Function *F, Loop *L, LoopInfo *LI, DominatorTree *DT, + ScalarEvolution *SE) + : F(F), L(L), LI(LI), DT(DT), SE(SE) {} /// Returns the number of partitions. unsigned getSize() const { return PartitionContainer.size(); } @@ -370,6 +392,54 @@ void mergeBeforePopulating() { if (DistributeMergeVectorizablePartitions) mergeAdjacentNonCyclic(); + } + + /// If two adjacent partitions have accesses to the same buffer (with a + /// small-ish stride between them), we should do both accesses during the same + /// iteration to prevent pulling the same data into cache multiple times. + void mergePartitionsWithNearbyAccesses() { + mergeAdjacentPartitionsIf([&](InstPartition *A, InstPartition *B) { + for (Instruction *InstA : *A) { + for (Instruction *InstB : *B) { + // If either partition is not vectorizable, we shouldn't merge or we + // might lose opportunities for vectorization. + if (A->hasDepCycle() || B->hasDepCycle()) + return false; + + Value *PtrA = getLoadStorePointerOperand(InstA); + Value *PtrB = getLoadStorePointerOperand(InstB); + if (!PtrA || !PtrB) + return false; + Optional Distance = getPointersDiff( + PtrA, PtrB, F->getParent()->getDataLayout(), *SE, false, false); + + if (Distance && + (unsigned)abs(*Distance) < DistributeNearbyLoadStoreThreshold) { + LLVM_DEBUG( + dbgs() + << "Merging partitions due to nearby load/stores in multiple " + << "partitions: " << A << ", " << B << "\n" + << "distance: " << Distance << "\n" + << "\t" << *InstA << "\n" + << "\t" << *InstB << "\n"); + return true; + } + } + } + return false; + }); + } + + void mergeSmallPartitions() { + mergeAdjacentPartitionsIf([&](InstPartition *A, InstPartition *B) { + return PartitionLoadStoreCount(A) < DistributeMinLoadStorePerPartition || + PartitionLoadStoreCount(B) < DistributeMinLoadStorePerPartition; + }); + } + + void mergeAfterPopulating() { + mergeSmallPartitions(); + mergePartitionsWithNearbyAccesses(); if (!DistributeNonIfConvertible) mergeNonIfConvertible(); } @@ -391,8 +461,7 @@ // Step through the partitions and create equivalence between partitions // that contain the same load. Also put partitions in between them in the // same equivalence class to avoid reordering of memory operations. - for (PartitionContainerT::iterator I = PartitionContainer.begin(), - E = PartitionContainer.end(); + for (auto I = PartitionContainer.begin(), E = PartitionContainer.end(); I != E; ++I) { auto *PartI = &*I; @@ -603,14 +672,15 @@ /// belongs to multiple partitions the entry contains -1. InstToPartitionIdT InstToPartitionId; + Function *F; Loop *L; LoopInfo *LI; DominatorTree *DT; + ScalarEvolution *SE; /// The control structure to merge adjacent partitions if both satisfy /// the \p Predicate. - template - void mergeAdjacentPartitionsIf(UnaryPredicate Predicate) { + void mergeAdjacentPartitionsIf(const std::function &Predicate) { InstPartition *PrevMatch = nullptr; for (auto I = PartitionContainer.begin(); I != PartitionContainer.end();) { auto DoesMatch = Predicate(&*I); @@ -627,6 +697,22 @@ } } + void mergeAdjacentPartitionsIf(const std::function &Predicate) { + for (auto I = PartitionContainer.begin(); + I != PartitionContainer.end() && + std::next(I, 1) != PartitionContainer.end();) { + auto J = std::next(I, 1); + if (Predicate(&*I, &*J)) { + J->moveTo(*I); + PartitionContainer.erase(J); + + // Try merging with I again, so not changing I + } else { + ++I; + } + } + } + /// Assign new LoopIDs for the partition's cloned loop. void setNewLoopID(MDNode *OrigLoopID, InstPartition *Part) { Optional PartitionID = makeFollowupLoopID( @@ -732,7 +818,7 @@ return fail("NoDeps", "dependency analysis failed"); } - InstPartitionContainer Partitions(L, LI, DT); + InstPartitionContainer Partitions(F, L, LI, DT, SE); // First, go through each memory operation and assign them to consecutive // partitions (the order of partitions follows program order). Put those @@ -786,7 +872,7 @@ return fail("CantIsolateUnsafeDeps", "cannot isolate unsafe dependencies"); - // Run the merge heuristics: Merge non-cyclic adjacent partitions since we + // Run some merge heuristics: Merge non-cyclic adjacent partitions since we // should be able to vectorize these together. Partitions.mergeBeforePopulating(); @@ -809,6 +895,10 @@ "cannot isolate unsafe dependencies"); } + Partitions.mergeAfterPopulating(); + if (Partitions.getSize() < 2) + return fail("Unprofitable", "no profitable loop distribution found"); + // Don't distribute the loop if we need too many SCEV run-time checks, or // any if it's illegal. const SCEVUnionPredicate &Pred = LAI->getPSE().getUnionPredicate(); diff --git a/llvm/test/Transforms/LoopDistribute/bug-uses-outside-loop.ll b/llvm/test/Transforms/LoopDistribute/bug-uses-outside-loop.ll --- a/llvm/test/Transforms/LoopDistribute/bug-uses-outside-loop.ll +++ b/llvm/test/Transforms/LoopDistribute/bug-uses-outside-loop.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -loop-distribute -enable-loop-distribute --loop-distribute-merge-vectorizable-partitions=false -verify-loop-info -verify-dom-info -S < %s \ +; RUN: opt -loop-distribute -enable-loop-distribute --loop-distribute-min-load-store-per-partition=0 -verify-loop-info -verify-dom-info -S < %s \ ; RUN: | FileCheck %s ; for (i = 0; i < n; i ++) {