This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
llvm/
-
lib/Transforms/Scalar/
-
Transforms/
-
Scalar/
-
LoopDistribute.cpp
-
test/Transforms/LoopDistribute/
-
Transforms/
-
LoopDistribute/
-
bug-uses-outside-loop.ll

Differential D100381

[RFC] Improve loop distribute cost model
Needs ReviewPublic

Authored by sanwou01 on Apr 13 2021, 5:56 AM.

Download Raw Diff

Details

Reviewers

SjoerdMeijer
qcolombet
dmgreen
davide
fhahn
jdoerfert
anemet
nikic
lebedev.ri

Summary

This is a first stab at an improved cost model for loop distribution,
replacing "always merge adjacent vectorizable partitions" with something
more fine-grained.

Two new heuristics are added. First, any adjacent partitions that have
nearby memory accesses are merged. This helps in cases where we would
otherwise separate accesses to the same buffer. (In particular, this
prevents pathologically bad behaviour on (hand-)unrolled loops.)

Second, any partition that is too small is merged with its neighbours.
This should help to keep ILP and MLP high. Currently, any partition
without load/stores is considered "too small", but I expect that this
will need some more tuning.

This seems to give reasonable results with some outliers that I need to
look at more. From the test suite:

                                                                                           delta exec time
benchmark                                                                      #loop-dist  (lower is better)
SingleSource/Benchmarks/Polybench/linear-algebra/kernels/gesummv/gesummv.test          1   0.864    (i.e. +86.4% exec time vs no loop distribute)
SingleSource/Benchmarks/Stanford/Bubblesort.test                                       3   0.265
SingleSource/Benchmarks/Polybench/linear-algebra/solvers/durbin/durbin.test            2   0.214
SingleSource/Benchmarks/Polybench/linear-algebra/kernels/bicg/bicg.test                2   0.144
SingleSource/Benchmarks/Misc/fp-convert.test                                           1   0.108
SingleSource/Benchmarks/Stanford/Treesort.test                                         2   0.091
SingleSource/Benchmarks/CoyoteBench/fftbench.test                                      1   0.081
MultiSource/Applications/hbd/hbd.test                                                  1   0.08
MultiSource/Benchmarks/TSVC/Equivalencing-dbl/Equivalencing-dbl.test                   1   0.062
SingleSource/Benchmarks/Polybench/linear-algebra/kernels/syr2k/syr2k.test              2   0.06
SingleSource/Benchmarks/Stanford/Quicksort.test                                        3   0.046
MultiSource/Benchmarks/MiBench/consumer-typeset/consumer-typeset.test                  1   0.042
MultiSource/Benchmarks/TSVC/LinearDependence-dbl/LinearDependence-dbl.test             1   0.042
MultiSource/Benchmarks/DOE-ProxyApps-C/miniAMR/miniAMR.test                            9   0.04
MultiSource/Benchmarks/MallocBench/espresso/espresso.test                              4   0.031
MultiSource/Benchmarks/TSVC/Expansion-flt/Expansion-flt.test                           1   0.029
MultiSource/Benchmarks/TSVC/CrossingThresholds-flt/CrossingThresholds-flt.test         1   0.023
MultiSource/Benchmarks/TSVC/CrossingThresholds-dbl/CrossingThresholds-dbl.test         1   0.022
MultiSource/Benchmarks/VersaBench/bmm/bmm.test                                         1   0.021
SingleSource/Benchmarks/McGill/queens.test                                             1   0.02
MultiSource/Benchmarks/TSVC/ControlFlow-dbl/ControlFlow-dbl.test                       1   0.019
SingleSource/Benchmarks/Polybench/linear-algebra/kernels/symm/symm.test                1   0.018
MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/miniFE.test                            2   0.017
MultiSource/Benchmarks/TSVC/NodeSplitting-dbl/NodeSplitting-dbl.test                   1   0.016
SingleSource/Benchmarks/Polybench/linear-algebra/kernels/trmm/trmm.test                1   0.016
MultiSource/Benchmarks/TSVC/ControlLoops-flt/ControlLoops-flt.test                     1   0.015
MultiSource/Benchmarks/TSVC/GlobalDataFlow-dbl/GlobalDataFlow-dbl.test                 1   0.015
MultiSource/Benchmarks/DOE-ProxyApps-C++/HACCKernels/HACCKernels.test                  2   0.013
SingleSource/Benchmarks/Polybench/linear-algebra/kernels/doitgen/doitgen.test          1   0.011
MultiSource/Benchmarks/TSVC/StatementReordering-dbl/StatementReordering-dbl.test       1   0.011
SingleSource/Benchmarks/Polybench/stencils/fdtd-apml/fdtd-apml.test                    4   0.007
SingleSource/Benchmarks/Polybench/linear-algebra/kernels/syrk/syrk.test                1   0.006
MultiSource/Benchmarks/TSVC/NodeSplitting-flt/NodeSplitting-flt.test                   1   0.006
MultiSource/Benchmarks/TSVC/InductionVariable-flt/InductionVariable-flt.test           1   0.005
MultiSource/Benchmarks/Bullet/bullet.test                                              8   0.004
MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/CLAMR.test                              6   0.004
MultiSource/Applications/oggenc/oggenc.test                                            6   0.004
SingleSource/Benchmarks/Polybench/stencils/adi/adi.test                                2   0.004
MultiSource/Benchmarks/TSVC/Equivalencing-flt/Equivalencing-flt.test                   1   0.004
MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/SimpleMOC.test                        3   0.003
MultiSource/Benchmarks/TSVC/Recurrences-flt/Recurrences-flt.test                       1   0.002
MultiSource/Benchmarks/ASC_Sequoia/AMGmk/AMGmk.test                                    1   0.002
MultiSource/Benchmarks/TSVC/IndirectAddressing-flt/IndirectAddressing-flt.test         1   0.002
MultiSource/Benchmarks/ASCI_Purple/SMG2000/smg2000.test                                11  0.001
MultiSource/Benchmarks/McCat/04-bisect/bisect.test                                     4   0.001
SingleSource/Benchmarks/Polybench/linear-algebra/solvers/dynprog/dynprog.test          2   0.001
MultiSource/Applications/SPASS/SPASS.test                                              1   0.001
MultiSource/Benchmarks/mediabench/jpeg/jpeg-6a/cjpeg.test                              14  0
MultiSource/Benchmarks/Prolangs-C/agrep/agrep.test                                     9   0
MultiSource/Benchmarks/TSVC/Searching-dbl/Searching-dbl.test                           1   0
MultiSource/Benchmarks/TSVC/Searching-flt/Searching-flt.test                           1   0
MultiSource/Benchmarks/7zip/7zip-benchmark.test                                        16  -0.001
MultiSource/Benchmarks/MiBench/consumer-lame/consumer-lame.test                        1   -0.001
MultiSource/Benchmarks/TSVC/Reductions-dbl/Reductions-dbl.test                         1   -0.001
MultiSource/Benchmarks/TSVC/Reductions-flt/Reductions-flt.test                         1   -0.001
MultiSource/Benchmarks/TSVC/LoopRestructuring-flt/LoopRestructuring-flt.test           1   -0.001
MultiSource/Benchmarks/TSVC/LoopRerolling-flt/LoopRerolling-flt.test                   1   -0.001
MultiSource/Applications/JM/lencod/lencod.test                                         6   -0.002
MultiSource/Benchmarks/TSVC/ControlFlow-flt/ControlFlow-flt.test                       1   -0.002
MultiSource/Applications/viterbi/viterbi.test                                          1   -0.002
MultiSource/Benchmarks/TSVC/Symbolics-flt/Symbolics-flt.test                           1   -0.004
MultiSource/Benchmarks/TSVC/StatementReordering-flt/StatementReordering-flt.test       1   -0.004
MultiSource/Benchmarks/TSVC/Packing-flt/Packing-flt.test                               1   -0.005
MultiSource/Benchmarks/TSVC/LinearDependence-flt/LinearDependence-flt.test             1   -0.005
SingleSource/Benchmarks/Linpack/linpack-pc.test                                        1   -0.005
MultiSource/Benchmarks/ASC_Sequoia/CrystalMk/CrystalMk.test                            3   -0.006
MultiSource/Benchmarks/TSVC/LoopRerolling-dbl/LoopRerolling-dbl.test                   1   -0.006
MultiSource/Benchmarks/TSVC/GlobalDataFlow-flt/GlobalDataFlow-flt.test                 1   -0.007
MultiSource/Benchmarks/VersaBench/beamformer/beamformer.test                           2   -0.008
SingleSource/Benchmarks/Polybench/stencils/fdtd-2d/fdtd-2d.test                        1   -0.008
MultiSource/Benchmarks/TSVC/Expansion-dbl/Expansion-dbl.test                           1   -0.009
MultiSource/Applications/JM/ldecod/ldecod.test                                         16  -0.011
MultiSource/Benchmarks/TSVC/Recurrences-dbl/Recurrences-dbl.test                       1   -0.011
MultiSource/Benchmarks/sim/sim.test                                                    6   -0.013
MultiSource/Benchmarks/TSVC/Packing-dbl/Packing-dbl.test                               1   -0.013
MultiSource/Applications/sqlite3/sqlite3.test                                          3   -0.014
MultiSource/Applications/ClamAV/clamscan.test                                          2   -0.014
MultiSource/Benchmarks/TSVC/ControlLoops-dbl/ControlLoops-dbl.test                     1   -0.014
MultiSource/Benchmarks/TSVC/IndirectAddressing-dbl/IndirectAddressing-dbl.test         1   -0.019
MultiSource/Benchmarks/mafft/pairlocalalign.test                                       78  -0.02
SingleSource/Benchmarks/Polybench/linear-algebra/solvers/gramschmidt/gramschmidt.test  2   -0.02
MultiSource/Benchmarks/FreeBench/pcompress2/pcompress2.test                            4   -0.024
MultiSource/Benchmarks/MiBench/consumer-jpeg/consumer-jpeg.test                        14  -0.027
MultiSource/Applications/obsequi/Obsequi.test                                          2   -0.027
MultiSource/Benchmarks/TSVC/Symbolics-dbl/Symbolics-dbl.test                           1   -0.029
SingleSource/Benchmarks/Misc-C++/oopack_v1p8.test                                      2   -0.03
MultiSource/Benchmarks/TSVC/InductionVariable-dbl/InductionVariable-dbl.test           1   -0.031
MultiSource/Benchmarks/TSVC/LoopRestructuring-dbl/LoopRestructuring-dbl.test           1   -0.065
SingleSource/Benchmarks/Polybench/linear-algebra/kernels/cholesky/cholesky.test        1   -0.08
SingleSource/Benchmarks/Polybench/stencils/jacobi-2d-imper/jacobi-2d-imper.test        2   -0.082
MultiSource/Benchmarks/MiBench/telecomm-FFT/telecomm-fft.test                          3   -0.125
MultiSource/Benchmarks/MallocBench/gs/gs.test                                          3   -0.151
SingleSource/Benchmarks/Polybench/stencils/jacobi-1d-imper/jacobi-1d-imper.test        2   -1

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

sanwou01 created this revision.Apr 13 2021, 5:56 AM

Herald added a subscriber: hiraditya. · View Herald TranscriptApr 13 2021, 5:56 AM

sanwou01 requested review of this revision.Apr 13 2021, 5:56 AM

Herald added a project: Restricted Project. · View Herald TranscriptApr 13 2021, 5:56 AM

Herald added a subscriber: llvm-commits. · View Herald Transcript

Harbormaster completed remote builds in B98468: Diff 337120.Apr 13 2021, 5:57 AM

lebedev.ri edited the summary of this revision. (Show Details)Apr 13 2021, 6:02 AM

sanwou01 added a parent revision: D99596: [LoopDist] Distribute vectorizable loops.Apr 13 2021, 6:05 AM

xbolva00 added a subscriber: xbolva00.Apr 13 2021, 6:16 AM

That certainly looks encouraging. Just some remarks about the perf numbers first. I found that the llvm test suite can be quite noisy and you certainly need to restrict it to the subset of CTMark for some more meaningful numbers. Just checking, did you do this? Because looking at all tests, I think I see more tests than I would expect with CTMark, but I could be wrong. How about a SPEC run for something that runs a bit longer?

loop dist in hmmer should always work :) and some improvements in TSVC should be expected as well.

https://github.com/UoB-HPC/TSVC_2/blob/master/src/tsvc.c#L1797

Since test suite can be quite noisy, can you check and share for additionally vectorized loops wrt new cost model?

In D100381#2685652, @xbolva00 wrote:

loop dist in hmmer should always work :)

Ha, true! Perhaps check that the expected gain is still there then. ;-)
But yeah, I basically want to say if we could check this with some other non/less noisy suite, so like this suggestion:

and some improvements in TSVC should be expected as well.

https://github.com/UoB-HPC/TSVC_2/blob/master/src/tsvc.c#L1797

Since test suite can be quite noisy, can you check and share for additionally vectorized loops wrt new cost model?

In D100381#2685619, @SjoerdMeijer wrote:

That certainly looks encouraging. Just some remarks about the perf numbers first. I found that the llvm test suite can be quite noisy and you certainly need to restrict it to the subset of CTMark for some more meaningful numbers. Just checking, did you do this? Because looking at all tests, I think I see more tests than I would expect with CTMark, but I could be wrong. How about a SPEC run for something that runs a bit longer?

Yeah, the test suite is fairly noisy, even with -DTEST_SUITE_BENCHMARKING_ONLY=On). CTMark is mostly for compile time, isn't it? I don't think it helps much in terms of noise.

In any case, I've only included benchmarks which have distributed loops, but there are changes (noise) of similar size in other tests too. IIRC SPEC was neutral except for the expected gain on hmmer, but I'll re-run them with the current patch.

In D100381#2685652, @xbolva00 wrote:

loop dist in hmmer should always work :) and some improvements in TSVC should be expected as well.

https://github.com/UoB-HPC/TSVC_2/blob/master/src/tsvc.c#L1797

I don't think we pick that one up (yet), because it's not the inner loop that needs distribution, which is a limitation of the current pass. It looks like there are a few more in TSVC, so I'll have a look at those.

Since test suite can be quite noisy, can you check and share for additionally vectorized loops wrt new cost model?

Good idea, I'll get some numbers for that. Loop distribution increases the total number of loops as well, so you'd naturally expect the number of vectorized loops to go up, so it'd be interesting to compare the change in number of loops to the number of *vectorized* loops.

sekharbvrs added a subscriber: sekharbvrs.Apr 14 2021, 1:44 AM

sanwou01 mentioned this in D99596: [LoopDist] Distribute vectorizable loops.Apr 14 2021, 5:31 AM

We also have some target-specific heuristics for loop-distribute, which focus on the number of memory streams a CPU can handle IIRC. I never got around posting them upstream so far. Let me go back and look at those heuristics.

In D100381#2688791, @fhahn wrote:

We also have some target-specific heuristics for loop-distribute, which focus on the number of memory streams a CPU can handle IIRC. I never got around posting them upstream so far. Let me go back and look at those heuristics.

That would be awesome, thanks!

In D100381#2685736, @sanwou01 wrote:

IIRC SPEC was neutral except for the expected gain on hmmer, but I'll re-run them with the current patch.

SPEC (AArch64, LTO) isn't where I'd like it to be just yet. hmmer's score is up 20% (vs no loop distribute), but the old loop distribute saw +45%, so looks like something isn't quite right. The other surprise is 2006 fprate 433.milc which is down 19%. I'll have a look at both of them.

nikic resigned from this revision.Apr 16 2021, 1:30 PM

Looking at TSVC a bit, @xbolva00 :

s221 won't distribute because the second read of a[i] is removed by EarlyCSE, so there is no unique load instruction for a second loop. For this loop I'm not convinced that distribution is likely to help performance; it's a trade-off between (some) vectorization and re-loading both a[i] and d[i].
s222 also gets mangled by EarlyCSE, but the result would still be distributable if it weren't for the order of the stores to e[i] and a[i]. This runs into a limitation of LoopAccessAnalysis, which can't reorder instructions. Perhaps it could help to do a bit of scheduling on IR?
s2275 as mentioned above, this runs into another LoopAccessAnalysis limitation: it only handles innermost loops. I'm not sure how easy (if at all possible) it would be to lift that restriction.

So, unfortunately, it looks like we can't handle these loops without some pretty fundamental changes to LoopAccessAnalysis. Thoughts?

Thank you for the detailed analysis!

I think it would be good to share this analysis on llvm-dev too and ask community about LoopAccessAnalysis.

cc @fhahn @reames as they work on loop optimizations

ChuanqiXu added a subscriber: ChuanqiXu.Apr 21 2021, 5:28 AM

In D100381#2704777, @sanwou01 wrote:

Looking at TSVC a bit, @xbolva00 :

s221 won't distribute because the second read of a[i] is removed by EarlyCSE, so there is no unique load instruction for a second loop. For this loop I'm not convinced that distribution is likely to help performance; it's a trade-off between (some) vectorization and re-loading both a[i] and d[i].

s222 also gets mangled by EarlyCSE, but the result would still be distributable if it weren't for the order of the stores to e[i] and a[i]. This runs into a limitation of LoopAccessAnalysis, which can't reorder instructions. Perhaps it could help to do a bit of scheduling on IR?

s2275 as mentioned above, this runs into another LoopAccessAnalysis limitation: it only handles innermost loops. I'm not sure how easy (if at all possible) it would be to lift that restriction.

So, unfortunately, it looks like we can't handle these loops without some pretty fundamental changes to LoopAccessAnalysis. Thoughts?

Yes, those are known limitations and I would recommend focusing on showing loop-distribute's value with current LAA.

In D100381#2704830, @fhahn wrote:

In D100381#2704777, @sanwou01 wrote:

(snip)

So, unfortunately, it looks like we can't handle these loops without some pretty fundamental changes to LoopAccessAnalysis. Thoughts?

Yes, those are known limitations and I would recommend focusing on showing loop-distribute's value with current LAA.

On a general comment, I highly recommend trying to be incremental and fixing one set of issues at a time. If you can show benefit with current LAA, do that. If you can't and need to change LAA in some way, go do that and show the benefit on it's own. Trying to do too many things at once is recipe for failure.

(p.s. I know very little about the specific code in loop-distribute. This is just a general comment.)

Thanks for the comments. I'm happy to leave LoopAccessAnalysis alone for now and focus on Loop Distribute's cost model. I was hoping that a cost model tweak or two might enable some more loop distribution in TSVC, but it looks like that isn't the case.

Matt added a subscriber: Matt.Apr 22 2021, 6:24 AM

Maybe worth to check gcc’s test cases in gcc.dg/tree-ssa/ldist-*.c ?

This review seems to be stuck/dead, consider abandoning if no longer relevant.

Herald added a project: Restricted Project. · View Herald TranscriptJan 12 2023, 5:23 PM

Herald added a subscriber: StephenFan. · View Herald Transcript

Allen added a subscriber: Allen.Aug 9 2023, 4:41 AM

Herald added a subscriber: wangpc. · View Herald TranscriptAug 9 2023, 4:41 AM

Revision Contents

Path

Size

llvm/

lib/

Transforms/

Scalar/

LoopDistribute.cpp

108 lines

test/

Transforms/

LoopDistribute/

bug-uses-outside-loop.ll

2 lines

Diff 337120

llvm/lib/Transforms/Scalar/LoopDistribute.cpp

//===- LoopDistribute.cpp - Loop Distribution Pass ------------------------===//		//===- LoopDistribute.cpp - Loop Distribution Pass ------------------------===//
		Lint: Lint Inline Actions clang-format not found in user's PATH; not linting file. Lint: Lint: clang-format not found in user's PATH; not linting file.
//		//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.		// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.		// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception		// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
//		//
▲ Show 20 Lines • Show All 103 Lines • ▼ Show 20 Lines	cl::desc("The maximum number of runtime pointer aliasing checks for Loop "
"Distribution"));		"Distribution"));

static cl::opt<bool> DistributeVectorizableLoops(		static cl::opt<bool> DistributeVectorizableLoops(
"loop-distribute-vectorizable-loops", cl::init(true), cl::Hidden,		"loop-distribute-vectorizable-loops", cl::init(true), cl::Hidden,
cl::desc(		cl::desc(
"Consider loops that are already vectorizable for loop distribution."));		"Consider loops that are already vectorizable for loop distribution."));

static cl::opt<bool> DistributeMergeVectorizablePartitions(		static cl::opt<bool> DistributeMergeVectorizablePartitions(
"loop-distribute-merge-vectorizable-partitions", cl::init(true), cl::Hidden,		"loop-distribute-merge-vectorizable-partitions", cl::init(false),
		cl::Hidden,
cl::desc("Merge adjacent partitions that are already vectorizable."));		cl::desc("Merge adjacent partitions that are already vectorizable."));

static cl::opt<unsigned> PragmaDistributeSCEVCheckThreshold(		static cl::opt<unsigned> PragmaDistributeSCEVCheckThreshold(
"loop-distribute-scev-check-threshold-with-pragma", cl::init(128),		"loop-distribute-scev-check-threshold-with-pragma", cl::init(128),
cl::Hidden,		cl::Hidden,
cl::desc(		cl::desc(
"The maximum number of SCEV checks allowed for Loop "		"The maximum number of SCEV checks allowed for Loop "
"Distribution for loop marked with #pragma loop distribute(enable)"));		"Distribution for loop marked with #pragma loop distribute(enable)"));

static cl::opt<bool> EnableLoopDistribute(		static cl::opt<bool> EnableLoopDistribute(
"enable-loop-distribute", cl::Hidden,		"enable-loop-distribute", cl::Hidden,
cl::desc("Enable the new, experimental LoopDistribution Pass"),		cl::desc("Enable the new, experimental LoopDistribution Pass"),
cl::init(false));		cl::init(false));

		static cl::opt<unsigned> DistributeNearbyLoadStoreThreshold(
		"loop-distribute-nearby-load-store-threshold", cl::init(4), cl::Hidden,
		cl::desc("Pairs of load/stores below the threshold (measured in number of "
		"elements) will be merged into the "
		"same partition."));

		static cl::opt<unsigned> DistributeMinLoadStorePerPartition(
		"loop-distribute-min-load-store-per-partition", cl::init(1), cl::Hidden,
		cl::desc("Minimum number of load or store instructions in a disributed loop."));

STATISTIC(NumLoopsDistributed, "Number of loops distributed");		STATISTIC(NumLoopsDistributed, "Number of loops distributed");

namespace {		namespace {

/// Maintains the set of instructions of the loop for a partition before		/// Maintains the set of instructions of the loop for a partition before
/// cloning. After cloning, it hosts the new loop.		/// cloning. After cloning, it hosts the new loop.
class InstPartition {		class InstPartition {
using InstructionSet = SmallPtrSet<Instruction *, 8>;		using InstructionSet = SmallPtrSet<Instruction *, 8>;
▲ Show 20 Lines • Show All 157 Lines • ▼ Show 20 Lines	private:
SmallVector<BasicBlock *, 8> ClonedLoopBlocks;		SmallVector<BasicBlock *, 8> ClonedLoopBlocks;

/// These gets populated once the set of instructions have been		/// These gets populated once the set of instructions have been
/// finalized. If this partition is mapped to the original loop, these are not		/// finalized. If this partition is mapped to the original loop, these are not
/// set.		/// set.
ValueToValueMapTy VMap;		ValueToValueMapTy VMap;
};		};

		// calculate number of load/store instrictions in a partition
		static unsigned PartitionLoadStoreCount(InstPartition *A) {
		unsigned LoadStoreCount = 0;
		for (Instruction InstA : A) {
		if (isa<LoadInst>(InstA) \|\| isa<StoreInst>(InstA))
		LoadStoreCount++;
		}
		return LoadStoreCount;
		}

/// Holds the set of Partitions. It populates them, merges them and then		/// Holds the set of Partitions. It populates them, merges them and then
/// clones the loops.		/// clones the loops.
class InstPartitionContainer {		class InstPartitionContainer {
using InstToPartitionIdT = DenseMap<Instruction *, int>;		using InstToPartitionIdT = DenseMap<Instruction *, int>;

public:		public:
InstPartitionContainer(Loop L, LoopInfo LI, DominatorTree *DT)		InstPartitionContainer(Function F, Loop L, LoopInfo LI, DominatorTree DT,
: L(L), LI(LI), DT(DT) {}		ScalarEvolution *SE)
		: F(F), L(L), LI(LI), DT(DT), SE(SE) {}

/// Returns the number of partitions.		/// Returns the number of partitions.
unsigned getSize() const { return PartitionContainer.size(); }		unsigned getSize() const { return PartitionContainer.size(); }

/// Adds \p Inst into the current partition if that is marked to		/// Adds \p Inst into the current partition if that is marked to
/// contain cycles. Otherwise start a new partition for it.		/// contain cycles. Otherwise start a new partition for it.
void addToCyclicPartition(Instruction *Inst) {		void addToCyclicPartition(Instruction *Inst) {
// If the current partition is non-cyclic. Start a new one.		// If the current partition is non-cyclic. Start a new one.
▲ Show 20 Lines • Show All 41 Lines • ▼ Show 20 Lines	mergeAdjacentPartitionsIf([&](const InstPartition *Partition) {
return seenStore;		return seenStore;
});		});
}		}

/// Merges the partitions according to various heuristics.		/// Merges the partitions according to various heuristics.
void mergeBeforePopulating() {		void mergeBeforePopulating() {
if (DistributeMergeVectorizablePartitions)		if (DistributeMergeVectorizablePartitions)
mergeAdjacentNonCyclic();		mergeAdjacentNonCyclic();
		}

		/// If two adjacent partitions have accesses to the same buffer (with a
		/// small-ish stride between them), we should do both accesses during the same
		/// iteration to prevent pulling the same data into cache multiple times.
		void mergePartitionsWithNearbyAccesses() {
		mergeAdjacentPartitionsIf([&](InstPartition A, InstPartition B) {
		for (Instruction InstA : A) {
		for (Instruction InstB : B) {
		// If either partition is not vectorizable, we shouldn't merge or we
		// might lose opportunities for vectorization.
		if (A->hasDepCycle() \|\| B->hasDepCycle())
		return false;

		Value *PtrA = getLoadStorePointerOperand(InstA);
		Value *PtrB = getLoadStorePointerOperand(InstB);
		if (!PtrA \|\| !PtrB)
		return false;
		Optional<int> Distance = getPointersDiff(
		PtrA, PtrB, F->getParent()->getDataLayout(), *SE, false, false);

		if (Distance &&
		(unsigned)abs(*Distance) < DistributeNearbyLoadStoreThreshold) {
		LLVM_DEBUG(
		dbgs()
		<< "Merging partitions due to nearby load/stores in multiple "
		<< "partitions: " << A << ", " << B << "\n"
		<< "distance: " << Distance << "\n"
		<< "\t" << *InstA << "\n"
		<< "\t" << *InstB << "\n");
		return true;
		}
		}
		}
		return false;
		});
		}

		void mergeSmallPartitions() {
		mergeAdjacentPartitionsIf([&](InstPartition A, InstPartition B) {
		return PartitionLoadStoreCount(A) < DistributeMinLoadStorePerPartition \|\|
		PartitionLoadStoreCount(B) < DistributeMinLoadStorePerPartition;
		});
		}

		void mergeAfterPopulating() {
		mergeSmallPartitions();
		mergePartitionsWithNearbyAccesses();
if (!DistributeNonIfConvertible)		if (!DistributeNonIfConvertible)
mergeNonIfConvertible();		mergeNonIfConvertible();
}		}

/// Merges partitions in order to ensure that no loads are duplicated.		/// Merges partitions in order to ensure that no loads are duplicated.
///		///
/// We can't duplicate loads because that could potentially reorder them.		/// We can't duplicate loads because that could potentially reorder them.
/// LoopAccessAnalysis provides dependency information with the context that		/// LoopAccessAnalysis provides dependency information with the context that
/// the order of memory operation is preserved.		/// the order of memory operation is preserved.
///		///
/// Return if any partitions were merged.		/// Return if any partitions were merged.
bool mergeToAvoidDuplicatedLoads() {		bool mergeToAvoidDuplicatedLoads() {
using LoadToPartitionT = DenseMap<Instruction , InstPartition >;		using LoadToPartitionT = DenseMap<Instruction , InstPartition >;
using ToBeMergedT = EquivalenceClasses<InstPartition *>;		using ToBeMergedT = EquivalenceClasses<InstPartition *>;

LoadToPartitionT LoadToPartition;		LoadToPartitionT LoadToPartition;
ToBeMergedT ToBeMerged;		ToBeMergedT ToBeMerged;

// Step through the partitions and create equivalence between partitions		// Step through the partitions and create equivalence between partitions
// that contain the same load. Also put partitions in between them in the		// that contain the same load. Also put partitions in between them in the
// same equivalence class to avoid reordering of memory operations.		// same equivalence class to avoid reordering of memory operations.
for (PartitionContainerT::iterator I = PartitionContainer.begin(),		for (auto I = PartitionContainer.begin(), E = PartitionContainer.end();
E = PartitionContainer.end();
I != E; ++I) {		I != E; ++I) {
auto PartI = &I;		auto PartI = &I;

// If a load occurs in two partitions PartI and PartJ, merge all		// If a load occurs in two partitions PartI and PartJ, merge all
// partitions (PartI, PartJ] into PartI.		// partitions (PartI, PartJ] into PartI.
for (Instruction Inst : PartI)		for (Instruction Inst : PartI)
if (isa<LoadInst>(Inst)) {		if (isa<LoadInst>(Inst)) {
bool NewElt;		bool NewElt;
▲ Show 20 Lines • Show All 194 Lines • ▼ Show 20 Lines	private:

/// List of partitions.		/// List of partitions.
PartitionContainerT PartitionContainer;		PartitionContainerT PartitionContainer;

/// Mapping from Instruction to partition Id. If the instruction		/// Mapping from Instruction to partition Id. If the instruction
/// belongs to multiple partitions the entry contains -1.		/// belongs to multiple partitions the entry contains -1.
InstToPartitionIdT InstToPartitionId;		InstToPartitionIdT InstToPartitionId;

		Function *F;
Loop *L;		Loop *L;
LoopInfo *LI;		LoopInfo *LI;
DominatorTree *DT;		DominatorTree *DT;
		ScalarEvolution *SE;

/// The control structure to merge adjacent partitions if both satisfy		/// The control structure to merge adjacent partitions if both satisfy
/// the \p Predicate.		/// the \p Predicate.
template <class UnaryPredicate>		void mergeAdjacentPartitionsIf(const std::function<bool(InstPartition *)> &Predicate) {
void mergeAdjacentPartitionsIf(UnaryPredicate Predicate) {
InstPartition *PrevMatch = nullptr;		InstPartition *PrevMatch = nullptr;
for (auto I = PartitionContainer.begin(); I != PartitionContainer.end();) {		for (auto I = PartitionContainer.begin(); I != PartitionContainer.end();) {
auto DoesMatch = Predicate(&*I);		auto DoesMatch = Predicate(&*I);
if (PrevMatch == nullptr && DoesMatch) {		if (PrevMatch == nullptr && DoesMatch) {
PrevMatch = &*I;		PrevMatch = &*I;
++I;		++I;
} else if (PrevMatch != nullptr && DoesMatch) {		} else if (PrevMatch != nullptr && DoesMatch) {
I->moveTo(*PrevMatch);		I->moveTo(*PrevMatch);
I = PartitionContainer.erase(I);		I = PartitionContainer.erase(I);
} else {		} else {
PrevMatch = nullptr;		PrevMatch = nullptr;
++I;		++I;
}		}
}		}
}		}

		void mergeAdjacentPartitionsIf(const std::function<bool(InstPartition, InstPartition)> &Predicate) {
		for (auto I = PartitionContainer.begin();
		I != PartitionContainer.end() &&
		std::next(I, 1) != PartitionContainer.end();) {
		auto J = std::next(I, 1);
		if (Predicate(&I, &J)) {
		J->moveTo(*I);
		PartitionContainer.erase(J);

		// Try merging with I again, so not changing I
		} else {
		++I;
		}
		}
		}

/// Assign new LoopIDs for the partition's cloned loop.		/// Assign new LoopIDs for the partition's cloned loop.
void setNewLoopID(MDNode OrigLoopID, InstPartition Part) {		void setNewLoopID(MDNode OrigLoopID, InstPartition Part) {
Optional<MDNode *> PartitionID = makeFollowupLoopID(		Optional<MDNode *> PartitionID = makeFollowupLoopID(
OrigLoopID,		OrigLoopID,
{LLVMLoopDistributeFollowupAll,		{LLVMLoopDistributeFollowupAll,
Part->hasDepCycle() ? LLVMLoopDistributeFollowupSequential		Part->hasDepCycle() ? LLVMLoopDistributeFollowupSequential
: LLVMLoopDistributeFollowupCoincident});		: LLVMLoopDistributeFollowupCoincident});
if (PartitionID.hasValue()) {		if (PartitionID.hasValue()) {
▲ Show 20 Lines • Show All 89 Lines • ▼ Show 20 Lines	bool processLoop(std::function<const LoopAccessInfo &(Loop &)> &GetLAA) {

// If we can't vectorize and the set of depdencies is empty, then that means		// If we can't vectorize and the set of depdencies is empty, then that means
// that Loop Access Analysis gave up and the results are invalid. Don't try		// that Loop Access Analysis gave up and the results are invalid. Don't try
// to do loop distribution based off it, or Bad Things happen.		// to do loop distribution based off it, or Bad Things happen.
if (!Dependences \|\| (!LAI->canVectorizeMemory() && Dependences->empty())) {		if (!Dependences \|\| (!LAI->canVectorizeMemory() && Dependences->empty())) {
return fail("NoDeps", "dependency analysis failed");		return fail("NoDeps", "dependency analysis failed");
}		}

InstPartitionContainer Partitions(L, LI, DT);		InstPartitionContainer Partitions(F, L, LI, DT, SE);

// First, go through each memory operation and assign them to consecutive		// First, go through each memory operation and assign them to consecutive
// partitions (the order of partitions follows program order). Put those		// partitions (the order of partitions follows program order). Put those
// with unsafe dependences into "cyclic" partition otherwise put each store		// with unsafe dependences into "cyclic" partition otherwise put each store
// in its own "non-cyclic" partition (we'll merge these later).		// in its own "non-cyclic" partition (we'll merge these later).
//		//
// Note that a memory operation (e.g. Load2 below) at a program point that		// Note that a memory operation (e.g. Load2 below) at a program point that
// has an unsafe dependence (Store3->Load1) spanning over it must be		// has an unsafe dependence (Store3->Load1) spanning over it must be
Show All 37 Lines	bool processLoop(std::function<const LoopAccessInfo &(Loop &)> &GetLAA) {
for (auto *Inst : DefsUsedOutside)		for (auto *Inst : DefsUsedOutside)
Partitions.addToNewNonCyclicPartition(Inst);		Partitions.addToNewNonCyclicPartition(Inst);

LLVM_DEBUG(dbgs() << "Seeded partitions:\n" << Partitions);		LLVM_DEBUG(dbgs() << "Seeded partitions:\n" << Partitions);
if (Partitions.getSize() < 2)		if (Partitions.getSize() < 2)
return fail("CantIsolateUnsafeDeps",		return fail("CantIsolateUnsafeDeps",
"cannot isolate unsafe dependencies");		"cannot isolate unsafe dependencies");

// Run the merge heuristics: Merge non-cyclic adjacent partitions since we		// Run some merge heuristics: Merge non-cyclic adjacent partitions since we
// should be able to vectorize these together.		// should be able to vectorize these together.
Partitions.mergeBeforePopulating();		Partitions.mergeBeforePopulating();

LLVM_DEBUG(dbgs() << "\nMerged partitions:\n" << Partitions);		LLVM_DEBUG(dbgs() << "\nMerged partitions:\n" << Partitions);
if (Partitions.getSize() < 2)		if (Partitions.getSize() < 2)
return fail("CantIsolateUnsafeDeps",		return fail("CantIsolateUnsafeDeps",
"cannot isolate unsafe dependencies");		"cannot isolate unsafe dependencies");

// Now, populate the partitions with non-memory operations.		// Now, populate the partitions with non-memory operations.
Partitions.populateUsedSet();		Partitions.populateUsedSet();
LLVM_DEBUG(dbgs() << "\nPopulated partitions:\n" << Partitions);		LLVM_DEBUG(dbgs() << "\nPopulated partitions:\n" << Partitions);

// In order to preserve original lexical order for loads, keep them in the		// In order to preserve original lexical order for loads, keep them in the
// partition that we set up in the MemoryInstructionDependences loop.		// partition that we set up in the MemoryInstructionDependences loop.
if (Partitions.mergeToAvoidDuplicatedLoads()) {		if (Partitions.mergeToAvoidDuplicatedLoads()) {
LLVM_DEBUG(dbgs() << "\nPartitions merged to ensure unique loads:\n"		LLVM_DEBUG(dbgs() << "\nPartitions merged to ensure unique loads:\n"
<< Partitions);		<< Partitions);
if (Partitions.getSize() < 2)		if (Partitions.getSize() < 2)
return fail("CantIsolateUnsafeDeps",		return fail("CantIsolateUnsafeDeps",
"cannot isolate unsafe dependencies");		"cannot isolate unsafe dependencies");
}		}

		Partitions.mergeAfterPopulating();
		if (Partitions.getSize() < 2)
		return fail("Unprofitable", "no profitable loop distribution found");

// Don't distribute the loop if we need too many SCEV run-time checks, or		// Don't distribute the loop if we need too many SCEV run-time checks, or
// any if it's illegal.		// any if it's illegal.
const SCEVUnionPredicate &Pred = LAI->getPSE().getUnionPredicate();		const SCEVUnionPredicate &Pred = LAI->getPSE().getUnionPredicate();
if (LAI->hasConvergentOp() && !Pred.isAlwaysTrue()) {		if (LAI->hasConvergentOp() && !Pred.isAlwaysTrue()) {
return fail("RuntimeCheckWithConvergent",		return fail("RuntimeCheckWithConvergent",
"may not insert runtime check with convergent operation");		"may not insert runtime check with convergent operation");
}		}

▲ Show 20 Lines • Show All 319 Lines • Show Last 20 Lines

llvm/test/Transforms/LoopDistribute/bug-uses-outside-loop.ll

	; NOTE: Assertions have been autogenerated by utils/update_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
	; RUN: opt -loop-distribute -enable-loop-distribute --loop-distribute-merge-vectorizable-partitions=false -verify-loop-info -verify-dom-info -S < %s \			; RUN: opt -loop-distribute -enable-loop-distribute --loop-distribute-min-load-store-per-partition=0 -verify-loop-info -verify-dom-info -S < %s \
	; RUN: \| FileCheck %s			; RUN: \| FileCheck %s

	; for (i = 0; i < n; i ++) {			; for (i = 0; i < n; i ++) {
	; sumA += A[i]			; sumA += A[i]
	; =========================			; =========================
	; sumB += B[i]			; sumB += B[i]
	; }			; }

	▲ Show 20 Lines • Show All 56 Lines • Show Last 20 Lines