diff --git a/llvm/lib/Transforms/Scalar/LoopDistribute.cpp b/llvm/lib/Transforms/Scalar/LoopDistribute.cpp --- a/llvm/lib/Transforms/Scalar/LoopDistribute.cpp +++ b/llvm/lib/Transforms/Scalar/LoopDistribute.cpp @@ -106,6 +106,20 @@ cl::desc("The maximum number of SCEV checks allowed for Loop " "Distribution")); +static cl::opt DistributeRuntimePointerChecksThreshold( + "loop-distribute-runtime-check-threshold", cl::init(32), cl::Hidden, + cl::desc("The maximum number of runtime pointer aliasing checks for Loop " + "Distribution")); + +static cl::opt DistributeVectorizableLoops( + "loop-distribute-vectorizable-loops", cl::init(true), cl::Hidden, + cl::desc( + "Consider loops that are already vectorizable for loop distribution.")); + +static cl::opt DistributeMergeVectorizablePartitions( + "loop-distribute-merge-vectorizable-partitions", cl::init(true), cl::Hidden, + cl::desc("Merge adjacent partitions that are already vectorizable.")); + static cl::opt PragmaDistributeSCEVCheckThreshold( "loop-distribute-scev-check-threshold-with-pragma", cl::init(128), cl::Hidden, @@ -130,6 +144,7 @@ public: InstPartition(Instruction *I, Loop *L, bool DepCycle = false) : DepCycle(DepCycle), OrigLoop(L) { + SeedInstructions.insert(I); Set.insert(I); } @@ -137,7 +152,10 @@ bool hasDepCycle() const { return DepCycle; } /// Adds an instruction to this partition. - void add(Instruction *I) { Set.insert(I); } + void addSeed(Instruction *I) { + SeedInstructions.insert(I); + Set.insert(I); + } /// Collection accessors. InstructionSet::iterator begin() { return Set.begin(); } @@ -149,6 +167,9 @@ /// Moves this partition into \p Other. This partition becomes empty /// after this. void moveTo(InstPartition &Other) { + Other.SeedInstructions.insert(SeedInstructions.begin(), + SeedInstructions.end()); + SeedInstructions.clear(); Other.Set.insert(Set.begin(), Set.end()); Set.clear(); Other.DepCycle |= DepCycle; @@ -207,6 +228,14 @@ /// Remaps the cloned instructions using VMap. void remapInstructions() { + // Seed instructions might be used outside the loop. + for (Instruction *I : SeedInstructions) { + I->replaceUsesWithIf(VMap[I], [&](Use &U) { + Instruction *Use = cast(U.getUser()); + return !OrigLoop->contains(Use->getParent()); + }); + } + remapInstructionsInBlocks(ClonedLoopBlocks, VMap); } @@ -253,6 +282,9 @@ /// Instructions from OrigLoop selected for this partition. InstructionSet Set; + /// Instructions from OrigLoop used to seed this partition. + InstructionSet SeedInstructions; + /// Whether this partition contains a dependence cycle. bool DepCycle; @@ -292,7 +324,7 @@ if (PartitionContainer.empty() || !PartitionContainer.back().hasDepCycle()) PartitionContainer.emplace_back(Inst, L, /*DepCycle=*/true); else - PartitionContainer.back().add(Inst); + PartitionContainer.back().addSeed(Inst); } /// Adds \p Inst into a partition that is not marked to contain @@ -336,7 +368,8 @@ /// Merges the partitions according to various heuristics. void mergeBeforePopulating() { - mergeAdjacentNonCyclic(); + if (DistributeMergeVectorizablePartitions) + mergeAdjacentNonCyclic(); if (!DistributeNonIfConvertible) mergeNonIfConvertible(); } @@ -684,13 +717,20 @@ // Currently, we only distribute to isolate the part of the loop with // dependence cycles to enable partial vectorization. - if (LAI->canVectorizeMemory()) + if (LAI->canVectorizeMemory() && !DistributeVectorizableLoops) return fail("MemOpsCanBeVectorized", "memory operations are safe for vectorization"); auto *Dependences = LAI->getDepChecker().getDependences(); - if (!Dependences || Dependences->empty()) - return fail("NoUnsafeDeps", "no unsafe dependences to isolate"); + if (Dependences) + LLVM_DEBUG(dbgs() << "NumDependences: " << Dependences->size() << "\n"); + + // If we can't vectorize and the set of depdencies is empty, then that means + // that Loop Access Analysis gave up and the results are invalid. Don't try + // to do loop distribution based off it, or Bad Things happen. + if (!Dependences || (!LAI->canVectorizeMemory() && Dependences->empty())) { + return fail("NoDeps", "dependency analysis failed"); + } InstPartitionContainer Partitions(L, LI, DT); @@ -749,6 +789,7 @@ // Run the merge heuristics: Merge non-cyclic adjacent partitions since we // should be able to vectorize these together. Partitions.mergeBeforePopulating(); + LLVM_DEBUG(dbgs() << "\nMerged partitions:\n" << Partitions); if (Partitions.getSize() < 2) return fail("CantIsolateUnsafeDeps", @@ -776,6 +817,8 @@ "may not insert runtime check with convergent operation"); } + LLVM_DEBUG(dbgs() << "LD: SCEV predicate complexity: " + << Pred.getComplexity() << "\n"); if (Pred.getComplexity() > (IsForced.getValueOr(false) ? PragmaDistributeSCEVCheckThreshold : DistributeSCEVCheckThreshold)) @@ -794,9 +837,16 @@ auto PtrToPartition = Partitions.computePartitionSetForPointers(*LAI); const auto *RtPtrChecking = LAI->getRuntimePointerChecking(); const auto &AllChecks = RtPtrChecking->getChecks(); + auto Checks = includeOnlyCrossPartitionChecks(AllChecks, PtrToPartition, RtPtrChecking); + // Runtime pointer checks could be quadratic in the number of pointers. + if (Checks.size() > DistributeRuntimePointerChecksThreshold) { + return fail("TooManyRuntimePointerChecks", + "too many runtime pointer-alias checks needed.\n"); + } + if (LAI->hasConvergentOp() && !Checks.empty()) { return fail("RuntimeCheckWithConvergent", "may not insert runtime check with convergent operation"); diff --git a/llvm/test/Transforms/LoopDistribute/bug-uses-outside-loop.ll b/llvm/test/Transforms/LoopDistribute/bug-uses-outside-loop.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopDistribute/bug-uses-outside-loop.ll @@ -0,0 +1,66 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -loop-distribute -enable-loop-distribute --loop-distribute-merge-vectorizable-partitions=false -verify-loop-info -verify-dom-info -S < %s \ +; RUN: | FileCheck %s + +; for (i = 0; i < n; i ++) { +; sumA += A[i] +; ========================= +; sumB += B[i] +; } + + +define i64 @f(i64 %n, i32* %a) { +; CHECK-LABEL: @f( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[ENTRY_SPLIT_LDIST1:%.*]] +; CHECK: entry.split.ldist1: +; CHECK-NEXT: br label [[FOR_BODY_LDIST1:%.*]] +; CHECK: for.body.ldist1: +; CHECK-NEXT: [[INDEX_LDIST1:%.*]] = phi i64 [ 0, [[ENTRY_SPLIT_LDIST1]] ], [ [[INDEX_NEXT_LDIST1:%.*]], [[FOR_BODY_LDIST1]] ] +; CHECK-NEXT: [[SUMA_LDIST1:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT_LDIST1]] ], [ [[SUMA_NEXT_LDIST1:%.*]], [[FOR_BODY_LDIST1]] ] +; CHECK-NEXT: [[IDXA_LDIST1:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX_LDIST1]] +; CHECK-NEXT: [[LOADA_LDIST1:%.*]] = load i32, i32* [[IDXA_LDIST1]], align 4 +; CHECK-NEXT: [[SUMA_NEXT_LDIST1]] = add nuw nsw i32 [[LOADA_LDIST1]], [[SUMA_LDIST1]] +; CHECK-NEXT: [[INDEX_NEXT_LDIST1]] = add nuw nsw i64 [[INDEX_LDIST1]], 1 +; CHECK-NEXT: [[EXITCOND_LDIST1:%.*]] = icmp eq i64 [[INDEX_NEXT_LDIST1]], [[N:%.*]] +; CHECK-NEXT: br i1 [[EXITCOND_LDIST1]], label [[ENTRY_SPLIT:%.*]], label [[FOR_BODY_LDIST1]] +; CHECK: entry.split: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY_SPLIT]] ], [ [[INDEX_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[SUMIDXSQ:%.*]] = phi i64 [ 0, [[ENTRY_SPLIT]] ], [ [[SUMIDXSQ_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[IDXSQ:%.*]] = mul i64 [[INDEX]], [[INDEX]] +; CHECK-NEXT: [[SUMIDXSQ_NEXT]] = add nuw nsw i64 [[IDXSQ]], [[SUMIDXSQ]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw nsw i64 [[INDEX]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]] +; CHECK: for.end: +; CHECK-NEXT: [[ZEXT:%.*]] = zext i32 [[SUMA_NEXT_LDIST1]] to i64 +; CHECK-NEXT: [[RET:%.*]] = add nuw nsw i64 [[ZEXT]], [[SUMIDXSQ_NEXT]] +; CHECK-NEXT: ret i64 [[RET]] +; +entry: + br label %for.body + +for.body: + %index = phi i64 [ 0, %entry ], [ %index.next, %for.body ] + %sumA = phi i32 [ 0, %entry ], [ %sumA.next, %for.body ] + %sumIdxSq = phi i64 [ 0, %entry ], [ %sumIdxSq.next, %for.body ] + + %idxA = getelementptr inbounds i32, i32* %a, i64 %index + %loadA = load i32, i32* %idxA, align 4 + %sumA.next = add nuw nsw i32 %loadA, %sumA + + %idxSq = mul i64 %index, %index + %sumIdxSq.next = add nuw nsw i64 %idxSq, %sumIdxSq + + %index.next = add nuw nsw i64 %index, 1 + + %exitcond = icmp eq i64 %index.next, %n + br i1 %exitcond, label %for.end, label %for.body + +for.end: + %zext = zext i32 %sumA.next to i64 + %ret = add nuw nsw i64 %zext, %sumIdxSq.next + ret i64 %ret +} diff --git a/llvm/test/Transforms/LoopDistribute/diagnostics-with-hotness.ll b/llvm/test/Transforms/LoopDistribute/diagnostics-with-hotness.ll --- a/llvm/test/Transforms/LoopDistribute/diagnostics-with-hotness.ll +++ b/llvm/test/Transforms/LoopDistribute/diagnostics-with-hotness.ll @@ -25,9 +25,7 @@ target triple = "x86_64-apple-macosx10.11.0" ; HOTNESS: remark: /tmp/t.c:3:3: loop not distributed: use -Rpass-analysis=loop-distribute for more info (hotness: 300) -; HOTNESS: remark: /tmp/t.c:3:3: loop not distributed: memory operations are safe for vectorization (hotness: 300) ; NO_HOTNESS: remark: /tmp/t.c:3:3: loop not distributed: use -Rpass-analysis=loop-distribute for more info{{$}} -; NO_HOTNESS: remark: /tmp/t.c:3:3: loop not distributed: memory operations are safe for vectorization{{$}} define void @forced(i8* %A, i8* %B, i8* %C, i32 %N) !dbg !7 !prof !22 { entry: diff --git a/llvm/test/Transforms/LoopDistribute/diagnostics.ll b/llvm/test/Transforms/LoopDistribute/diagnostics.ll --- a/llvm/test/Transforms/LoopDistribute/diagnostics.ll +++ b/llvm/test/Transforms/LoopDistribute/diagnostics.ll @@ -36,7 +36,6 @@ target triple = "x86_64-apple-macosx10.11.0" ; MISSED_REMARKS: remark: /tmp/t.c:3:3: loop not distributed: use -Rpass-analysis=loop-distribute for more info -; ALWAYS: remark: /tmp/t.c:3:3: loop not distributed: memory operations are safe for vectorization ; ALWAYS: warning: /tmp/t.c:3:3: loop not distributed: failed explicitly specified loop distribution define void @forced(i8* %A, i8* %B, i8* %C, i32 %N) !dbg !7 { @@ -67,7 +66,6 @@ ; NO_REMARKS-NOT: remark: /tmp/t.c:9:3: loop not distributed: memory operations are safe for vectorization ; MISSED_REMARKS: remark: /tmp/t.c:9:3: loop not distributed: use -Rpass-analysis=loop-distribute for more info -; ANALYSIS_REMARKS: remark: /tmp/t.c:9:3: loop not distributed: memory operations are safe for vectorization ; ALWAYS-NOT: warning: /tmp/t.c:9:3: loop not distributed: failed explicitly specified loop distribution define void @not_forced(i8* %A, i8* %B, i8* %C, i32 %N) !dbg !22 {