diff --git a/llvm/lib/Transforms/Scalar/LoopDistribute.cpp b/llvm/lib/Transforms/Scalar/LoopDistribute.cpp --- a/llvm/lib/Transforms/Scalar/LoopDistribute.cpp +++ b/llvm/lib/Transforms/Scalar/LoopDistribute.cpp @@ -106,6 +106,20 @@ cl::desc("The maximum number of SCEV checks allowed for Loop " "Distribution")); +static cl::opt DistributeRuntimePointerChecksThreshold( + "loop-distribute-runtime-check-threshold", cl::init(32), cl::Hidden, + cl::desc("The maximum number of runtime pointer aliasing checks for Loop " + "Distribution")); + +static cl::opt DistributeVectorizableLoops( + "loop-distribute-vectorizable-loops", cl::init(true), cl::Hidden, + cl::desc( + "Consider vectorizable loops for loop distribution.")); + +static cl::opt DistributeMergeVectorizablePartitions( + "loop-distribute-merge-vectorizable-partitions", cl::init(true), cl::Hidden, + cl::desc("Merge adjacent partitions that are already vectorizable.")); + static cl::opt PragmaDistributeSCEVCheckThreshold( "loop-distribute-scev-check-threshold-with-pragma", cl::init(128), cl::Hidden, @@ -130,6 +144,7 @@ public: InstPartition(Instruction *I, Loop *L, bool DepCycle = false) : DepCycle(DepCycle), OrigLoop(L) { + SeedInstructions.insert(I); Set.insert(I); } @@ -137,7 +152,10 @@ bool hasDepCycle() const { return DepCycle; } /// Adds an instruction to this partition. - void add(Instruction *I) { Set.insert(I); } + void addSeed(Instruction *I) { + SeedInstructions.insert(I); + Set.insert(I); + } /// Collection accessors. InstructionSet::iterator begin() { return Set.begin(); } @@ -149,6 +167,9 @@ /// Moves this partition into \p Other. This partition becomes empty /// after this. void moveTo(InstPartition &Other) { + Other.SeedInstructions.insert(SeedInstructions.begin(), + SeedInstructions.end()); + SeedInstructions.clear(); Other.Set.insert(Set.begin(), Set.end()); Set.clear(); Other.DepCycle |= DepCycle; @@ -207,6 +228,14 @@ /// Remaps the cloned instructions using VMap. void remapInstructions() { + // Seed instructions might be used outside the loop. + for (Instruction *I : SeedInstructions) { + I->replaceUsesWithIf(VMap[I], [&](Use &U) { + Instruction *Use = cast(U.getUser()); + return !OrigLoop->contains(Use->getParent()); + }); + } + remapInstructionsInBlocks(ClonedLoopBlocks, VMap); } @@ -253,6 +282,9 @@ /// Instructions from OrigLoop selected for this partition. InstructionSet Set; + /// Instructions from OrigLoop used to seed this partition. + InstructionSet SeedInstructions; + /// Whether this partition contains a dependence cycle. bool DepCycle; @@ -292,7 +324,7 @@ if (PartitionContainer.empty() || !PartitionContainer.back().hasDepCycle()) PartitionContainer.emplace_back(Inst, L, /*DepCycle=*/true); else - PartitionContainer.back().add(Inst); + PartitionContainer.back().addSeed(Inst); } /// Adds \p Inst into a partition that is not marked to contain @@ -336,7 +368,8 @@ /// Merges the partitions according to various heuristics. void mergeBeforePopulating() { - mergeAdjacentNonCyclic(); + if (DistributeMergeVectorizablePartitions) + mergeAdjacentNonCyclic(); if (!DistributeNonIfConvertible) mergeNonIfConvertible(); } @@ -652,6 +685,15 @@ AccessesType Accesses; }; +static bool hasPossiblyBackwardDependences( + const SmallVectorImpl &Dependences) { + for (auto &Dep : Dependences) + if (Dep.isPossiblyBackward()) + return true; + + return false; +} + /// The actual class performing the per-loop work. class LoopDistributeForLoop { public: @@ -684,13 +726,30 @@ // Currently, we only distribute to isolate the part of the loop with // dependence cycles to enable partial vectorization. - if (LAI->canVectorizeMemory()) + if (LAI->canVectorizeMemory() && !DistributeVectorizableLoops) return fail("MemOpsCanBeVectorized", "memory operations are safe for vectorization"); auto *Dependences = LAI->getDepChecker().getDependences(); - if (!Dependences || Dependences->empty()) - return fail("NoUnsafeDeps", "no unsafe dependences to isolate"); + if (!Dependences) + return fail("NoDeps", "dependency analysis failed"); + + LLVM_DEBUG(dbgs() << "NumDependences: " << Dependences->size() << "\n"); + + // If there are potentially-backward dependencies (which don't prevent + // vectorisation), loop distribute would spuriously distribute. + if (LAI->canVectorizeMemory() && + hasPossiblyBackwardDependences(*Dependences)) { + return fail("VectorizableDependences", + "dependences that won't block vectorization found"); + } + + // If we can't vectorize and the set of depdencies is empty, then that means + // that Loop Access Analysis gave up and the results are invalid. Don't try + // to do loop distribution based off it, or Bad Things happen. + if (!LAI->canVectorizeMemory() && Dependences->empty()) { + return fail("NoDeps", "dependency analysis failed"); + } InstPartitionContainer Partitions(L, LI, DT); @@ -749,6 +808,7 @@ // Run the merge heuristics: Merge non-cyclic adjacent partitions since we // should be able to vectorize these together. Partitions.mergeBeforePopulating(); + LLVM_DEBUG(dbgs() << "\nMerged partitions:\n" << Partitions); if (Partitions.getSize() < 2) return fail("CantIsolateUnsafeDeps", @@ -776,6 +836,8 @@ "may not insert runtime check with convergent operation"); } + LLVM_DEBUG(dbgs() << "LD: SCEV predicate complexity: " + << Pred.getComplexity() << "\n"); if (Pred.getComplexity() > (IsForced.getValueOr(false) ? PragmaDistributeSCEVCheckThreshold : DistributeSCEVCheckThreshold)) @@ -794,9 +856,16 @@ auto PtrToPartition = Partitions.computePartitionSetForPointers(*LAI); const auto *RtPtrChecking = LAI->getRuntimePointerChecking(); const auto &AllChecks = RtPtrChecking->getChecks(); + auto Checks = includeOnlyCrossPartitionChecks(AllChecks, PtrToPartition, RtPtrChecking); + // Runtime pointer checks could be quadratic in the number of pointers. + if (Checks.size() > DistributeRuntimePointerChecksThreshold) { + return fail("TooManyRuntimePointerChecks", + "too many runtime pointer-alias checks needed.\n"); + } + if (LAI->hasConvergentOp() && !Checks.empty()) { return fail("RuntimeCheckWithConvergent", "may not insert runtime check with convergent operation"); diff --git a/llvm/test/Transforms/LoopDistribute/bug-uses-outside-loop.ll b/llvm/test/Transforms/LoopDistribute/bug-uses-outside-loop.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopDistribute/bug-uses-outside-loop.ll @@ -0,0 +1,66 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -loop-distribute -enable-loop-distribute --loop-distribute-merge-vectorizable-partitions=false -verify-loop-info -verify-dom-info -S < %s \ +; RUN: | FileCheck %s + +; for (i = 0; i < n; i ++) { +; sumA += A[i] +; ========================= +; sumB += B[i] +; } + + +define i64 @f(i64 %n, i32* %a) { +; CHECK-LABEL: @f( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[ENTRY_SPLIT_LDIST1:%.*]] +; CHECK: entry.split.ldist1: +; CHECK-NEXT: br label [[FOR_BODY_LDIST1:%.*]] +; CHECK: for.body.ldist1: +; CHECK-NEXT: [[INDEX_LDIST1:%.*]] = phi i64 [ 0, [[ENTRY_SPLIT_LDIST1]] ], [ [[INDEX_NEXT_LDIST1:%.*]], [[FOR_BODY_LDIST1]] ] +; CHECK-NEXT: [[SUMA_LDIST1:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT_LDIST1]] ], [ [[SUMA_NEXT_LDIST1:%.*]], [[FOR_BODY_LDIST1]] ] +; CHECK-NEXT: [[IDXA_LDIST1:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX_LDIST1]] +; CHECK-NEXT: [[LOADA_LDIST1:%.*]] = load i32, i32* [[IDXA_LDIST1]], align 4 +; CHECK-NEXT: [[SUMA_NEXT_LDIST1]] = add nuw nsw i32 [[LOADA_LDIST1]], [[SUMA_LDIST1]] +; CHECK-NEXT: [[INDEX_NEXT_LDIST1]] = add nuw nsw i64 [[INDEX_LDIST1]], 1 +; CHECK-NEXT: [[EXITCOND_LDIST1:%.*]] = icmp eq i64 [[INDEX_NEXT_LDIST1]], [[N:%.*]] +; CHECK-NEXT: br i1 [[EXITCOND_LDIST1]], label [[ENTRY_SPLIT:%.*]], label [[FOR_BODY_LDIST1]] +; CHECK: entry.split: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY_SPLIT]] ], [ [[INDEX_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[SUMIDXSQ:%.*]] = phi i64 [ 0, [[ENTRY_SPLIT]] ], [ [[SUMIDXSQ_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[IDXSQ:%.*]] = mul i64 [[INDEX]], [[INDEX]] +; CHECK-NEXT: [[SUMIDXSQ_NEXT]] = add nuw nsw i64 [[IDXSQ]], [[SUMIDXSQ]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw nsw i64 [[INDEX]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]] +; CHECK: for.end: +; CHECK-NEXT: [[ZEXT:%.*]] = zext i32 [[SUMA_NEXT_LDIST1]] to i64 +; CHECK-NEXT: [[RET:%.*]] = add nuw nsw i64 [[ZEXT]], [[SUMIDXSQ_NEXT]] +; CHECK-NEXT: ret i64 [[RET]] +; +entry: + br label %for.body + +for.body: + %index = phi i64 [ 0, %entry ], [ %index.next, %for.body ] + %sumA = phi i32 [ 0, %entry ], [ %sumA.next, %for.body ] + %sumIdxSq = phi i64 [ 0, %entry ], [ %sumIdxSq.next, %for.body ] + + %idxA = getelementptr inbounds i32, i32* %a, i64 %index + %loadA = load i32, i32* %idxA, align 4 + %sumA.next = add nuw nsw i32 %loadA, %sumA + + %idxSq = mul i64 %index, %index + %sumIdxSq.next = add nuw nsw i64 %idxSq, %sumIdxSq + + %index.next = add nuw nsw i64 %index, 1 + + %exitcond = icmp eq i64 %index.next, %n + br i1 %exitcond, label %for.end, label %for.body + +for.end: + %zext = zext i32 %sumA.next to i64 + %ret = add nuw nsw i64 %zext, %sumIdxSq.next + ret i64 %ret +} diff --git a/llvm/test/Transforms/LoopDistribute/diagnostics-with-hotness.ll b/llvm/test/Transforms/LoopDistribute/diagnostics-with-hotness.ll --- a/llvm/test/Transforms/LoopDistribute/diagnostics-with-hotness.ll +++ b/llvm/test/Transforms/LoopDistribute/diagnostics-with-hotness.ll @@ -25,9 +25,7 @@ target triple = "x86_64-apple-macosx10.11.0" ; HOTNESS: remark: /tmp/t.c:3:3: loop not distributed: use -Rpass-analysis=loop-distribute for more info (hotness: 300) -; HOTNESS: remark: /tmp/t.c:3:3: loop not distributed: memory operations are safe for vectorization (hotness: 300) ; NO_HOTNESS: remark: /tmp/t.c:3:3: loop not distributed: use -Rpass-analysis=loop-distribute for more info{{$}} -; NO_HOTNESS: remark: /tmp/t.c:3:3: loop not distributed: memory operations are safe for vectorization{{$}} define void @forced(i8* %A, i8* %B, i8* %C, i32 %N) !dbg !7 !prof !22 { entry: diff --git a/llvm/test/Transforms/LoopDistribute/diagnostics.ll b/llvm/test/Transforms/LoopDistribute/diagnostics.ll --- a/llvm/test/Transforms/LoopDistribute/diagnostics.ll +++ b/llvm/test/Transforms/LoopDistribute/diagnostics.ll @@ -36,7 +36,6 @@ target triple = "x86_64-apple-macosx10.11.0" ; MISSED_REMARKS: remark: /tmp/t.c:3:3: loop not distributed: use -Rpass-analysis=loop-distribute for more info -; ALWAYS: remark: /tmp/t.c:3:3: loop not distributed: memory operations are safe for vectorization ; ALWAYS: warning: /tmp/t.c:3:3: loop not distributed: failed explicitly specified loop distribution define void @forced(i8* %A, i8* %B, i8* %C, i32 %N) !dbg !7 { @@ -67,7 +66,6 @@ ; NO_REMARKS-NOT: remark: /tmp/t.c:9:3: loop not distributed: memory operations are safe for vectorization ; MISSED_REMARKS: remark: /tmp/t.c:9:3: loop not distributed: use -Rpass-analysis=loop-distribute for more info -; ANALYSIS_REMARKS: remark: /tmp/t.c:9:3: loop not distributed: memory operations are safe for vectorization ; ALWAYS-NOT: warning: /tmp/t.c:9:3: loop not distributed: failed explicitly specified loop distribution define void @not_forced(i8* %A, i8* %B, i8* %C, i32 %N) !dbg !22 { diff --git a/llvm/test/Transforms/LoopDistribute/vectorizable-dependences.ll b/llvm/test/Transforms/LoopDistribute/vectorizable-dependences.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopDistribute/vectorizable-dependences.ll @@ -0,0 +1,119 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -basic-aa -loop-distribute -enable-loop-distribute -verify-loop-info -verify-dom-info -S \ +; RUN: < %s | FileCheck %s + +@A = global [2 x [16 x [16 x i32]]] zeroinitializer +@B = global [2 x [16 x [16 x i32]]] zeroinitializer +@C = global [16 x [16 x i32]] zeroinitializer +@D = global [16 x [16 x i32]] zeroinitializer + +define void @backward_vectorizable(i32 %j) { +; CHECK-LABEL: @backward_vectorizable( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[IDXPROM1:%.*]] = sext i32 [[J:%.*]] to i64 +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x [16 x [16 x i32]]], [2 x [16 x [16 x i32]]]* @B, i64 0, i64 0, i64 [[INDVARS_IV]], i64 [[IDXPROM1]] +; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [2 x [16 x [16 x i32]]], [2 x [16 x [16 x i32]]]* @A, i64 0, i64 0, i64 [[INDVARS_IV]], i64 [[IDXPROM1]] +; CHECK-NEXT: store i32 [[TMP0]], i32* [[ARRAYIDX6]], align 4 +; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds [2 x [16 x [16 x i32]]], [2 x [16 x [16 x i32]]]* @B, i64 0, i64 1, i64 [[INDVARS_IV]], i64 [[IDXPROM1]] +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX10]], align 4 +; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds [2 x [16 x [16 x i32]]], [2 x [16 x [16 x i32]]]* @A, i64 0, i64 1, i64 [[INDVARS_IV]], i64 [[IDXPROM1]] +; CHECK-NEXT: store i32 [[TMP1]], i32* [[ARRAYIDX14]], align 4 +; CHECK-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds [16 x [16 x i32]], [16 x [16 x i32]]* @D, i64 0, i64 [[INDVARS_IV]], i64 [[IDXPROM1]] +; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX18]], align 4 +; CHECK-NEXT: [[ARRAYIDX22:%.*]] = getelementptr inbounds [16 x [16 x i32]], [16 x [16 x i32]]* @C, i64 0, i64 [[INDVARS_IV]], i64 [[IDXPROM1]] +; CHECK-NEXT: store i32 [[TMP2]], i32* [[ARRAYIDX22]], align 4 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 16 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +entry: + %idxprom1 = sext i32 %j to i64 + br label %for.body + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx2 = getelementptr inbounds [2 x [16 x [16 x i32]]], [2 x [16 x [16 x i32]]]* @B, i64 0, i64 0, i64 %indvars.iv, i64 %idxprom1 + %0 = load i32, i32* %arrayidx2, align 4 + %arrayidx6 = getelementptr inbounds [2 x [16 x [16 x i32]]], [2 x [16 x [16 x i32]]]* @A, i64 0, i64 0, i64 %indvars.iv, i64 %idxprom1 + store i32 %0, i32* %arrayidx6, align 4 + %arrayidx10 = getelementptr inbounds [2 x [16 x [16 x i32]]], [2 x [16 x [16 x i32]]]* @B, i64 0, i64 1, i64 %indvars.iv, i64 %idxprom1 + %1 = load i32, i32* %arrayidx10, align 4 + %arrayidx14 = getelementptr inbounds [2 x [16 x [16 x i32]]], [2 x [16 x [16 x i32]]]* @A, i64 0, i64 1, i64 %indvars.iv, i64 %idxprom1 + store i32 %1, i32* %arrayidx14, align 4 + %arrayidx18 = getelementptr inbounds [16 x [16 x i32]], [16 x [16 x i32]]* @D, i64 0, i64 %indvars.iv, i64 %idxprom1 + %2 = load i32, i32* %arrayidx18, align 4 + %arrayidx22 = getelementptr inbounds [16 x [16 x i32]], [16 x [16 x i32]]* @C, i64 0, i64 %indvars.iv, i64 %idxprom1 + store i32 %2, i32* %arrayidx22, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, 16 + br i1 %exitcond.not, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + +%struct = type { [361 x i32], [361 x i32] } +define void @backward_vectorizable2(i32 %a, i32 %b, %struct* nocapture %S) { +; CHECK-LABEL: @backward_vectorizable2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SUB:%.*]] = sub i32 1, [[A:%.*]] +; CHECK-NEXT: [[CMP_NOT12:%.*]] = icmp sgt i32 [[A]], [[B:%.*]] +; CHECK-NEXT: br i1 [[CMP_NOT12]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[A]] to i64 +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[B]], 1 +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[TMP0]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[INDVARS_IV]] to i32 +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[SUB]], [[TMP2]] +; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[ADD]] to i64 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT:%.*]], %struct* [[S:%.*]], i64 0, i32 0, i64 [[IDXPROM]] +; CHECK-NEXT: store i32 2, i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [[STRUCT]], %struct* [[S]], i64 0, i32 1, i64 [[IDXPROM]] +; CHECK-NEXT: store i32 1, i32* [[ARRAYIDX4]], align 4 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[TMP1]], [[LFTR_WIDEIV]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_BODY]] +; CHECK: for.end.loopexit: +; CHECK-NEXT: br label [[FOR_END]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +entry: + %sub = sub i32 1, %a + %cmp.not12 = icmp sgt i32 %a, %b + br i1 %cmp.not12, label %for.end, label %for.body.preheader + +for.body.preheader: ; preds = %entry + %0 = zext i32 %a to i64 + %1 = add i32 %b, 1 + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %2 = trunc i64 %indvars.iv to i32 + %add = add i32 %sub, %2 + %idxprom = sext i32 %add to i64 + %arrayidx = getelementptr inbounds %struct, %struct* %S, i64 0, i32 0, i64 %idxprom + store i32 2, i32* %arrayidx, align 4 + %arrayidx4 = getelementptr inbounds %struct, %struct* %S, i64 0, i32 1, i64 %idxprom + store i32 1, i32* %arrayidx4, align 4 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond.not = icmp eq i32 %1, %lftr.wideiv + br i1 %exitcond.not, label %for.end.loopexit, label %for.body + +for.end.loopexit: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + ret void +}