diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -1835,6 +1835,9 @@ BPtr->getType()->getPointerAddressSpace()) return Dependence::Unknown; + if (isa(ATy) || isa(BTy)) + return Dependence::Unknown; + int64_t StrideAPtr = getPtrStride(PSE, ATy, APtr, InnermostLoop, Strides, true); int64_t StrideBPtr = @@ -1862,10 +1865,21 @@ LLVM_DEBUG(dbgs() << "LAA: Distance for " << *InstMap[AIdx] << " to " << *InstMap[BIdx] << ": " << *Dist << "\n"); - // Need accesses with constant stride. We don't want to vectorize - // "A[B[i]] += ..." and similar code or pointer arithmetic that could wrap in - // the address space. - if (!StrideAPtr || !StrideBPtr || StrideAPtr != StrideBPtr){ + // Compute the stride to use. If one of the accesses is loop-invariant, use + // the stride of the AddRec. Those cases are only used to try to rule out + // dependences if we can prove the distance between the accesses is larger + // than the range the accesses will travel through the execution of the loop. + uint64_t Stride = -1; + if (StrideAPtr && StrideBPtr && StrideAPtr == StrideBPtr) { + Stride = std::abs(StrideAPtr); + } else if (StrideAPtr && SE.isLoopInvariant(Sink, InnermostLoop)) { + Stride = std::abs(StrideAPtr); + } else if (StrideBPtr && SE.isLoopInvariant(Src, InnermostLoop)) { + Stride = std::abs(StrideBPtr); + } else { + // Need at least one access with constant stride. We don't want to vectorize + // "A[B[i]] += ..." and similar code or pointer arithmetic that could wrap + // in the address space. LLVM_DEBUG(dbgs() << "Pointer access with non-constant stride\n"); return Dependence::Unknown; } @@ -1874,14 +1888,21 @@ uint64_t TypeByteSize = DL.getTypeAllocSize(ATy); bool HasSameSize = DL.getTypeStoreSizeInBits(ATy) == DL.getTypeStoreSizeInBits(BTy); - uint64_t Stride = std::abs(StrideAPtr); + if (!isa(Dist) && HasSameSize && + isSafeDependenceDistance(DL, SE, *(PSE.getBackedgeTakenCount()), *Dist, + Stride, TypeByteSize)) + return Dependence::NoDep; + + // Need accesses with constant stride. We don't want to vectorize + // "A[B[i]] += ..." and similar code or pointer arithmetic that could wrap in + // the address space. + if (!StrideAPtr || !StrideBPtr || StrideAPtr != StrideBPtr) { + LLVM_DEBUG(dbgs() << "Pointer access with non-constant stride\n"); + return Dependence::Unknown; + } + const SCEVConstant *C = dyn_cast(Dist); if (!C) { - if (!isa(Dist) && HasSameSize && - isSafeDependenceDistance(DL, SE, *(PSE.getBackedgeTakenCount()), *Dist, - Stride, TypeByteSize)) - return Dependence::NoDep; - LLVM_DEBUG(dbgs() << "LAA: Dependence because of non-constant distance\n"); FoundNonConstantDistanceDependence = true; return Dependence::Unknown; diff --git a/llvm/test/Analysis/LoopAccessAnalysis/loop-invariant-dep-with-backedge-taken-count.ll b/llvm/test/Analysis/LoopAccessAnalysis/loop-invariant-dep-with-backedge-taken-count.ll --- a/llvm/test/Analysis/LoopAccessAnalysis/loop-invariant-dep-with-backedge-taken-count.ll +++ b/llvm/test/Analysis/LoopAccessAnalysis/loop-invariant-dep-with-backedge-taken-count.ll @@ -37,13 +37,8 @@ define void @test_distance_much_greater_than_BTC_100(ptr %a) { ; CHECK-LABEL: Loop access info in function 'test_distance_much_greater_than_BTC_100': ; CHECK-NEXT: loop: -; CHECK-NEXT: Report: unsafe dependent memory operations in loop. -; CHECK-NEXT: Unknown data dependence. +; CHECK-NEXT: Memory dependences are safe ; CHECK-NEXT: Dependences: -; CHECK-NEXT: Unknown: -; CHECK-NEXT: %l = load i32, ptr %gep.x, align 4 -> -; CHECK-NEXT: store i32 %l, ptr %gep, align 4 -; CHECK-EMPTY: ; CHECK-NEXT: Run-time memory checks: ; CHECK-NEXT: Grouped accesses: ; CHECK-EMPTY: diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/safe-dep-distance.ll b/llvm/test/Transforms/LoopVectorize/RISCV/safe-dep-distance.ll --- a/llvm/test/Transforms/LoopVectorize/RISCV/safe-dep-distance.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/safe-dep-distance.ll @@ -10,27 +10,33 @@ define void @test(ptr %p) { ; CHECK-LABEL: @test( ; CHECK-NEXT: entry: -; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 200, [[TMP0]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: +; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 200, [[TMP1]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 200, [[N_MOD_VF]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[TMP1]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 32 -; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[TMP0]], 200 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i64, ptr [[TMP4]], i32 0 -; CHECK-NEXT: store <2 x i64> [[WIDE_LOAD]], ptr [[TMP5]], align 32 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200 -; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i64, ptr [[TMP3]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP4]], align 32 +; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[TMP2]], 200 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i64, ptr [[TMP6]], i32 0 +; CHECK-NEXT: store [[WIDE_LOAD]], ptr [[TMP7]], align 32 +; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]] +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 200, 200 +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 200, [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 200, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]