Index: include/llvm/Analysis/LoopAccessAnalysis.h =================================================================== --- include/llvm/Analysis/LoopAccessAnalysis.h +++ include/llvm/Analysis/LoopAccessAnalysis.h @@ -164,8 +164,8 @@ MemoryDepChecker(PredicatedScalarEvolution &PSE, const Loop *L) : PSE(PSE), InnermostLoop(L), AccessIdx(0), MaxSafeRegisterWidth(-1U), - ShouldRetryWithRuntimeCheck(false), SafeForVectorization(true), - RecordDependences(true) {} + ShouldRetryWithRuntimeCheck(false), RuntimeChecksFeasible(true), + SafeForVectorization(true), RecordDependences(true) {} /// Register the location (instructions are given increasing numbers) /// of a write access. @@ -185,7 +185,8 @@ ++AccessIdx; } - /// Check whether the dependencies between the accesses are safe. + /// Check whether the dependencies between the accesses are safe and whether + /// runtime checks are feasible. /// /// Only checks sets with elements in \p CheckDeps. bool areDepsSafe(DepCandidates &AccessSets, MemAccessInfoList &CheckDeps, @@ -203,9 +204,13 @@ /// simultaneously, multiplied by the size of the element in bits. uint64_t getMaxSafeRegisterWidth() const { return MaxSafeRegisterWidth; } - /// In same cases when the dependency check fails we can still - /// vectorize the loop with a dynamic array access check. - bool shouldRetryWithRuntimeCheck() { return ShouldRetryWithRuntimeCheck; } + /// In some cases when the dependency check fails we can still vectorize + /// the loop with a dynamic array access check. If we already found + /// a dependence preventing vectorization, we do not retry with runtime + /// checks. + bool shouldRetryWithRuntimeCheck() { + return RuntimeChecksFeasible && ShouldRetryWithRuntimeCheck; + } /// Returns the memory dependences. If null is returned we exceeded /// the MaxDependences threshold and this information is not @@ -269,6 +274,10 @@ /// vectorize this loop with runtime checks. bool ShouldRetryWithRuntimeCheck; + /// If we see a non-unknown unsafe dependence, there is no point in generating + /// runtime checks. + bool RuntimeChecksFeasible; + /// No memory dependence was encountered that would inhibit /// vectorization. bool SafeForVectorization; Index: lib/Analysis/LoopAccessAnalysis.cpp =================================================================== --- lib/Analysis/LoopAccessAnalysis.cpp +++ lib/Analysis/LoopAccessAnalysis.cpp @@ -1616,7 +1616,8 @@ bool MemoryDepChecker::areDepsSafe(DepCandidates &AccessSets, MemAccessInfoList &CheckDeps, const ValueToValueMap &Strides) { - + // Runtime checks are only feasible if only unknown dependences prevent + // vectorization. MaxSafeDepDistBytes = -1; SmallPtrSet Visited; for (MemAccessInfo CurAccess : CheckDeps) { @@ -1652,7 +1653,12 @@ Dependence::DepType Type = isDependent(*A.first, A.second, *B.first, B.second, Strides); - SafeForVectorization &= Dependence::isSafeForVectorization(Type); + bool DepSafe = Dependence::isSafeForVectorization(Type); + SafeForVectorization &= DepSafe; + // Runtime checks are only feasible, if all unsafe dependencies are + // unknown. For other unsafe deps, we already know they will fail + // the runtime checks at compile time. + RuntimeChecksFeasible &= Type == Dependence::Unknown || DepSafe; // Gather dependences unless we accumulated MaxDependences // dependences. In that case return as soon as we find the first @@ -1669,8 +1675,13 @@ << "Too many dependences, stopped recording\n"); } } - if (!RecordDependences && !SafeForVectorization) + // We do not generate runtime checks for accesses with constant + // strides, so without investigating all dependences, we cannot + // be sure runtime checks are safe. + if (!RecordDependences && !SafeForVectorization) { + RuntimeChecksFeasible = false; return false; + } } ++OI; } Index: test/Transforms/LoopVectorize/runtime-check.ll =================================================================== --- test/Transforms/LoopVectorize/runtime-check.ll +++ test/Transforms/LoopVectorize/runtime-check.ll @@ -117,6 +117,48 @@ ret void } +; Check we do not generate runtime checks if we found a known dependence preventing +; vectorization. In this case, it is a read of c[i-1] followed by a write of c[i]. +; The runtime checks would always fail. + +; void test_runtime_check2(float *a, float b, unsigned offset, unsigned offset2, unsigned n, float *c) { +; for (unsigned i = 1; i < n; i++) { +; a[i+o1] += a[i+o2] + b; +; c[i] = c[i-1] + b; +; } +; } +; +; CHECK-LABEL: test_runtime_check2 +; CHECK-NOT: <4 x float> +define void @test_runtime_check2(float* %a, float %b, i64 %offset, i64 %offset2, i64 %n, float* %c) { +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %ind.sum = add i64 %iv, %offset + %arr.idx = getelementptr inbounds float, float* %a, i64 %ind.sum + %l1 = load float, float* %arr.idx, align 4 + %ind.sum2 = add i64 %iv, %offset2 + %arr.idx2 = getelementptr inbounds float, float* %a, i64 %ind.sum2 + %l2 = load float, float* %arr.idx2, align 4 + %m = fmul fast float %b, %l2 + %ad = fadd fast float %l1, %m + store float %ad, float* %arr.idx, align 4 + %c.ind = add i64 %iv, -1 + %c.idx = getelementptr inbounds float, float* %c, i64 %c.ind + %lc = load float, float* %c.idx, align 4 + %vc = fadd float %lc, 1.0 + %c.idx2 = getelementptr inbounds float, float* %c, i64 %iv + store float %vc, float* %c.idx2 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, %n + br i1 %exitcond, label %loopexit, label %for.body + +loopexit: + ret void +} + ; CHECK: !9 = !DILocation(line: 101, column: 1, scope: !{{.*}}) !llvm.module.flags = !{!0, !1}