Index: include/llvm/Analysis/LoopAccessAnalysis.h =================================================================== --- include/llvm/Analysis/LoopAccessAnalysis.h +++ include/llvm/Analysis/LoopAccessAnalysis.h @@ -652,13 +652,15 @@ Value *Ptr, Value *OrigPtr = nullptr); /// \brief Check the stride of the pointer and ensure that it does not wrap in -/// the address space, assuming \p Preds is true. +/// the address space, assuming \p Preds is true. Returns via func. argument +/// \p Loop over which pointer is strided. \p Lp might be nullptr in case of zero +/// return value of the function. /// /// If necessary this method will version the stride of the pointer according /// to \p PtrToStride and therefore add a new predicate to \p Preds. /// The \p Assume parameter indicates if we are allowed to make additional /// run-time assumptions. -int isStridedPtr(PredicatedScalarEvolution &PSE, Value *Ptr, const Loop *Lp, +int isStridedPtr(PredicatedScalarEvolution &PSE, Value *Ptr, Loop *&Lp, const ValueToValueMap &StridesMap, bool Assume = false); /// \brief Returns true if the memory operations \p A and \p B are consecutive. Index: lib/Analysis/LoopAccessAnalysis.cpp =================================================================== --- lib/Analysis/LoopAccessAnalysis.cpp +++ lib/Analysis/LoopAccessAnalysis.cpp @@ -566,11 +566,12 @@ else ++NumReadPtrChecks; + Loop *Lp = nullptr; if (hasComputableBounds(PSE, StridesMap, Ptr, TheLoop) && // When we run after a failing dependency check we have to make sure // we don't have wrapping pointers. (!ShouldCheckStride || - isStridedPtr(PSE, Ptr, TheLoop, StridesMap) == 1)) { + (isStridedPtr(PSE, Ptr, Lp, StridesMap) == 1 && Lp != nullptr && Lp->contains(TheLoop)))) { // The id of the dependence set. unsigned DepId; @@ -820,8 +821,10 @@ /// \brief Check whether the access through \p Ptr has a constant stride. int llvm::isStridedPtr(PredicatedScalarEvolution &PSE, Value *Ptr, - const Loop *Lp, const ValueToValueMap &StridesMap, + Loop *&Lp, const ValueToValueMap &StridesMap, bool Assume) { + assert(Lp == nullptr && "Passed non-null pointer"); + Type *Ty = Ptr->getType(); assert(Ty->isPointerTy() && "Unexpected non-ptr"); @@ -845,12 +848,9 @@ return 0; } - // The accesss function must stride over the innermost loop. - if (Lp != AR->getLoop()) { - DEBUG(dbgs() << "LAA: Bad stride - Not striding over innermost loop " << - *Ptr << " SCEV: " << *AR << "\n"); - return 0; - } + // Return via func. argument the loop over which access function is strided, + // so the appropriate checks could be performed by the caller of isStridedPtr. + Lp = const_cast(AR->getLoop()); // The address calculation must not wrap. Otherwise, a dependence could be // inverted. @@ -1148,8 +1148,11 @@ const SCEV *AScev = replaceSymbolicStrideSCEV(PSE, Strides, APtr); const SCEV *BScev = replaceSymbolicStrideSCEV(PSE, Strides, BPtr); - int StrideAPtr = isStridedPtr(PSE, APtr, InnermostLoop, Strides, true); - int StrideBPtr = isStridedPtr(PSE, BPtr, InnermostLoop, Strides, true); + Loop *APtrLp = nullptr; + Loop *BPtrLp = nullptr; + + int StrideAPtr = isStridedPtr(PSE, APtr, APtrLp, Strides, true); + int StrideBPtr = isStridedPtr(PSE, BPtr, BPtrLp, Strides, true); const SCEV *Src = AScev; const SCEV *Sink = BScev; @@ -1176,7 +1179,7 @@ // Need accesses with constant stride. We don't want to vectorize // "A[B[i]] += ..." and similar code or pointer arithmetic that could wrap in // the address space. - if (!StrideAPtr || !StrideBPtr || StrideAPtr != StrideBPtr){ + if (!StrideAPtr || !StrideBPtr || StrideAPtr != StrideBPtr || APtrLp != InnermostLoop || BPtrLp != InnermostLoop){ DEBUG(dbgs() << "Pointer access with non-constant stride\n"); return Dependence::Unknown; } @@ -1598,7 +1601,8 @@ // read a few words, modify, and write a few words, and some of the // words may be written to the same address. bool IsReadOnlyPtr = false; - if (Seen.insert(Ptr).second || !isStridedPtr(PSE, Ptr, TheLoop, Strides)) { + Loop *Lp = nullptr; + if (Seen.insert(Ptr).second || !isStridedPtr(PSE, Ptr, Lp, Strides) || Lp != TheLoop) { ++NumReads; IsReadOnlyPtr = true; } Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4586,8 +4586,13 @@ StoreInst *SI = dyn_cast(I); Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand(); - int Stride = isStridedPtr(PSE, Ptr, TheLoop, Strides); - + Loop *Lp = nullptr; + int Stride = isStridedPtr(PSE, Ptr, Lp, Strides); + + // Consider stride to be zero in case of Ptr not strided over TheLoop + if (Lp != TheLoop) + Stride = 0; + // The factor of the corresponding interleave group. unsigned Factor = std::abs(Stride); Index: test/Analysis/LoopAccessAnalysis/stride-vectorization.ll =================================================================== --- test/Analysis/LoopAccessAnalysis/stride-vectorization.ll +++ test/Analysis/LoopAccessAnalysis/stride-vectorization.ll @@ -0,0 +1,41 @@ +; RUN: opt -loop-vectorize -S < %s | FileCheck %s +; CHECK-LABEL: Test +; CHECK: %vector.body + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%struct.s = type { [32 x i32], [32 x i32], [32 x [32 x i32]] } + +define void @Test(%struct.s* nocapture %obj) #0 { + br label %.outer.preheader + + +.outer.preheader: + %i = phi i64 [ 0, %0 ], [ %i.next, %.outer ] + %1 = getelementptr inbounds %struct.s, %struct.s* %obj, i64 0, i32 1, i64 %i + br label %.inner + +.exit: + ret void + +.outer: + %i.next = add nuw nsw i64 %i, 1 + %exitcond.outer = icmp eq i64 %i.next, 32 + br i1 %exitcond.outer, label %.exit, label %.outer.preheader + +.inner: + %j = phi i64 [ 0, %.outer.preheader ], [ %j.next, %.inner ] + %2 = getelementptr inbounds %struct.s, %struct.s* %obj, i64 0, i32 0, i64 %j + %3 = load i32, i32* %2 + %4 = load i32, i32* %1 + %5 = add nsw i32 %4, %3 + %6 = getelementptr inbounds %struct.s, %struct.s* %obj, i64 0, i32 2, i64 %i, i64 %j + %7 = load i32, i32* %6 + %8 = add nsw i32 %5, %7 + store i32 %8, i32* %6 + %j.next = add nuw nsw i64 %j, 1 + %exitcond.inner = icmp eq i64 %j.next, 32 + br i1 %exitcond.inner, label %.outer, label %.inner +} +