Index: include/llvm/Analysis/LoopAccessAnalysis.h =================================================================== --- include/llvm/Analysis/LoopAccessAnalysis.h +++ include/llvm/Analysis/LoopAccessAnalysis.h @@ -662,7 +662,7 @@ /// to \p PtrToStride and therefore add further predicates to \p PSE. /// The \p Assume parameter indicates if we are allowed to make additional /// run-time assumptions. -int isStridedPtr(PredicatedScalarEvolution &PSE, Value *Ptr, const Loop *Lp, +int isStridedPtr(PredicatedScalarEvolution &PSE, Value *Ptr, const ValueToValueMap &StridesMap = ValueToValueMap(), bool Assume = false); Index: lib/Analysis/LoopAccessAnalysis.cpp =================================================================== --- lib/Analysis/LoopAccessAnalysis.cpp +++ lib/Analysis/LoopAccessAnalysis.cpp @@ -582,8 +582,8 @@ if (hasComputableBounds(PSE, StridesMap, Ptr, TheLoop) && // When we run after a failing dependency check we have to make sure // we don't have wrapping pointers. - (!ShouldCheckStride || - isStridedPtr(PSE, Ptr, TheLoop, StridesMap) == 1)) { + (!ShouldCheckStride || + isStridedPtr(PSE, Ptr, StridesMap) == 1)) { // The id of the dependence set. unsigned DepId; @@ -833,8 +833,7 @@ /// \brief Check whether the access through \p Ptr has a constant stride. int llvm::isStridedPtr(PredicatedScalarEvolution &PSE, Value *Ptr, - const Loop *Lp, const ValueToValueMap &StridesMap, - bool Assume) { + const ValueToValueMap &StridesMap, bool Assume) { Type *Ty = Ptr->getType(); assert(Ty->isPointerTy() && "Unexpected non-ptr"); @@ -858,12 +857,8 @@ return 0; } - // The accesss function must stride over the innermost loop. - if (Lp != AR->getLoop()) { - DEBUG(dbgs() << "LAA: Bad stride - Not striding over innermost loop " << - *Ptr << " SCEV: " << *AR << "\n"); - return 0; - } + // Holds the loop over which access function is strided. + const Loop* Lp = AR->getLoop(); // The address calculation must not wrap. Otherwise, a dependence could be // inverted. @@ -1161,8 +1156,14 @@ const SCEV *AScev = replaceSymbolicStrideSCEV(PSE, Strides, APtr); const SCEV *BScev = replaceSymbolicStrideSCEV(PSE, Strides, BPtr); - int StrideAPtr = isStridedPtr(PSE, APtr, InnermostLoop, Strides, true); - int StrideBPtr = isStridedPtr(PSE, BPtr, InnermostLoop, Strides, true); + // Get pointers strides regarding InnermostLoop, so considering + // InnermostLoop-invariant pointers to have zero stride. + int StrideAPtr = PSE.getSE()->isLoopInvariant(AScev, InnermostLoop) + ? 0 + : isStridedPtr(PSE, APtr, Strides, true); + int StrideBPtr = PSE.getSE()->isLoopInvariant(BScev, InnermostLoop) + ? 0 + : isStridedPtr(PSE, BPtr, Strides, true); const SCEV *Src = AScev; const SCEV *Sink = BScev; @@ -1615,7 +1616,7 @@ // read a few words, modify, and write a few words, and some of the // words may be written to the same address. bool IsReadOnlyPtr = false; - if (Seen.insert(Ptr).second || !isStridedPtr(PSE, Ptr, TheLoop, Strides)) { + if (Seen.insert(Ptr).second || !isStridedPtr(PSE, Ptr, Strides)) { ++NumReads; IsReadOnlyPtr = true; } Index: lib/Transforms/Scalar/LoopLoadElimination.cpp =================================================================== --- lib/Transforms/Scalar/LoopLoadElimination.cpp +++ lib/Transforms/Scalar/LoopLoadElimination.cpp @@ -62,8 +62,7 @@ /// \brief Return true if the dependence from the store to the load has a /// distance of one. E.g. A[i+1] = A[i] - bool isDependenceDistanceOfOne(PredicatedScalarEvolution &PSE, - Loop *L) const { + bool isDependenceDistanceOfOne(PredicatedScalarEvolution &PSE) const { Value *LoadPtr = Load->getPointerOperand(); Value *StorePtr = Store->getPointerOperand(); Type *LoadPtrType = LoadPtr->getType(); @@ -77,8 +76,8 @@ // Currently we only support accesses with unit stride. FIXME: we should be // able to handle non unit stirde as well as long as the stride is equal to // the dependence distance. - if (isStridedPtr(PSE, LoadPtr, L) != 1 || - isStridedPtr(PSE, LoadPtr, L) != 1) + if (isStridedPtr(PSE, LoadPtr) != 1 || + isStridedPtr(PSE, LoadPtr) != 1) return false; auto &DL = Load->getParent()->getModule()->getDataLayout(); @@ -238,8 +237,8 @@ // so deciding which one forwards is easy. The later one forwards as // long as they both have a dependence distance of one to the load. if (Cand.Store->getParent() == OtherCand->Store->getParent() && - Cand.isDependenceDistanceOfOne(PSE, L) && - OtherCand->isDependenceDistanceOfOne(PSE, L)) { + Cand.isDependenceDistanceOfOne(PSE) && + OtherCand->isDependenceDistanceOfOne(PSE)) { // They are in the same block, the later one will forward to the load. if (getInstrIndex(OtherCand->Store) < getInstrIndex(Cand.Store)) OtherCand = &Cand; @@ -452,7 +451,7 @@ // Check whether the SCEV difference is the same as the induction step, // thus we load the value in the next iteration. - if (!Cand.isDependenceDistanceOfOne(PSE, L)) + if (!Cand.isDependenceDistanceOfOne(PSE)) continue; ++NumForwarding; Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4915,8 +4915,13 @@ StoreInst *SI = dyn_cast(I); Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand(); - int Stride = isStridedPtr(PSE, Ptr, TheLoop, Strides); - + const SCEV *PtrScev = replaceSymbolicStrideSCEV(PSE, Strides, Ptr); + + // Consider Stride to be zero unless Ptr have constant stride over TheLoop. + int Stride = 0; + if (!PSE.getSE()->isLoopInvariant(PtrScev, TheLoop)) + Stride = isStridedPtr(PSE, Ptr, Strides); + // The factor of the corresponding interleave group. unsigned Factor = std::abs(Stride); @@ -4924,7 +4929,6 @@ if (Factor < 2 || Factor > MaxInterleaveGroupFactor) continue; - const SCEV *Scev = replaceSymbolicStrideSCEV(PSE, Strides, Ptr); PointerType *PtrTy = dyn_cast(Ptr->getType()); unsigned Size = DL.getTypeAllocSize(PtrTy->getElementType()); @@ -4933,7 +4937,7 @@ if (!Align) Align = DL.getABITypeAlignment(PtrTy->getElementType()); - StrideAccesses[I] = StrideDescriptor(Stride, Scev, Size, Align); + StrideAccesses[I] = StrideDescriptor(Stride, PtrScev, Size, Align); } } Index: test/Analysis/LoopAccessAnalysis/multiple-strides-rt-memory-checks.ll =================================================================== --- test/Analysis/LoopAccessAnalysis/multiple-strides-rt-memory-checks.ll +++ test/Analysis/LoopAccessAnalysis/multiple-strides-rt-memory-checks.ll @@ -0,0 +1,46 @@ +; This is the test case from PR26314. +; RUN: opt -loop-accesses -analyze -S < %s | FileCheck %s +; CHECK: function 'Test': +; CHECK: .inner: +; CHECK-NEXT: Memory dependences are safe +; CHECK-NEXT: Dependences: +; CHECK-NEXT: Run-time memory checks: +; CHECK: Check 0: +; CHECK: Check 1: + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%struct.s = type { [32 x i32], [32 x i32], [32 x [32 x i32]] } + +define void @Test(%struct.s* nocapture %obj) #0 { + br label %.outer.preheader + + +.outer.preheader: + %i = phi i64 [ 0, %0 ], [ %i.next, %.outer ] + %1 = getelementptr inbounds %struct.s, %struct.s* %obj, i64 0, i32 1, i64 %i + br label %.inner + +.exit: + ret void + +.outer: + %i.next = add nuw nsw i64 %i, 1 + %exitcond.outer = icmp eq i64 %i.next, 32 + br i1 %exitcond.outer, label %.exit, label %.outer.preheader + +.inner: + %j = phi i64 [ 0, %.outer.preheader ], [ %j.next, %.inner ] + %2 = getelementptr inbounds %struct.s, %struct.s* %obj, i64 0, i32 0, i64 %j + %3 = load i32, i32* %2 + %4 = load i32, i32* %1 + %5 = add nsw i32 %4, %3 + %6 = getelementptr inbounds %struct.s, %struct.s* %obj, i64 0, i32 2, i64 %i, i64 %j + %7 = load i32, i32* %6 + %8 = add nsw i32 %5, %7 + store i32 %8, i32* %6 + %j.next = add nuw nsw i64 %j, 1 + %exitcond.inner = icmp eq i64 %j.next, 32 + br i1 %exitcond.inner, label %.outer, label %.inner +} Index: test/Transforms/LoopVectorize/multiple-strides-vectorization.ll =================================================================== --- test/Transforms/LoopVectorize/multiple-strides-vectorization.ll +++ test/Transforms/LoopVectorize/multiple-strides-vectorization.ll @@ -0,0 +1,43 @@ +; This is the test case from PR26314. +; RUN: opt -loop-vectorize -S < %s | FileCheck %s +; CHECK-LABEL: Test +; CHECK: <4 x i64> +; CHECK: <4 x i32>, <4 x i32> +; CHECK: llvm.loop.vectorize.width + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%struct.s = type { [32 x i32], [32 x i32], [32 x [32 x i32]] } + +define void @Test(%struct.s* nocapture %obj) #0 { + br label %.outer.preheader + + +.outer.preheader: + %i = phi i64 [ 0, %0 ], [ %i.next, %.outer ] + %1 = getelementptr inbounds %struct.s, %struct.s* %obj, i64 0, i32 1, i64 %i + br label %.inner + +.exit: + ret void + +.outer: + %i.next = add nuw nsw i64 %i, 1 + %exitcond.outer = icmp eq i64 %i.next, 32 + br i1 %exitcond.outer, label %.exit, label %.outer.preheader + +.inner: + %j = phi i64 [ 0, %.outer.preheader ], [ %j.next, %.inner ] + %2 = getelementptr inbounds %struct.s, %struct.s* %obj, i64 0, i32 0, i64 %j + %3 = load i32, i32* %2 + %4 = load i32, i32* %1 + %5 = add nsw i32 %4, %3 + %6 = getelementptr inbounds %struct.s, %struct.s* %obj, i64 0, i32 2, i64 %i, i64 %j + %7 = load i32, i32* %6 + %8 = add nsw i32 %5, %7 + store i32 %8, i32* %6 + %j.next = add nuw nsw i64 %j, 1 + %exitcond.inner = icmp eq i64 %j.next, 32 + br i1 %exitcond.inner, label %.outer, label %.inner +}