Index: include/llvm/Analysis/LoopAccessAnalysis.h =================================================================== --- include/llvm/Analysis/LoopAccessAnalysis.h +++ include/llvm/Analysis/LoopAccessAnalysis.h @@ -163,7 +163,7 @@ }; MemoryDepChecker(PredicatedScalarEvolution &PSE, const Loop *L) - : PSE(PSE), InnermostLoop(L), AccessIdx(0), + : PSE(PSE), InnermostLoop(L), AccessIdx(0), MaxSafeRegisterWidth(0), ShouldRetryWithRuntimeCheck(false), SafeForVectorization(true), RecordDependences(true) {} @@ -199,6 +199,10 @@ /// the accesses safely with. uint64_t getMaxSafeDepDistBytes() { return MaxSafeDepDistBytes; } + /// \brief Return the number of elements that are safe to operate on + /// simultaneously, multiplied by the size of the element in bits. + uint64_t getMaxSafeRegisterWidth() const { return MaxSafeRegisterWidth; } + /// \brief In same cases when the dependency check fails we can still /// vectorize the loop with a dynamic array access check. bool shouldRetryWithRuntimeCheck() { return ShouldRetryWithRuntimeCheck; } @@ -255,6 +259,12 @@ // We can access this many bytes in parallel safely. uint64_t MaxSafeDepDistBytes; + /// \brief Number of elements (from consecutive iterations) that are safe to + /// operate on simultaneously, multiplied by the size of the element in bits. + /// The size of the element is taken from the memory access that causes the + /// worse restriction. + uint64_t MaxSafeRegisterWidth; + /// \brief If we see a non-constant dependence distance we can still try to /// vectorize this loop with runtime checks. bool ShouldRetryWithRuntimeCheck; Index: lib/Analysis/LoopAccessAnalysis.cpp =================================================================== --- lib/Analysis/LoopAccessAnalysis.cpp +++ lib/Analysis/LoopAccessAnalysis.cpp @@ -1471,10 +1471,12 @@ couldPreventStoreLoadForward(Distance, TypeByteSize)) return Dependence::BackwardVectorizableButPreventsForwarding; + uint64_t MaxVF = MaxSafeDepDistBytes / (TypeByteSize * Stride); DEBUG(dbgs() << "LAA: Positive distance " << Val.getSExtValue() - << " with max VF = " - << MaxSafeDepDistBytes / (TypeByteSize * Stride) << '\n'); - + << " with max VF = " << MaxVF << '\n'); + uint64_t MaxVFInBits = MaxVF * TypeByteSize * 8 ; + MaxSafeRegisterWidth = (MaxSafeRegisterWidth == 0) ? + MaxVFInBits : std::min(MaxSafeRegisterWidth, MaxVFInBits); return Dependence::BackwardVectorizable; } Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1673,6 +1673,10 @@ unsigned getMaxSafeDepDistBytes() { return LAI->getMaxSafeDepDistBytes(); } + uint64_t getMaxSafeRegisterWidth() const { + return LAI->getDepChecker().getMaxSafeRegisterWidth(); + } + bool hasStride(Value *V) { return LAI->hasStride(V); } /// Returns true if the target machine supports masked store operation @@ -6217,8 +6221,11 @@ // Remove interleaved store groups with gaps. for (InterleaveGroup *Group : StoreGroups) - if (Group->getNumMembers() != Group->getFactor()) + if (Group->getNumMembers() != Group->getFactor()) { + DEBUG(dbgs() << "LV: Invalidate candidate interleaved group due to " + "interleaved store group with gaps.\n"); releaseGroup(Group); + } // Remove interleaved groups with gaps (currently only loads) whose memory // accesses may wrap around. We have to revisit the getPtrStride analysis, @@ -6355,15 +6362,12 @@ unsigned WidestRegister = TTI.getRegisterBitWidth(true); unsigned MaxSafeDepDist = -1U; - // Get the maximum safe dependence distance in bits computed by LAA. If the - // loop contains any interleaved accesses, we divide the dependence distance - // by the maximum interleave factor of all interleaved groups. Note that - // although the division ensures correctness, this is a fairly conservative - // computation because the maximum distance computed by LAA may not involve - // any of the interleaved accesses. + // Get the maximum safe dependence distance in bits computed by LAA. + // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from + // the memory accesses that incurs the largest restriction (involved in the + // smallest dependence distance). if (Legal->getMaxSafeDepDistBytes() != -1U) - MaxSafeDepDist = - Legal->getMaxSafeDepDistBytes() * 8 / Legal->getMaxInterleaveFactor(); + MaxSafeDepDist = Legal->getMaxSafeRegisterWidth(); WidestRegister = ((WidestRegister < MaxSafeDepDist) ? WidestRegister : MaxSafeDepDist); Index: test/Transforms/LoopVectorize/X86/pr34283-1.ll =================================================================== --- test/Transforms/LoopVectorize/X86/pr34283-1.ll +++ test/Transforms/LoopVectorize/X86/pr34283-1.ll @@ -0,0 +1,50 @@ +; REQUIRES: asserts +; RUN: opt -S -loop-vectorize -mcpu=skx -debug-only=loop-vectorize < %s 2>&1 | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +;Check the new calculation of the maximum safe distance in bits which can be vectorized. +;The previous behavior did not take account that the stride was 2. +;Therefore the MaxSafeDist was computed as 384 bits instead of 192. + +;#define M 32 +;#define N 2 * M +;unsigned int a [N]; +;void foo(){ +; unsigned int j=0; +; for (j = 0; j < M - 6; ++j) +; { +; a[N - 2 * j] = 69; +; a[N - 12 - 2 * j] = 7; +; } +; +;} + +; CHECK-LABEL: foo +; CHECK: LV: The Widest register is: 192 bits. +; MaxVF in loop is 6, therefore minmum required register is 6 * sizeof(int) * 8 = 192 bits + +@a = common local_unnamed_addr global [64 x i32] zeroinitializer, align 16 + +; Function Attrs: norecurse nounwind uwtable +define void @foo() local_unnamed_addr { +entry: + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %0 = shl i64 %indvars.iv, 1 + %1 = sub nuw nsw i64 64, %0 + %arrayidx = getelementptr inbounds [64 x i32], [64 x i32]* @a, i64 0, i64 %1 + store i32 69, i32* %arrayidx, align 8 + %2 = sub nuw nsw i64 52, %0 + %arrayidx4 = getelementptr inbounds [64 x i32], [64 x i32]* @a, i64 0, i64 %2 + store i32 7, i32* %arrayidx4, align 8 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 26 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} Index: test/Transforms/LoopVectorize/X86/pr34283-2.ll =================================================================== --- test/Transforms/LoopVectorize/X86/pr34283-2.ll +++ test/Transforms/LoopVectorize/X86/pr34283-2.ll @@ -0,0 +1,49 @@ +; RUN: opt -S -loop-vectorize -mcpu=skx < %s 2>&1 | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +;Check the new calculation of the maximum safe distance in bits which can be vectorized. +;The previous behavior did not take account that the stride was 2. +;Therefore the maxVF was computed as 8 instead of 4. + +;#define M 32 +;#define N 2 * M +;unsigned int a [N]; +;void foo(){ +; unsigned int j=0; +; for (j = 0; j < M - 6; ++j) +; { +; a[N - 2 * j] = 69; +; a[N - 12 - 2 * j] = 7; +; } +; +;} + +; CHECK-LABEL: foo +; CHECK: <4 x i32> +; CHECK-NOT: <8 x i32> + +@a = common local_unnamed_addr global [64 x i32] zeroinitializer, align 16 + +; Function Attrs: norecurse nounwind uwtable +define void @foo() local_unnamed_addr { +entry: + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %0 = shl i64 %indvars.iv, 1 + %1 = sub nuw nsw i64 64, %0 + %arrayidx = getelementptr inbounds [64 x i32], [64 x i32]* @a, i64 0, i64 %1 + store i32 69, i32* %arrayidx, align 8 + %2 = sub nuw nsw i64 52, %0 + %arrayidx4 = getelementptr inbounds [64 x i32], [64 x i32]* @a, i64 0, i64 %2 + store i32 7, i32* %arrayidx4, align 8 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 26 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +}