Index: llvm/trunk/lib/Analysis/LoopAccessAnalysis.cpp =================================================================== --- llvm/trunk/lib/Analysis/LoopAccessAnalysis.cpp +++ llvm/trunk/lib/Analysis/LoopAccessAnalysis.cpp @@ -678,6 +678,42 @@ return false; } +/// \brief Check the dependence for two accesses with the same stride \p Stride. +/// \p Distance is the positive distance and \p TypeByteSize is type size in +/// bytes. +/// +/// \returns true if they are independent. +static bool areStridedAccessesIndependent(unsigned Distance, unsigned Stride, + unsigned TypeByteSize) { + assert(Stride > 1 && "The stride must be greater than 1"); + assert(TypeByteSize > 0 && "The type size in byte must be non-zero"); + assert(Distance > 0 && "The distance must be non-zero"); + + // Skip if the distance is not multiple of type byte size. + if (Distance % TypeByteSize) + return false; + + unsigned ScaledDist = Distance / TypeByteSize; + + // No dependence if the scaled distance is not multiple of the stride. + // E.g. + // for (i = 0; i < 1024 ; i += 4) + // A[i+2] = A[i] + 1; + // + // Two accesses in memory (scaled distance is 2, stride is 4): + // | A[0] | | | | A[4] | | | | + // | | | A[2] | | | | A[6] | | + // + // E.g. + // for (i = 0; i < 1024 ; i += 3) + // A[i+4] = A[i] + 1; + // + // Two accesses in memory (scaled distance is 4, stride is 3): + // | A[0] | | | A[3] | | | A[6] | | | + // | | | | | A[4] | | | A[7] | | + return ScaledDist % Stride; +} + MemoryDepChecker::Dependence::DepType MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx, const MemAccessInfo &B, unsigned BIdx, @@ -778,34 +814,87 @@ unsigned Distance = (unsigned) Val.getZExtValue(); + unsigned Stride = std::abs(StrideAPtr); + if (Stride > 1 && + areStridedAccessesIndependent(Distance, Stride, TypeByteSize)) + return Dependence::NoDep; + // Bail out early if passed-in parameters make vectorization not feasible. unsigned ForcedFactor = (VectorizerParams::VectorizationFactor ? VectorizerParams::VectorizationFactor : 1); unsigned ForcedUnroll = (VectorizerParams::VectorizationInterleave ? VectorizerParams::VectorizationInterleave : 1); + // The minimum number of iterations for a vectorized/unrolled version. + unsigned MinNumIter = std::max(ForcedFactor * ForcedUnroll, 2U); + + // It's not vectorizable if the distance is smaller than the minimum distance + // needed for a vectroized/unrolled version. Vectorizing one iteration in + // front needs TypeByteSize * Stride. Vectorizing the last iteration needs + // TypeByteSize (No need to plus the last gap distance). + // + // E.g. Assume one char is 1 byte in memory and one int is 4 bytes. + // foo(int *A) { + // int *B = (int *)((char *)A + 14); + // for (i = 0 ; i < 1024 ; i += 2) + // B[i] = A[i] + 1; + // } + // + // Two accesses in memory (stride is 2): + // | A[0] | | A[2] | | A[4] | | A[6] | | + // | B[0] | | B[2] | | B[4] | + // + // Distance needs for vectorizing iterations except the last iteration: + // 4 * 2 * (MinNumIter - 1). Distance needs for the last iteration: 4. + // So the minimum distance needed is: 4 * 2 * (MinNumIter - 1) + 4. + // + // If MinNumIter is 2, it is vectorizable as the minimum distance needed is + // 12, which is less than distance. + // + // If MinNumIter is 4 (Say if a user forces the vectorization factor to be 4), + // the minimum distance needed is 28, which is greater than distance. It is + // not safe to do vectorization. + unsigned MinDistanceNeeded = + TypeByteSize * Stride * (MinNumIter - 1) + TypeByteSize; + if (MinDistanceNeeded > Distance) { + DEBUG(dbgs() << "LAA: Failure because of positive distance " << Distance + << '\n'); + return Dependence::Backward; + } - // The distance must be bigger than the size needed for a vectorized version - // of the operation and the size of the vectorized operation must not be - // bigger than the currrent maximum size. - if (Distance < 2*TypeByteSize || - 2*TypeByteSize > MaxSafeDepDistBytes || - Distance < TypeByteSize * ForcedUnroll * ForcedFactor) { - DEBUG(dbgs() << "LAA: Failure because of Positive distance " - << Val.getSExtValue() << '\n'); + // Unsafe if the minimum distance needed is greater than max safe distance. + if (MinDistanceNeeded > MaxSafeDepDistBytes) { + DEBUG(dbgs() << "LAA: Failure because it needs at least " + << MinDistanceNeeded << " size in bytes"); return Dependence::Backward; } // Positive distance bigger than max vectorization factor. - MaxSafeDepDistBytes = Distance < MaxSafeDepDistBytes ? - Distance : MaxSafeDepDistBytes; + // FIXME: Should use max factor instead of max distance in bytes, which could + // not handle different types. + // E.g. Assume one char is 1 byte in memory and one int is 4 bytes. + // void foo (int *A, char *B) { + // for (unsigned i = 0; i < 1024; i++) { + // A[i+2] = A[i] + 1; + // B[i+2] = B[i] + 1; + // } + // } + // + // This case is currently unsafe according to the max safe distance. If we + // analyze the two accesses on array B, the max safe dependence distance + // is 2. Then we analyze the accesses on array A, the minimum distance needed + // is 8, which is less than 2 and forbidden vectorization, But actually + // both A and B could be vectorized by 2 iterations. + MaxSafeDepDistBytes = + Distance < MaxSafeDepDistBytes ? Distance : MaxSafeDepDistBytes; bool IsTrueDataDependence = (!AIsWrite && BIsWrite); if (IsTrueDataDependence && couldPreventStoreLoadForward(Distance, TypeByteSize)) return Dependence::BackwardVectorizableButPreventsForwarding; - DEBUG(dbgs() << "LAA: Positive distance " << Val.getSExtValue() << - " with max VF = " << MaxSafeDepDistBytes / TypeByteSize << '\n'); + DEBUG(dbgs() << "LAA: Positive distance " << Val.getSExtValue() + << " with max VF = " + << MaxSafeDepDistBytes / (TypeByteSize * Stride) << '\n'); return Dependence::BackwardVectorizable; } Index: llvm/trunk/test/Analysis/LoopAccessAnalysis/stride-access-dependence.ll =================================================================== --- llvm/trunk/test/Analysis/LoopAccessAnalysis/stride-access-dependence.ll +++ llvm/trunk/test/Analysis/LoopAccessAnalysis/stride-access-dependence.ll @@ -0,0 +1,540 @@ +; RUN: opt -loop-accesses -analyze < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" + +; Following cases are no dependence. + +; void nodep_Read_Write(int *A) { +; int *B = A + 1; +; for (unsigned i = 0; i < 1024; i+=3) +; B[i] = A[i] + 1; +; } + +; CHECK: function 'nodep_Read_Write': +; CHECK-NEXT: for.body: +; CHECK-NEXT: Memory dependences are safe +; CHECK-NEXT: Interesting Dependences: +; CHECK-NEXT: Run-time memory checks: + +define void @nodep_Read_Write(i32* nocapture %A) { +entry: + %add.ptr = getelementptr inbounds i32, i32* %A, i64 1 + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %add = add nsw i32 %0, 1 + %arrayidx2 = getelementptr inbounds i32, i32* %add.ptr, i64 %indvars.iv + store i32 %add, i32* %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 3 + %cmp = icmp ult i64 %indvars.iv.next, 1024 + br i1 %cmp, label %for.body, label %for.cond.cleanup +} + +; int nodep_Write_Read(int *A) { +; int sum = 0; +; for (unsigned i = 0; i < 1024; i+=4) { +; A[i] = i; +; sum += A[i+3]; +; } +; +; return sum; +; } + +; CHECK: function 'nodep_Write_Read': +; CHECK-NEXT: for.body: +; CHECK-NEXT: Memory dependences are safe +; CHECK-NEXT: Interesting Dependences: +; CHECK-NEXT: Run-time memory checks: + +define i32 @nodep_Write_Read(i32* nocapture %A) { +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret i32 %add3 + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %sum.013 = phi i32 [ 0, %entry ], [ %add3, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + %0 = trunc i64 %indvars.iv to i32 + store i32 %0, i32* %arrayidx, align 4 + %1 = or i64 %indvars.iv, 3 + %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %1 + %2 = load i32, i32* %arrayidx2, align 4 + %add3 = add nsw i32 %2, %sum.013 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 4 + %cmp = icmp ult i64 %indvars.iv.next, 1024 + br i1 %cmp, label %for.body, label %for.cond.cleanup +} + +; void nodep_Write_Write(int *A) { +; for (unsigned i = 0; i < 1024; i+=2) { +; A[i] = i; +; A[i+1] = i+1; +; } +; } + +; CHECK: function 'nodep_Write_Write': +; CHECK-NEXT: for.body: +; CHECK-NEXT: Memory dependences are safe +; CHECK-NEXT: Interesting Dependences: +; CHECK-NEXT: Run-time memory checks: + +define void @nodep_Write_Write(i32* nocapture %A) { +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + %0 = trunc i64 %indvars.iv to i32 + store i32 %0, i32* %arrayidx, align 4 + %1 = or i64 %indvars.iv, 1 + %arrayidx3 = getelementptr inbounds i32, i32* %A, i64 %1 + %2 = trunc i64 %1 to i32 + store i32 %2, i32* %arrayidx3, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 + %cmp = icmp ult i64 %indvars.iv.next, 1024 + br i1 %cmp, label %for.body, label %for.cond.cleanup +} + +; Following cases are unsafe depdences and are not vectorizable. + +; void unsafe_Read_Write(int *A) { +; for (unsigned i = 0; i < 1024; i+=3) +; A[i+3] = A[i] + 1; +; } + +; CHECK: function 'unsafe_Read_Write': +; CHECK-NEXT: for.body: +; CHECK-NEXT: Report: unsafe dependent memory operations in loop +; CHECK-NEXT: Interesting Dependences: +; CHECK-NEXT: Backward: +; CHECK-NEXT: %0 = load i32, i32* %arrayidx, align 4 -> +; CHECK-NEXT: store i32 %add, i32* %arrayidx3, align 4 + +define void @unsafe_Read_Write(i32* nocapture %A) { +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %entry, %for.body + %i.010 = phi i32 [ 0, %entry ], [ %add1, %for.body ] + %idxprom = zext i32 %i.010 to i64 + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %idxprom + %0 = load i32, i32* %arrayidx, align 4 + %add = add nsw i32 %0, 1 + %add1 = add i32 %i.010, 3 + %idxprom2 = zext i32 %add1 to i64 + %arrayidx3 = getelementptr inbounds i32, i32* %A, i64 %idxprom2 + store i32 %add, i32* %arrayidx3, align 4 + %cmp = icmp ult i32 %add1, 1024 + br i1 %cmp, label %for.body, label %for.cond.cleanup +} + +; int unsafe_Write_Read(int *A) { +; int sum = 0; +; for (unsigned i = 0; i < 1024; i+=4) { +; A[i] = i; +; sum += A[i+4]; +; } +; +; return sum; +; } + +; CHECK: function 'unsafe_Write_Read': +; CHECK-NEXT: for.body: +; CHECK-NEXT: Report: unsafe dependent memory operations in loop +; CHECK-NEXT: Interesting Dependences: +; CHECK-NEXT: Backward: +; CHECK-NEXT: store i32 %0, i32* %arrayidx, align 4 -> +; CHECK-NEXT: %1 = load i32, i32* %arrayidx2, align 4 + +define i32 @unsafe_Write_Read(i32* nocapture %A) { +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret i32 %add3 + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %sum.013 = phi i32 [ 0, %entry ], [ %add3, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + %0 = trunc i64 %indvars.iv to i32 + store i32 %0, i32* %arrayidx, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 4 + %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv.next + %1 = load i32, i32* %arrayidx2, align 4 + %add3 = add nsw i32 %1, %sum.013 + %cmp = icmp ult i64 %indvars.iv.next, 1024 + br i1 %cmp, label %for.body, label %for.cond.cleanup +} + +; void unsafe_Write_Write(int *A) { +; for (unsigned i = 0; i < 1024; i+=2) { +; A[i] = i; +; A[i+2] = i+1; +; } +; } + +; CHECK: function 'unsafe_Write_Write': +; CHECK-NEXT: for.body: +; CHECK-NEXT: Report: unsafe dependent memory operations in loop +; CHECK-NEXT: Interesting Dependences: +; CHECK-NEXT: Backward: +; CHECK-NEXT: store i32 %0, i32* %arrayidx, align 4 -> +; CHECK-NEXT: store i32 %2, i32* %arrayidx3, align 4 + +define void @unsafe_Write_Write(i32* nocapture %A) { +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + %0 = trunc i64 %indvars.iv to i32 + store i32 %0, i32* %arrayidx, align 4 + %1 = or i64 %indvars.iv, 1 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 + %arrayidx3 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv.next + %2 = trunc i64 %1 to i32 + store i32 %2, i32* %arrayidx3, align 4 + %cmp = icmp ult i64 %indvars.iv.next, 1024 + br i1 %cmp, label %for.body, label %for.cond.cleanup +} + +; Following cases check that strided accesses can be vectorized. + +; void vectorizable_Read_Write(int *A) { +; int *B = A + 4; +; for (unsigned i = 0; i < 1024; i+=2) +; B[i] = A[i] + 1; +; } + +; CHECK: function 'vectorizable_Read_Write': +; CHECK-NEXT: for.body: +; CHECK-NEXT: Memory dependences are safe +; CHECK-NEXT: Interesting Dependences: +; CHECK-NEXT: BackwardVectorizable: +; CHECK-NEXT: %0 = load i32, i32* %arrayidx, align 4 -> +; CHECK-NEXT: store i32 %add, i32* %arrayidx2, align 4 + +define void @vectorizable_Read_Write(i32* nocapture %A) { +entry: + %add.ptr = getelementptr inbounds i32, i32* %A, i64 4 + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %add = add nsw i32 %0, 1 + %arrayidx2 = getelementptr inbounds i32, i32* %add.ptr, i64 %indvars.iv + store i32 %add, i32* %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 + %cmp = icmp ult i64 %indvars.iv.next, 1024 + br i1 %cmp, label %for.body, label %for.cond.cleanup +} + +; int vectorizable_Write_Read(int *A) { +; int *B = A + 4; +; int sum = 0; +; for (unsigned i = 0; i < 1024; i+=2) { +; A[i] = i; +; sum += B[i]; +; } +; +; return sum; +; } + +; CHECK: function 'vectorizable_Write_Read': +; CHECK-NEXT: for.body: +; CHECK-NEXT: Memory dependences are safe +; CHECK-NEXT: Interesting Dependences: +; CHECK-NEXT: BackwardVectorizable: +; CHECK-NEXT: store i32 %0, i32* %arrayidx, align 4 -> +; CHECK-NEXT: %1 = load i32, i32* %arrayidx2, align 4 + +define i32 @vectorizable_Write_Read(i32* nocapture %A) { +entry: + %add.ptr = getelementptr inbounds i32, i32* %A, i64 4 + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret i32 %add + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %sum.013 = phi i32 [ 0, %entry ], [ %add, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + %0 = trunc i64 %indvars.iv to i32 + store i32 %0, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, i32* %add.ptr, i64 %indvars.iv + %1 = load i32, i32* %arrayidx2, align 4 + %add = add nsw i32 %1, %sum.013 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 + %cmp = icmp ult i64 %indvars.iv.next, 1024 + br i1 %cmp, label %for.body, label %for.cond.cleanup +} + +; void vectorizable_Write_Write(int *A) { +; int *B = A + 4; +; for (unsigned i = 0; i < 1024; i+=2) { +; A[i] = i; +; B[i] = i+1; +; } +; } + +; CHECK: function 'vectorizable_Write_Write': +; CHECK-NEXT: for.body: +; CHECK-NEXT: Memory dependences are safe +; CHECK-NEXT: Interesting Dependences: +; CHECK-NEXT: BackwardVectorizable: +; CHECK-NEXT: store i32 %0, i32* %arrayidx, align 4 -> +; CHECK-NEXT: store i32 %2, i32* %arrayidx2, align 4 + +define void @vectorizable_Write_Write(i32* nocapture %A) { +entry: + %add.ptr = getelementptr inbounds i32, i32* %A, i64 4 + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + %0 = trunc i64 %indvars.iv to i32 + store i32 %0, i32* %arrayidx, align 4 + %1 = or i64 %indvars.iv, 1 + %arrayidx2 = getelementptr inbounds i32, i32* %add.ptr, i64 %indvars.iv + %2 = trunc i64 %1 to i32 + store i32 %2, i32* %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 + %cmp = icmp ult i64 %indvars.iv.next, 1024 + br i1 %cmp, label %for.body, label %for.cond.cleanup +} + +; void vectorizable_unscaled_Read_Write(int *A) { +; int *B = (int *)((char *)A + 14); +; for (unsigned i = 0; i < 1024; i+=2) +; B[i] = A[i] + 1; +; } + +; FIXME: This case looks like previous case @vectorizable_Read_Write. It sould +; be vectorizable. + +; CHECK: function 'vectorizable_unscaled_Read_Write': +; CHECK-NEXT: for.body: +; CHECK-NEXT: Report: unsafe dependent memory operations in loop +; CHECK-NEXT: Interesting Dependences: +; CHECK-NEXT: BackwardVectorizableButPreventsForwarding: +; CHECK-NEXT: %2 = load i32, i32* %arrayidx, align 4 -> +; CHECK-NEXT: store i32 %add, i32* %arrayidx2, align 4 + +define void @vectorizable_unscaled_Read_Write(i32* nocapture %A) { +entry: + %0 = bitcast i32* %A to i8* + %add.ptr = getelementptr inbounds i8, i8* %0, i64 14 + %1 = bitcast i8* %add.ptr to i32* + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + %2 = load i32, i32* %arrayidx, align 4 + %add = add nsw i32 %2, 1 + %arrayidx2 = getelementptr inbounds i32, i32* %1, i64 %indvars.iv + store i32 %add, i32* %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 + %cmp = icmp ult i64 %indvars.iv.next, 1024 + br i1 %cmp, label %for.body, label %for.cond.cleanup +} + +; int vectorizable_unscaled_Write_Read(int *A) { +; int *B = (int *)((char *)A + 17); +; int sum = 0; +; for (unsigned i = 0; i < 1024; i+=2) { +; A[i] = i; +; sum += B[i]; +; } +; +; return sum; +; } + +; CHECK: for function 'vectorizable_unscaled_Write_Read': +; CHECK-NEXT: for.body: +; CHECK-NEXT: Memory dependences are safe +; CHECK-NEXT: Interesting Dependences: +; CHECK-NEXT: BackwardVectorizable: +; CHECK-NEXT: store i32 %2, i32* %arrayidx, align 4 -> +; CHECK-NEXT: %3 = load i32, i32* %arrayidx2, align 4 + +define i32 @vectorizable_unscaled_Write_Read(i32* nocapture %A) { +entry: + %0 = bitcast i32* %A to i8* + %add.ptr = getelementptr inbounds i8, i8* %0, i64 17 + %1 = bitcast i8* %add.ptr to i32* + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret i32 %add + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %sum.013 = phi i32 [ 0, %entry ], [ %add, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + %2 = trunc i64 %indvars.iv to i32 + store i32 %2, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, i32* %1, i64 %indvars.iv + %3 = load i32, i32* %arrayidx2, align 4 + %add = add nsw i32 %3, %sum.013 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 + %cmp = icmp ult i64 %indvars.iv.next, 1024 + br i1 %cmp, label %for.body, label %for.cond.cleanup +} + +; void unsafe_unscaled_Read_Write(int *A) { +; int *B = (int *)((char *)A + 11); +; for (unsigned i = 0; i < 1024; i+=2) +; B[i] = A[i] + 1; +; } + +; CHECK: function 'unsafe_unscaled_Read_Write': +; CHECK-NEXT: for.body: +; CHECK-NEXT: Report: unsafe dependent memory operations in loop +; CHECK-NEXT: Interesting Dependences: +; CHECK-NEXT: Backward: +; CHECK-NEXT: %2 = load i32, i32* %arrayidx, align 4 -> +; CHECK-NEXT: store i32 %add, i32* %arrayidx2, align 4 + +define void @unsafe_unscaled_Read_Write(i32* nocapture %A) { +entry: + %0 = bitcast i32* %A to i8* + %add.ptr = getelementptr inbounds i8, i8* %0, i64 11 + %1 = bitcast i8* %add.ptr to i32* + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + %2 = load i32, i32* %arrayidx, align 4 + %add = add nsw i32 %2, 1 + %arrayidx2 = getelementptr inbounds i32, i32* %1, i64 %indvars.iv + store i32 %add, i32* %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 + %cmp = icmp ult i64 %indvars.iv.next, 1024 + br i1 %cmp, label %for.body, label %for.cond.cleanup +} + +; CHECK: function 'unsafe_unscaled_Read_Write2': +; CHECK-NEXT: for.body: +; CHECK-NEXT: Report: unsafe dependent memory operations in loop +; CHECK-NEXT: Interesting Dependences: +; CHECK-NEXT: Backward: +; CHECK-NEXT: %2 = load i32, i32* %arrayidx, align 4 -> +; CHECK-NEXT: store i32 %add, i32* %arrayidx2, align 4 + +; void unsafe_unscaled_Read_Write2(int *A) { +; int *B = (int *)((char *)A + 1); +; for (unsigned i = 0; i < 1024; i+=2) +; B[i] = A[i] + 1; +; } + +define void @unsafe_unscaled_Read_Write2(i32* nocapture %A) { +entry: + %0 = bitcast i32* %A to i8* + %add.ptr = getelementptr inbounds i8, i8* %0, i64 1 + %1 = bitcast i8* %add.ptr to i32* + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + %2 = load i32, i32* %arrayidx, align 4 + %add = add nsw i32 %2, 1 + %arrayidx2 = getelementptr inbounds i32, i32* %1, i64 %indvars.iv + store i32 %add, i32* %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 + %cmp = icmp ult i64 %indvars.iv.next, 1024 + br i1 %cmp, label %for.body, label %for.cond.cleanup +} + +; Following case checks that interleaved stores have dependences with another +; store and can not pass dependence check. + +; void interleaved_stores(int *A) { +; int *B = (int *) ((char *)A + 1); +; for(int i = 0; i < 1024; i+=2) { +; B[i] = i; // (1) +; A[i+1] = i + 1; // (2) +; B[i+1] = i + 1; // (3) +; } +; } +; +; The access (2) has overlaps with (1) and (3). + +; CHECK: function 'interleaved_stores': +; CHECK-NEXT: for.body: +; CHECK-NEXT: Report: unsafe dependent memory operations in loop +; CHECK-NEXT: Interesting Dependences: +; CHECK-NEXT: Backward: +; CHECK-NEXT: store i32 %4, i32* %arrayidx5, align 4 -> +; CHECK-NEXT: store i32 %4, i32* %arrayidx9, align 4 +; CHECK: Backward: +; CHECK-NEXT: store i32 %2, i32* %arrayidx2, align 4 -> +; CHECK-NEXT: store i32 %4, i32* %arrayidx5, align 4 + +define void @interleaved_stores(i32* nocapture %A) { +entry: + %0 = bitcast i32* %A to i8* + %incdec.ptr = getelementptr inbounds i8, i8* %0, i64 1 + %1 = bitcast i8* %incdec.ptr to i32* + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %2 = trunc i64 %indvars.iv to i32 + %arrayidx2 = getelementptr inbounds i32, i32* %1, i64 %indvars.iv + store i32 %2, i32* %arrayidx2, align 4 + %3 = or i64 %indvars.iv, 1 + %arrayidx5 = getelementptr inbounds i32, i32* %A, i64 %3 + %4 = trunc i64 %3 to i32 + store i32 %4, i32* %arrayidx5, align 4 + %arrayidx9 = getelementptr inbounds i32, i32* %1, i64 %3 + store i32 %4, i32* %arrayidx9, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 + %cmp = icmp slt i64 %indvars.iv.next, 1024 + br i1 %cmp, label %for.body, label %for.cond.cleanup +}