diff --git a/llvm/include/llvm/Analysis/DependenceAnalysis.h b/llvm/include/llvm/Analysis/DependenceAnalysis.h --- a/llvm/include/llvm/Analysis/DependenceAnalysis.h +++ b/llvm/include/llvm/Analysis/DependenceAnalysis.h @@ -924,10 +924,28 @@ void updateDirection(Dependence::DVEntry &Level, const Constraint &CurConstraint) const; + /// Given a linear access function, tries to recover subscripts + /// for each dimension of the array element access. bool tryDelinearize(Instruction *Src, Instruction *Dst, SmallVectorImpl &Pair); - private: + /// Tries to delinearize access function for a fixed size multi-dimensional + /// array, by deriving subscripts from GEP instructions. Returns true upon + /// success and false otherwise. + bool tryDelinearizeFixedSize(Instruction *Src, Instruction *Dst, + const SCEV *SrcAccessFn, + const SCEV *DstAccessFn, + SmallVectorImpl &SrcSubscripts, + SmallVectorImpl &DstSubscripts); + + /// Tries to delinearize access function for a multi-dimensional array with + /// symbolic runtime sizes. + /// Returns true upon success and false otherwise. + bool tryDelinearizeParametricSize( + Instruction *Src, Instruction *Dst, const SCEV *SrcAccessFn, + const SCEV *DstAccessFn, SmallVectorImpl &SrcSubscripts, + SmallVectorImpl &DstSubscripts); + /// checkSubscript - Helper function for checkSrcSubscript and /// checkDstSubscript to avoid duplicate code bool checkSubscript(const SCEV *Expr, const Loop *LoopNest, diff --git a/llvm/lib/Analysis/DependenceAnalysis.cpp b/llvm/lib/Analysis/DependenceAnalysis.cpp --- a/llvm/lib/Analysis/DependenceAnalysis.cpp +++ b/llvm/lib/Analysis/DependenceAnalysis.cpp @@ -3264,23 +3264,134 @@ assert(isLoadOrStore(Dst) && "instruction is not load or store"); Value *SrcPtr = getLoadStorePointerOperand(Src); Value *DstPtr = getLoadStorePointerOperand(Dst); - Loop *SrcLoop = LI->getLoopFor(Src->getParent()); Loop *DstLoop = LI->getLoopFor(Dst->getParent()); + const SCEV *SrcAccessFn = SE->getSCEVAtScope(SrcPtr, SrcLoop); + const SCEV *DstAccessFn = SE->getSCEVAtScope(DstPtr, DstLoop); + const SCEVUnknown *SrcBase = + dyn_cast(SE->getPointerBase(SrcAccessFn)); + const SCEVUnknown *DstBase = + dyn_cast(SE->getPointerBase(DstAccessFn)); + + if (!SrcBase || !DstBase || SrcBase != DstBase) + return false; - // Below code mimics the code in Delinearization.cpp - const SCEV *SrcAccessFn = - SE->getSCEVAtScope(SrcPtr, SrcLoop); - const SCEV *DstAccessFn = - SE->getSCEVAtScope(DstPtr, DstLoop); + SmallVector SrcSubscripts, DstSubscripts; + + if (!tryDelinearizeFixedSize(Src, Dst, SrcAccessFn, DstAccessFn, + SrcSubscripts, DstSubscripts) && + !tryDelinearizeParametricSize(Src, Dst, SrcAccessFn, DstAccessFn, + SrcSubscripts, DstSubscripts)) + return false; + + int Size = SrcSubscripts.size(); + LLVM_DEBUG({ + dbgs() << "\nSrcSubscripts: "; + for (int I = 0; I < Size; I++) + dbgs() << *SrcSubscripts[I]; + dbgs() << "\nDstSubscripts: "; + for (int I = 0; I < Size; I++) + dbgs() << *DstSubscripts[I]; + }); + // The delinearization transforms a single-subscript MIV dependence test into + // a multi-subscript SIV dependence test that is easier to compute. So we + // resize Pair to contain as many pairs of subscripts as the delinearization + // has found, and then initialize the pairs following the delinearization. + Pair.resize(Size); + for (int I = 0; I < Size; ++I) { + Pair[I].Src = SrcSubscripts[I]; + Pair[I].Dst = DstSubscripts[I]; + unifySubscriptType(&Pair[I]); + } + + return true; +} + +bool DependenceInfo::tryDelinearizeFixedSize( + Instruction *Src, Instruction *Dst, const SCEV *SrcAccessFn, + const SCEV *DstAccessFn, SmallVectorImpl &SrcSubscripts, + SmallVectorImpl &DstSubscripts) { + + // In general we cannot safely assume that the subscripts recovered from GEPs + // are in the range of values defined for their corresponding array + // dimensions. For example some C language usage/interpretation make it + // impossible to verify this at compile-time. As such we give up here unless + // we can assume that the subscripts do not overlap into neighboring + // dimensions and that the number of dimensions matches the number of + // subscripts being recovered. + if (!DisableDelinearizationChecks) + return false; + + Value *SrcPtr = getLoadStorePointerOperand(Src); + Value *DstPtr = getLoadStorePointerOperand(Dst); const SCEVUnknown *SrcBase = dyn_cast(SE->getPointerBase(SrcAccessFn)); const SCEVUnknown *DstBase = dyn_cast(SE->getPointerBase(DstAccessFn)); + assert(SrcBase && DstBase && SrcBase == DstBase && + "expected src and dst scev unknowns to be equal"); - if (!SrcBase || !DstBase || SrcBase != DstBase) + // Check the simple case where the array dimensions are fixed size. + auto *SrcGEP = dyn_cast(SrcPtr); + auto *DstGEP = dyn_cast(DstPtr); + if (!SrcGEP || !DstGEP) + return false; + + SmallVector SrcSizes, DstSizes; + SE->getIndexExpressionsFromGEP(SrcGEP, SrcSubscripts, SrcSizes); + SE->getIndexExpressionsFromGEP(DstGEP, DstSubscripts, DstSizes); + + // Check that the two size arrays are non-empty and equal in length and + // value. + if (SrcSizes.empty() || SrcSubscripts.size() <= 1 || + SrcSizes.size() != DstSizes.size() || + !std::equal(SrcSizes.begin(), SrcSizes.end(), DstSizes.begin())) { + SrcSubscripts.clear(); + DstSubscripts.clear(); return false; + } + + Value *SrcBasePtr = SrcGEP->getOperand(0); + Value *DstBasePtr = DstGEP->getOperand(0); + while (auto *PCast = dyn_cast(SrcBasePtr)) + SrcBasePtr = PCast->getOperand(0); + while (auto *PCast = dyn_cast(DstBasePtr)) + DstBasePtr = PCast->getOperand(0); + + // Check that for identical base pointers we do not miss index offsets + // that have been added before this GEP is applied. + if (SrcBasePtr == SrcBase->getValue() && DstBasePtr == DstBase->getValue()) { + assert(SrcSubscripts.size() == DstSubscripts.size() && + SrcSubscripts.size() == SrcSizes.size() + 1 && + "Expected equal number of entries in the list of sizes and " + "subscripts."); + LLVM_DEBUG({ + dbgs() << "Delinearized subscripts of fixed-size array\n" + << "SrcGEP:" << *SrcGEP << "\n" + << "DstGEP:" << *DstGEP << "\n"; + }); + return true; + } + + SrcSubscripts.clear(); + DstSubscripts.clear(); + return false; +} + +bool DependenceInfo::tryDelinearizeParametricSize( + Instruction *Src, Instruction *Dst, const SCEV *SrcAccessFn, + const SCEV *DstAccessFn, SmallVectorImpl &SrcSubscripts, + SmallVectorImpl &DstSubscripts) { + + Value *SrcPtr = getLoadStorePointerOperand(Src); + Value *DstPtr = getLoadStorePointerOperand(Dst); + const SCEVUnknown *SrcBase = + dyn_cast(SE->getPointerBase(SrcAccessFn)); + const SCEVUnknown *DstBase = + dyn_cast(SE->getPointerBase(DstAccessFn)); + assert(SrcBase && DstBase && SrcBase == DstBase && + "expected src and dst scev unknowns to be equal"); const SCEV *ElementSize = SE->getElementSize(Src); if (ElementSize != SE->getElementSize(Dst)) @@ -3304,7 +3415,6 @@ SE->findArrayDimensions(Terms, Sizes, ElementSize); // Third step: compute the access functions for each subscript. - SmallVector SrcSubscripts, DstSubscripts; SE->computeAccessFunctions(SrcAR, SrcSubscripts, Sizes); SE->computeAccessFunctions(DstAR, DstSubscripts, Sizes); @@ -3313,7 +3423,7 @@ SrcSubscripts.size() != DstSubscripts.size()) return false; - int size = SrcSubscripts.size(); + size_t Size = SrcSubscripts.size(); // Statically check that the array bounds are in-range. The first subscript we // don't have a size for and it cannot overflow into another subscript, so is @@ -3322,40 +3432,20 @@ // FIXME: It may be better to record these sizes and add them as constraints // to the dependency checks. if (!DisableDelinearizationChecks) - for (int i = 1; i < size; ++i) { - if (!isKnownNonNegative(SrcSubscripts[i], SrcPtr)) + for (size_t I = 1; I < Size; ++I) { + if (!isKnownNonNegative(SrcSubscripts[I], SrcPtr)) return false; - if (!isKnownLessThan(SrcSubscripts[i], Sizes[i - 1])) + if (!isKnownLessThan(SrcSubscripts[I], Sizes[I - 1])) return false; - if (!isKnownNonNegative(DstSubscripts[i], DstPtr)) + if (!isKnownNonNegative(DstSubscripts[I], DstPtr)) return false; - if (!isKnownLessThan(DstSubscripts[i], Sizes[i - 1])) + if (!isKnownLessThan(DstSubscripts[I], Sizes[I - 1])) return false; } - LLVM_DEBUG({ - dbgs() << "\nSrcSubscripts: "; - for (int i = 0; i < size; i++) - dbgs() << *SrcSubscripts[i]; - dbgs() << "\nDstSubscripts: "; - for (int i = 0; i < size; i++) - dbgs() << *DstSubscripts[i]; - }); - - // The delinearization transforms a single-subscript MIV dependence test into - // a multi-subscript SIV dependence test that is easier to compute. So we - // resize Pair to contain as many pairs of subscripts as the delinearization - // has found, and then initialize the pairs following the delinearization. - Pair.resize(size); - for (int i = 0; i < size; ++i) { - Pair[i].Src = SrcSubscripts[i]; - Pair[i].Dst = DstSubscripts[i]; - unifySubscriptType(&Pair[i]); - } - return true; } diff --git a/llvm/test/Analysis/DependenceAnalysis/PreliminaryNoValidityCheckFixedSize.ll b/llvm/test/Analysis/DependenceAnalysis/PreliminaryNoValidityCheckFixedSize.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Analysis/DependenceAnalysis/PreliminaryNoValidityCheckFixedSize.ll @@ -0,0 +1,106 @@ +; RUN: opt < %s -disable-output "-passes=print" -aa-pipeline=basic-aa 2>&1 \ +; RUN: -da-disable-delinearization-checks | FileCheck %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.6.0" + +;; for (long int i = 0; i < n; i++) { +;; for (long int j = 0; j < n; j++) { +;; for (long int k = 0; k < n; k++) { +;; A[i][j][k] = i; +;; } +;; for (long int k = 0; k < n; k++) { +;; *B++ = A[i + 3][j + 2][k + 1]; + +define void @p2(i64 %n, [100 x [100 x i64]]* %A, i64* %B) nounwind uwtable ssp { +entry: + %cmp10 = icmp sgt i64 %n, 0 + br i1 %cmp10, label %for.cond1.preheader.preheader, label %for.end26 + +; CHECK-LABEL: p2 +; CHECK: da analyze - none! +; CHECK: da analyze - flow [-3 -2]! +; CHECK: da analyze - confused! +; CHECK: da analyze - none! +; CHECK: da analyze - confused! +; CHECK: da analyze - output [* * *]! + +for.cond1.preheader.preheader: ; preds = %entry + br label %for.cond1.preheader + +for.cond1.preheader: ; preds = %for.cond1.preheader.preheader, %for.inc24 + %B.addr.012 = phi i64* [ %B.addr.1.lcssa, %for.inc24 ], [ %B, %for.cond1.preheader.preheader ] + %i.011 = phi i64 [ %inc25, %for.inc24 ], [ 0, %for.cond1.preheader.preheader ] + %cmp26 = icmp sgt i64 %n, 0 + br i1 %cmp26, label %for.cond4.preheader.preheader, label %for.inc24 + +for.cond4.preheader.preheader: ; preds = %for.cond1.preheader + br label %for.cond4.preheader + +for.cond4.preheader: ; preds = %for.cond4.preheader.preheader, %for.inc21 + %B.addr.18 = phi i64* [ %B.addr.2.lcssa, %for.inc21 ], [ %B.addr.012, %for.cond4.preheader.preheader ] + %j.07 = phi i64 [ %inc22, %for.inc21 ], [ 0, %for.cond4.preheader.preheader ] + %cmp51 = icmp sgt i64 %n, 0 + br i1 %cmp51, label %for.body6.preheader, label %for.cond10.loopexit + +for.body6.preheader: ; preds = %for.cond4.preheader + br label %for.body6 + +for.body6: ; preds = %for.body6.preheader, %for.body6 + %k.02 = phi i64 [ %inc, %for.body6 ], [ 0, %for.body6.preheader ] + %arrayidx8 = getelementptr inbounds [100 x [100 x i64]], [100 x [100 x i64]]* %A, i64 %i.011, i64 %j.07, i64 %k.02 + store i64 %i.011, i64* %arrayidx8, align 8 + %inc = add nsw i64 %k.02, 1 + %exitcond13 = icmp ne i64 %inc, %n + br i1 %exitcond13, label %for.body6, label %for.cond10.loopexit.loopexit + +for.cond10.loopexit.loopexit: ; preds = %for.body6 + br label %for.cond10.loopexit + +for.cond10.loopexit: ; preds = %for.cond10.loopexit.loopexit, %for.cond4.preheader + %cmp113 = icmp sgt i64 %n, 0 + br i1 %cmp113, label %for.body12.preheader, label %for.inc21 + +for.body12.preheader: ; preds = %for.cond10.loopexit + br label %for.body12 + +for.body12: ; preds = %for.body12.preheader, %for.body12 + %k9.05 = phi i64 [ %inc19, %for.body12 ], [ 0, %for.body12.preheader ] + %B.addr.24 = phi i64* [ %incdec.ptr, %for.body12 ], [ %B.addr.18, %for.body12.preheader ] + %add = add nsw i64 %k9.05, 1 + %add13 = add nsw i64 %j.07, 2 + %add14 = add nsw i64 %i.011, 3 + %arrayidx17 = getelementptr inbounds [100 x [100 x i64]], [100 x [100 x i64]]* %A, i64 %add14, i64 %add13, i64 %add + %0 = load i64, i64* %arrayidx17, align 8 + %incdec.ptr = getelementptr inbounds i64, i64* %B.addr.24, i64 1 + store i64 %0, i64* %B.addr.24, align 8 + %inc19 = add nsw i64 %k9.05, 1 + %exitcond = icmp ne i64 %inc19, %n + br i1 %exitcond, label %for.body12, label %for.inc21.loopexit + +for.inc21.loopexit: ; preds = %for.body12 + %scevgep = getelementptr i64, i64* %B.addr.18, i64 %n + br label %for.inc21 + +for.inc21: ; preds = %for.inc21.loopexit, %for.cond10.loopexit + %B.addr.2.lcssa = phi i64* [ %B.addr.18, %for.cond10.loopexit ], [ %scevgep, %for.inc21.loopexit ] + %inc22 = add nsw i64 %j.07, 1 + %exitcond14 = icmp ne i64 %inc22, %n + br i1 %exitcond14, label %for.cond4.preheader, label %for.inc24.loopexit + +for.inc24.loopexit: ; preds = %for.inc21 + %B.addr.2.lcssa.lcssa = phi i64* [ %B.addr.2.lcssa, %for.inc21 ] + br label %for.inc24 + +for.inc24: ; preds = %for.inc24.loopexit, %for.cond1.preheader + %B.addr.1.lcssa = phi i64* [ %B.addr.012, %for.cond1.preheader ], [ %B.addr.2.lcssa.lcssa, %for.inc24.loopexit ] + %inc25 = add nsw i64 %i.011, 1 + %exitcond15 = icmp ne i64 %inc25, %n + br i1 %exitcond15, label %for.cond1.preheader, label %for.end26.loopexit + +for.end26.loopexit: ; preds = %for.inc24 + br label %for.end26 + +for.end26: ; preds = %for.end26.loopexit, %entry + ret void +} diff --git a/llvm/test/Analysis/DependenceAnalysis/SimpleSIVNoValidityCheckFixedSize.ll b/llvm/test/Analysis/DependenceAnalysis/SimpleSIVNoValidityCheckFixedSize.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Analysis/DependenceAnalysis/SimpleSIVNoValidityCheckFixedSize.ll @@ -0,0 +1,120 @@ +; RUN: opt < %s -disable-output -passes="print" \ +; RUN: -da-disable-delinearization-checks 2>&1 | FileCheck %s +; RUN: opt < %s -da -analyze -da-disable-delinearization-checks | FileCheck %s + +; CHECK-LABEL: t1 +; CHECK: da analyze - none! +; CHECK: da analyze - consistent anti [1 -2]! +; CHECK: da analyze - none! + +;; #define N 1024 +;; #define M 2048 +;; void t1(int a[N][M]) { +;; for (int i = 0; i < N-1; ++i) +;; for (int j = 2; j < M; ++j) +;; a[i][j] = a[i+1][j-2]; +;; } + +define void @t1([2048 x i32]* %a) { +entry: + br label %for.body + +for.body: ; preds = %entry, %for.inc11 + %indvars.iv4 = phi i64 [ 0, %entry ], [ %indvars.iv.next5, %for.inc11 ] + br label %for.body4 + +for.body4: ; preds = %for.body, %for.body4 + %indvars.iv = phi i64 [ 2, %for.body ], [ %indvars.iv.next, %for.body4 ] + %0 = add nuw nsw i64 %indvars.iv4, 1 + %1 = add nsw i64 %indvars.iv, -2 + %arrayidx6 = getelementptr inbounds [2048 x i32], [2048 x i32]* %a, i64 %0, i64 %1 + %2 = load i32, i32* %arrayidx6, align 4 + %arrayidx10 = getelementptr inbounds [2048 x i32], [2048 x i32]* %a, i64 %indvars.iv4, i64 %indvars.iv + store i32 %2, i32* %arrayidx10, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp ne i64 %indvars.iv.next, 2048 + br i1 %exitcond, label %for.body4, label %for.inc11 + +for.inc11: ; preds = %for.body4 + %indvars.iv.next5 = add nuw nsw i64 %indvars.iv4, 1 + %exitcond7 = icmp ne i64 %indvars.iv.next5, 1023 + br i1 %exitcond7, label %for.body, label %for.end13 + +for.end13: ; preds = %for.inc11 + ret void +} + + +; CHECK-LABEL: t2 +; CHECK: da analyze - none! +; CHECK: da analyze - consistent anti [1 -2 0 -3 2]! +; CHECK: da analyze - none! + +;; #define N 1024 +;; #define M 2048 +;; void t2(int a[][N][N][N][M]) { +;; for (int i1 = 0; i1 < N-1; ++i1) +;; for (int i2 = 2; i2 < N; ++i2) +;; for (int i3 = 0; i3 < N; ++i3) +;; for (int i4 = 3; i4 < N; ++i4) +;; for (int i5 = 0; i5 < M-2; ++i5) +;; a[i1][i2][i3][i4][i5] = a[i1+1][i2-2][i3][i4-3][i5+2]; +;; } + +define void @t2([1024 x [1024 x [1024 x [2048 x i32]]]]* %a) { +entry: + br label %for.body + +for.body: ; preds = %entry, %for.inc46 + %indvars.iv18 = phi i64 [ 0, %entry ], [ %indvars.iv.next19, %for.inc46 ] + br label %for.body4 + +for.body4: ; preds = %for.body, %for.inc43 + %indvars.iv14 = phi i64 [ 2, %for.body ], [ %indvars.iv.next15, %for.inc43 ] + br label %for.body8 + +for.body8: ; preds = %for.body4, %for.inc40 + %indvars.iv11 = phi i64 [ 0, %for.body4 ], [ %indvars.iv.next12, %for.inc40 ] + br label %for.body12 + +for.body12: ; preds = %for.body8, %for.inc37 + %indvars.iv7 = phi i64 [ 3, %for.body8 ], [ %indvars.iv.next8, %for.inc37 ] + br label %for.body16 + +for.body16: ; preds = %for.body12, %for.body16 + %indvars.iv = phi i64 [ 0, %for.body12 ], [ %indvars.iv.next, %for.body16 ] + %0 = add nuw nsw i64 %indvars.iv18, 1 + %1 = add nsw i64 %indvars.iv14, -2 + %2 = add nsw i64 %indvars.iv7, -3 + %3 = add nuw nsw i64 %indvars.iv, 2 + %arrayidx26 = getelementptr inbounds [1024 x [1024 x [1024 x [2048 x i32]]]], [1024 x [1024 x [1024 x [2048 x i32]]]]* %a, i64 %0, i64 %1, i64 %indvars.iv11, i64 %2, i64 %3 + %4 = load i32, i32* %arrayidx26, align 4 + %arrayidx36 = getelementptr inbounds [1024 x [1024 x [1024 x [2048 x i32]]]], [1024 x [1024 x [1024 x [2048 x i32]]]]* %a, i64 %indvars.iv18, i64 %indvars.iv14, i64 %indvars.iv11, i64 %indvars.iv7, i64 %indvars.iv + store i32 %4, i32* %arrayidx36, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp ne i64 %indvars.iv.next, 2046 + br i1 %exitcond, label %for.body16, label %for.inc37 + +for.inc37: ; preds = %for.body16 + %indvars.iv.next8 = add nuw nsw i64 %indvars.iv7, 1 + %exitcond10 = icmp ne i64 %indvars.iv.next8, 1024 + br i1 %exitcond10, label %for.body12, label %for.inc40 + +for.inc40: ; preds = %for.inc37 + %indvars.iv.next12 = add nuw nsw i64 %indvars.iv11, 1 + %exitcond13 = icmp ne i64 %indvars.iv.next12, 1024 + br i1 %exitcond13, label %for.body8, label %for.inc43 + +for.inc43: ; preds = %for.inc40 + %indvars.iv.next15 = add nuw nsw i64 %indvars.iv14, 1 + %exitcond17 = icmp ne i64 %indvars.iv.next15, 1024 + br i1 %exitcond17, label %for.body4, label %for.inc46 + +for.inc46: ; preds = %for.inc43 + %indvars.iv.next19 = add nuw nsw i64 %indvars.iv18, 1 + %exitcond21 = icmp ne i64 %indvars.iv.next19, 1023 + br i1 %exitcond21, label %for.body, label %for.end48 + +for.end48: ; preds = %for.inc46 + ret void +} diff --git a/llvm/test/Transforms/LoopInterchange/currentLimitation.ll b/llvm/test/Transforms/LoopInterchange/currentLimitation.ll --- a/llvm/test/Transforms/LoopInterchange/currentLimitation.ll +++ b/llvm/test/Transforms/LoopInterchange/currentLimitation.ll @@ -2,6 +2,11 @@ ; RUN: -pass-remarks-output=%t -verify-loop-info -verify-dom-info -S | FileCheck -check-prefix=IR %s ; RUN: FileCheck --input-file=%t %s +; RUN: opt < %s -basicaa -loop-interchange -pass-remarks-missed='loop-interchange' \ +; RUN: -da-disable-delinearization-checks -pass-remarks-output=%t \ +; RUN: -verify-loop-info -verify-dom-info -S | FileCheck -check-prefix=IR %s +; RUN: FileCheck --check-prefix=DELIN --input-file=%t %s + target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" @@ -16,13 +21,14 @@ ;; for(int j=1;j