diff --git a/llvm/lib/Analysis/LoopCacheAnalysis.cpp b/llvm/lib/Analysis/LoopCacheAnalysis.cpp --- a/llvm/lib/Analysis/LoopCacheAnalysis.cpp +++ b/llvm/lib/Analysis/LoopCacheAnalysis.cpp @@ -90,7 +90,11 @@ if (!SE.isLoopInvariant(Start, &L) || !SE.isLoopInvariant(Step, &L)) return false; - return AR->getStepRecurrence(SE) == &ElemSize; + const SCEV *StepRec = AR->getStepRecurrence(SE); + if (StepRec && SE.isKnownNegative(StepRec)) + StepRec = SE.getNegativeSCEV(StepRec); + + return StepRec == &ElemSize; } /// Compute the trip count for the given loop \p L. Return the SCEV expression @@ -285,10 +289,13 @@ const SCEV *Stride = SE.getMulExpr(Coeff, ElemSize); const SCEV *CacheLineSize = SE.getConstant(Stride->getType(), CLS); Type *WiderType = SE.getWiderType(Stride->getType(), TripCount->getType()); - Stride = SE.getNoopOrSignExtend(Stride, WiderType); + if (SE.isKnownNegative(Stride)) + Stride = SE.getNegativeSCEV(Stride); + Stride = SE.getNoopOrAnyExtend(Stride, WiderType); TripCount = SE.getNoopOrAnyExtend(TripCount, WiderType); const SCEV *Numerator = SE.getMulExpr(Stride, TripCount); RefCost = SE.getUDivExpr(Numerator, CacheLineSize); + LLVM_DEBUG(dbgs().indent(4) << "Access is consecutive: RefCost=(TripCount*Stride)/CLS=" << *RefCost << "\n"); @@ -349,6 +356,19 @@ return false; } + // The array may be accessed in reverse, for example: + // for (i = N; i > 0; i--) + // A[i] = 0; + // In this case, reconstruct the access function using the absolute value + // of the step recurrence. + const SCEVAddRecExpr *AccessFnAR = dyn_cast(AccessFn); + const SCEV *StepRec = AccessFnAR ? AccessFnAR->getStepRecurrence(SE) : nullptr; + + if (StepRec && SE.isKnownNegative(StepRec)) + AccessFn = SE.getAddRecExpr(AccessFnAR->getStart(), + SE.getNegativeSCEV(StepRec), + AccessFnAR->getLoop(), + AccessFnAR->getNoWrapFlags()); const SCEV *Div = SE.getUDivExactExpr(AccessFn, ElemSize); Subscripts.push_back(Div); Sizes.push_back(ElemSize); @@ -396,6 +416,7 @@ const SCEV *Stride = SE.getMulExpr(Coeff, ElemSize); const SCEV *CacheLineSize = SE.getConstant(Stride->getType(), CLS); + Stride = SE.isKnownNegative(Stride) ? SE.getNegativeSCEV(Stride) : Stride; return SE.isKnownPredicate(ICmpInst::ICMP_ULT, Stride, CacheLineSize); } @@ -537,6 +558,18 @@ dbgs().indent(2) << Representative << "\n"; }); + + // FIXME: Both positive and negative access functions will be placed + // into the same reference group, resulting in a bi-directional array + // access such as: + // for (i = N; i > 0; i--) + // A[i] = A[N - i]; + // having the same cost calculation as a single dimention access pattern + // for (i = 0; i < N; i++) + // A[i] = A[i]; + // when in actuality, depending on the array size, the first example + // should have a cost closer to 2x the second due to the two cache + // access per iteration from opposite ends of the array Optional HasTemporalReuse = R->hasTemporalReuse(Representative, *TRT, *InnerMostLoop, DI, AA); Optional HasSpacialReuse = diff --git a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/compute-cost.ll b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/compute-cost.ll --- a/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/compute-cost.ll +++ b/llvm/test/Analysis/LoopCacheAnalysis/PowerPC/compute-cost.ll @@ -31,5 +31,125 @@ ; Exit blocks for.end: ; preds = %for.cond ret void +} + + + +; Check IndexedReference::computeRefCost can handle negative stride + +; CHECK: Loop 'for.neg.cond' has cost = 64 + +define void @handle_to_ptr_neg_stride(%struct._Handleitem** %blocks) { +; Preheader: +entry: + br label %for.neg.cond + +; Loop: +for.neg.cond: ; preds = %for.neg.body, %entry + %i.0 = phi i32 [ 1023, %entry ], [ %dec, %for.neg.body ] + %cmp = icmp sgt i32 %i.0, 0 + br i1 %cmp, label %for.neg.body, label %for.neg.end + +for.neg.body: ; preds = %for.neg.cond + %idxprom = zext i32 %i.0 to i64 + %arrayidx = getelementptr inbounds %struct._Handleitem*, %struct._Handleitem** %blocks, i64 %idxprom + store %struct._Handleitem* null, %struct._Handleitem** %arrayidx, align 8 + %dec = add nsw i32 %i.0, -1 + br label %for.neg.cond + +; Exit blocks +for.neg.end: ; preds = %for.neg.cond + ret void +} + + + +; for (int i = 40960; i > 0; i--) +; B[i] = B[40960 - i]; + +; FIXME: Currently negative access functions are treated the same as positive +; access functions. When this is fixed this testcase should have a cost +; approximately 2x higher. + +; CHECK: Loop 'for.cond2' has cost = 2560 +define void @Test2(double* %B) { +entry: + br label %for.cond2 + +for.cond2: ; preds = %for.body, %entry + %i.0 = phi i32 [ 40960, %entry ], [ %dec, %for.body ] + %cmp = icmp sgt i32 %i.0, 0 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %sub = sub nsw i32 40960, %i.0 + %idxprom = sext i32 %sub to i64 + %arrayidx = getelementptr inbounds double, double* %B, i64 %idxprom + %0 = load double, double* %arrayidx, align 8 + %idxprom1 = sext i32 %i.0 to i64 + %arrayidx2 = getelementptr inbounds double, double* %B, i64 %idxprom1 + store double %0, double* %arrayidx2, align 8 + %dec = add nsw i32 %i.0, -1 + br label %for.cond2 +for.end: ; preds = %for.cond + ret void +} + + + +; for (i = 40960; i > 0; i--) +; C[i] = C[i]; + +; CHECK: Loop 'for.cond3' has cost = 2560 +define void @Test3(double** %C) { +entry: + br label %for.cond3 + +for.cond3: ; preds = %for.body, %entry + %i.0 = phi i32 [ 40960, %entry ], [ %dec, %for.body ] + %cmp = icmp sgt i32 %i.0, 0 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %idxprom = sext i32 %i.0 to i64 + %arrayidx = getelementptr inbounds double*, double** %C, i64 %idxprom + %0 = load double*, double** %arrayidx, align 8 + %idxprom1 = sext i32 %i.0 to i64 + %arrayidx2 = getelementptr inbounds double*, double** %C, i64 %idxprom1 + store double* %0, double** %arrayidx2, align 8 + %dec = add nsw i32 %i.0, -1 + br label %for.cond3 + +for.end: ; preds = %for.cond + ret void +} + + + +; for (i = 0; i < 40960; i++) +; D[i] = D[i]; + +; CHECK: Loop 'for.cond4' has cost = 2560 +define void @Test4(double** %D) { +entry: + br label %for.cond4 + +for.cond4: ; preds = %for.body, %entry + %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %cmp = icmp slt i32 %i.0, 40960 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %idxprom = sext i32 %i.0 to i64 + %arrayidx = getelementptr inbounds double*, double** %D, i64 %idxprom + %0 = load double*, double** %arrayidx, align 8 + %idxprom1 = sext i32 %i.0 to i64 + %arrayidx2 = getelementptr inbounds double*, double** %D, i64 %idxprom1 + store double* %0, double** %arrayidx2, align 8 + %inc = add nsw i32 %i.0, 1 + br label %for.cond4 + +for.end: ; preds = %for.cond + ret void }