Index: llvm/include/llvm/Analysis/ScalarEvolution.h =================================================================== --- llvm/include/llvm/Analysis/ScalarEvolution.h +++ llvm/include/llvm/Analysis/ScalarEvolution.h @@ -798,6 +798,12 @@ /// evaluation type which can not result in overflow. const SCEV *getTripCountFromExitCount(const SCEV *ExitCount); + /// Returns the upper bound of the loop trip count infered from memory access. + /// Can not access bytes starting outside the statically allocated size + /// without being immediate UB. + /// Returns SCEVCouldNotCompute if the trip count could not be inferred. + const SCEV *getConstantMaxTripCountFromMemAccess(const Loop *L); + /// Convert from an "exit count" (i.e. "backedge taken count") to a "trip /// count". A "trip count" is the number of times the header of the loop /// will execute if an exit is taken after the specified number of backedges Index: llvm/lib/Analysis/ScalarEvolution.cpp =================================================================== --- llvm/lib/Analysis/ScalarEvolution.cpp +++ llvm/lib/Analysis/ScalarEvolution.cpp @@ -249,6 +249,11 @@ cl::desc("Infer nuw/nsw flags using context where suitable"), cl::init(true)); +static cl::opt UseMemoryAccessUBForBEInference( + "scalar-evolution-infer-max-trip-count-from-memory-access", cl::Hidden, + cl::desc("Infer loop max trip count from memory access"), + cl::init(false)); + //===----------------------------------------------------------------------===// // SCEV class definitions //===----------------------------------------------------------------------===// @@ -8086,6 +8091,161 @@ return ((unsigned)ExitConst->getZExtValue()) + 1; } +/// Collect Load/Store instructions that must be executed on each iteration +/// inside loop \p L . +static void +collectExeLoadStoreInsideLoop(const Loop *L, DominatorTree &DT, + SmallVector &MemInsts) { + // It is difficult to tell if the load/store instruction is executed on every + // iteration inside an irregular loop. + if (!L->isLoopSimplifyForm() || !L->isInnermost()) + return; + + // FIXME: To make the scene more typical, we only analysis loops that have + // one exiting block and that block must be the latch. To make it easier to + // capture loops that have memory access and memory access will be executed + // on each iteration. + const BasicBlock *LoopLatch = L->getLoopLatch(); + assert(LoopLatch && "See defination of simplify form loop."); + if (L->getExitingBlock() != LoopLatch) + return; + + for (auto *BB : L->getBlocks()) { + // Go here, we can know that Loop is a single exiting and simplified form + // loop. Make sure that infer from Memory Operation in those BBs must be + // executed in loop. First step, we can make sure that max execution time + // of MemAccessBB in loop represents latch max excution time. + // Such BB as below should be skipped: + // Entry + // │ + // ┌─────▼─────┐ + // │Loop Header◄─────┐ + // └──┬──────┬─┘ │ + // │ │ │ + // ┌────────▼──┐ ┌─▼─────┐ │ + // │MemAccessBB│ │OtherBB│ │ + // └────────┬──┘ └─┬─────┘ │ + // │ │ │ + // ┌─▼──────▼─┐ │ + // │Loop Latch├─────┘ + // └────┬─────┘ + // ▼ + // Exit + if (!DT.dominates(BB, LoopLatch)) + continue; + + for (Instruction &Inst : *BB) { + if (isa(&Inst) || isa(&Inst)) + MemInsts.push_back(&Inst); + } + } +} + +/// Returns true if memory access instruction \p I may be overlapping under +/// recurrence expression \p Rec which comes from the pointer operand of \p I . +static bool memReadWriteMaybeOverlapping(Instruction *I, + const SCEVAddRecExpr *Rec, + ScalarEvolution *SE) { + assert(isa(I) || isa(I)); + const SCEV *AccessSize = SE->getElementSize(I); + const SCEV *Step = Rec->getStepRecurrence(*SE); + // The unknown situation is regarded as a possible overlap. + if (!AccessSize || !Step || !isa(AccessSize) || + !isa(Step)) + return true; + + ConstantInt *StepC = cast(Step)->getValue(); + if (StepC->isZero() || StepC->isNegative() || + StepC->getValue().getActiveBits() > 32) + return true; + + ConstantInt *AcSizeC = cast(AccessSize)->getValue(); + // When the accessing size is greater than the step size, then overlap occurs. + if (AcSizeC->getValue().getZExtValue() > StepC->getValue().getZExtValue()) + return true; + + // Notice that pointer may be wraps arround. Treat that as overlap. + if (!Rec->hasNoSelfWrap()) + return true; + + return false; +} + +/// Returns a SCEV representing the memory size of pointer \p V . +/// TODO: Memory size of more types can be identified here. +static const SCEV *getCertainSizeOfMem(const SCEV *V, Type *RTy, + const DataLayout &DL, + ScalarEvolution *SE) { + const SCEVUnknown *PtrBase = dyn_cast(V); + if (!PtrBase) + return nullptr; + + // TODO: Memory which has certain size. + AllocaInst *AllocateInst = dyn_cast(PtrBase->getValue()); + if (!AllocateInst) + return nullptr; + + // Make sure only handle normal array. + auto *Ty = dyn_cast(AllocateInst->getAllocatedType()); + auto *ArrSize = dyn_cast(AllocateInst->getArraySize()); + if (!Ty || !ArrSize || !ArrSize->isOne()) + return nullptr; + + return SE->getConstant(RTy, DL.getTypeAllocSize(Ty)); +} + +const SCEV * +ScalarEvolution::getConstantMaxTripCountFromMemAccess(const Loop *L) { + SmallVector MemInsts; + collectExeLoadStoreInsideLoop(L, DT, MemInsts); + + // Collect AddRecExpr that meets the requirements and can be analyzed. + SmallPtrSet Exprs; + for (Instruction *I : MemInsts) { + Value *Ptr = getLoadStorePointerOperand(I); + assert(Ptr && "getLoadStorePointerOperand changed."); + auto *AddRec = dyn_cast(getSCEV(Ptr)); + if (!AddRec || !AddRec->isAffine()) + continue; + + if (!memReadWriteMaybeOverlapping(I, AddRec, this)) + Exprs.insert(AddRec); + } + + const DataLayout &DL = getDataLayout(); + SmallVector InferCountColl; + for (auto *Rec : Exprs) { + const SCEV *PtrBase = getPointerBase(Rec); + const SCEV *Step = Rec->getStepRecurrence(*this); + const SCEV *MemSize = + getCertainSizeOfMem(PtrBase, Step->getType(), DL, this); + if (!MemSize) + continue; + + // Now we can infer a max execution time by MemLength/StepLength. + auto *MaxExeCount = dyn_cast(getUDivCeilSCEV(MemSize, Step)); + if (!MaxExeCount || MaxExeCount->getAPInt().getActiveBits() > 32) + continue; + + // If the loop reaches the maximum number of executions, we can not + // access bytes starting outside the statically allocated size without + // being immediate UB. But it is allowed to enter loop header one more + // time. + auto *InferCount = dyn_cast( + getAddExpr(MaxExeCount, getOne(MaxExeCount->getType()))); + // Discard the maximum number of execution times under 32bits. + if (!InferCount || InferCount->getAPInt().getActiveBits() > 32) + continue; + + InferCountColl.push_back(InferCount); + } + + if (InferCountColl.empty()) + return getCouldNotCompute(); + + return getUMinFromMismatchedTypes(InferCountColl); +} + unsigned ScalarEvolution::getSmallConstantTripCount(const Loop *L) { auto *ExitCount = dyn_cast(getBackedgeTakenCount(L, Exact)); return getConstantTripCount(ExitCount); @@ -8105,7 +8265,17 @@ unsigned ScalarEvolution::getSmallConstantMaxTripCount(const Loop *L) { const auto *MaxExitCount = dyn_cast(getConstantMaxBackedgeTakenCount(L)); - return getConstantTripCount(MaxExitCount); + unsigned MaxExitCountN = getConstantTripCount(MaxExitCount); + if (UseMemoryAccessUBForBEInference) { + auto *MaxInferCount = getConstantMaxTripCountFromMemAccess(L); + if (auto *InferCount = dyn_cast(MaxInferCount)) { + unsigned InferValue = (unsigned)(InferCount->getValue()->getZExtValue()); + MaxExitCountN = (MaxExitCountN == 0) + ? InferValue + : std::min(MaxExitCountN, InferValue); + } + } + return MaxExitCountN; } unsigned ScalarEvolution::getSmallConstantTripMultiple(const Loop *L) { @@ -13443,6 +13613,17 @@ OS << ": "; OS << "Trip multiple is " << SE->getSmallConstantTripMultiple(L) << "\n"; } + + if (UseMemoryAccessUBForBEInference) { + unsigned SmallMaxTrip = SE->getSmallConstantMaxTripCount(L); + OS << "Loop "; + L->getHeader()->printAsOperand(OS, /*PrintType=*/false); + OS << ": "; + if (SmallMaxTrip) + OS << "Small constant max trip is " << SmallMaxTrip << "\n"; + else + OS << "Small constant max trip couldn't be computed. " << "\n"; + } } namespace llvm { Index: llvm/test/Analysis/ScalarEvolution/infer-trip-count-idx-wrap.ll =================================================================== --- /dev/null +++ llvm/test/Analysis/ScalarEvolution/infer-trip-count-idx-wrap.ll @@ -0,0 +1,110 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py +; RUN: opt < %s -disable-output "-passes=print" -scalar-evolution-classify-expressions=0 -scalar-evolution-infer-max-trip-count-from-memory-access 2>&1 | FileCheck %s + +define void @ComputeMaxTripCountFromArrayIdxWrap(i32 signext %len) { +; CHECK-LABEL: 'ComputeMaxTripCountFromArrayIdxWrap' +; CHECK-NEXT: Determining loop execution counts for: @ComputeMaxTripCountFromArrayIdxWrap +; CHECK-NEXT: Loop %for.body: backedge-taken count is (-1 + %len) +; CHECK-NEXT: Loop %for.body: constant max backedge-taken count is 2147483646 +; CHECK-NEXT: Loop %for.body: symbolic max backedge-taken count is (-1 + %len) +; CHECK-NEXT: Loop %for.body: Predicated backedge-taken count is (-1 + %len) +; CHECK-NEXT: Predicates: +; CHECK: Loop %for.body: Trip multiple is 1 +; CHECK-NEXT: Loop %for.body: Small constant max trip is 257 +; +entry: + %a = alloca [256 x i32], align 4 + %cmp4 = icmp sgt i32 %len, 0 + br i1 %cmp4, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + br label %for.body + +for.cond.cleanup.loopexit: + br label %for.cond.cleanup + +for.cond.cleanup: + ret void + +for.body: + %iv = phi i8 [ %inc, %for.body ], [ 0, %for.body.preheader ] + %idxprom = zext i8 %iv to i64 + %arrayidx = getelementptr inbounds [256 x i32], [256 x i32]* %a, i64 0, i64 %idxprom + store i32 0, i32* %arrayidx, align 4 + %inc = add nuw nsw i8 %iv, 1 + %inc_zext = zext i8 %inc to i32 + %cmp = icmp slt i32 %inc_zext, %len + br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit +} + +define void @ComputeMaxTripCountFromArrayIdxWrap2(i32 signext %len) { +; CHECK-LABEL: 'ComputeMaxTripCountFromArrayIdxWrap2' +; CHECK-NEXT: Determining loop execution counts for: @ComputeMaxTripCountFromArrayIdxWrap2 +; CHECK-NEXT: Loop %for.body: backedge-taken count is (-1 + %len) +; CHECK-NEXT: Loop %for.body: constant max backedge-taken count is 2147483646 +; CHECK-NEXT: Loop %for.body: symbolic max backedge-taken count is (-1 + %len) +; CHECK-NEXT: Loop %for.body: Predicated backedge-taken count is (-1 + %len) +; CHECK-NEXT: Predicates: +; CHECK: Loop %for.body: Trip multiple is 1 +; CHECK-NEXT: Loop %for.body: Small constant max trip is 128 +; +entry: + %a = alloca [127 x i32], align 4 + %cmp4 = icmp sgt i32 %len, 0 + br i1 %cmp4, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + br label %for.body + +for.cond.cleanup.loopexit: + br label %for.cond.cleanup + +for.cond.cleanup: + ret void + +for.body: + %iv = phi i8 [ %inc, %for.body ], [ 0, %for.body.preheader ] + %idxprom = zext i8 %iv to i64 + %arrayidx = getelementptr inbounds [127 x i32], [127 x i32]* %a, i64 0, i64 %idxprom + store i32 0, i32* %arrayidx, align 4 + %inc = add nuw nsw i8 %iv, 1 + %inc_zext = zext i8 %inc to i32 + %cmp = icmp slt i32 %inc_zext, %len + br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit +} + +define void @ComputeMaxTripCountFromArrayIdxWrap3(i32 signext %len) { +; CHECK-LABEL: 'ComputeMaxTripCountFromArrayIdxWrap3' +; CHECK-NEXT: Determining loop execution counts for: @ComputeMaxTripCountFromArrayIdxWrap3 +; CHECK-NEXT: Loop %for.body: backedge-taken count is (-1 + %len) +; CHECK-NEXT: Loop %for.body: constant max backedge-taken count is 2147483646 +; CHECK-NEXT: Loop %for.body: symbolic max backedge-taken count is (-1 + %len) +; CHECK-NEXT: Loop %for.body: Predicated backedge-taken count is (-1 + %len) +; CHECK-NEXT: Predicates: +; CHECK: Loop %for.body: Trip multiple is 1 +; CHECK-NEXT: Loop %for.body: Small constant max trip is 21 +; +entry: + %a = alloca [20 x i32], align 4 + %cmp4 = icmp sgt i32 %len, 0 + br i1 %cmp4, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + br label %for.body + +for.cond.cleanup.loopexit: + br label %for.cond.cleanup + +for.cond.cleanup: + ret void + +for.body: + %iv = phi i8 [ %inc, %for.body ], [ 0, %for.body.preheader ] + %idxprom = zext i8 %iv to i64 + %arrayidx = getelementptr inbounds [20 x i32], [20 x i32]* %a, i64 0, i64 %idxprom + store i32 0, i32* %arrayidx, align 4 + %inc = add nuw nsw i8 %iv, 1 + %inc_zext = zext i8 %inc to i32 + %cmp = icmp slt i32 %inc_zext, %len + br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit +} Index: llvm/test/Analysis/ScalarEvolution/infer-trip-count.ll =================================================================== --- /dev/null +++ llvm/test/Analysis/ScalarEvolution/infer-trip-count.ll @@ -0,0 +1,191 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py +; RUN: opt < %s -disable-output "-passes=print" -scalar-evolution-classify-expressions=0 -scalar-evolution-infer-max-trip-count-from-memory-access 2>&1 | FileCheck %s + +define void @ComputeMaxTripCountFromArrayNormal(i32 signext %len) { +; CHECK-LABEL: 'ComputeMaxTripCountFromArrayNormal' +; CHECK-NEXT: Determining loop execution counts for: @ComputeMaxTripCountFromArrayNormal +; CHECK-NEXT: Loop %for.body: backedge-taken count is (-1 + %len) +; CHECK-NEXT: Loop %for.body: constant max backedge-taken count is 2147483646 +; CHECK-NEXT: Loop %for.body: symbolic max backedge-taken count is (-1 + %len) +; CHECK-NEXT: Loop %for.body: Predicated backedge-taken count is (-1 + %len) +; CHECK-NEXT: Predicates: +; CHECK: Loop %for.body: Trip multiple is 1 +; CHECK-NEXT: Loop %for.body: Small constant max trip is 8 +; +entry: + %a = alloca [7 x i32], align 4 + %cmp4 = icmp sgt i32 %len, 0 + br i1 %cmp4, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + br label %for.body + +for.cond.cleanup.loopexit: + br label %for.cond.cleanup + +for.cond.cleanup: + ret void + +for.body: + %iv = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] + %idxprom = zext i32 %iv to i64 + %arrayidx = getelementptr inbounds [7 x i32], [7 x i32]* %a, i64 0, i64 %idxprom + store i32 0, i32* %arrayidx, align 4 + %inc = add nuw nsw i32 %iv, 1 + %cmp = icmp slt i32 %inc, %len + br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit +} + + +define void @ComputeMaxTripCountFromZeroArray(i32 signext %len) { +; CHECK-LABEL: 'ComputeMaxTripCountFromZeroArray' +; CHECK-NEXT: Determining loop execution counts for: @ComputeMaxTripCountFromZeroArray +; CHECK-NEXT: Loop %for.body: backedge-taken count is (-1 + %len) +; CHECK-NEXT: Loop %for.body: constant max backedge-taken count is 2147483646 +; CHECK-NEXT: Loop %for.body: symbolic max backedge-taken count is (-1 + %len) +; CHECK-NEXT: Loop %for.body: Predicated backedge-taken count is (-1 + %len) +; CHECK-NEXT: Predicates: +; CHECK: Loop %for.body: Trip multiple is 1 +; CHECK-NEXT: Loop %for.body: Small constant max trip is 1 +; +entry: + %a = alloca [0 x i32], align 4 + %cmp4 = icmp sgt i32 %len, 0 + br i1 %cmp4, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + br label %for.body + +for.cond.cleanup.loopexit: + br label %for.cond.cleanup + +for.cond.cleanup: + ret void + +for.body: + %iv = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] + %idxprom = zext i32 %iv to i64 + %arrayidx = getelementptr inbounds [0 x i32], [0 x i32]* %a, i64 0, i64 %idxprom + store i32 0, i32* %arrayidx, align 4 + %inc = add nuw nsw i32 %iv, 1 + %cmp = icmp slt i32 %inc, %len + br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit +} + +define void @ComputeMaxTripCountFromExtremArray(i32 signext %len) { +; CHECK-LABEL: 'ComputeMaxTripCountFromExtremArray' +; CHECK-NEXT: Determining loop execution counts for: @ComputeMaxTripCountFromExtremArray +; CHECK-NEXT: Loop %for.body: backedge-taken count is (-1 + %len) +; CHECK-NEXT: Loop %for.body: constant max backedge-taken count is 2147483646 +; CHECK-NEXT: Loop %for.body: symbolic max backedge-taken count is (-1 + %len) +; CHECK-NEXT: Loop %for.body: Predicated backedge-taken count is (-1 + %len) +; CHECK-NEXT: Predicates: +; CHECK: Loop %for.body: Trip multiple is 1 +; CHECK-NEXT: Loop %for.body: Small constant max trip is 2147483647 +; +entry: + %a = alloca [4294967295 x i1], align 4 + %cmp4 = icmp sgt i32 %len, 0 + br i1 %cmp4, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + br label %for.body + +for.cond.cleanup.loopexit: + br label %for.cond.cleanup + +for.cond.cleanup: + ret void + +for.body: + %iv = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] + %idxprom = zext i32 %iv to i64 + %arrayidx = getelementptr inbounds [4294967295 x i1], [4294967295 x i1]* %a, i64 0, i64 %idxprom + store i1 0, i1* %arrayidx, align 4 + %inc = add nuw nsw i32 %iv, 1 + %cmp = icmp slt i32 %inc, %len + br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit +} + + +define void @ComputeMaxTripCountFromArrayInBranch(i32 signext %len) { +; CHECK-LABEL: 'ComputeMaxTripCountFromArrayInBranch' +; CHECK-NEXT: Determining loop execution counts for: @ComputeMaxTripCountFromArrayInBranch +; CHECK-NEXT: Loop %for.cond: backedge-taken count is (0 smax %len) +; CHECK-NEXT: Loop %for.cond: constant max backedge-taken count is 2147483647 +; CHECK-NEXT: Loop %for.cond: symbolic max backedge-taken count is (0 smax %len) +; CHECK-NEXT: Loop %for.cond: Predicated backedge-taken count is (0 smax %len) +; CHECK-NEXT: Predicates: +; CHECK: Loop %for.cond: Trip multiple is 1 +; CHECK-NEXT: Loop %for.cond: Small constant max trip is 2147483648 +; +entry: + %a = alloca [8 x i32], align 4 + br label %for.cond + +for.cond: + %iv = phi i32 [ %inc, %for.inc ], [ 0, %entry ] + %cmp = icmp slt i32 %iv, %len + br i1 %cmp, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: + br label %for.end + +for.body: + %cmp1 = icmp slt i32 %iv, 8 + br i1 %cmp1, label %if.then, label %if.end + +if.then: + %idxprom = sext i32 %iv to i64 + %arrayidx = getelementptr inbounds [8 x i32], [8 x i32]* %a, i64 0, i64 %idxprom + store i32 0, i32* %arrayidx, align 4 + br label %if.end + +if.end: + br label %for.inc + +for.inc: + %inc = add nsw i32 %iv, 1 + br label %for.cond + +for.end: + ret void +} + +define void @ComputeMaxTripCountFromMultiDemArray(i32 signext %len) { +; CHECK-LABEL: 'ComputeMaxTripCountFromMultiDemArray' +; CHECK-NEXT: Determining loop execution counts for: @ComputeMaxTripCountFromMultiDemArray +; CHECK-NEXT: Loop %for.cond: backedge-taken count is (0 smax %len) +; CHECK-NEXT: Loop %for.cond: constant max backedge-taken count is 2147483647 +; CHECK-NEXT: Loop %for.cond: symbolic max backedge-taken count is (0 smax %len) +; CHECK-NEXT: Loop %for.cond: Predicated backedge-taken count is (0 smax %len) +; CHECK-NEXT: Predicates: +; CHECK: Loop %for.cond: Trip multiple is 1 +; CHECK-NEXT: Loop %for.cond: Small constant max trip is 2147483648 +; +entry: + %a = alloca [3 x [5 x i32]], align 4 + br label %for.cond + +for.cond: + %iv = phi i32 [ %inc, %for.inc ], [ 0, %entry ] + %cmp = icmp slt i32 %iv, %len + br i1 %cmp, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: + br label %for.end + +for.body: + %arrayidx = getelementptr inbounds [3 x [5 x i32]], [3 x [5 x i32]]* %a, i64 0, i64 3 + %idxprom = sext i32 %iv to i64 + %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %arrayidx, i64 0, i64 %idxprom + store i32 0, i32* %arrayidx1, align 4 + br label %for.inc + +for.inc: + %inc = add nsw i32 %iv, 1 + br label %for.cond + +for.end: + ret void +}