Index: include/llvm/Analysis/LoopAccessAnalysis.h =================================================================== --- include/llvm/Analysis/LoopAccessAnalysis.h +++ include/llvm/Analysis/LoopAccessAnalysis.h @@ -311,7 +311,7 @@ /// This struct holds information about the memory runtime legality check that /// a group of pointers do not overlap. struct RuntimePointerCheck { - RuntimePointerCheck() : Need(false) {} + RuntimePointerCheck(ScalarEvolution *SE) : Need(false), SE(SE) {} /// Reset the state of the pointer runtime information. void reset() { @@ -322,16 +322,41 @@ IsWritePtr.clear(); DependencySetId.clear(); AliasSetId.clear(); + Exprs.clear(); } /// Insert a pointer and calculate the start and end SCEVs. - void insert(ScalarEvolution *SE, Loop *Lp, Value *Ptr, bool WritePtr, + void insert(Loop *Lp, Value *Ptr, bool WritePtr, unsigned DepSetId, unsigned ASId, const ValueToValueMap &Strides); /// \brief No run-time memory checking is necessary. bool empty() const { return Pointers.empty(); } + /// A grouping a pointers. A single memcheck is required between two + /// groups. + struct CheckGroup { + CheckGroup() : High(0), Low(0) {} + // Index of the pointer wich will represent the upper bound + // of the memcheck. + unsigned High; + // Index of the pointer wich will represent the lower bound + // of the memcheck. + unsigned Low; + // Indices of all the pointers that constitute this grouping. + SmallVector Members; + }; + + // Decide if we need to add a check between two groups of pointers, + // according to needsAnyChecking. + bool hasDependentMembers(struct CheckGroup &M, struct CheckGroup &N, + const SmallVectorImpl *PtrPartition) const; + + // Groups pointers pointers such that a single memcheck is required + // between two different groups. + void groupChecks(SmallVectorImpl &GroupedChecks, + const SmallVectorImpl *PtrPartition) const; + /// \brief Decide whether we need to issue a run-time check for pointer at /// index \p I and \p J to prove their independence. /// @@ -372,6 +397,10 @@ SmallVector DependencySetId; /// Holds the id of the disjoint alias set to which this pointer belongs. SmallVector AliasSetId; + /// Holds at position i the SCEV for the access i + SmallVector Exprs; + /// Holds a pointer to the ScalarEvolution analysis. + ScalarEvolution *SE; }; LoopAccessInfo(Loop *L, ScalarEvolution *SE, const DataLayout &DL, Index: lib/Analysis/LoopAccessAnalysis.cpp =================================================================== --- lib/Analysis/LoopAccessAnalysis.cpp +++ lib/Analysis/LoopAccessAnalysis.cpp @@ -113,7 +113,7 @@ } void LoopAccessInfo::RuntimePointerCheck::insert( - ScalarEvolution *SE, Loop *Lp, Value *Ptr, bool WritePtr, unsigned DepSetId, + Loop *Lp, Value *Ptr, bool WritePtr, unsigned DepSetId, unsigned ASId, const ValueToValueMap &Strides) { // Get the stride replaced scev. const SCEV *Sc = replaceSymbolicStrideSCEV(SE, Strides, Ptr); @@ -127,6 +127,160 @@ IsWritePtr.push_back(WritePtr); DependencySetId.push_back(DepSetId); AliasSetId.push_back(ASId); + Exprs.push_back(Sc); +} + +static bool isInBoundsGep(Value *Ptr) { + // FIXME: we can expand this to include the following cases: + // - PHIs with only inbounds edges + // - Values used as the base of an inbounds GEP, where the + // GEP post-dominates this instruction + if (GetElementPtrInst *GEP = dyn_cast(Ptr)) + return GEP->isInBounds(); + return false; +} + +bool LoopAccessInfo::RuntimePointerCheck::hasDependentMembers( + struct CheckGroup &M, struct CheckGroup &N, + const SmallVectorImpl *PtrPartition) const { + for (unsigned I = 0, EI = M.Members.size(); EI != I; ++I) + for (unsigned J = 0, EJ = N.Members.size(); EJ != J; ++J) + if (needsChecking(M.Members[I], N.Members[J], PtrPartition)) + return true; + return false; +} + +void LoopAccessInfo::RuntimePointerCheck::groupChecks( + SmallVectorImpl &GroupedChecks, + const SmallVectorImpl *PtrPartition) const { + + // Each pointer is part of an equivalence class. If two + // pointers are part of the same equivalence class, they will + // be part of the same group. + EquivalenceClasses CheckClass; + // For each pointer, holds the index to the first element + // that was added to the pointers equivalence class. + SmallVector SetLeaderIndex; + // Holds the index of the pointer which has the highest 'End' + // value. This will form the upper bound of the groups range. + // This is only valid for an index I if SetLeaderIndex[I] == I. + SmallVector High; + // Same as the High vector. + SmallVector Low; + // Same as High, except we keep the maximum difference from the + // leader. + SmallVector HighConstant; + // Same as HighConstant, except we keep the minimum. + SmallVector LowConstant; + + for (unsigned I = 0; I < Pointers.size(); ++I) { + const SCEV *Sc = Exprs[I]; + const SCEVAddRecExpr *AR = dyn_cast(Sc); + + // Create a new set containing only this element. + unsigned Leader = CheckClass.getOrInsertLeaderValue(I); + SetLeaderIndex.push_back(Leader); + High.push_back(Leader); + Low.push_back(Leader); + const SCEVConstant * Zero = + static_cast(SE->getConstant(AR->getType(), 0)); + HighConstant.push_back(Zero); + LowConstant.push_back(Zero); + + // We don't want to reason about non-inbounds pointers. + if (!isInBoundsGep(Pointers[I])) continue; + + // If we don't need to add any check for this pointer + // then adding it to another pointer group wouldn't decrease + // the total number of memchecks that we have to do. + // Therefore it is better to leave it in its own group. + bool NeedsCheck = false; + for (unsigned J = 0; J < Pointers.size(); ++J) { + if (needsChecking(I, J, PtrPartition)) { + NeedsCheck = true; + break; + } + } + + if (!NeedsCheck) continue; + + // Go through all the existing sets and see if we can find one + // which can include this pointer. + for (EquivalenceClasses::iterator I = CheckClass.begin(), + E = CheckClass.end(); + I != E; ++I) { + if (!I->isLeader()) continue; + unsigned Value = I->getData(); + if (Leader == Value) continue; + Value = SetLeaderIndex[Value]; + // If the leader is inbounds, all the other members of an + // equivalence class will be inbounds as well. + if (!isInBoundsGep(Pointers[Value])) continue; + + const SCEV *Old = Exprs[Value]; + if (Old->getType() != AR->getType()) + continue; + const SCEV *Diff = SE->getMinusSCEV(AR, Old); + const SCEVConstant *C = dyn_cast(Diff); + if (!C) continue; + + EquivalenceClasses::member_iterator AI, AE; + AI = CheckClass.member_begin(I), AE = CheckClass.member_end(); + + // Merging the check for Leader into this equivalence class means that + // we won't be able to memcheck the Leaders accesses against any other + // in this class. Therefore we need to first make sure that the excluded + // checks are not required. + bool Valid = true; + while (AI != AE) { + const unsigned Index = *AI; + if (needsChecking(Index, Leader, PtrPartition)) { + Valid = false; + break; + } + AI++; + } + if (!Valid) continue; + + // We're adding a new element to the set. Update the maximum + // and minimum differences. + if (C->getValue()->getValue(). + sgt(HighConstant[Value]->getValue()->getValue())) { + HighConstant[Value] = C; + High[Value] = Leader; + } + if (C->getValue()->getValue(). + slt(LowConstant[Value]->getValue()->getValue())) { + LowConstant[Value] = C; + Low[Value] = Leader; + } + + // Make the new element point to the leader of the set. + SetLeaderIndex[Leader] = SetLeaderIndex[Value]; + + CheckClass.unionSets(Leader, Value); + break; + } + } + + for (EquivalenceClasses::iterator I = CheckClass.begin(), + E = CheckClass.end(); + I != E; ++I) { + if (!I->isLeader()) continue; + unsigned Value = I->getData(); + Value = SetLeaderIndex[Value]; + // Add all members of this equivalence class to the memcheck + EquivalenceClasses::member_iterator AI, AE; + AI = CheckClass.member_begin(I), AE = CheckClass.member_end(); + + GroupedChecks.push_back(CheckGroup()); + GroupedChecks.back().High = High[Value]; + GroupedChecks.back().Low = Low[Value]; + while (AI != AE) { + GroupedChecks.back().Members.push_back(*AI); + ++AI; + } + } } bool LoopAccessInfo::RuntimePointerCheck::needsChecking( @@ -156,42 +310,83 @@ void LoopAccessInfo::RuntimePointerCheck::print( raw_ostream &OS, unsigned Depth, const SmallVectorImpl *PtrPartition) const { - unsigned NumPointers = Pointers.size(); - if (NumPointers == 0) - return; OS.indent(Depth) << "Run-time memory checks:\n"; unsigned N = 0; - for (unsigned I = 0; I < NumPointers; ++I) - for (unsigned J = I + 1; J < NumPointers; ++J) - if (needsChecking(I, J, PtrPartition)) { - OS.indent(Depth) << N++ << ":\n"; - OS.indent(Depth + 2) << *Pointers[I]; - if (PtrPartition) - OS << " (Partition: " << (*PtrPartition)[I] << ")"; - OS << "\n"; - OS.indent(Depth + 2) << *Pointers[J]; - if (PtrPartition) - OS << " (Partition: " << (*PtrPartition)[J] << ")"; - OS << "\n"; + + SmallVector GroupedChecks; + groupChecks(GroupedChecks, PtrPartition); + + unsigned NumGroups = GroupedChecks.size(); + if (NumGroups == 0) + return; + + for (unsigned I = 0; I < NumGroups; ++I) + for (unsigned J = I + 1; J < NumGroups; ++J) + if (hasDependentMembers(GroupedChecks[I], + GroupedChecks[J], + PtrPartition)) { + OS.indent(Depth) << "Check " << N++ << ":\n"; + + for (unsigned K = 0; K < GroupedChecks[I].Members.size(); ++K) { + OS.indent(Depth + 2) << *Pointers[GroupedChecks[I].Members[K]] + << "\n"; + if (PtrPartition) + OS << " (Partition: " + << (*PtrPartition)[GroupedChecks[I].Members[K]] << ")" + << "\n"; + } + + for (unsigned K = 0; K < GroupedChecks[J].Members.size(); ++K) { + OS.indent(Depth + 2) << *Pointers[GroupedChecks[J].Members[K]] + << "\n"; + if (PtrPartition) + OS << " (Partition: " + << (*PtrPartition)[GroupedChecks[J].Members[K]] << ")" + << "\n"; + } } + + OS.indent(Depth) << "Grouped accesses:\n"; + for (unsigned I = 0; I < NumGroups; ++I) { + OS.indent(Depth + 2) << "Group " << I << ":\n"; + OS.indent(Depth + 4) << "(Low: " << *Starts[GroupedChecks[I].Low] + << " High: " << *Ends[GroupedChecks[I].High] + << ")\n"; + for (unsigned J = 0; J < GroupedChecks[I].Members.size(); ++J) { + OS.indent(Depth + 6) << "Member: " << *Exprs[GroupedChecks[I].Members[J]] + << "\n"; + } + } } unsigned LoopAccessInfo::RuntimePointerCheck::getNumberOfChecks( const SmallVectorImpl *PtrPartition) const { - unsigned NumPointers = Pointers.size(); + SmallVector GroupedChecks; + + groupChecks(GroupedChecks, PtrPartition); + + unsigned NumPartitions = GroupedChecks.size(); unsigned CheckCount = 0; - for (unsigned I = 0; I < NumPointers; ++I) - for (unsigned J = I + 1; J < NumPointers; ++J) - if (needsChecking(I, J, PtrPartition)) + for (unsigned I = 0; I < NumPartitions; ++I) + for (unsigned J = I + 1; J < NumPartitions; ++J) + if (hasDependentMembers(GroupedChecks[I], + GroupedChecks[J], + PtrPartition)) CheckCount++; return CheckCount; } bool LoopAccessInfo::RuntimePointerCheck::needsAnyChecking( const SmallVectorImpl *PtrPartition) const { - return getNumberOfChecks(PtrPartition) != 0; + unsigned NumPointers = Pointers.size(); + + for (unsigned I = 0; I < NumPointers; ++I) + for (unsigned J = I + 1; J < NumPointers; ++J) + if (needsChecking(I, J, PtrPartition)) + return true; + return false; } namespace { @@ -341,7 +536,7 @@ // Each access has its own dependence set. DepId = RunningDepId++; - RtCheck.insert(SE, TheLoop, Ptr, IsWrite, DepId, ASId, StridesMap); + RtCheck.insert(TheLoop, Ptr, IsWrite, DepId, ASId, StridesMap); DEBUG(dbgs() << "LAA: Found a runtime check ptr:" << *Ptr << '\n'); } else { @@ -498,12 +693,6 @@ } } -static bool isInBoundsGep(Value *Ptr) { - if (GetElementPtrInst *GEP = dyn_cast(Ptr)) - return GEP->isInBounds(); - return false; -} - /// \brief Check whether the access through \p Ptr has a constant stride. int llvm::isStridedPtr(ScalarEvolution *SE, Value *Ptr, const Loop *Lp, const ValueToValueMap &StridesMap) { @@ -1312,7 +1501,6 @@ if (!PtrRtCheck.Need) return std::make_pair(nullptr, nullptr); - unsigned NumPointers = PtrRtCheck.Pointers.size(); SmallVector , 2> Starts; SmallVector , 2> Ends; @@ -1320,8 +1508,11 @@ SCEVExpander Exp(*SE, DL, "induction"); Instruction *FirstInst = nullptr; - for (unsigned i = 0; i < NumPointers; ++i) { - Value *Ptr = PtrRtCheck.Pointers[i]; + SmallVector GroupedChecks; + PtrRtCheck.groupChecks(GroupedChecks, PtrPartition); + + for (unsigned i = 0; i < GroupedChecks.size(); ++i) { + Value *Ptr = PtrRtCheck.Pointers[GroupedChecks[i].Members[0]]; const SCEV *Sc = SE->getSCEV(Ptr); if (SE->isLoopInvariant(Sc, TheLoop)) { @@ -1330,14 +1521,20 @@ Starts.push_back(Ptr); Ends.push_back(Ptr); } else { - DEBUG(dbgs() << "LAA: Adding RT check for range:" << *Ptr << '\n'); unsigned AS = Ptr->getType()->getPointerAddressSpace(); // Use this type for pointer arithmetic. Type *PtrArithTy = Type::getInt8PtrTy(Ctx, AS); + Value *Start = nullptr, *End = nullptr; - Value *Start = Exp.expandCodeFor(PtrRtCheck.Starts[i], PtrArithTy, Loc); - Value *End = Exp.expandCodeFor(PtrRtCheck.Ends[i], PtrArithTy, Loc); + DEBUG(dbgs() << "LAA: Adding RT check for range:\n"); + Start = Exp.expandCodeFor(PtrRtCheck.Starts[GroupedChecks[i].Low], + PtrArithTy, Loc); + End = Exp.expandCodeFor(PtrRtCheck.Ends[GroupedChecks[i].High], + PtrArithTy, Loc); + DEBUG(dbgs() << "Start: " << *PtrRtCheck.Starts[GroupedChecks[i].Low] + << " End: " << *PtrRtCheck.Ends[GroupedChecks[i].High] + << "\n"); Starts.push_back(Start); Ends.push_back(End); } @@ -1346,10 +1543,12 @@ IRBuilder<> ChkBuilder(Loc); // Our instructions might fold to a constant. Value *MemoryRuntimeCheck = nullptr; - for (unsigned i = 0; i < NumPointers; ++i) { - for (unsigned j = i+1; j < NumPointers; ++j) { - if (!PtrRtCheck.needsChecking(i, j, PtrPartition)) - continue; + for (unsigned i = 0; i < GroupedChecks.size(); ++i) { + for (unsigned j = i+1; j < GroupedChecks.size(); ++j) { + if (!PtrRtCheck.hasDependentMembers(GroupedChecks[i], + GroupedChecks[j], + PtrPartition)) + continue; unsigned AS0 = Starts[i]->getType()->getPointerAddressSpace(); unsigned AS1 = Starts[j]->getType()->getPointerAddressSpace(); @@ -1399,7 +1598,7 @@ const TargetLibraryInfo *TLI, AliasAnalysis *AA, DominatorTree *DT, LoopInfo *LI, const ValueToValueMap &Strides) - : DepChecker(SE, L), TheLoop(L), SE(SE), DL(DL), + : PtrRtCheck(SE), DepChecker(SE, L), TheLoop(L), SE(SE), DL(DL), TLI(TLI), AA(AA), DT(DT), LI(LI), NumLoads(0), NumStores(0), MaxSafeDepDistBytes(-1U), CanVecMem(false), StoreToLoopInvariantAddress(false) { Index: test/Analysis/LoopAccessAnalysis/number-of-memchecks.ll =================================================================== --- test/Analysis/LoopAccessAnalysis/number-of-memchecks.ll +++ test/Analysis/LoopAccessAnalysis/number-of-memchecks.ll @@ -1,19 +1,20 @@ -; RUN: opt -loop-accesses -analyze < %s | FileCheck %s - -; 3 reads and 3 writes should need 12 memchecks +; RUN: opt -O1 -loop-accesses -analyze < %s | FileCheck %s target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" target triple = "aarch64--linux-gnueabi" +; 3 reads and 3 writes should need 12 memchecks +; CHECK: function 'testf': ; CHECK: Memory dependences are safe with run-time checks + ; Memory dependecies have labels starting from 0, so in ; order to verify that we have n checks, we look for -; (n-1): and not n:. +; (n-1): and not n. ; CHECK: Run-time memory checks: -; CHECK-NEXT: 0: -; CHECK: 11: -; CHECK-NOT: 12: +; CHECK-NEXT: Check 0: +; CHECK: Check 11: +; CHECK-NOT: Check 12: define void @testf(i16* %a, i16* %b, @@ -56,3 +57,116 @@ for.end: ; preds = %for.body ret void } + +; 3 reads and 2 writes - two of the reads can be merged, +; and the writes can be merged as well. This gives us a +; total of 2 memory checks. + +; CHECK: function 'testg': +; CHECK: Memory dependences are safe with run-time checks + +; CHECK: Run-time memory checks: +; CHECK-NEXT: Check 0: +; CHECK: Check 1: +; CHECK-NOT: Check 2: +; CHECK: Group 0: +; CHECK: Group 1: +; CHECK: Group 2: +; CHECK-NOT: Group 3: + +define void @testg(i16* %a, + i16* %b, + i16* %c) { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %ind = phi i64 [ 0, %entry ], [ %add, %for.body ] + %store_ind = phi i64 [ 0, %entry ], [ %store_ind_next, %for.body ] + + %add = add nuw nsw i64 %ind, 1 + %store_ind_inc = add nuw nsw i64 %store_ind, 1 + %store_ind_next = add nuw nsw i64 %store_ind_inc, 1 + + %arrayidxA = getelementptr inbounds i16, i16* %a, i64 %ind + %loadA = load i16, i16* %arrayidxA, align 2 + + %arrayidxA1 = getelementptr inbounds i16, i16* %a, i64 %add + %loadA1 = load i16, i16* %arrayidxA1, align 2 + + %arrayidxB = getelementptr inbounds i16, i16* %b, i64 %ind + %loadB = load i16, i16* %arrayidxB, align 2 + + %mul = mul i16 %loadA, %loadA1 + %mul1 = mul i16 %mul, %loadB + + %arrayidxC = getelementptr inbounds i16, i16* %c, i64 %store_ind + store i16 %mul1, i16* %arrayidxC, align 2 + + %arrayidxC1 = getelementptr inbounds i16, i16* %c, i64 %store_ind_inc + store i16 %mul, i16* %arrayidxC1, align 2 + + %exitcond = icmp eq i64 %add, 20 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + +; 3 reads and 2 writes - the writes can be merged into a single +; group, but the GEPs used for the reads are not marked as inbounds, +; and therefore cannot be merged. This will give us a total of 3 +; memory checks. + +; CHECK: function 'testh': +; CHECK: Memory dependences are safe with run-time checks + +; CHECK: Run-time memory checks: +; CHECK-NEXT: Check 0: +; CHECK: Check 1: +; CHECK: Check 2: +; CHECK-NOT: Check 3: +; CHECK: Group 0: +; CHECK: Group 1: +; CHECK: Group 2: +; CHECK: Group 3: +; CHECK-NOT: Group 4: + +define void @testh(i16* %a, + i16* %b, + i16* %c) { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %ind = phi i64 [ 0, %entry ], [ %add, %for.body ] + %store_ind = phi i64 [ 0, %entry ], [ %store_ind_next, %for.body ] + + %add = add nuw nsw i64 %ind, 1 + %store_ind_inc = add nuw nsw i64 %store_ind, 1 + %store_ind_next = add nuw nsw i64 %store_ind_inc, 1 + + %arrayidxA = getelementptr i16, i16* %a, i64 %ind + %loadA = load i16, i16* %arrayidxA, align 2 + + %arrayidxA1 = getelementptr i16, i16* %a, i64 %add + %loadA1 = load i16, i16* %arrayidxA1, align 2 + + %arrayidxB = getelementptr i16, i16* %b, i64 %ind + %loadB = load i16, i16* %arrayidxB, align 2 + + %mul = mul i16 %loadA, %loadA1 + %mul1 = mul i16 %mul, %loadB + + %arrayidxC = getelementptr inbounds i16, i16* %c, i64 %store_ind + store i16 %mul1, i16* %arrayidxC, align 2 + + %arrayidxC1 = getelementptr inbounds i16, i16* %c, i64 %store_ind_inc + store i16 %mul, i16* %arrayidxC1, align 2 + + %exitcond = icmp eq i64 %add, 20 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} Index: test/Transforms/LoopDistribute/basic-with-memchecks.ll =================================================================== --- test/Transforms/LoopDistribute/basic-with-memchecks.ll +++ test/Transforms/LoopDistribute/basic-with-memchecks.ll @@ -32,8 +32,9 @@ %e = load i32*, i32** @E, align 8 br label %for.body -; We have two compares for each array overlap check which is a total of 10 -; compares. +; We have two compares for each array overlap check. +; Since the checks to A and A + 4 get merged, this will give us a +; total of 8 compares. ; ; CHECK: for.body.ldist.memcheck: ; CHECK: = icmp @@ -48,9 +49,6 @@ ; CHECK: = icmp ; CHECK: = icmp -; CHECK: = icmp -; CHECK: = icmp - ; CHECK-NOT: = icmp ; CHECK: br i1 %memcheck.conflict, label %for.body.ph.ldist.nondist, label %for.body.ph.ldist1