diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h --- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h +++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h @@ -406,8 +406,8 @@ /// according to the assumptions that we've made during the analysis. /// The method might also version the pointer stride according to \p Strides, /// and add new predicates to \p PSE. - void insert(Loop *Lp, Value *Ptr, bool WritePtr, unsigned DepSetId, - unsigned ASId, const ValueToValueMap &Strides, + void insert(Loop *Lp, Value *Ptr, Type *AccessTy, bool WritePtr, + unsigned DepSetId, unsigned ASId, const ValueToValueMap &Strides, PredicatedScalarEvolution &PSE); /// No run-time memory checking is necessary. diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -189,8 +189,9 @@ /// /// There is no conflict when the intervals are disjoint: /// NoConflict = (P2.Start >= P1.End) || (P1.Start >= P2.End) -void RuntimePointerChecking::insert(Loop *Lp, Value *Ptr, bool WritePtr, - unsigned DepSetId, unsigned ASId, +void RuntimePointerChecking::insert(Loop *Lp, Value *Ptr, Type *AccessTy, + bool WritePtr, unsigned DepSetId, + unsigned ASId, const ValueToValueMap &Strides, PredicatedScalarEvolution &PSE) { // Get the stride replaced scev. @@ -227,8 +228,7 @@ // Add the size of the pointed element to ScEnd. auto &DL = Lp->getHeader()->getModule()->getDataLayout(); Type *IdxTy = DL.getIndexType(Ptr->getType()); - const SCEV *EltSizeSCEV = - SE->getStoreSizeOfExpr(IdxTy, Ptr->getType()->getPointerElementType()); + const SCEV *EltSizeSCEV = SE->getStoreSizeOfExpr(IdxTy, AccessTy); ScEnd = SE->getAddExpr(ScEnd, EltSizeSCEV); Pointers.emplace_back(Ptr, ScStart, ScEnd, WritePtr, DepSetId, ASId, Sc); @@ -522,19 +522,19 @@ : TheLoop(TheLoop), AST(*AA), LI(LI), DepCands(DA), PSE(PSE) {} /// Register a load and whether it is only read from. - void addLoad(MemoryLocation &Loc, bool IsReadOnly) { + void addLoad(MemoryLocation &Loc, Type *AccessTy, bool IsReadOnly) { Value *Ptr = const_cast(Loc.Ptr); AST.add(Ptr, LocationSize::beforeOrAfterPointer(), Loc.AATags); - Accesses.insert(MemAccessInfo(Ptr, false)); + Accesses[MemAccessInfo(Ptr, false)].insert(AccessTy); if (IsReadOnly) ReadOnlyPtr.insert(Ptr); } /// Register a store. - void addStore(MemoryLocation &Loc) { + void addStore(MemoryLocation &Loc, Type *AccessTy) { Value *Ptr = const_cast(Loc.Ptr); AST.add(Ptr, LocationSize::beforeOrAfterPointer(), Loc.AATags); - Accesses.insert(MemAccessInfo(Ptr, true)); + Accesses[MemAccessInfo(Ptr, true)].insert(AccessTy); } /// Check if we can emit a run-time no-alias check for \p Access. @@ -545,12 +545,11 @@ /// we will attempt to use additional run-time checks in order to get /// the bounds of the pointer. bool createCheckForAccess(RuntimePointerChecking &RtCheck, - MemAccessInfo Access, + MemAccessInfo Access, Type *AccessTy, const ValueToValueMap &Strides, DenseMap &DepSetId, Loop *TheLoop, unsigned &RunningDepId, - unsigned ASId, bool ShouldCheckStride, - bool Assume); + unsigned ASId, bool ShouldCheckStride, bool Assume); /// Check whether we can check the pointers at runtime for /// non-intersection. @@ -583,14 +582,15 @@ MemAccessInfoList &getDependenciesToCheck() { return CheckDeps; } private: - typedef SetVector PtrAccessSet; + typedef MapVector> PtrAccessMap; /// Go over all memory access and check whether runtime pointer checks /// are needed and build sets of dependency check candidates. void processMemAccesses(); - /// Set of all accesses. - PtrAccessSet Accesses; + /// Map of all accesses. Values are the types used to access memory pointed to + /// by the pointer. + PtrAccessMap Accesses; /// The loop being checked. const Loop *TheLoop; @@ -652,12 +652,12 @@ /// Check whether a pointer address cannot wrap. static bool isNoWrap(PredicatedScalarEvolution &PSE, - const ValueToValueMap &Strides, Value *Ptr, Loop *L) { + const ValueToValueMap &Strides, Value *Ptr, Type *AccessTy, + Loop *L) { const SCEV *PtrScev = PSE.getSCEV(Ptr); if (PSE.getSE()->isLoopInvariant(PtrScev, L)) return true; - Type *AccessTy = Ptr->getType()->getPointerElementType(); int64_t Stride = getPtrStride(PSE, AccessTy, Ptr, L, Strides); if (Stride == 1 || PSE.hasNoOverflow(Ptr, SCEVWrapPredicate::IncrementNUSW)) return true; @@ -689,7 +689,7 @@ } bool AccessAnalysis::createCheckForAccess(RuntimePointerChecking &RtCheck, - MemAccessInfo Access, + MemAccessInfo Access, Type *AccessTy, const ValueToValueMap &StridesMap, DenseMap &DepSetId, Loop *TheLoop, unsigned &RunningDepId, @@ -702,7 +702,7 @@ // When we run after a failing dependency check we have to make sure // we don't have wrapping pointers. - if (ShouldCheckWrap && !isNoWrap(PSE, StridesMap, Ptr, TheLoop)) { + if (ShouldCheckWrap && !isNoWrap(PSE, StridesMap, Ptr, AccessTy, TheLoop)) { auto *Expr = PSE.getSCEV(Ptr); if (!Assume || !isa(Expr)) return false; @@ -723,11 +723,11 @@ DepId = RunningDepId++; bool IsWrite = Access.getInt(); - RtCheck.insert(TheLoop, Ptr, IsWrite, DepId, ASId, StridesMap, PSE); + RtCheck.insert(TheLoop, Ptr, AccessTy, IsWrite, DepId, ASId, StridesMap, PSE); LLVM_DEBUG(dbgs() << "LAA: Found a runtime check ptr:" << *Ptr << '\n'); return true; - } +} bool AccessAnalysis::canCheckPtrAtRT(RuntimePointerChecking &RtCheck, ScalarEvolution *SE, Loop *TheLoop, @@ -788,12 +788,15 @@ } for (auto &Access : AccessInfos) { - if (!createCheckForAccess(RtCheck, Access, StridesMap, DepSetId, TheLoop, - RunningDepId, ASId, ShouldCheckWrap, false)) { - LLVM_DEBUG(dbgs() << "LAA: Can't find bounds for ptr:" - << *Access.getPointer() << '\n'); - Retries.push_back(Access); - CanDoAliasSetRT = false; + for (auto &AccessTy : Accesses[Access]) { + if (!createCheckForAccess(RtCheck, Access, AccessTy, StridesMap, + DepSetId, TheLoop, RunningDepId, ASId, + ShouldCheckWrap, false)) { + LLVM_DEBUG(dbgs() << "LAA: Can't find bounds for ptr:" + << *Access.getPointer() << '\n'); + Retries.push_back(Access); + CanDoAliasSetRT = false; + } } } @@ -815,13 +818,16 @@ // We know that we need these checks, so we can now be more aggressive // and add further checks if required (overflow checks). CanDoAliasSetRT = true; - for (auto Access : Retries) - if (!createCheckForAccess(RtCheck, Access, StridesMap, DepSetId, - TheLoop, RunningDepId, ASId, - ShouldCheckWrap, /*Assume=*/true)) { - CanDoAliasSetRT = false; - break; + for (auto Access : Retries) { + for (auto &AccessTy : Accesses[Access]) { + if (!createCheckForAccess(RtCheck, Access, AccessTy, StridesMap, + DepSetId, TheLoop, RunningDepId, ASId, + ShouldCheckWrap, /*Assume=*/true)) { + CanDoAliasSetRT = false; + break; + } } + } } CanDoRT &= CanDoAliasSetRT; @@ -886,9 +892,12 @@ LLVM_DEBUG(dbgs() << "LAA: Accesses(" << Accesses.size() << "):\n"); LLVM_DEBUG({ for (auto A : Accesses) - dbgs() << "\t" << *A.getPointer() << " (" << - (A.getInt() ? "write" : (ReadOnlyPtr.count(A.getPointer()) ? - "read-only" : "read")) << ")\n"; + dbgs() << "\t" << *A.first.getPointer() << " (" + << (A.first.getInt() + ? "write" + : (ReadOnlyPtr.count(A.first.getPointer()) ? "read-only" + : "read")) + << ")\n"; }); // The AliasSetTracker has nicely partitioned our pointers by metadata @@ -907,13 +916,13 @@ UnderlyingObjToAccessMap ObjToLastAccess; // Set of access to check after all writes have been processed. - PtrAccessSet DeferredAccesses; + PtrAccessMap DeferredAccesses; // Iterate over each alias set twice, once to process read/write pointers, // and then to process read-only pointers. for (int SetIteration = 0; SetIteration < 2; ++SetIteration) { bool UseDeferred = SetIteration > 0; - PtrAccessSet &S = UseDeferred ? DeferredAccesses : Accesses; + PtrAccessMap &S = UseDeferred ? DeferredAccesses : Accesses; for (const auto &AV : AS) { Value *Ptr = AV.getValue(); @@ -921,10 +930,10 @@ // For a single memory access in AliasSetTracker, Accesses may contain // both read and write, and they both need to be handled for CheckDeps. for (const auto &AC : S) { - if (AC.getPointer() != Ptr) + if (AC.first.getPointer() != Ptr) continue; - bool IsWrite = AC.getInt(); + bool IsWrite = AC.first.getInt(); // If we're using the deferred access set, then it contains only // reads. @@ -946,7 +955,9 @@ // consecutive as "read-only" pointers (so that we check // "a[b[i]] +="). Hence, we need the second check for "!IsWrite". if (!UseDeferred && IsReadOnlyPtr) { - DeferredAccesses.insert(Access); + // We only use the pointer keys, the types vector values don't + // matter. + DeferredAccesses.insert({Access, {}}); continue; } @@ -1518,8 +1529,8 @@ Value *BPtr = B.getPointer(); bool AIsWrite = A.getInt(); bool BIsWrite = B.getInt(); - Type *ATy = APtr->getType()->getPointerElementType(); - Type *BTy = BPtr->getType()->getPointerElementType(); + Type *ATy = getLoadStoreType(InstMap[AIdx]); + Type *BTy = getLoadStoreType(InstMap[BIdx]); // Two reads are independent. if (!AIsWrite && !BIsWrite) @@ -1842,8 +1853,6 @@ void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI, const TargetLibraryInfo *TLI, DominatorTree *DT) { - typedef SmallPtrSet ValueSet; - // Holds the Load and Store instructions. SmallVector Loads; SmallVector Stores; @@ -1975,11 +1984,11 @@ // for read and once for write, it will only appear once (on the write // list). This is okay, since we are going to check for conflicts between // writes and between reads and writes, but not between reads and reads. - ValueSet Seen; + SmallSet, 16> Seen; // Record uniform store addresses to identify if we have multiple stores // to the same address. - ValueSet UniformStores; + SmallPtrSet UniformStores; for (StoreInst *ST : Stores) { Value *Ptr = ST->getPointerOperand(); @@ -1990,7 +1999,8 @@ // If we did *not* see this pointer before, insert it to the read-write // list. At this phase it is only a 'write' list. - if (Seen.insert(Ptr).second) { + Type *AccessTy = getLoadStoreType(ST); + if (Seen.insert({Ptr, AccessTy}).second) { ++NumReadWrites; MemoryLocation Loc = MemoryLocation::get(ST); @@ -2001,9 +2011,9 @@ Loc.AATags.TBAA = nullptr; visitPointers(const_cast(Loc.Ptr), *TheLoop, - [&Accesses, Loc](Value *Ptr) { + [&Accesses, AccessTy, Loc](Value *Ptr) { MemoryLocation NewLoc = Loc.getWithNewPtr(Ptr); - Accesses.addStore(NewLoc); + Accesses.addStore(NewLoc, AccessTy); }); } } @@ -2027,7 +2037,8 @@ // read a few words, modify, and write a few words, and some of the // words may be written to the same address. bool IsReadOnlyPtr = false; - if (Seen.insert(Ptr).second || + Type *AccessTy = getLoadStoreType(LD); + if (Seen.insert({Ptr, AccessTy}).second || !getPtrStride(*PSE, LD->getType(), Ptr, TheLoop, SymbolicStrides)) { ++NumReads; IsReadOnlyPtr = true; @@ -2049,9 +2060,9 @@ Loc.AATags.TBAA = nullptr; visitPointers(const_cast(Loc.Ptr), *TheLoop, - [&Accesses, Loc, IsReadOnlyPtr](Value *Ptr) { + [&Accesses, AccessTy, Loc, IsReadOnlyPtr](Value *Ptr) { MemoryLocation NewLoc = Loc.getWithNewPtr(Ptr); - Accesses.addLoad(NewLoc, IsReadOnlyPtr); + Accesses.addLoad(NewLoc, AccessTy, IsReadOnlyPtr); }); } diff --git a/llvm/test/Analysis/LoopAccessAnalysis/depend_diff_types_opaque_ptr.ll b/llvm/test/Analysis/LoopAccessAnalysis/depend_diff_types_opaque_ptr.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Analysis/LoopAccessAnalysis/depend_diff_types_opaque_ptr.ll @@ -0,0 +1,179 @@ +; RUN: opt -S -disable-output --opaque-pointers -passes='print-access-info' < %s 2>&1 | FileCheck %s + +; In the function below some of the accesses are done as float types and some +; are done as i32 types. When doing dependence analysis the type should not +; matter if it can be determined that they are the same size. + +%int_pair = type { i32, i32 } + +; CHECK-LABEL: function 'backdep_type_size_equivalence': +; CHECK-NEXT: loop: +; CHECK-NEXT: Memory dependences are safe with a maximum dependence distance of 800 bytes +; CHECK-NEXT: Dependences: +; CHECK-NEXT: Forward: +; CHECK-NEXT: %ld.f32 = load float, ptr %gep.iv, align 8 -> +; CHECK-NEXT: store i32 %indvars.iv.i32, ptr %gep.iv, align 8 +; CHECK-EMPTY: +; CHECK-NEXT: Forward: +; CHECK-NEXT: %ld.f32 = load float, ptr %gep.iv, align 8 -> +; CHECK-NEXT: store float %val, ptr %gep.iv.min.100, align 8 +; CHECK-EMPTY: +; CHECK-NEXT: BackwardVectorizable: +; CHECK-NEXT: store float %val, ptr %gep.iv.min.100, align 8 -> +; CHECK-NEXT: store i32 %indvars.iv.i32, ptr %gep.iv, align 8 +; CHECK-EMPTY: +; CHECK-NEXT: Run-time memory checks: +; CHECK-NEXT: Grouped accesses: + +define void @backdep_type_size_equivalence(ptr nocapture %vec, i64 %n) { +entry: + br label %loop + +loop: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %loop ] + + ;; Load from vec[indvars.iv].x as float + %gep.iv = getelementptr inbounds %int_pair, ptr %vec, i64 %indvars.iv, i32 0 + %ld.f32 = load float, ptr %gep.iv, align 8 + %val = fmul fast float %ld.f32, 5.0 + + ;; Store to vec[indvars.iv - 100].x as float + %indvars.iv.min.100 = add nsw i64 %indvars.iv, -100 + %gep.iv.min.100 = getelementptr inbounds %int_pair, ptr %vec, i64 %indvars.iv.min.100, i32 0 + store float %val, ptr %gep.iv.min.100, align 8 + + ;; Store to vec[indvars.iv].x as i32, creating a backward dependency between + ;; the two stores with different element types but the same element size. + %indvars.iv.i32 = trunc i64 %indvars.iv to i32 + store i32 %indvars.iv.i32, ptr %gep.iv, align 8 + + ;; Store to vec[indvars.iv].y as i32, strided accesses should be independent + ;; between the two stores with different element types but the same element size. + %gep.iv.1 = getelementptr inbounds %int_pair, ptr %vec, i64 %indvars.iv, i32 1 + store i32 %indvars.iv.i32, ptr %gep.iv.1, align 8 + + ;; Store to vec[indvars.iv + n].y as i32, to verify no dependence in the case + ;; of unknown dependence distance. + %indvars.iv.n = add nuw nsw i64 %indvars.iv, %n + %gep.iv.n = getelementptr inbounds %int_pair, ptr %vec, i64 %indvars.iv.n, i32 1 + store i32 %indvars.iv.i32, ptr %gep.iv.n, align 8 + + ;; Loop condition. + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %cond = icmp eq i64 %indvars.iv.next, %n + br i1 %cond, label %exit, label %loop + +exit: + ret void +} + +; In the function below one of the accesses is done as i19 type, which has a +; different store size than the i32 type, even though their alloc sizes are +; equivalent. This is a negative test to ensure that they are not analyzed as +; in the tests above. +; +; CHECK-LABEL: function 'backdep_type_store_size_equivalence': +; CHECK-NEXT: loop: +; CHECK-NEXT: Report: unsafe dependent memory operations in loop. +; CHECK-NEXT: Unknown data dependence. +; CHECK-NEXT: Dependences: +; CHECK-NEXT: Unknown: +; CHECK-NEXT: %ld.f32 = load float, ptr %gep.iv, align 8 -> +; CHECK-NEXT: store i19 %indvars.iv.i19, ptr %gep.iv, align 8 +; CHECK-EMPTY: +; CHECK-NEXT: Run-time memory checks: +; CHECK-NEXT: Grouped accesses: + +define void @backdep_type_store_size_equivalence(ptr nocapture %vec, i64 %n) { +entry: + br label %loop + +loop: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %loop ] + + ;; Load from vec[indvars.iv].x as float + %gep.iv = getelementptr inbounds %int_pair, ptr %vec, i64 %indvars.iv, i32 0 + %ld.f32 = load float, ptr %gep.iv, align 8 + %val = fmul fast float %ld.f32, 5.0 + + ;; Store to vec[indvars.iv].x as i19. + %indvars.iv.i19 = trunc i64 %indvars.iv to i19 + store i19 %indvars.iv.i19, ptr %gep.iv, align 8 + + ;; Loop condition. + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %cond = icmp eq i64 %indvars.iv.next, %n + br i1 %cond, label %exit, label %loop + +exit: + ret void +} + +; In the function below some of the accesses are done as double types and some +; are done as i64 and i32 types. This is a negative test to ensure that they +; are not analyzed as in the tests above. + +; CHECK-LABEL: function 'neg_dist_dep_type_size_equivalence': +; CHECK-NEXT: loop: +; CHECK-NEXT: Report: unsafe dependent memory operations in loop. +; CHECK-NEXT: Unknown data dependence. +; CHECK-NEXT: Dependences: +; CHECK-NEXT: Unknown: +; CHECK-NEXT: %ld.f64 = load double, ptr %gep.iv, align 8 -> +; CHECK-NEXT: store i32 %ld.i64.i32, ptr %gep.iv.n, align 8 +; CHECK-EMPTY: +; CHECK-NEXT: Unknown: +; CHECK-NEXT: %ld.i64 = load i64, ptr %gep.iv, align 8 -> +; CHECK-NEXT: store i32 %ld.i64.i32, ptr %gep.iv.n, align 8 +; CHECK-EMPTY: +; CHECK-NEXT: BackwardVectorizableButPreventsForwarding: +; CHECK-NEXT: %ld.f64 = load double, ptr %gep.iv, align 8 -> +; CHECK-NEXT: store double %val, ptr %gep.iv.101, align 8 +; CHECK-EMPTY: +; CHECK-NEXT: ForwardButPreventsForwarding: +; CHECK-NEXT: store double %val, ptr %gep.iv.101, align 8 -> +; CHECK-NEXT: %ld.i64 = load i64, ptr %gep.iv, align 8 +; CHECK-EMPTY: +; CHECK-NEXT: Unknown: +; CHECK-NEXT: store double %val, ptr %gep.iv.101, align 8 -> +; CHECK-NEXT: store i32 %ld.i64.i32, ptr %gep.iv.n, align 8 +; CHECK-EMPTY: +; CHECK-NEXT: Run-time memory checks: +; CHECK-NEXT: Grouped accesses: + +define void @neg_dist_dep_type_size_equivalence(ptr nocapture %vec, i64 %n) { +entry: + br label %loop + +loop: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %loop ] + + ;; Load from vec[indvars.iv] as double + %gep.iv = getelementptr i64, ptr %vec, i64 %indvars.iv + %ld.f64 = load double, ptr %gep.iv, align 8 + %val = fmul fast double %ld.f64, 5.0 + + ;; Store to vec[indvars.iv + 101] as double + %indvars.iv.101 = add nsw i64 %indvars.iv, 101 + %gep.iv.101 = getelementptr i64, ptr %vec, i64 %indvars.iv.101 + store double %val, ptr %gep.iv.101, align 8 + + ;; Read from vec[indvars.iv] as i64 creating + ;; a forward but prevents forwarding dependence + ;; with different types but same sizes. + %ld.i64 = load i64, ptr %gep.iv, align 8 + + ;; Different sizes + %indvars.iv.n = add nuw nsw i64 %indvars.iv, %n + %gep.iv.n = getelementptr inbounds i64, ptr %vec, i64 %indvars.iv.n + %ld.i64.i32 = trunc i64 %ld.i64 to i32 + store i32 %ld.i64.i32, ptr %gep.iv.n, align 8 + + ;; Loop condition. + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %cond = icmp eq i64 %indvars.iv.next, %n + br i1 %cond, label %exit, label %loop + +exit: + ret void +}