Index: llvm/include/llvm/Analysis/LoopAccessAnalysis.h =================================================================== --- llvm/include/llvm/Analysis/LoopAccessAnalysis.h +++ llvm/include/llvm/Analysis/LoopAccessAnalysis.h @@ -328,13 +328,15 @@ /// two groups. struct RuntimeCheckingPtrGroup { /// Create a new pointer checking group containing a single - /// pointer, with index \p Index in RtCheck. - RuntimeCheckingPtrGroup(unsigned Index, RuntimePointerChecking &RtCheck); + /// pointer, with index \p Index in RtCheck and using \p Fork (0 or 1) if + /// this represents a forked pointer. + RuntimeCheckingPtrGroup(unsigned Index, RuntimePointerChecking &RtCheck, + bool IsFork = false); RuntimeCheckingPtrGroup(unsigned Index, const SCEV *Start, const SCEV *End, - unsigned AS) + unsigned AS, bool IsFork = false) : High(End), Low(Start), AddressSpace(AS) { - Members.push_back(Index); + Members.push_back(std::make_pair(Index, IsFork ? 1 : 0)); } /// Tries to add the pointer recorded in RtCheck at index @@ -342,9 +344,12 @@ /// to a checking group if we will still be able to get /// the upper and lower bounds of the check. Returns true in case /// of success, false otherwise. - bool addPointer(unsigned Index, RuntimePointerChecking &RtCheck); + /// For forked pointers this will only add one fork at a time, determined + /// by \p Fork (0 or 1). + bool addPointer(unsigned Index, RuntimePointerChecking &RtCheck, + bool IsFork = false); bool addPointer(unsigned Index, const SCEV *Start, const SCEV *End, - unsigned AS, ScalarEvolution &SE); + unsigned AS, ScalarEvolution &SE, bool IsFork = false); /// The SCEV expression which represents the upper bound of all the /// pointers in this group. @@ -352,8 +357,9 @@ /// The SCEV expression which represents the lower bound of all the /// pointers in this group. const SCEV *Low; - /// Indices of all the pointers that constitute this grouping. - SmallVector Members; + /// Indices of all the pointers that constitute this grouping, including + /// which fork if the pointer is forked. + SmallVector, 2> Members; /// Address space of the involved pointers. unsigned AddressSpace; }; @@ -363,6 +369,11 @@ const RuntimeCheckingPtrGroup *> RuntimePointerCheck; +/// Type for a possible forked pointer, where a single pointer could be +/// one of two different pointers determined conditionally via a select +/// instruction. +using ForkedPointer = std::pair; + /// Holds information about the memory runtime legality checks to verify /// that a group of pointers do not overlap. class RuntimePointerChecking { @@ -372,12 +383,12 @@ struct PointerInfo { /// Holds the pointer value that we need to check. TrackingVH PointerValue; - /// Holds the smallest byte address accessed by the pointer throughout all - /// iterations of the loop. - const SCEV *Start; - /// Holds the largest byte address accessed by the pointer throughout all - /// iterations of the loop, plus 1. - const SCEV *End; + /// Holds the smallest byte address(es) accessed by the pointer throughout + /// all iterations of the loop. + const SCEV *Starts[2]; + /// Holds the largest byte address(es) accessed by the pointer throughout + /// all iterations of the loop, plus 1. + const SCEV *Ends[2]; /// Holds the information if this pointer is used for writing to memory. bool IsWritePtr; /// Holds the id of the set of pointers that could be dependent because of a @@ -385,23 +396,39 @@ unsigned DependencySetId; /// Holds the id of the disjoint alias set to which this pointer belongs. unsigned AliasSetId; - /// SCEV for the access. - const SCEV *Expr; - - PointerInfo(Value *PointerValue, const SCEV *Start, const SCEV *End, - bool IsWritePtr, unsigned DependencySetId, unsigned AliasSetId, - const SCEV *Expr) - : PointerValue(PointerValue), Start(Start), End(End), - IsWritePtr(IsWritePtr), DependencySetId(DependencySetId), - AliasSetId(AliasSetId), Expr(Expr) {} + /// SCEV(s) for the access. + const SCEV *Exprs[2]; + /// Determines whether this represents a forked pointer, which will have + /// two SCEV expressions to consider for runtime checking instead of one. + bool HasFork = false; + + PointerInfo(Value *PointerValue, SmallVectorImpl &ScStarts, + SmallVectorImpl &ScEnds, bool IsWritePtr, + unsigned DependencySetId, unsigned AliasSetId, + SmallVectorImpl &ScExprs) + : PointerValue(PointerValue), IsWritePtr(IsWritePtr), + DependencySetId(DependencySetId), AliasSetId(AliasSetId) { + assert(ScExprs.size() <= 2 && "Too many SCEV expressions for pointer"); + for (size_t i = 0; i < ScExprs.size(); ++i) { + Exprs[i] = ScExprs[i]; + Starts[i] = ScStarts[i]; + Ends[i] = ScEnds[i]; + } + + // If there are two SCEV expressions associated with this pointer, + // then we have a fork. + HasFork = (ScExprs.size() == 2); + } }; - RuntimePointerChecking(ScalarEvolution *SE) : Need(false), SE(SE) {} + RuntimePointerChecking(ScalarEvolution *SE, bool AllowForkedPtrs) + : Need(false), AllowForkedPtrs(AllowForkedPtrs), SE(SE) {} /// Reset the state of the pointer runtime information. void reset() { Need = false; Pointers.clear(); + ForkedPtrs.clear(); Checks.clear(); } @@ -444,12 +471,28 @@ const SmallVectorImpl &Checks, unsigned Depth = 0) const; + /// Returns true if the pointer can decompose into two separate pointers + /// through a select. + bool isForkedPtr(const Value *Ptr) const { + return ForkedPtrs.count(Ptr) != 0; + } + /// This flag indicates if we need to add the runtime check. bool Need; + /// This flags indicates we allow diverging pointers over a select + bool AllowForkedPtrs; + /// Information about the pointers that may require checking. SmallVector Pointers; + /// Mapping between a pointer Value used by memory operations and a pair + /// of SCEV expressions. This is used in cases where a 'select' instruction + /// is used to form part of an address, and we could form a single + /// SCEVAddRecExpr with either side but not with both, so we just calculate + /// and store the two expressions to determine whether checks are required. + DenseMap ForkedPtrs; + /// Holds a partitioning of pointers into "check groups". SmallVector CheckingGroups; Index: llvm/lib/Analysis/LoopAccessAnalysis.cpp =================================================================== --- llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -129,6 +129,15 @@ cl::desc("Enable conflict detection in loop-access analysis"), cl::init(true)); +/// Enables the detection of forked pointers when analyzing loops; these +/// pointers could have two possible values at runtime based on a conditional +/// select instruction, and we can analyze both possibilities to determine +/// bounds. +static cl::opt EnableForkedPointers( + "enable-forked-pointer-detection", cl::Hidden, + cl::desc("Enable detection of pointers forked by a select instruction"), + cl::init(false)); + bool VectorizerParams::isInterleaveForced() { return ::VectorizationInterleave.getNumOccurrences() > 0; } @@ -167,13 +176,174 @@ return Expr; } +// Walk back through the IR for a pointer, looking for a select like the +// following: +// +// %offset = select i1 %cmp, i64 %a, i64 %b +// %addr = getelementptr double, double* %base, i64 %offset +// %ld = load double, double* %addr, align 8 +// +// We won't be able to form a single SCEVAddRecExpr from this since the +// address for each loop iteration depends on %cmp. We could potentially +// produce multiple valid SCEVAddRecExprs, though, and check all of them for +// memory safety/aliasing if needed. +// +// If we encounter some IR we don't yet handle, or something obviously fine +// like a constant, then we just add the SCEV for that term to the list passed +// in by the caller. If we have a node that may potentially yield a valid +// SCEVAddRecExpr then we decompose it into parts and build the SCEV terms +// ourselves before adding to the list. +static void findForkedSCEVs(ScalarEvolution *SE, const Loop *L, Value *Ptr, + SmallVectorImpl &ScevList) { + const SCEV *Scev = SE->getSCEV(Ptr); + if (SE->isLoopInvariant(Scev, L) || isa(Scev) || + !isa(Ptr)) { + ScevList.push_back(Scev); + return; + } + + auto GetBinOpExpr = [&SE](unsigned Opcode, const SCEV *L, const SCEV *R) { + switch (Opcode) { + case Instruction::Add: + return SE->getAddExpr(L, R); + case Instruction::Sub: + return SE->getMinusSCEV(L, R); + case Instruction::Mul: + return SE->getMulExpr(L, R); + default: + llvm_unreachable("Unexpected binary operator when walking ForkedPtrs"); + } + }; + + Instruction *I = cast(Ptr); + unsigned Opcode = I->getOpcode(); + switch (Opcode) { + case Instruction::BitCast: + findForkedSCEVs(SE, L, I->getOperand(0), ScevList); + break; + case Instruction::SExt: + case Instruction::ZExt: { + SmallVector ExtScevs; + findForkedSCEVs(SE, L, I->getOperand(0), ExtScevs); + for (const SCEV *Scev : ExtScevs) + if (Opcode == Instruction::SExt) + ScevList.push_back(SE->getSignExtendExpr(Scev, I->getType())); + else + ScevList.push_back(SE->getZeroExtendExpr(Scev, I->getType())); + break; + } + case Instruction::GetElementPtr: { + GetElementPtrInst *GEP = cast(I); + Type *SourceTy = GEP->getSourceElementType(); + // We only handle base + single offset GEPs here for now. + // Not dealing with preexisting gathers yet, so no vectors. + if (I->getNumOperands() != 2 || SourceTy->isVectorTy()) { + ScevList.push_back(Scev); + break; + } + SmallVector BaseScevs; + SmallVector OffsetScevs; + findForkedSCEVs(SE, L, I->getOperand(0), BaseScevs); + findForkedSCEVs(SE, L, I->getOperand(1), OffsetScevs); + + // Make sure we get the correct pointer type to extend to, including the + // address space. + const SCEV *BaseExpr = SE->getSCEV(GEP->getPointerOperand()); + Type *IntPtrTy = SE->getEffectiveSCEVType(BaseExpr->getType()); + SCEV::NoWrapFlags Wrap = + GEP->isInBounds() ? SCEV::FlagNSW : SCEV::FlagAnyWrap; + // Find the size of the type being pointed to. We only have a single + // index term (guarded above) so we don't need to index into arrays or + // structures, just get the size of the scalar value. + const SCEV *Size = SE->getSizeOfExpr(IntPtrTy, SourceTy); + + if (OffsetScevs.size() == 2 && BaseScevs.size() == 1) { + const SCEV *Off1 = SE->getTruncateOrSignExtend(OffsetScevs[0], IntPtrTy); + const SCEV *Off2 = SE->getTruncateOrSignExtend(OffsetScevs[1], IntPtrTy); + const SCEV *Mul1 = SE->getMulExpr(Size, Off1, Wrap); + const SCEV *Mul2 = SE->getMulExpr(Size, Off2, Wrap); + const SCEV *Add1 = SE->getAddExpr(BaseScevs[0], Mul1, Wrap); + const SCEV *Add2 = SE->getAddExpr(BaseScevs[0], Mul2, Wrap); + ScevList.push_back(Add1); + ScevList.push_back(Add2); + } else if (BaseScevs.size() == 2 && OffsetScevs.size() == 1) { + const SCEV *Off = SE->getTruncateOrSignExtend(OffsetScevs[0], IntPtrTy); + const SCEV *Mul = SE->getMulExpr(Size, Off, Wrap); + const SCEV *Add1 = SE->getAddExpr(BaseScevs[0], Mul, Wrap); + const SCEV *Add2 = SE->getAddExpr(BaseScevs[1], Mul, Wrap); + ScevList.push_back(Add1); + ScevList.push_back(Add2); + } else + ScevList.push_back(Scev); + break; + } + case Instruction::Select: { + SmallVector ChildScevs; + // A select means we've found a forked pointer, but we currently only + // support a single select per pointer so if there's another behind this + // then we just bail out and return the generic SCEV. + findForkedSCEVs(SE, L, I->getOperand(1), ChildScevs); + findForkedSCEVs(SE, L, I->getOperand(2), ChildScevs); + if (ChildScevs.size() == 2) { + ScevList.push_back(ChildScevs[0]); + ScevList.push_back(ChildScevs[1]); + } else + ScevList.push_back(Scev); + break; + } + // If adding another binop to this list, update GetBinOpExpr above + case Instruction::Add: + case Instruction::Sub: + case Instruction::Mul: { + SmallVector LScevs; + SmallVector RScevs; + findForkedSCEVs(SE, L, I->getOperand(0), LScevs); + findForkedSCEVs(SE, L, I->getOperand(1), RScevs); + if (LScevs.size() == 2 && RScevs.size() == 1) { + const SCEV *Op1 = GetBinOpExpr(Opcode, LScevs[0], RScevs[0]); + const SCEV *Op2 = GetBinOpExpr(Opcode, LScevs[1], RScevs[0]); + ScevList.push_back(Op1); + ScevList.push_back(Op2); + } else if (LScevs.size() == 1 && RScevs.size() == 2) { + const SCEV *Op1 = GetBinOpExpr(Opcode, LScevs[0], RScevs[0]); + const SCEV *Op2 = GetBinOpExpr(Opcode, LScevs[0], RScevs[1]); + ScevList.push_back(Op1); + ScevList.push_back(Op2); + } else + ScevList.push_back(Scev); + break; + } + default: + // Just return the current SCEV if we haven't handled the instruction yet. + LLVM_DEBUG(dbgs() << "ForkedPtr unhandled instruction: " << *I << "\n"); + ScevList.push_back(Scev); + break; + } + + return; +} + +static Optional findForkedPointer(ScalarEvolution *SE, Value *V, + const Loop *L) { + assert(SE->isSCEVable(V->getType()) && "Value is not SCEVable!"); + SmallVector Scevs; + findForkedSCEVs(SE, L, V, Scevs); + + // For now, we will only accept a forked pointer with two options. + if (Scevs.size() == 2) + return std::make_pair(Scevs[0], Scevs[1]); + + return None; +} + RuntimeCheckingPtrGroup::RuntimeCheckingPtrGroup( - unsigned Index, RuntimePointerChecking &RtCheck) - : High(RtCheck.Pointers[Index].End), Low(RtCheck.Pointers[Index].Start), + unsigned Index, RuntimePointerChecking &RtCheck, bool IsFork) + : High(RtCheck.Pointers[Index].Ends[IsFork ? 1 : 0]), + Low(RtCheck.Pointers[Index].Starts[IsFork ? 1 : 0]), AddressSpace(RtCheck.Pointers[Index] .PointerValue->getType() ->getPointerAddressSpace()) { - Members.push_back(Index); + Members.push_back(std::make_pair(Index, IsFork ? 1 : 0)); } /// Calculate Start and End points of memory access. @@ -200,38 +370,51 @@ const SCEV *ScStart; const SCEV *ScEnd; - if (SE->isLoopInvariant(Sc, Lp)) { - ScStart = ScEnd = Sc; - } else { - const SCEVAddRecExpr *AR = dyn_cast(Sc); - assert(AR && "Invalid addrec expression"); - const SCEV *Ex = PSE.getBackedgeTakenCount(); - - ScStart = AR->getStart(); - ScEnd = AR->evaluateAtIteration(Ex, *SE); - const SCEV *Step = AR->getStepRecurrence(*SE); - - // For expressions with negative step, the upper bound is ScStart and the - // lower bound is ScEnd. - if (const auto *CStep = dyn_cast(Step)) { - if (CStep->getValue()->isNegative()) - std::swap(ScStart, ScEnd); + // See if this was a ForkedPtr. If so, we want to add checks for both sides. + SmallVector Scevs; + SmallVector Starts; + SmallVector Ends; + if (ForkedPtrs.count(Ptr)) { + Scevs.push_back(ForkedPtrs[Ptr].first); + Scevs.push_back(ForkedPtrs[Ptr].second); + } else + Scevs.push_back(Sc); + + for (const SCEV *Sc : Scevs) { + if (SE->isLoopInvariant(Sc, Lp)) { + ScStart = ScEnd = Sc; } else { - // Fallback case: the step is not constant, but we can still - // get the upper and lower bounds of the interval by using min/max - // expressions. - ScStart = SE->getUMinExpr(ScStart, ScEnd); - ScEnd = SE->getUMaxExpr(AR->getStart(), ScEnd); + const SCEVAddRecExpr *AR = dyn_cast(Sc); + assert(AR && "Invalid addrec expression"); + const SCEV *Ex = PSE.getBackedgeTakenCount(); + + ScStart = AR->getStart(); + ScEnd = AR->evaluateAtIteration(Ex, *SE); + const SCEV *Step = AR->getStepRecurrence(*SE); + + // For expressions with negative step, the upper bound is ScStart and the + // lower bound is ScEnd. + if (const auto *CStep = dyn_cast(Step)) { + if (CStep->getValue()->isNegative()) + std::swap(ScStart, ScEnd); + } else { + // Fallback case: the step is not constant, but we can still + // get the upper and lower bounds of the interval by using min/max + // expressions. + ScStart = SE->getUMinExpr(ScStart, ScEnd); + ScEnd = SE->getUMaxExpr(AR->getStart(), ScEnd); + } } + // Add the size of the pointed element to ScEnd. + auto &DL = Lp->getHeader()->getModule()->getDataLayout(); + Type *IdxTy = DL.getIndexType(Ptr->getType()); + const SCEV *EltSizeSCEV = + SE->getStoreSizeOfExpr(IdxTy, Ptr->getType()->getPointerElementType()); + ScEnd = SE->getAddExpr(ScEnd, EltSizeSCEV); + Starts.push_back(ScStart); + Ends.push_back(ScEnd); } - // Add the size of the pointed element to ScEnd. - auto &DL = Lp->getHeader()->getModule()->getDataLayout(); - Type *IdxTy = DL.getIndexType(Ptr->getType()); - const SCEV *EltSizeSCEV = - SE->getStoreSizeOfExpr(IdxTy, Ptr->getType()->getPointerElementType()); - ScEnd = SE->getAddExpr(ScEnd, EltSizeSCEV); - - Pointers.emplace_back(Ptr, ScStart, ScEnd, WritePtr, DepSetId, ASId, Sc); + Pointers.emplace_back(Ptr, Starts, Ends, WritePtr, DepSetId, ASId, Scevs); } SmallVector @@ -261,7 +444,7 @@ const RuntimeCheckingPtrGroup &M, const RuntimeCheckingPtrGroup &N) const { for (unsigned I = 0, EI = M.Members.size(); EI != I; ++I) for (unsigned J = 0, EJ = N.Members.size(); EJ != J; ++J) - if (needsChecking(M.Members[I], N.Members[J])) + if (needsChecking(M.Members[I].first, N.Members[J].first)) return true; return false; } @@ -281,16 +464,19 @@ } bool RuntimeCheckingPtrGroup::addPointer(unsigned Index, - RuntimePointerChecking &RtCheck) { + RuntimePointerChecking &RtCheck, + bool IsFork) { + unsigned Fork = IsFork ? 1 : 0; return addPointer( - Index, RtCheck.Pointers[Index].Start, RtCheck.Pointers[Index].End, + Index, RtCheck.Pointers[Index].Starts[Fork], + RtCheck.Pointers[Index].Ends[Fork], RtCheck.Pointers[Index].PointerValue->getType()->getPointerAddressSpace(), - *RtCheck.SE); + *RtCheck.SE, Fork); } bool RuntimeCheckingPtrGroup::addPointer(unsigned Index, const SCEV *Start, const SCEV *End, unsigned AS, - ScalarEvolution &SE) { + ScalarEvolution &SE, bool IsFork) { assert(AddressSpace == AS && "all pointers in a checking group must be in the same address space"); @@ -313,7 +499,7 @@ if (Min1 != End) High = End; - Members.push_back(Index); + Members.push_back(std::make_pair(Index, IsFork ? 1 : 0)); return true; } @@ -364,8 +550,14 @@ // for correctness, because in this case we can have checking between // pointers to the same underlying object. if (!UseDependencies) { - for (unsigned I = 0; I < Pointers.size(); ++I) - CheckingGroups.push_back(RuntimeCheckingPtrGroup(I, *this)); + for (unsigned I = 0; I < Pointers.size(); ++I) { + auto Group = CheckingGroups.emplace_back(I, *this, /*Fork=*/0); + // Try to add a fork to the same group first, otherwise establish + // a new one. + if (Pointers[I].HasFork) + if (!Group.addPointer(I, *this, /*Fork=*/1)) + CheckingGroups.emplace_back(I, *this, /*Fork=*/1); + } return; } @@ -405,7 +597,9 @@ assert(PointerI != PositionMap.end() && "pointer in equivalence class not found in PositionMap"); unsigned Pointer = PointerI->second; - bool Merged = false; + bool Merged[2] = {false, false}; + // Two expressions to evaluate for forked pointers. + const unsigned NumExprs = Pointers[Pointer].HasFork ? 2 : 1; // Mark this pointer as seen. Seen.insert(Pointer); @@ -421,17 +615,27 @@ TotalComparisons++; - if (Group.addPointer(Pointer, *this)) { - Merged = true; + // Try to add both forks, if applicable. + for (unsigned J = 0; J < NumExprs; ++J) + if (!Merged[J]) + Merged[J] = Group.addPointer(Pointer, *this, J); + + if (Merged[0] && (Merged[1] || NumExprs == 1)) break; - } } - if (!Merged) - // We couldn't add this pointer to any existing set or the threshold - // for the number of comparisons has been reached. Create a new group - // to hold the current pointer. - Groups.push_back(RuntimeCheckingPtrGroup(Pointer, *this)); + // If we couldn't add the pointer expression(s) to any existing set or + // the threshold for the number of comparisons has been reached, create a + // new group to hold the current pointer expression(s). + RuntimeCheckingPtrGroup *Group = nullptr; + if (!Merged[0]) + Group = &Groups.emplace_back(Pointer, *this); + if (!Merged[1] && NumExprs == 2) { + // Try merging forks first, as they may have the same base address + // but different offsets. + if (!Group || !Group->addPointer(Pointer, *this, /*Fork=*/1)) + Groups.emplace_back(Pointer, *this, /*Fork=*/1); + } } // We've computed the grouped checks for this partition. @@ -476,12 +680,22 @@ OS.indent(Depth) << "Check " << N++ << ":\n"; OS.indent(Depth + 2) << "Comparing group (" << Check.first << "):\n"; - for (unsigned K = 0; K < First.size(); ++K) - OS.indent(Depth + 2) << *Pointers[First[K]].PointerValue << "\n"; + for (unsigned K = 0; K < First.size(); ++K) { + const PointerInfo &Info = Pointers[First[K].first]; + OS.indent(Depth + 2) << *Info.PointerValue << "\n"; + if (Info.HasFork) + OS.indent(Depth + 6) + << "Fork: " << *Info.Exprs[First[K].second] << "\n"; + } OS.indent(Depth + 2) << "Against group (" << Check.second << "):\n"; - for (unsigned K = 0; K < Second.size(); ++K) - OS.indent(Depth + 2) << *Pointers[Second[K]].PointerValue << "\n"; + for (unsigned K = 0; K < Second.size(); ++K) { + const PointerInfo &Info = Pointers[Second[K].first]; + OS.indent(Depth + 2) << *Info.PointerValue << "\n"; + if (Info.HasFork) + OS.indent(Depth + 6) + << "Fork: " << *Info.Exprs[Second[K].second] << "\n"; + } } } @@ -498,7 +712,8 @@ OS.indent(Depth + 4) << "(Low: " << *CG.Low << " High: " << *CG.High << ")\n"; for (unsigned J = 0; J < CG.Members.size(); ++J) { - OS.indent(Depth + 6) << "Member: " << *Pointers[CG.Members[J]].Expr + const PointerInfo &Info = Pointers[CG.Members[J].first]; + OS.indent(Depth + 6) << "Member: " << *(Info.Exprs[CG.Members[J].second]) << "\n"; } } @@ -633,11 +848,13 @@ /// by adding run-time checks (overflow checks) if necessary. static bool hasComputableBounds(PredicatedScalarEvolution &PSE, const ValueToValueMap &Strides, Value *Ptr, - Loop *L, bool Assume) { + Loop *L, bool Assume, + RuntimePointerChecking &RtCheck) { const SCEV *PtrScev = replaceSymbolicStrideSCEV(PSE, Strides, Ptr); + ScalarEvolution *SE = PSE.getSE(); // The bounds for loop-invariant pointer is trivial. - if (PSE.getSE()->isLoopInvariant(PtrScev, L)) + if (SE->isLoopInvariant(PtrScev, L)) return true; const SCEVAddRecExpr *AR = dyn_cast(PtrScev); @@ -645,19 +862,97 @@ if (!AR && Assume) AR = PSE.getAsAddRec(Ptr); - if (!AR) - return false; + if (AR) + return AR->isAffine(); + + // If we can't find a single SCEVAddRecExpr, then maybe we can find two. + // The findForkedPointer function tries walking backwards through IR until + // it finds operands that are either loop invariant or for which a + // SCEVAddRecExpr can be formed. If a select instruction is encountered + // during this walk then we split the walk and try to generate two valid + // SCEVAddRecExprs. Both of those SCEVs can then be considered when + // deciding whether runtime checks are required. + if (RtCheck.AllowForkedPtrs) { + Optional FPtr = findForkedPointer(SE, Ptr, L); + if (!FPtr) + return false; + + const SCEV *A = FPtr->first; + const SCEV *B = FPtr->second; + LLVM_DEBUG(dbgs() << "LAA: ForkedPtr found: " << *Ptr << "\n"); + LLVM_DEBUG(dbgs() << "LAA: SCEV1: " << *A << "\n"); + LLVM_DEBUG(dbgs() << "LAA: SCEV2: " << *B << "\n"); + if (isa(A) && cast(A)->isAffine() && + isa(B) && cast(B)->isAffine()) { + RtCheck.ForkedPtrs[Ptr] = *FPtr; + return true; + } - return AR->isAffine(); + LLVM_DEBUG( + dbgs() << "LAA: Could not determine bounds for forked pointer\n"); + } + + return false; } /// Check whether a pointer address cannot wrap. static bool isNoWrap(PredicatedScalarEvolution &PSE, - const ValueToValueMap &Strides, Value *Ptr, Loop *L) { + const ValueToValueMap &Strides, Value *Ptr, Loop *L, + RuntimePointerChecking &RtCheck) { const SCEV *PtrScev = PSE.getSCEV(Ptr); if (PSE.getSE()->isLoopInvariant(PtrScev, L)) return true; + // The SCEV generated directly from a forked pointer is not an AddRecExpr, + // but an unknown SCEV. The PSE.hasNoOverflow method currently assumes + // that it must be an AddRecExpr and just casts. So we just bail out at + // this point, since we can't pass in the SCEVs one at a time -- + // hasNoOverflow takes a Value* as a param instead of a SCEV*. + // TODO: Support overflow checking for ForkedPointers. + // + // We can, however, calculate an effective stride for each side of the fork + // and check if the stride is 1 for both; this is the other way we can + // assume a pointer doesn't wrap. + if (RtCheck.isForkedPtr(Ptr)) { + auto SCEVs = RtCheck.ForkedPtrs[Ptr]; + const SCEVAddRecExpr *LSAR = cast(SCEVs.first); + const SCEVAddRecExpr *RSAR = cast(SCEVs.second); + const SCEV *LStep = LSAR->getStepRecurrence(*PSE.getSE()); + const SCEV *RStep = RSAR->getStepRecurrence(*PSE.getSE()); + + if (LStep != RStep) { + LLVM_DEBUG(dbgs() << "LAA: Forkedptr with mismatched steps\n"); + return false; + } + + auto *PtrTy = dyn_cast(Ptr->getType()); + auto &DL = L->getHeader()->getModule()->getDataLayout(); + int64_t Size = DL.getTypeAllocSize(PtrTy->getElementType()).getFixedSize(); + + const SCEVConstant *C = dyn_cast(LStep); + if (!C) { + LLVM_DEBUG(dbgs() << "LAA: Forkedptr with non-constant steps\n"); + return false; + } + const APInt &APStepVal = C->getAPInt(); + if (APStepVal.getBitWidth() > 64) { + LLVM_DEBUG(dbgs() << "LAA: Forkedptr step is too large\n"); + return false; + } + + int64_t StepVal = APStepVal.getSExtValue(); + + // Strided access. + int64_t Stride = StepVal / Size; + int64_t Rem = StepVal % Size; + if (Rem) { + LLVM_DEBUG(dbgs() << "LAA: Forkedptr step not multiple of elt size\n"); + return false; + } + + return Stride == 1; + } + Type *AccessTy = Ptr->getType()->getPointerElementType(); int64_t Stride = getPtrStride(PSE, AccessTy, Ptr, L, Strides); if (Stride == 1 || PSE.hasNoOverflow(Ptr, SCEVWrapPredicate::IncrementNUSW)) @@ -698,12 +993,12 @@ bool Assume) { Value *Ptr = Access.getPointer(); - if (!hasComputableBounds(PSE, StridesMap, Ptr, TheLoop, Assume)) + if (!hasComputableBounds(PSE, StridesMap, Ptr, TheLoop, Assume, RtCheck)) return false; // When we run after a failing dependency check we have to make sure // we don't have wrapping pointers. - if (ShouldCheckWrap && !isNoWrap(PSE, StridesMap, Ptr, TheLoop)) { + if (ShouldCheckWrap && !isNoWrap(PSE, StridesMap, Ptr, TheLoop, RtCheck)) { auto *Expr = PSE.getSCEV(Ptr); if (!Assume || !isa(Expr)) return false; @@ -1862,6 +2157,7 @@ PtrRtChecking->Pointers.clear(); PtrRtChecking->Need = false; + PtrRtChecking->ForkedPtrs.clear(); const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel(); @@ -2245,7 +2541,8 @@ const TargetLibraryInfo *TLI, AAResults *AA, DominatorTree *DT, LoopInfo *LI) : PSE(std::make_unique(*SE, *L)), - PtrRtChecking(std::make_unique(SE)), + PtrRtChecking( + std::make_unique(SE, EnableForkedPointers)), DepChecker(std::make_unique(*PSE, L)), TheLoop(L), NumLoads(0), NumStores(0), MaxSafeDepDistBytes(-1), CanVecMem(false), HasConvergentOp(false), Index: llvm/lib/Transforms/Scalar/LoopDistribute.cpp =================================================================== --- llvm/lib/Transforms/Scalar/LoopDistribute.cpp +++ llvm/lib/Transforms/Scalar/LoopDistribute.cpp @@ -908,29 +908,30 @@ const RuntimePointerChecking *RtPtrChecking) { SmallVector Checks; - copy_if(AllChecks, std::back_inserter(Checks), - [&](const RuntimePointerCheck &Check) { - for (unsigned PtrIdx1 : Check.first->Members) - for (unsigned PtrIdx2 : Check.second->Members) - // Only include this check if there is a pair of pointers - // that require checking and the pointers fall into - // separate partitions. - // - // (Note that we already know at this point that the two - // pointer groups need checking but it doesn't follow - // that each pair of pointers within the two groups need - // checking as well. - // - // In other words we don't want to include a check just - // because there is a pair of pointers between the two - // pointer groups that require checks and a different - // pair whose pointers fall into different partitions.) - if (RtPtrChecking->needsChecking(PtrIdx1, PtrIdx2) && - !RuntimePointerChecking::arePointersInSamePartition( - PtrToPartition, PtrIdx1, PtrIdx2)) - return true; - return false; - }); + copy_if( + AllChecks, std::back_inserter(Checks), + [&](const RuntimePointerCheck &Check) { + for (auto PtrIdx1 : Check.first->Members) + for (auto PtrIdx2 : Check.second->Members) + // Only include this check if there is a pair of pointers + // that require checking and the pointers fall into + // separate partitions. + // + // (Note that we already know at this point that the two + // pointer groups need checking but it doesn't follow + // that each pair of pointers within the two groups need + // checking as well. + // + // In other words we don't want to include a check just + // because there is a pair of pointers between the two + // pointer groups that require checks and a different + // pair whose pointers fall into different partitions.) + if (RtPtrChecking->needsChecking(PtrIdx1.first, PtrIdx2.first) && + !RuntimePointerChecking::arePointersInSamePartition( + PtrToPartition, PtrIdx1.first, PtrIdx2.first)) + return true; + return false; + }); return Checks; } Index: llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp =================================================================== --- llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp +++ llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp @@ -393,8 +393,8 @@ [&](const RuntimePointerCheck &Check) { for (auto PtrIdx1 : Check.first->Members) for (auto PtrIdx2 : Check.second->Members) - if (needsChecking(PtrIdx1, PtrIdx2, PtrsWrittenOnFwdingPath, - CandLoadPtrs)) + if (needsChecking(PtrIdx1.first, PtrIdx2.first, + PtrsWrittenOnFwdingPath, CandLoadPtrs)) return true; return false; }); Index: llvm/lib/Transforms/Utils/LoopVersioning.cpp =================================================================== --- llvm/lib/Transforms/Utils/LoopVersioning.cpp +++ llvm/lib/Transforms/Utils/LoopVersioning.cpp @@ -193,8 +193,9 @@ for (const auto &Group : RtPtrChecking->CheckingGroups) { GroupToScope[&Group] = MDB.createAnonymousAliasScope(Domain); - for (unsigned PtrIdx : Group.Members) - PtrToGroup[RtPtrChecking->getPointerInfo(PtrIdx).PointerValue] = &Group; + for (auto PtrIdx : Group.Members) + PtrToGroup[RtPtrChecking->getPointerInfo(PtrIdx.first).PointerValue] = + &Group; } // Go through the checks and for each pointer group, collect the scopes for Index: llvm/test/Analysis/LoopAccessAnalysis/forked-pointers.ll =================================================================== --- llvm/test/Analysis/LoopAccessAnalysis/forked-pointers.ll +++ llvm/test/Analysis/LoopAccessAnalysis/forked-pointers.ll @@ -1,19 +1,48 @@ -; RUN: opt -loop-accesses -analyze -enable-new-pm=0 %s 2>&1 | FileCheck %s -; RUN: opt -disable-output -passes='require,require,loop(print-access-info)' %s 2>&1 | FileCheck %s +; RUN: opt -enable-forked-pointer-detection -loop-accesses -analyze -enable-new-pm=0 %s 2>&1 | FileCheck %s +; RUN: opt -enable-forked-pointer-detection -disable-output -passes='require,require,loop(print-access-info)' %s 2>&1 | FileCheck %s target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" ; CHECK-LABEL: function 'forked_ptrs_different_base_same_offset': -; CHECK-NEXT: for.body: -; CHECK-NEXT: Report: cannot identify array bounds -; CHECK-NEXT: Dependences: -; CHECK-NEXT: Run-time memory checks: -; CHECK-NEXT: Grouped accesses: +; CHECK-NEXT: for.body: +; CHECK-NEXT: Memory dependences are safe with run-time checks +; CHECK-NEXT: Dependences: +; CHECK-NEXT: Run-time memory checks: +; CHECK-NEXT: Check 0: +; CHECK-NEXT: Comparing group +; CHECK-NEXT: %1 = getelementptr inbounds float, float* %Dest, i64 %indvars.iv +; CHECK-NEXT: Against group +; CHECK-NEXT: %arrayidx = getelementptr inbounds i32, i32* %Preds, i64 %indvars.iv +; CHECK-NEXT: Check 1: +; CHECK-NEXT: Comparing group +; CHECK-NEXT: %1 = getelementptr inbounds float, float* %Dest, i64 %indvars.iv +; CHECK-NEXT: Against group +; CHECK-NEXT: %.sink.in = getelementptr inbounds float, float* %spec.select, i64 %indvars.iv +; CHECK-NEXT: Fork: {%Base2,+,4}<%for.body> +; CHECK-NEXT: Check 2: +; CHECK-NEXT: Comparing group +; CHECK-NEXT: %1 = getelementptr inbounds float, float* %Dest, i64 %indvars.iv +; CHECK-NEXT: Against group +; CHECK-NEXT: %.sink.in = getelementptr inbounds float, float* %spec.select, i64 %indvars.iv +; CHECK-NEXT: Fork: {%Base1,+,4}<%for.body> +; CHECK-NEXT: Grouped accesses: +; CHECK-NEXT: Group +; CHECK-NEXT: (Low: %Dest High: (400 + %Dest)) +; CHECK-NEXT: Member: {%Dest,+,4}<%for.body> +; CHECK-NEXT: Group +; CHECK-NEXT: (Low: %Preds High: (400 + %Preds)) +; CHECK-NEXT: Member: {%Preds,+,4}<%for.body> +; CHECK-NEXT: Group +; CHECK-NEXT: (Low: %Base2 High: (400 + %Base2)) +; CHECK-NEXT: Member: {%Base2,+,4}<%for.body> +; CHECK-NEXT: Group +; CHECK-NEXT: (Low: %Base1 High: (400 + %Base1)) +; CHECK-NEXT: Member: {%Base1,+,4}<%for.body> ; CHECK-EMPTY: -; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. -; CHECK-NEXT: SCEV assumptions: +; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. +; CHECK-NEXT: SCEV assumptions: ; CHECK-EMPTY: -; CHECK-NEXT: Expressions re-written: +; CHECK-NEXT: Expressions re-written: ;;;; Derived from the following C code ;; void forked_ptrs_different_base_same_offset(float *A, float *B, float *C, int *D) { @@ -49,16 +78,39 @@ } ; CHECK-LABEL: function 'forked_ptrs_same_base_different_offset': -; CHECK-NEXT: for.body: -; CHECK-NEXT: Report: cannot identify array bounds -; CHECK-NEXT: Dependences: -; CHECK-NEXT: Run-time memory checks: -; CHECK-NEXT: Grouped accesses: +; CHECK-NEXT: for.body: +; CHECK-NEXT: Memory dependences are safe with run-time checks +; CHECK-NEXT: Dependences: +; CHECK-NEXT: Run-time memory checks: +; CHECK-NEXT: Check 0: +; CHECK-NEXT: Comparing group +; CHECK-NEXT: %arrayidx5 = getelementptr inbounds float, float* %Dest, i64 %indvars.iv +; CHECK-NEXT: Against group +; CHECK-NEXT: %arrayidx = getelementptr inbounds i32, i32* %Preds, i64 %indvars.iv +; CHECK-NEXT: Check 1: +; CHECK-NEXT: Comparing group +; CHECK-NEXT: %arrayidx5 = getelementptr inbounds float, float* %Dest, i64 %indvars.iv +; CHECK-NEXT: Against group +; CHECK-NEXT: %arrayidx3 = getelementptr inbounds float, float* %Base, i64 %idxprom213 +; CHECK-NEXT: Fork: {(4 + %Base),+,4}<%for.body> +; CHECK-NEXT: %arrayidx3 = getelementptr inbounds float, float* %Base, i64 %idxprom213 +; CHECK-NEXT: Fork: {%Base,+,4}<%for.body> +; CHECK-NEXT: Grouped accesses: +; CHECK-NEXT: Group +; CHECK-NEXT: (Low: %Dest High: (400 + %Dest)) +; CHECK-NEXT: Member: {%Dest,+,4}<%for.body> +; CHECK-NEXT: Group +; CHECK-NEXT: (Low: %Preds High: (400 + %Preds)) +; CHECK-NEXT: Member: {%Preds,+,4}<%for.body> +; CHECK-NEXT: Group +; CHECK-NEXT: (Low: %Base High: (404 + %Base)) +; CHECK-NEXT: Member: {(4 + %Base),+,4}<%for.body> +; CHECK-NEXT: Member: {%Base,+,4}<%for.body> ; CHECK-EMPTY: -; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. -; CHECK-NEXT: SCEV assumptions: +; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. +; CHECK-NEXT: SCEV assumptions: ; CHECK-EMPTY: -; CHECK-NEXT: Expressions re-written: +; CHECK-NEXT: Expressions re-written: ;;;; Derived from the following C code ;; void forked_ptrs_same_base_different_offset(float *A, float *B, int *C) { Index: llvm/test/Transforms/LoopVectorize/forked-pointers.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/forked-pointers.ll +++ llvm/test/Transforms/LoopVectorize/forked-pointers.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -loop-vectorize -instcombine -force-vector-width=4 -S < %s 2>&1 | FileCheck %s +; RUN: opt -enable-forked-pointer-detection -loop-vectorize -instcombine -force-vector-width=4 -S < %s 2>&1 | FileCheck %s target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" @@ -17,22 +17,84 @@ define dso_local void @forked_ptrs_different_base_same_offset(float* nocapture readonly %Base1, float* nocapture readonly %Base2, float* nocapture %Dest, i32* nocapture readonly %Preds) { ; CHECK-LABEL: @forked_ptrs_different_base_same_offset( ; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK: vector.memcheck: +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr float, float* [[DEST:%.*]], i64 100 +; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[PREDS:%.*]], i64 100 +; CHECK-NEXT: [[SCEVGEP7:%.*]] = getelementptr float, float* [[BASE2:%.*]], i64 100 +; CHECK-NEXT: [[SCEVGEP10:%.*]] = getelementptr float, float* [[BASE1:%.*]], i64 100 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[SCEVGEP4]] to float* +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt float* [[TMP0]], [[DEST]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[SCEVGEP]] to i32* +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt i32* [[TMP1]], [[PREDS]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[BOUND012:%.*]] = icmp ugt float* [[SCEVGEP7]], [[DEST]] +; CHECK-NEXT: [[BOUND113:%.*]] = icmp ugt float* [[SCEVGEP]], [[BASE2]] +; CHECK-NEXT: [[FOUND_CONFLICT14:%.*]] = and i1 [[BOUND012]], [[BOUND113]] +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT14]] +; CHECK-NEXT: [[BOUND015:%.*]] = icmp ugt float* [[SCEVGEP10]], [[DEST]] +; CHECK-NEXT: [[BOUND116:%.*]] = icmp ugt float* [[SCEVGEP]], [[BASE1]] +; CHECK-NEXT: [[FOUND_CONFLICT17:%.*]] = and i1 [[BOUND015]], [[BOUND116]] +; CHECK-NEXT: [[CONFLICT_RDX18:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT17]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX18]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float*> poison, float* [[BASE2]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float*> [[BROADCAST_SPLATINSERT]], <4 x float*> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT19:%.*]] = insertelement <4 x float*> poison, float* [[BASE1]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT20:%.*]] = shufflevector <4 x float*> [[BROADCAST_SPLATINSERT19]], <4 x float*> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP2:%.*]] = or i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP3:%.*]] = or i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP4:%.*]] = or i64 [[INDEX]], 3 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[PREDS]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP6]], align 4, !alias.scope !0 +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = select <4 x i1> [[TMP7]], <4 x float*> [[BROADCAST_SPLAT]], <4 x float*> [[BROADCAST_SPLAT20]] +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x float*> [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP9]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x float*> [[TMP8]], i32 1 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP11]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x float*> [[TMP8]], i32 2 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP13]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x float*> [[TMP8]], i32 3 +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP15]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP17:%.*]] = load float, float* [[TMP10]], align 4, !alias.scope !3 +; CHECK-NEXT: [[TMP18:%.*]] = load float, float* [[TMP12]], align 4, !alias.scope !3 +; CHECK-NEXT: [[TMP19:%.*]] = load float, float* [[TMP14]], align 4, !alias.scope !3 +; CHECK-NEXT: [[TMP20:%.*]] = load float, float* [[TMP16]], align 4, !alias.scope !3 +; CHECK-NEXT: [[TMP21:%.*]] = insertelement <4 x float> poison, float [[TMP17]], i32 0 +; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x float> [[TMP21]], float [[TMP18]], i32 1 +; CHECK-NEXT: [[TMP23:%.*]] = insertelement <4 x float> [[TMP22]], float [[TMP19]], i32 2 +; CHECK-NEXT: [[TMP24:%.*]] = insertelement <4 x float> [[TMP23]], float [[TMP20]], i32 3 +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds float, float* [[DEST]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP26:%.*]] = bitcast float* [[TMP25]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP24]], <4 x float>* [[TMP26]], align 4, !alias.scope !5, !noalias !7 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 +; CHECK-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: ret void ; CHECK: for.body: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[PREDS:%.*]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[CMP1_NOT:%.*]] = icmp eq i32 [[TMP0]], 0 -; CHECK-NEXT: [[SPEC_SELECT:%.*]] = select i1 [[CMP1_NOT]], float* [[BASE2:%.*]], float* [[BASE1:%.*]] +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[PREDS]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP28:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[CMP1_NOT:%.*]] = icmp eq i32 [[TMP28]], 0 +; CHECK-NEXT: [[SPEC_SELECT:%.*]] = select i1 [[CMP1_NOT]], float* [[BASE2]], float* [[BASE1]] ; CHECK-NEXT: [[DOTSINK_IN:%.*]] = getelementptr inbounds float, float* [[SPEC_SELECT]], i64 [[INDVARS_IV]] ; CHECK-NEXT: [[DOTSINK:%.*]] = load float, float* [[DOTSINK_IN]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, float* [[DEST:%.*]], i64 [[INDVARS_IV]] -; CHECK-NEXT: store float [[DOTSINK]], float* [[TMP1]], align 4 +; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, float* [[DEST]], i64 [[INDVARS_IV]] +; CHECK-NEXT: store float [[DOTSINK]], float* [[TMP29]], align 4 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 100 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; entry: br label %for.body @@ -70,26 +132,83 @@ define dso_local void @forked_ptrs_same_base_different_offset(float* nocapture readonly %Base, float* nocapture %Dest, i32* nocapture readonly %Preds) { ; CHECK-LABEL: @forked_ptrs_same_base_different_offset( ; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK: vector.memcheck: +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr float, float* [[DEST:%.*]], i64 100 +; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[PREDS:%.*]], i64 100 +; CHECK-NEXT: [[SCEVGEP7:%.*]] = getelementptr float, float* [[BASE:%.*]], i64 101 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[SCEVGEP4]] to float* +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt float* [[TMP0]], [[DEST]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[SCEVGEP]] to i32* +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt i32* [[TMP1]], [[PREDS]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[BOUND09:%.*]] = icmp ugt float* [[SCEVGEP7]], [[DEST]] +; CHECK-NEXT: [[BOUND110:%.*]] = icmp ugt float* [[SCEVGEP]], [[BASE]] +; CHECK-NEXT: [[FOUND_CONFLICT11:%.*]] = and i1 [[BOUND09]], [[BOUND110]] +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT11]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND13:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT14:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND15:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT16:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[PREDS]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4, !alias.scope !12 +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = add nuw nsw <4 x i32> [[VEC_IND13]], +; CHECK-NEXT: [[TMP6:%.*]] = select <4 x i1> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> [[VEC_IND15]] +; CHECK-NEXT: [[TMP7:%.*]] = zext <4 x i32> [[TMP6]] to <4 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i64> [[TMP7]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[BASE]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[TMP7]], i32 1 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float* [[BASE]], i64 [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i64> [[TMP7]], i32 2 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, float* [[BASE]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i64> [[TMP7]], i32 3 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, float* [[BASE]], i64 [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = load float, float* [[TMP9]], align 4, !alias.scope !15 +; CHECK-NEXT: [[TMP17:%.*]] = load float, float* [[TMP11]], align 4, !alias.scope !15 +; CHECK-NEXT: [[TMP18:%.*]] = load float, float* [[TMP13]], align 4, !alias.scope !15 +; CHECK-NEXT: [[TMP19:%.*]] = load float, float* [[TMP15]], align 4, !alias.scope !15 +; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x float> poison, float [[TMP16]], i32 0 +; CHECK-NEXT: [[TMP21:%.*]] = insertelement <4 x float> [[TMP20]], float [[TMP17]], i32 1 +; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x float> [[TMP21]], float [[TMP18]], i32 2 +; CHECK-NEXT: [[TMP23:%.*]] = insertelement <4 x float> [[TMP22]], float [[TMP19]], i32 3 +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float* [[DEST]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP24]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP23]], <4 x float>* [[TMP25]], align 4, !alias.scope !17, !noalias !19 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT14]] = add <4 x i32> [[VEC_IND13]], +; CHECK-NEXT: [[VEC_IND_NEXT16]] = add <4 x i32> [[VEC_IND15]], +; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 +; CHECK-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL12:%.*]] = phi i32 [ 100, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: ret void ; CHECK: for.body: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[I_014:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[PREDS:%.*]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[CMP1_NOT:%.*]] = icmp eq i32 [[TMP0]], 0 +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[I_014:%.*]] = phi i32 [ [[BC_RESUME_VAL12]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[PREDS]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP27:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[CMP1_NOT:%.*]] = icmp eq i32 [[TMP27]], 0 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[ADD]] = add nuw nsw i32 [[I_014]], 1 -; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[INDVARS_IV]] to i32 -; CHECK-NEXT: [[OFFSET_0:%.*]] = select i1 [[CMP1_NOT]], i32 [[ADD]], i32 [[TMP1]] +; CHECK-NEXT: [[TMP28:%.*]] = trunc i64 [[INDVARS_IV]] to i32 +; CHECK-NEXT: [[OFFSET_0:%.*]] = select i1 [[CMP1_NOT]], i32 [[ADD]], i32 [[TMP28]] ; CHECK-NEXT: [[IDXPROM213:%.*]] = zext i32 [[OFFSET_0]] to i64 -; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[BASE:%.*]], i64 [[IDXPROM213]] -; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[ARRAYIDX3]], align 4 -; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[DEST:%.*]], i64 [[INDVARS_IV]] -; CHECK-NEXT: store float [[TMP2]], float* [[ARRAYIDX5]], align 4 +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[BASE]], i64 [[IDXPROM213]] +; CHECK-NEXT: [[TMP29:%.*]] = load float, float* [[ARRAYIDX3]], align 4 +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[DEST]], i64 [[INDVARS_IV]] +; CHECK-NEXT: store float [[TMP29]], float* [[ARRAYIDX5]], align 4 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 100 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] ; entry: br label %for.body