diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h --- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h +++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h @@ -335,6 +335,11 @@ /// Create a new pointer checking group containing a single /// pointer, with index \p Index in RtCheck. RuntimeCheckingPtrGroup(unsigned Index, RuntimePointerChecking &RtCheck); + RuntimeCheckingPtrGroup(unsigned Index, const SCEV *Start, const SCEV *End, + unsigned AS, bool NeedsFreeze) + : High(End), Low(Start), AddressSpace(AS), NeedsFreeze(NeedsFreeze) { + Members.push_back(Index); + } /// Tries to add the pointer recorded in RtCheck at index /// \p Index to this pointer checking group. We can only add a pointer diff --git a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h --- a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h +++ b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h @@ -54,6 +54,14 @@ } // end namespace slpvectorizer +struct SLPVectorizerResult { + bool MadeAnyChange; + bool MadeCFGChange; + + SLPVectorizerResult(bool MadeAnyChange, bool MadeCFGChange) + : MadeAnyChange(MadeAnyChange), MadeCFGChange(MadeCFGChange) {} +}; + struct SLPVectorizerPass : public PassInfoMixin { using StoreList = SmallVector; using StoreListMap = MapVector; @@ -75,10 +83,12 @@ PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); // Glue for old PM. - bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, - TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, - DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, - OptimizationRemarkEmitter *ORE_); + SLPVectorizerResult runImpl(Function &F, ScalarEvolution *SE_, + TargetTransformInfo *TTI_, + TargetLibraryInfo *TLI_, AAResults *AA_, + LoopInfo *LI_, DominatorTree *DT_, + AssumptionCache *AC_, DemandedBits *DB_, + OptimizationRemarkEmitter *ORE_); private: /// Collect store and getelementptr instructions and organize them @@ -139,6 +149,11 @@ bool vectorizeStores(ArrayRef Stores, slpvectorizer::BoUpSLP &R); + SLPVectorizerResult + vectorizeBlockWithVersioning(BasicBlock *BB, + const SmallPtrSetImpl &TrackedObjects, + slpvectorizer::BoUpSLP &R); + /// The store instructions in a basic block organized by base pointer. StoreListMap Stores; diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -388,7 +388,7 @@ const SCEV *End, unsigned AS, bool NeedsFreeze, ScalarEvolution &SE) { - assert(AddressSpace == AS && + assert((Members.empty() || AddressSpace == AS) && "all pointers in a checking group must be in the same address space"); // Compare the starts and ends with the known minimum and maximum diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -36,6 +36,7 @@ #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CodeMetrics.h" #include "llvm/Analysis/DemandedBits.h" +#include "llvm/Analysis/DomTreeUpdater.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/IVDescriptors.h" #include "llvm/Analysis/LoopAccessAnalysis.h" @@ -62,6 +63,7 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" +#include "llvm/IR/MDBuilder.h" #include "llvm/IR/Module.h" #include "llvm/IR/Operator.h" #include "llvm/IR/PatternMatch.h" @@ -85,9 +87,12 @@ #include "llvm/Support/KnownBits.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Transforms/Utils/InjectTLIMappings.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopUtils.h" +#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" #include "llvm/Transforms/Vectorize.h" #include #include @@ -108,6 +113,10 @@ #define DEBUG_TYPE "SLP" STATISTIC(NumVectorInstructions, "Number of vector instructions generated"); +STATISTIC(NumVersioningSuccessful, + "Number of times versioning was tried and beneficial"); +STATISTIC(NumVersioningFailed, + "Number of times versioning was tried but was not beneficial"); cl::opt RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, cl::desc("Run the SLP vectorization passes")); @@ -177,6 +186,10 @@ ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz")); +static cl::opt EnableMemoryVersioning( + "slp-memory-versioning", cl::init(false), cl::Hidden, + cl::desc("Enable memory versioning for SLP vectorization.")); + // Limit the number of alias checks. The limit is chosen so that // it has no negative effect on the llvm benchmarks. static const unsigned AliasedCheckLimit = 10; @@ -833,6 +846,52 @@ (all_of(VL, isUsedOutsideBlock) || all_of(VL, areAllOperandsNonInsts)); } +namespace { +/// Models a memory access to an underlying object with SCEV pointer expression +/// and access type. +struct AccessInfo { + Value *UnderlyingObj; + const SCEV *PtrSCEV; + Type *AccessTy; + + AccessInfo(Value *UnderlyingObj = nullptr, const SCEV *PtrSCEV = nullptr, + Type *AccessTy = nullptr) + : UnderlyingObj(UnderlyingObj), PtrSCEV(PtrSCEV), AccessTy(AccessTy) {} + + /// Returns the AccessInfo for \p I. If \p I isn't a memory instruction or the + /// pointer cannot be converted to a SCEV, return an empty object. + static AccessInfo get(Instruction &I, ScalarEvolution &SE, + DominatorTree &DT) { + BasicBlock *BB = I.getParent(); + auto GetPtrAndAccessTy = [](Instruction *I) -> std::pair { + if (auto *L = dyn_cast(I)) { + if (isValidElementType(L->getType())) + return {L->getPointerOperand(), L->getType()}; + } + if (auto *S = dyn_cast(I)) + if (isValidElementType(S->getValueOperand()->getType())) + return {S->getPointerOperand(), S->getValueOperand()->getType()}; + return {nullptr, nullptr}; + }; + Value *Ptr; + Type *AccessTy; + std::tie(Ptr, AccessTy) = GetPtrAndAccessTy(&I); + if (!Ptr) + return {}; + Value *Obj = getUnderlyingObject(Ptr); + if (!Obj) + return {}; + auto *Start = SE.getSCEV(Ptr); + + PHINode *PN = dyn_cast(Obj); + if (!SE.properlyDominates(Start, BB) && + !(PN && DT.dominates(PN->getParent(), BB))) + return {}; + return {Obj, Start, AccessTy}; + } +}; +} // anonymous namespace + namespace slpvectorizer { /// Bottom Up SLP Vectorizer. @@ -841,6 +900,18 @@ struct ScheduleData; public: + /// Set of objects we need to generate runtime checks for. + SmallPtrSet TrackedObjects; + + SmallSet, 8> DepObjs; + + /// Cache for alias results. + /// TODO: consider moving this to the AliasAnalysis itself. + using AliasCacheKey = std::pair; + DenseMap> AliasCache; + + bool CollectMemAccess = false; + using ValueList = SmallVector; using InstrList = SmallVector; using ValueSet = SmallPtrSet; @@ -960,6 +1031,17 @@ /// during analysis. void reorderBottomToTop(bool IgnoreReorder = false); + void removeDeletedInstructions() { + for (auto *I : DeletedInstructions) { + I->dropAllReferences(); + } + for (auto *I : DeletedInstructions) { + assert(I->use_empty() && "trying to erase instruction with users."); + I->eraseFromParent(); + } + DeletedInstructions.clear(); + } + /// \return The vector element size in bits to use when vectorizing the /// expression tree ending at \p V. If V is a store, the size is the width of /// the stored value. Otherwise, the size is the width of the largest loaded @@ -2647,12 +2729,6 @@ return aliased; } - using AliasCacheKey = std::pair; - - /// Cache for alias results. - /// TODO: consider moving this to the AliasAnalysis itself. - DenseMap> AliasCache; - // Cache for pointerMayBeCaptured calls inside AA. This is preserved // globally through SLP because we don't perform any action which // invalidates capture results. @@ -3353,15 +3429,11 @@ } I->dropAllReferences(); } - for (auto *I : DeletedInstructions) { - assert(I->use_empty() && - "trying to erase instruction with users."); - I->eraseFromParent(); - } // Cleanup any dead scalar code feeding the vectorized instructions RecursivelyDeleteTriviallyDeadInstructions(DeadInsts, TLI); + removeDeletedInstructions(); #ifdef EXPENSIVE_CHECKS // If we could guarantee that this call is not extremely slow, we could // remove the ifdef limitation (see PR47712). @@ -9510,9 +9582,37 @@ // balance between reduced runtime and accurate dependencies. numAliased++; + ScheduleData *DestBundle = DepDest->FirstInBundle; + // If this bundle is not scheduled and no versioned code has been + // generated yet, try to collect the bounds of the accesses to + // generate runtime checks. + if (!DestBundle->IsScheduled && SLP->CollectMemAccess) { + auto *Src = getLoadStorePointerOperand(SrcInst); + auto *Dst = getLoadStorePointerOperand(DepDest->Inst); + + if (SrcInst->getParent() == DepDest->Inst->getParent() && Src && + Dst) { + auto SrcObjAndPtr = AccessInfo::get(*SrcInst, *SLP->SE, *SLP->DT); + auto DstObjAndPtr = + AccessInfo::get(*DepDest->Inst, *SLP->SE, *SLP->DT); + if (!SrcObjAndPtr.UnderlyingObj || !DstObjAndPtr.UnderlyingObj || + SrcObjAndPtr.UnderlyingObj == DstObjAndPtr.UnderlyingObj) + SLP->TrackedObjects.clear(); + else { + SLP->TrackedObjects.insert(SrcObjAndPtr.UnderlyingObj); + SLP->TrackedObjects.insert(DstObjAndPtr.UnderlyingObj); + + Value *A = SrcObjAndPtr.UnderlyingObj; + Value *B = DstObjAndPtr.UnderlyingObj; + if (A > B) + std::swap(A, B); + SLP->DepObjs.insert({A, B}); + } + } + } + DepDest->MemoryDependencies.push_back(BundleMember); BundleMember->Dependencies++; - ScheduleData *DestBundle = DepDest->FirstInBundle; if (!DestBundle->IsScheduled) { BundleMember->incrementUnscheduledDeps(1); } @@ -9958,7 +10058,7 @@ auto *DB = &getAnalysis().getDemandedBits(); auto *ORE = &getAnalysis().getORE(); - return Impl.runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE); + return Impl.runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE).MadeAnyChange; } void getAnalysisUsage(AnalysisUsage &AU) const override { @@ -9974,9 +10074,11 @@ AU.addRequired(); AU.addPreserved(); AU.addPreserved(); - AU.addPreserved(); AU.addPreserved(); - AU.setPreservesCFG(); + if (!EnableMemoryVersioning) { + AU.addPreserved(); + AU.setPreservesCFG(); + } } }; @@ -9993,23 +10095,374 @@ auto *DB = &AM.getResult(F); auto *ORE = &AM.getResult(F); - bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE); - if (!Changed) + auto Result = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE); + if (!Result.MadeAnyChange) return PreservedAnalyses::all(); PreservedAnalyses PA; - PA.preserveSet(); + if (!Result.MadeCFGChange) + PA.preserveSet(); + PA.preserve(); + PA.preserve(); return PA; } -bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_, - TargetTransformInfo *TTI_, - TargetLibraryInfo *TLI_, AAResults *AA_, - LoopInfo *LI_, DominatorTree *DT_, - AssumptionCache *AC_, DemandedBits *DB_, - OptimizationRemarkEmitter *ORE_) { +/// Restore the original CFG by removing \p VectorBB and folding \p CheckBB, \p +/// ScalarBB, \p MergeBB and \p Tail into a single block, like in the original +/// IR. +static void undoVersionedBlocks(BasicBlock *CheckBB, BasicBlock *ScalarBB, + DomTreeUpdater &DTU, LoopInfo *LI, + BasicBlock *VectorBB, StringRef OriginalBBName, + BasicBlock *MergeBB, BasicBlock *Tail) { + CheckBB->setName(OriginalBBName); + CheckBB->getTerminator()->eraseFromParent(); + ; + { + IRBuilder<> Builder(CheckBB); + Builder.CreateBr(ScalarBB); + } + DTU.applyUpdates({{DominatorTree::Delete, CheckBB, VectorBB}}); + LI->removeBlock(VectorBB); + VectorBB->getTerminator()->eraseFromParent(); + ; + { + IRBuilder<> Builder(VectorBB); + Builder.CreateUnreachable(); + } + DTU.applyUpdates({{DominatorTree::Delete, VectorBB, MergeBB}}); + DTU.deleteBB(VectorBB); + MergeBlockIntoPredecessor(MergeBB, &DTU, LI); + if (Tail) + MergeBlockIntoPredecessor(Tail, &DTU, LI); + MergeBlockIntoPredecessor(ScalarBB, &DTU, LI); + NumVersioningFailed++; +} + +SLPVectorizerResult SLPVectorizerPass::vectorizeBlockWithVersioning( + BasicBlock *BB, const SmallPtrSetImpl &TrackedObjects, + slpvectorizer::BoUpSLP &R) { + // Try to vectorize BB with versioning. + // + // First, collect all memory bounds for accesses in the block. + // + // Next, split off the region between the first and last tracked memory + // access. + // + // Then, duplicate the split off region, one will remain scalar and one will + // be annotated with noalias metadata. + // + // Then introduce placeholder blocks for the memory runtime checks (branch to + // either scalar or versioned blocks) and a merge block joining the control + // flow from scalar and versioned blocks. + // + // Then, add noalias metadata for memory accessed in the versioned block and + // run SLP vectorization on the versioned block. + // + // Now compare the cost of the scalar block against the cost of the vector + // block + the cost of the runtime checks. If the vector cost is less than the + // scalar cost, generate runtime checks in the check block. Otherwise remove + // all temporary blocks and restore the original IR. + + bool Changed = false; + bool CFGChanged = false; + R.AliasCache.clear(); + + // First, clean up deleted instructions, so they are not re-used during SCEV + // expansion. + R.optimizeGatherSequence(); + R.removeDeletedInstructions(); + + auto &DL = BB->getModule()->getDataLayout(); + // Collect up-to-date memory bounds for tracked objects. Also collect the + // first and last memory instruction using a tracked object. + MapVector MemBounds; + SmallPtrSet WrittenObjs; + // First instruction that accesses an object we collect bounds for. + Instruction *FirstTrackedInst = nullptr; + // Last instruction that accesses an object we collect bounds for. + Instruction *LastTrackedInst = nullptr; + + DenseMap ObjOrder; + unsigned Order = 0; + for (Instruction &I : *BB) { + auto ObjAndStart = AccessInfo::get(I, *SE, *DT); + if (!ObjAndStart.UnderlyingObj) + continue; + auto *Obj = ObjAndStart.UnderlyingObj; + const auto *Start = ObjAndStart.PtrSCEV; + + if (I.mayWriteToMemory()) + WrittenObjs.insert(Obj); + + unsigned AS = Obj->getType()->getPointerAddressSpace(); + + // We know that the Start is dereferenced, hence adding one should not + // overflow: + Type *IdxTy = DL.getIndexType(Obj->getType()); + const SCEV *EltSizeSCEV = + SE->getStoreSizeOfExpr(IdxTy, ObjAndStart.AccessTy); + auto *End = SE->getAddExpr(Start, EltSizeSCEV); + + if (TrackedObjects.find(Obj) != TrackedObjects.end()) + MemBounds.insert({Obj, {0, Start, End, AS, false}}); + auto BoundsIter = MemBounds.find(Obj); + if (BoundsIter == MemBounds.end()) + continue; + BoundsIter->second.addPointer(0, Start, End, AS, false, *SE); + + if (ObjOrder.find(Obj) == ObjOrder.end()) { + ObjOrder[Obj] = Order++; + } + if (!FirstTrackedInst) + FirstTrackedInst = &I; + LastTrackedInst = &I; + } + + // Not enough memory access bounds for runtime checks. + if (MemBounds.size() < 2 || WrittenObjs.empty()) + return {Changed, CFGChanged}; + + // Check if all uses between the first and last tracked instruction are inside + // the region. If that is not the case, PHIs would need to be added when + // duplicating the block. + auto AllUsesInside = [FirstTrackedInst, LastTrackedInst](BasicBlock *BB) { + return all_of(make_range(FirstTrackedInst->getIterator(), + std::next(LastTrackedInst->getIterator())), + [LastTrackedInst, BB](Instruction &I) { + return all_of(I.users(), [LastTrackedInst, BB](User *U) { + if (auto *UserI = dyn_cast(U)) + return UserI->getParent() == BB && + !isa(UserI) && + (UserI->comesBefore(LastTrackedInst) || + UserI == LastTrackedInst); + return true; + }); + }); + }; + if (!AllUsesInside(BB)) + return {Changed, CFGChanged}; + + SmallVector> BoundGroups; + for (auto &B : MemBounds) + BoundGroups.emplace_back(B.first, &B.second); + + // Create a RuntimePointerCheck for all groups in BoundGroups. + SmallVector PointerChecks; + uint64_t MaxDist = 0; + + for (auto &P : R.DepObjs) { + Value *SrcObj = P.first; + Value *SinkObj = P.second; + if (ObjOrder[SrcObj] > ObjOrder[SinkObj]) + std::swap(SrcObj, SinkObj); + + auto &SrcGroup = MemBounds.find(SrcObj)->second; + auto &SinkGroup = MemBounds.find(SinkObj)->second; + bool SrcWrites = WrittenObjs.contains(SrcObj); + bool SinkWrites = WrittenObjs.contains(SinkObj); + if (!SrcWrites && !SinkWrites) + continue; + const SCEV *CurDist = + SE->getUMaxExpr(SE->getMinusSCEV(SrcGroup.High, SrcGroup.Low), + SE->getMinusSCEV(SinkGroup.High, SinkGroup.Low)); + if (auto *C = dyn_cast(CurDist)) { + MaxDist = std::max(MaxDist, C->getValue()->getZExtValue()); + IntegerType *IntTy = IntegerType::get( + BB->getContext(), DL.getPointerSizeInBits(SinkGroup.AddressSpace)); + const SCEV *SinkStartInt = SE->getPtrToIntExpr(SinkGroup.Low, IntTy); + const SCEV *SrcStartInt = SE->getPtrToIntExpr(SrcGroup.Low, IntTy); + if (isa(SinkStartInt) || + isa(SrcStartInt)) { + return {Changed, CFGChanged}; + } + + PointerChecks.emplace_back(SinkStartInt, SrcStartInt, 1, false); + } else + return {Changed, CFGChanged}; + } + + // Duplicate BB now and set up block and branches for memory checks. + std::string OriginalBBName = BB->getName().str(); + IRBuilder<> ChkBuilder(BB->getFirstNonPHI()); + DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); + + BasicBlock *Tail = nullptr; + if (LastTrackedInst->getNextNode() != BB->getTerminator()) + Tail = SplitBlock(BB, LastTrackedInst->getNextNode(), &DTU, LI, nullptr, + OriginalBBName + ".tail"); + auto *CheckBB = BB; + BB = SplitBlock(BB, FirstTrackedInst, &DTU, LI, nullptr, + OriginalBBName + ".slpmemcheck"); + for (Use &U : make_early_inc_range(BB->uses())) { + BasicBlock *UserBB = cast(U.getUser())->getParent(); + if (UserBB == CheckBB) + continue; + + U.set(CheckBB); + DTU.applyUpdates({{DT->Delete, UserBB, BB}}); + DTU.applyUpdates({{DT->Insert, UserBB, CheckBB}}); + } + CFGChanged = true; + + auto *MergeBB = BB; + BasicBlock *ScalarBB = + splitBlockBefore(BB, BB->getTerminator(), &DTU, LI, nullptr, + OriginalBBName + ".slpversioned"); + + ValueToValueMapTy VMap; + BasicBlock *VectorBB = CloneBasicBlock(ScalarBB, VMap, "", BB->getParent()); + ScalarBB->setName(OriginalBBName + ".scalar"); + MergeBB->setName(OriginalBBName + ".merge"); + SmallVector Tmp; + Tmp.push_back(VectorBB); + remapInstructionsInBlocks(Tmp, VMap); + auto *Term = CheckBB->getTerminator(); + ChkBuilder.SetInsertPoint(CheckBB->getTerminator()); + ChkBuilder.CreateCondBr(ChkBuilder.getTrue(), ScalarBB, VectorBB); + Term->eraseFromParent(); + DTU.applyUpdates({{DT->Insert, CheckBB, VectorBB}}); + if (auto *L = LI->getLoopFor(CheckBB)) + L->addBasicBlockToLoop(VectorBB, *LI); + Changed = true; + + // Add !noalias metadata to memory accesses in the versioned block. + LLVMContext &Ctx = BB->getContext(); + MDBuilder MDB(Ctx); + MDNode *Domain = MDB.createAnonymousAliasScopeDomain("SLPVerDomain"); + + DenseMap GroupToScope; + for (const auto &Group : MemBounds) + GroupToScope[&Group.second] = MDB.createAnonymousAliasScope(Domain); + + for (Instruction &I : *VectorBB) { + auto *Ptr = getLoadStorePointerOperand(&I); + if (!Ptr) + continue; + + auto *PtrSCEV = SE->getSCEV(Ptr); + Value *Obj = getUnderlyingObject(Ptr); + if (!Obj) { + if (auto *GEP = dyn_cast(Ptr)) + Obj = GEP->getOperand(0); + else + continue; + } + + auto BoundsIter = MemBounds.find(Obj); + if (BoundsIter == MemBounds.end()) + continue; + auto *LowerBound = BoundsIter->second.Low; + auto *UpperBound = BoundsIter->second.High; + auto *Scope = GroupToScope.find(&BoundsIter->second)->second; + + auto *LowerSub = SE->getMinusSCEV(PtrSCEV, LowerBound); + auto *UpperSub = SE->getMinusSCEV(UpperBound, PtrSCEV); + if (!isa(LowerSub) && + !isa(UpperSub) && + SE->isKnownNonNegative(LowerSub) && SE->isKnownNonNegative(UpperSub)) { + I.setMetadata( + LLVMContext::MD_alias_scope, + MDNode::concatenate(I.getMetadata(LLVMContext::MD_alias_scope), + MDNode::get(Ctx, Scope))); + + SmallVector NonAliasing; + for (auto &KV : GroupToScope) { + if (KV.first == &BoundsIter->second) + continue; + NonAliasing.push_back(KV.second); + } + I.setMetadata(LLVMContext::MD_noalias, + MDNode::concatenate(I.getMetadata(LLVMContext::MD_noalias), + MDNode::get(Ctx, NonAliasing))); + } + } + + DTU.flush(); + DT->updateDFSNumbers(); + collectSeedInstructions(VectorBB); + + // Vectorize trees that end at stores. + assert(!Stores.empty() && "should have stores when versioning"); + LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size() + << " underlying objects.\n"); + bool AnyVectorized = vectorizeStoreChains(R); + Changed |= AnyVectorized; + + InstructionCost SLPCost = 0; + InstructionCost ScalarCost = 0; + if (AnyVectorized) { + R.optimizeGatherSequence(); + R.removeDeletedInstructions(); + for (Instruction &I : *ScalarBB) + ScalarCost += TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput); + for (Instruction &I : make_early_inc_range(reverse(*VectorBB))) { + if (isInstructionTriviallyDead(&I, TLI)) { + I.eraseFromParent(); + continue; + } + SLPCost += TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput); + } + + // Estimate the size of the runtime checks, consisting of computing lower & + // upper bounds (2), the overlap checks (2) and the AND/OR to combine the + // checks. + SLPCost += 5 * PointerChecks.size() + MemBounds.size(); + } + + if (!AnyVectorized || SLPCost >= ScalarCost) { + // Vectorization not beneficial or possible. Restore original state by + // removing the introduced blocks. + R.getORE()->emit([&]() { + OptimizationRemarkMissed Rem(SV_NAME, "VersioningNotBeneficial", + &*ScalarBB->begin()); + Rem << "Tried to version block but was not beneficial"; + if (AnyVectorized) { + Rem << ore::NV("VectorCost", SLPCost) + << " >= " << ore::NV("ScalarCost", ScalarCost); + } else + Rem << "(nothing vectorized)"; + return Rem; + }); + Changed = false; + CFGChanged = false; + undoVersionedBlocks(CheckBB, ScalarBB, DTU, LI, VectorBB, OriginalBBName, + MergeBB, Tail); + } else { + R.getORE()->emit( + OptimizationRemark(SV_NAME, "VersioningSuccessful", &*ScalarBB->begin()) + << "SLP vectorization with versioning is beneficial " + << ore::NV("VectorCost", SLPCost) << " < " + << ore::NV("ScalarCost", ScalarCost) + << ore::NV("AnyVectorized", AnyVectorized)); + + ChkBuilder.SetInsertPoint(CheckBB->getTerminator()); + SCEVExpander Exp(*SE, BB->getParent()->getParent()->getDataLayout(), + "memcheck"); + Value *MemoryOverlap = addDiffRuntimeChecks( + CheckBB->getTerminator(), PointerChecks, Exp, + [MaxDist](IRBuilderBase &B, unsigned Bits) { + return B.getIntN(Bits, MaxDist); + }, + 1); + /* Value *MemoryOverlap =*/ + /*addRuntimeChecks(CheckBB->getTerminator(), nullptr, PointerChecks, Exp);*/ + assert(MemoryOverlap && + "runtime checks required, but no checks generated in IR?"); + cast(CheckBB->getTerminator())->setCondition(MemoryOverlap); + NumVersioningSuccessful++; + } + DTU.flush(); + DT->updateDFSNumbers(); + + return {Changed, CFGChanged}; +} + +SLPVectorizerResult SLPVectorizerPass::runImpl( + Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, + TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_, + AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_) { if (!RunSLPVectorization) - return false; + return {false, false}; SE = SE_; TTI = TTI_; TLI = TLI_; @@ -10023,18 +10476,19 @@ Stores.clear(); GEPs.clear(); bool Changed = false; + bool CFGChanged = false; // If the target claims to have no vector registers don't attempt // vectorization. if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true))) { LLVM_DEBUG( dbgs() << "SLP: Didn't find any vector registers for target, abort.\n"); - return false; + return {false, false}; } // Don't vectorize when the attribute NoImplicitFloat is used. if (F.hasFnAttribute(Attribute::NoImplicitFloat)) - return false; + return {false, false}; LLVM_DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n"); @@ -10048,21 +10502,31 @@ // Update DFS numbers now so that we can use them for ordering. DT->updateDFSNumbers(); + SmallVector BlocksToRetry; + SmallVector, 4> BoundsToUse; // Scan the blocks in the function in post order. for (auto BB : post_order(&F.getEntryBlock())) { // Start new block - clear the list of reduction roots. R.clearReductionData(); collectSeedInstructions(BB); + bool VectorizedBlock = false; // Vectorize trees that end at stores. if (!Stores.empty()) { LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size() << " underlying objects.\n"); - Changed |= vectorizeStoreChains(R); + R.TrackedObjects.clear(); + + if (EnableMemoryVersioning) + R.CollectMemAccess = BB->size() <= 300; + + VectorizedBlock = vectorizeStoreChains(R); + + R.CollectMemAccess = false; } // Vectorize trees that end at reductions. - Changed |= vectorizeChainsInBlock(BB, R); + VectorizedBlock |= vectorizeChainsInBlock(BB, R); // Vectorize the index computations of getelementptr instructions. This // is primarily intended to catch gather-like idioms ending at @@ -10070,15 +10534,30 @@ if (!GEPs.empty()) { LLVM_DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size() << " underlying objects.\n"); - Changed |= vectorizeGEPIndices(BB, R); + VectorizedBlock |= vectorizeGEPIndices(BB, R); } + + if (!VectorizedBlock && !R.TrackedObjects.empty()) { + BlocksToRetry.push_back(BB); + BoundsToUse.push_back(R.TrackedObjects); + } + R.TrackedObjects.clear(); + Changed |= VectorizedBlock; + } + + for (unsigned I = 0; I != BlocksToRetry.size(); I++) { + auto Status = + vectorizeBlockWithVersioning(BlocksToRetry[I], BoundsToUse[I], R); + Changed |= Status.MadeAnyChange; + CFGChanged |= Status.MadeCFGChange; } if (Changed) { R.optimizeGatherSequence(); LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n"); } - return Changed; + + return {Changed, CFGChanged}; } bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef Chain, BoUpSLP &R, diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/loadi8.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/loadi8.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/loadi8.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/loadi8.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -S -slp-vectorizer -mtriple=aarch64--linux-gnu -mcpu=generic < %s | FileCheck %s +; RUN: opt -slp-memory-versioning -scoped-noalias-aa -S -slp-vectorizer -mtriple=aarch64--linux-gnu -mcpu=generic -enable-new-pm=false < %s | FileCheck %s target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64" @@ -92,57 +92,83 @@ define void @f_alias(i8* nocapture %dst, i8* nocapture readonly %src, %struct.weight_t* nocapture readonly %w) { ; CHECK-LABEL: @f_alias( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[DST38:%.*]] = ptrtoint i8* [[DST:%.*]] to i64 +; CHECK-NEXT: [[SRC37:%.*]] = ptrtoint i8* [[SRC:%.*]] to i64 ; CHECK-NEXT: [[SCALE:%.*]] = getelementptr inbounds [[STRUCT_WEIGHT_T:%.*]], %struct.weight_t* [[W:%.*]], i64 0, i32 0 ; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[SCALE]], align 16 ; CHECK-NEXT: [[OFFSET:%.*]] = getelementptr inbounds [[STRUCT_WEIGHT_T]], %struct.weight_t* [[W]], i64 0, i32 1 ; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[OFFSET]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = load i8, i8* [[SRC:%.*]], align 1 -; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP2]] to i32 +; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[SRC37]], [[DST38]] +; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], 4 +; CHECK-NEXT: br i1 [[DIFF_CHECK]], label [[ENTRY_SCALAR:%.*]], label [[ENTRY_SLPVERSIONED1:%.*]] +; CHECK: entry.scalar: +; CHECK-NEXT: [[TMP3:%.*]] = load i8, i8* [[SRC]], align 1 +; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP3]] to i32 ; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP0]], [[CONV]] ; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[MUL]], [[TMP1]] ; CHECK-NEXT: [[TOBOOL_NOT_I:%.*]] = icmp ult i32 [[ADD]], 256 -; CHECK-NEXT: [[TMP3:%.*]] = icmp sgt i32 [[ADD]], 0 -; CHECK-NEXT: [[SHR_I:%.*]] = sext i1 [[TMP3]] to i32 +; CHECK-NEXT: [[TMP4:%.*]] = icmp sgt i32 [[ADD]], 0 +; CHECK-NEXT: [[SHR_I:%.*]] = sext i1 [[TMP4]] to i32 ; CHECK-NEXT: [[COND_I:%.*]] = select i1 [[TOBOOL_NOT_I]], i32 [[ADD]], i32 [[SHR_I]] ; CHECK-NEXT: [[CONV_I:%.*]] = trunc i32 [[COND_I]] to i8 -; CHECK-NEXT: store i8 [[CONV_I]], i8* [[DST:%.*]], align 1 +; CHECK-NEXT: store i8 [[CONV_I]], i8* [[DST]], align 1 ; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 1 -; CHECK-NEXT: [[TMP4:%.*]] = load i8, i8* [[ARRAYIDX_1]], align 1 -; CHECK-NEXT: [[CONV_1:%.*]] = zext i8 [[TMP4]] to i32 +; CHECK-NEXT: [[TMP5:%.*]] = load i8, i8* [[ARRAYIDX_1]], align 1 +; CHECK-NEXT: [[CONV_1:%.*]] = zext i8 [[TMP5]] to i32 ; CHECK-NEXT: [[MUL_1:%.*]] = mul nsw i32 [[TMP0]], [[CONV_1]] ; CHECK-NEXT: [[ADD_1:%.*]] = add nsw i32 [[MUL_1]], [[TMP1]] ; CHECK-NEXT: [[TOBOOL_NOT_I_1:%.*]] = icmp ult i32 [[ADD_1]], 256 -; CHECK-NEXT: [[TMP5:%.*]] = icmp sgt i32 [[ADD_1]], 0 -; CHECK-NEXT: [[SHR_I_1:%.*]] = sext i1 [[TMP5]] to i32 +; CHECK-NEXT: [[TMP6:%.*]] = icmp sgt i32 [[ADD_1]], 0 +; CHECK-NEXT: [[SHR_I_1:%.*]] = sext i1 [[TMP6]] to i32 ; CHECK-NEXT: [[COND_I_1:%.*]] = select i1 [[TOBOOL_NOT_I_1]], i32 [[ADD_1]], i32 [[SHR_I_1]] ; CHECK-NEXT: [[CONV_I_1:%.*]] = trunc i32 [[COND_I_1]] to i8 ; CHECK-NEXT: [[ARRAYIDX2_1:%.*]] = getelementptr inbounds i8, i8* [[DST]], i64 1 ; CHECK-NEXT: store i8 [[CONV_I_1]], i8* [[ARRAYIDX2_1]], align 1 ; CHECK-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP6:%.*]] = load i8, i8* [[ARRAYIDX_2]], align 1 -; CHECK-NEXT: [[CONV_2:%.*]] = zext i8 [[TMP6]] to i32 +; CHECK-NEXT: [[TMP7:%.*]] = load i8, i8* [[ARRAYIDX_2]], align 1 +; CHECK-NEXT: [[CONV_2:%.*]] = zext i8 [[TMP7]] to i32 ; CHECK-NEXT: [[MUL_2:%.*]] = mul nsw i32 [[TMP0]], [[CONV_2]] ; CHECK-NEXT: [[ADD_2:%.*]] = add nsw i32 [[MUL_2]], [[TMP1]] ; CHECK-NEXT: [[TOBOOL_NOT_I_2:%.*]] = icmp ult i32 [[ADD_2]], 256 -; CHECK-NEXT: [[TMP7:%.*]] = icmp sgt i32 [[ADD_2]], 0 -; CHECK-NEXT: [[SHR_I_2:%.*]] = sext i1 [[TMP7]] to i32 +; CHECK-NEXT: [[TMP8:%.*]] = icmp sgt i32 [[ADD_2]], 0 +; CHECK-NEXT: [[SHR_I_2:%.*]] = sext i1 [[TMP8]] to i32 ; CHECK-NEXT: [[COND_I_2:%.*]] = select i1 [[TOBOOL_NOT_I_2]], i32 [[ADD_2]], i32 [[SHR_I_2]] ; CHECK-NEXT: [[CONV_I_2:%.*]] = trunc i32 [[COND_I_2]] to i8 ; CHECK-NEXT: [[ARRAYIDX2_2:%.*]] = getelementptr inbounds i8, i8* [[DST]], i64 2 ; CHECK-NEXT: store i8 [[CONV_I_2]], i8* [[ARRAYIDX2_2]], align 1 ; CHECK-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP8:%.*]] = load i8, i8* [[ARRAYIDX_3]], align 1 -; CHECK-NEXT: [[CONV_3:%.*]] = zext i8 [[TMP8]] to i32 +; CHECK-NEXT: [[TMP9:%.*]] = load i8, i8* [[ARRAYIDX_3]], align 1 +; CHECK-NEXT: [[CONV_3:%.*]] = zext i8 [[TMP9]] to i32 ; CHECK-NEXT: [[MUL_3:%.*]] = mul nsw i32 [[TMP0]], [[CONV_3]] ; CHECK-NEXT: [[ADD_3:%.*]] = add nsw i32 [[MUL_3]], [[TMP1]] ; CHECK-NEXT: [[TOBOOL_NOT_I_3:%.*]] = icmp ult i32 [[ADD_3]], 256 -; CHECK-NEXT: [[TMP9:%.*]] = icmp sgt i32 [[ADD_3]], 0 -; CHECK-NEXT: [[SHR_I_3:%.*]] = sext i1 [[TMP9]] to i32 +; CHECK-NEXT: [[TMP10:%.*]] = icmp sgt i32 [[ADD_3]], 0 +; CHECK-NEXT: [[SHR_I_3:%.*]] = sext i1 [[TMP10]] to i32 ; CHECK-NEXT: [[COND_I_3:%.*]] = select i1 [[TOBOOL_NOT_I_3]], i32 [[ADD_3]], i32 [[SHR_I_3]] ; CHECK-NEXT: [[CONV_I_3:%.*]] = trunc i32 [[COND_I_3]] to i8 ; CHECK-NEXT: [[ARRAYIDX2_3:%.*]] = getelementptr inbounds i8, i8* [[DST]], i64 3 ; CHECK-NEXT: store i8 [[CONV_I_3]], i8* [[ARRAYIDX2_3]], align 1 +; CHECK-NEXT: br label [[ENTRY_MERGE:%.*]] +; CHECK: entry.merge: ; CHECK-NEXT: ret void +; CHECK: entry.slpversioned1: +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8* [[SRC]] to <4 x i8>* +; CHECK-NEXT: [[TMP12:%.*]] = load <4 x i8>, <4 x i8>* [[TMP11]], align 1, !alias.scope !0, !noalias !3 +; CHECK-NEXT: [[TMP13:%.*]] = zext <4 x i8> [[TMP12]] to <4 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i32 0 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP14]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = mul nsw <4 x i32> [[SHUFFLE]], [[TMP13]] +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1]], i32 0 +; CHECK-NEXT: [[SHUFFLE36:%.*]] = shufflevector <4 x i32> [[TMP16]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = add nsw <4 x i32> [[TMP15]], [[SHUFFLE36]] +; CHECK-NEXT: [[TMP18:%.*]] = icmp ult <4 x i32> [[TMP17]], +; CHECK-NEXT: [[TMP19:%.*]] = icmp sgt <4 x i32> [[TMP17]], zeroinitializer +; CHECK-NEXT: [[TMP20:%.*]] = sext <4 x i1> [[TMP19]] to <4 x i32> +; CHECK-NEXT: [[TMP21:%.*]] = select <4 x i1> [[TMP18]], <4 x i32> [[TMP17]], <4 x i32> [[TMP20]] +; CHECK-NEXT: [[TMP22:%.*]] = trunc <4 x i32> [[TMP21]] to <4 x i8> +; CHECK-NEXT: [[TMP23:%.*]] = bitcast i8* [[DST]] to <4 x i8>* +; CHECK-NEXT: store <4 x i8> [[TMP22]], <4 x i8>* [[TMP23]], align 1, !alias.scope !3, !noalias !0 +; CHECK-NEXT: br label [[ENTRY_MERGE]] ; entry: %scale = getelementptr inbounds %struct.weight_t, %struct.weight_t* %w, i64 0, i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks-in-loops.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks-in-loops.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks-in-loops.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks-in-loops.ll @@ -1,16 +1,26 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -slp-vectorizer -mtriple=arm64-apple-ios -S %s | FileCheck %s -; RUN: opt -aa-pipeline='basic-aa,scoped-noalias-aa' -passes=slp-vectorizer -mtriple=arm64-apple-darwin -S %s | FileCheck %s +; RUN: opt -scoped-noalias-aa -slp-vectorizer -slp-memory-versioning -enable-new-pm=false -mtriple=arm64-apple-ios -S %s | FileCheck %s +; RUN: opt -aa-pipeline='basic-aa,scoped-noalias-aa' -slp-memory-versioning -passes=slp-vectorizer -mtriple=arm64-apple-darwin -S %s | FileCheck %s define void @loop1(i32* %A, i32* %B, i64 %N) { ; CHECK-LABEL: @loop1( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[A29:%.*]] = ptrtoint i32* [[A:%.*]] to i64 +; CHECK-NEXT: [[B28:%.*]] = ptrtoint i32* [[B:%.*]] to i64 ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[B_GEP_0:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[IV]] +; CHECK-NEXT: [[INDVAR:%.*]] = phi i64 [ [[INDVAR_NEXT:%.*]], [[LOOP_TAIL:%.*]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[LOOP_TAIL]] ] +; CHECK-NEXT: [[TMP0:%.*]] = shl i64 [[INDVAR]], 6 +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[B28]], [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[A29]], [[TMP0]] +; CHECK-NEXT: [[B_GEP_0:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[IV]] +; CHECK-NEXT: [[TMP3:%.*]] = sub i64 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP3]], 16 +; CHECK-NEXT: br i1 [[DIFF_CHECK]], label [[LOOP_SCALAR:%.*]], label [[LOOP_SLPVERSIONED1:%.*]] +; CHECK: loop.scalar: ; CHECK-NEXT: [[B_0:%.*]] = load i32, i32* [[B_GEP_0]], align 4 -; CHECK-NEXT: [[A_GEP_0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[IV]] +; CHECK-NEXT: [[A_GEP_0:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[IV]] ; CHECK-NEXT: [[A_0:%.*]] = load i32, i32* [[A_GEP_0]], align 4 ; CHECK-NEXT: [[ADD_0:%.*]] = add i32 [[A_0]], 20 ; CHECK-NEXT: [[XOR_0:%.*]] = xor i32 [[ADD_0]], [[B_0]] @@ -39,11 +49,27 @@ ; CHECK-NEXT: [[ADD_3:%.*]] = add i32 [[A_3]], 20 ; CHECK-NEXT: [[XOR_3:%.*]] = xor i32 [[ADD_3]], [[B_3]] ; CHECK-NEXT: store i32 [[XOR_3]], i32* [[A_GEP_3]], align 4 +; CHECK-NEXT: br label [[LOOP_MERGE:%.*]] +; CHECK: loop.merge: +; CHECK-NEXT: br label [[LOOP_TAIL]] +; CHECK: loop.tail: ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 16 ; CHECK-NEXT: [[COND:%.*]] = icmp ult i64 [[IV_NEXT]], [[N:%.*]] +; CHECK-NEXT: [[INDVAR_NEXT]] = add i64 [[INDVAR]], 1 ; CHECK-NEXT: br i1 [[COND]], label [[LOOP]], label [[EXIT:%.*]] ; CHECK: exit: ; CHECK-NEXT: ret void +; CHECK: loop.slpversioned1: +; CHECK-NEXT: [[A_GEP_03:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[IV]] +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[B_GEP_0]] to <4 x i32>* +; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4, !alias.scope !0, !noalias !3 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[A_GEP_03]] to <4 x i32>* +; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP6]], align 4, !alias.scope !3, !noalias !0 +; CHECK-NEXT: [[TMP8:%.*]] = add <4 x i32> [[TMP7]], +; CHECK-NEXT: [[TMP9:%.*]] = xor <4 x i32> [[TMP8]], [[TMP5]] +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i32* [[A_GEP_03]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* [[TMP10]], align 4, !alias.scope !3, !noalias !0 +; CHECK-NEXT: br label [[LOOP_MERGE]] ; entry: br label %loop @@ -92,16 +118,22 @@ define void @loop_iv_update_at_start(float* %src, float* %dst) #0 { ; CHECK-LABEL: @loop_iv_update_at_start( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[DST27:%.*]] = ptrtoint float* [[DST:%.*]] to i64 +; CHECK-NEXT: [[SRC26:%.*]] = ptrtoint float* [[SRC:%.*]] to i64 ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_MERGE:%.*]] ] ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 ; CHECK-NEXT: [[COND:%.*]] = icmp ult i32 [[IV]], 2000 -; CHECK-NEXT: [[SRC_GEP_0:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 0 +; CHECK-NEXT: [[SRC_GEP_0:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 0 +; CHECK-NEXT: [[TMP0:%.*]] = sub i64 [[SRC26]], [[DST27]] +; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 20 +; CHECK-NEXT: br i1 [[DIFF_CHECK]], label [[LOOP_SCALAR:%.*]], label [[LOOP_SLPVERSIONED1:%.*]] +; CHECK: loop.scalar: ; CHECK-NEXT: [[SRC_0:%.*]] = load float, float* [[SRC_GEP_0]], align 8 ; CHECK-NEXT: [[ADD_0:%.*]] = fadd float [[SRC_0]], 1.000000e+00 ; CHECK-NEXT: [[MUL_0:%.*]] = fmul float [[ADD_0]], [[SRC_0]] -; CHECK-NEXT: [[DST_GEP_0:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 0 +; CHECK-NEXT: [[DST_GEP_0:%.*]] = getelementptr inbounds float, float* [[DST]], i64 0 ; CHECK-NEXT: store float [[MUL_0]], float* [[DST_GEP_0]], align 8 ; CHECK-NEXT: [[SRC_GEP_1:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 1 ; CHECK-NEXT: [[SRC_1:%.*]] = load float, float* [[SRC_GEP_1]], align 8 @@ -127,9 +159,26 @@ ; CHECK-NEXT: [[MUL_4:%.*]] = fmul float [[ADD_4]], [[SRC_4]] ; CHECK-NEXT: [[DST_GEP_4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 4 ; CHECK-NEXT: store float [[MUL_4]], float* [[DST_GEP_4]], align 8 +; CHECK-NEXT: br label [[LOOP_MERGE]] +; CHECK: loop.merge: ; CHECK-NEXT: br i1 [[COND]], label [[LOOP]], label [[EXIT:%.*]] ; CHECK: exit: ; CHECK-NEXT: ret void +; CHECK: loop.slpversioned1: +; CHECK-NEXT: [[DST_GEP_05:%.*]] = getelementptr inbounds float, float* [[DST]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[SRC_GEP_0]] to <4 x float>* +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 8, !alias.scope !5, !noalias !8 +; CHECK-NEXT: [[TMP3:%.*]] = fadd <4 x float> [[TMP2]], +; CHECK-NEXT: [[TMP4:%.*]] = fmul <4 x float> [[TMP3]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[DST_GEP_05]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP4]], <4 x float>* [[TMP5]], align 8, !alias.scope !8, !noalias !5 +; CHECK-NEXT: [[SRC_GEP_421:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 4 +; CHECK-NEXT: [[SRC_422:%.*]] = load float, float* [[SRC_GEP_421]], align 8, !alias.scope !5, !noalias !8 +; CHECK-NEXT: [[ADD_423:%.*]] = fadd float [[SRC_422]], 1.000000e+00 +; CHECK-NEXT: [[MUL_424:%.*]] = fmul float [[ADD_423]], [[SRC_422]] +; CHECK-NEXT: [[DST_GEP_425:%.*]] = getelementptr inbounds float, float* [[DST]], i64 4 +; CHECK-NEXT: store float [[MUL_424]], float* [[DST_GEP_425]], align 8, !alias.scope !8, !noalias !5 +; CHECK-NEXT: br label [[LOOP_MERGE]] ; entry: br label %loop diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -scoped-noalias-aa -slp-vectorizer -mtriple=arm64-apple-darwin -enable-new-pm=false -S %s | FileCheck %s -; RUN: opt -aa-pipeline='basic-aa,scoped-noalias-aa' -passes=slp-vectorizer -mtriple=arm64-apple-darwin -S %s | FileCheck %s +; RUN: opt -aa-pipeline='basic-aa,scoped-noalias-aa' -slp-memory-versioning -passes=slp-vectorizer -mtriple=arm64-apple-darwin -S %s | FileCheck %s +; RUN: opt -aa-pipeline='basic-aa,scoped-noalias-aa' -slp-memory-versioning=false -passes=slp-vectorizer -mtriple=arm64-apple-darwin -S %s | FileCheck --check-prefix=NOVERSION %s + +; NOVERSION-NOT: slpversioned define void @needs_versioning_not_profitable(i32* %dst, i32* %src) { ; CHECK-LABEL: @needs_versioning_not_profitable( @@ -30,9 +32,15 @@ define void @needs_versioning_profitable(i32* %dst, i32* %src) { ; CHECK-LABEL: @needs_versioning_profitable( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[SRC_0:%.*]] = load i32, i32* [[SRC:%.*]], align 4 +; CHECK-NEXT: [[DST17:%.*]] = ptrtoint i32* [[DST:%.*]] to i64 +; CHECK-NEXT: [[SRC16:%.*]] = ptrtoint i32* [[SRC:%.*]] to i64 +; CHECK-NEXT: [[TMP0:%.*]] = sub i64 [[SRC16]], [[DST17]] +; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 16 +; CHECK-NEXT: br i1 [[DIFF_CHECK]], label [[ENTRY_SCALAR:%.*]], label [[ENTRY_SLPVERSIONED1:%.*]] +; CHECK: entry.scalar: +; CHECK-NEXT: [[SRC_0:%.*]] = load i32, i32* [[SRC]], align 4 ; CHECK-NEXT: [[R_0:%.*]] = ashr i32 [[SRC_0]], 16 -; CHECK-NEXT: store i32 [[R_0]], i32* [[DST:%.*]], align 4 +; CHECK-NEXT: store i32 [[R_0]], i32* [[DST]], align 4 ; CHECK-NEXT: [[SRC_GEP_1:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 1 ; CHECK-NEXT: [[SRC_1:%.*]] = load i32, i32* [[SRC_GEP_1]], align 4 ; CHECK-NEXT: [[R_1:%.*]] = ashr i32 [[SRC_1]], 16 @@ -48,7 +56,16 @@ ; CHECK-NEXT: [[R_3:%.*]] = ashr i32 [[SRC_3]], 16 ; CHECK-NEXT: [[DST_GEP_3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3 ; CHECK-NEXT: store i32 [[R_3]], i32* [[DST_GEP_3]], align 4 +; CHECK-NEXT: br label [[ENTRY_MERGE:%.*]] +; CHECK: entry.merge: ; CHECK-NEXT: ret void +; CHECK: entry.slpversioned1: +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[SRC]] to <4 x i32>* +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4, !alias.scope !0, !noalias !3 +; CHECK-NEXT: [[TMP3:%.*]] = ashr <4 x i32> [[TMP2]], +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[DST]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]], align 4, !alias.scope !3, !noalias !0 +; CHECK-NEXT: br label [[ENTRY_MERGE]] ; entry: %src.0 = load i32, i32* %src, align 4 @@ -76,11 +93,21 @@ define void @needs_versioning_profitable_2_sources(i32* %dst, i32* %A, i32* %B) { ; CHECK-LABEL: @needs_versioning_profitable_2_sources( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[A_0:%.*]] = load i32, i32* [[A:%.*]], align 4 -; CHECK-NEXT: [[B_0:%.*]] = load i32, i32* [[B:%.*]], align 4 +; CHECK-NEXT: [[B29:%.*]] = ptrtoint i32* [[B:%.*]] to i64 +; CHECK-NEXT: [[DST28:%.*]] = ptrtoint i32* [[DST:%.*]] to i64 +; CHECK-NEXT: [[A27:%.*]] = ptrtoint i32* [[A:%.*]] to i64 +; CHECK-NEXT: [[TMP0:%.*]] = sub i64 [[A27]], [[DST28]] +; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 16 +; CHECK-NEXT: [[TMP1:%.*]] = sub i64 [[B29]], [[DST28]] +; CHECK-NEXT: [[DIFF_CHECK30:%.*]] = icmp ult i64 [[TMP1]], 16 +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK30]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label [[ENTRY_SCALAR:%.*]], label [[ENTRY_SLPVERSIONED1:%.*]] +; CHECK: entry.scalar: +; CHECK-NEXT: [[A_0:%.*]] = load i32, i32* [[A]], align 4 +; CHECK-NEXT: [[B_0:%.*]] = load i32, i32* [[B]], align 4 ; CHECK-NEXT: [[R_0:%.*]] = add i32 [[A_0]], [[B_0]] ; CHECK-NEXT: [[MUL_0:%.*]] = mul i32 [[R_0]], 2 -; CHECK-NEXT: store i32 [[MUL_0]], i32* [[DST:%.*]], align 4 +; CHECK-NEXT: store i32 [[MUL_0]], i32* [[DST]], align 4 ; CHECK-NEXT: [[A_GEP_1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 1 ; CHECK-NEXT: [[A_1:%.*]] = load i32, i32* [[A_GEP_1]], align 4 ; CHECK-NEXT: [[B_GEP_1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 1 @@ -105,7 +132,19 @@ ; CHECK-NEXT: [[MUL_3:%.*]] = mul i32 [[R_3]], 2 ; CHECK-NEXT: [[DST_GEP_3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3 ; CHECK-NEXT: store i32 [[MUL_3]], i32* [[DST_GEP_3]], align 4 +; CHECK-NEXT: br label [[ENTRY_MERGE:%.*]] +; CHECK: entry.merge: ; CHECK-NEXT: ret void +; CHECK: entry.slpversioned1: +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[A]] to <4 x i32>* +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4, !alias.scope !5, !noalias !8 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[B]] to <4 x i32>* +; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4, !alias.scope !11, !noalias !12 +; CHECK-NEXT: [[TMP6:%.*]] = add <4 x i32> [[TMP3]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = mul <4 x i32> [[TMP6]], +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[DST]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4, !alias.scope !13, !noalias !14 +; CHECK-NEXT: br label [[ENTRY_MERGE]] ; entry: %A.0 = load i32, i32* %A, align 4 @@ -148,12 +187,18 @@ define void @needs_versioning_profitable_split_points(i32* %dst, i32* %src) { ; CHECK-LABEL: @needs_versioning_profitable_split_points( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[DST17:%.*]] = ptrtoint i32* [[DST:%.*]] to i64 +; CHECK-NEXT: [[SRC16:%.*]] = ptrtoint i32* [[SRC:%.*]] to i64 ; CHECK-NEXT: call void @bar() ; CHECK-NEXT: call void @bar() ; CHECK-NEXT: call void @bar() -; CHECK-NEXT: [[SRC_0:%.*]] = load i32, i32* [[SRC:%.*]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = sub i64 [[SRC16]], [[DST17]] +; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 16 +; CHECK-NEXT: br i1 [[DIFF_CHECK]], label [[ENTRY_SCALAR:%.*]], label [[ENTRY_SLPVERSIONED1:%.*]] +; CHECK: entry.scalar: +; CHECK-NEXT: [[SRC_0:%.*]] = load i32, i32* [[SRC]], align 4 ; CHECK-NEXT: [[R_0:%.*]] = ashr i32 [[SRC_0]], 16 -; CHECK-NEXT: store i32 [[R_0]], i32* [[DST:%.*]], align 4 +; CHECK-NEXT: store i32 [[R_0]], i32* [[DST]], align 4 ; CHECK-NEXT: [[SRC_GEP_1:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 1 ; CHECK-NEXT: [[SRC_1:%.*]] = load i32, i32* [[SRC_GEP_1]], align 4 ; CHECK-NEXT: [[R_1:%.*]] = ashr i32 [[SRC_1]], 16 @@ -169,8 +214,19 @@ ; CHECK-NEXT: [[R_3:%.*]] = ashr i32 [[SRC_3]], 16 ; CHECK-NEXT: [[DST_GEP_3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3 ; CHECK-NEXT: store i32 [[R_3]], i32* [[DST_GEP_3]], align 4 +; CHECK-NEXT: br label [[ENTRY_MERGE:%.*]] +; CHECK: entry.merge: +; CHECK-NEXT: br label [[ENTRY_TAIL:%.*]] +; CHECK: entry.tail: ; CHECK-NEXT: call void @bar() ; CHECK-NEXT: ret void +; CHECK: entry.slpversioned1: +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[SRC]] to <4 x i32>* +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4, !alias.scope !15, !noalias !18 +; CHECK-NEXT: [[TMP3:%.*]] = ashr <4 x i32> [[TMP2]], +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[DST]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]], align 4, !alias.scope !18, !noalias !15 +; CHECK-NEXT: br label [[ENTRY_MERGE]] ; entry: call void @bar() @@ -347,29 +403,46 @@ define void @version_multiple(i32* nocapture %out_block, i32* nocapture readonly %counter) { ; CHECK-LABEL: @version_multiple( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[COUNTER:%.*]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[OUT_BLOCK:%.*]], align 4 -; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[TMP1]], [[TMP0]] +; CHECK-NEXT: [[OUT_BLOCK13:%.*]] = ptrtoint i32* [[OUT_BLOCK:%.*]] to i64 +; CHECK-NEXT: [[COUNTER12:%.*]] = ptrtoint i32* [[COUNTER:%.*]] to i64 +; CHECK-NEXT: [[TMP0:%.*]] = sub i64 [[COUNTER12]], [[OUT_BLOCK13]] +; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 16 +; CHECK-NEXT: br i1 [[DIFF_CHECK]], label [[ENTRY_SCALAR:%.*]], label [[ENTRY_SLPVERSIONED1:%.*]] +; CHECK: entry.scalar: +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[COUNTER]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[OUT_BLOCK]], align 4 +; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[TMP2]], [[TMP1]] ; CHECK-NEXT: store i32 [[XOR]], i32* [[OUT_BLOCK]], align 4 ; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[COUNTER]], i64 1 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4 ; CHECK-NEXT: [[ARRAYIDX2_1:%.*]] = getelementptr inbounds i32, i32* [[OUT_BLOCK]], i64 1 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX2_1]], align 4 -; CHECK-NEXT: [[XOR_1:%.*]] = xor i32 [[TMP3]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX2_1]], align 4 +; CHECK-NEXT: [[XOR_1:%.*]] = xor i32 [[TMP4]], [[TMP3]] ; CHECK-NEXT: store i32 [[XOR_1]], i32* [[ARRAYIDX2_1]], align 4 ; CHECK-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[COUNTER]], i64 2 -; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4 ; CHECK-NEXT: [[ARRAYIDX2_2:%.*]] = getelementptr inbounds i32, i32* [[OUT_BLOCK]], i64 2 -; CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX2_2]], align 4 -; CHECK-NEXT: [[XOR_2:%.*]] = xor i32 [[TMP5]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX2_2]], align 4 +; CHECK-NEXT: [[XOR_2:%.*]] = xor i32 [[TMP6]], [[TMP5]] ; CHECK-NEXT: store i32 [[XOR_2]], i32* [[ARRAYIDX2_2]], align 4 ; CHECK-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[COUNTER]], i64 3 -; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4 ; CHECK-NEXT: [[ARRAYIDX2_3:%.*]] = getelementptr inbounds i32, i32* [[OUT_BLOCK]], i64 3 -; CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX2_3]], align 4 -; CHECK-NEXT: [[XOR_3:%.*]] = xor i32 [[TMP7]], [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX2_3]], align 4 +; CHECK-NEXT: [[XOR_3:%.*]] = xor i32 [[TMP8]], [[TMP7]] ; CHECK-NEXT: store i32 [[XOR_3]], i32* [[ARRAYIDX2_3]], align 4 +; CHECK-NEXT: br label [[ENTRY_MERGE:%.*]] +; CHECK: entry.merge: ; CHECK-NEXT: ret void +; CHECK: entry.slpversioned1: +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i32* [[COUNTER]] to <4 x i32>* +; CHECK-NEXT: [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4, !alias.scope !20, !noalias !23 +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i32* [[OUT_BLOCK]] to <4 x i32>* +; CHECK-NEXT: [[TMP12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP11]], align 4, !alias.scope !23, !noalias !20 +; CHECK-NEXT: [[TMP13:%.*]] = xor <4 x i32> [[TMP12]], [[TMP10]] +; CHECK-NEXT: [[TMP14:%.*]] = bitcast i32* [[OUT_BLOCK]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP13]], <4 x i32>* [[TMP14]], align 4, !alias.scope !23, !noalias !20 +; CHECK-NEXT: br label [[ENTRY_MERGE]] ; entry: %0 = load i32, i32* %counter, align 4 @@ -608,6 +681,7 @@ ; CHECK-LABEL: @test_bounds_removed_before_runtime_checks( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT:%.*]], %struct* [[A:%.*]], i64 0, i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT]], %struct* [[A]], i64 0, i32 1 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[TMP11]] to <2 x i32>* ; CHECK-NEXT: store <2 x i32> , <2 x i32>* [[TMP0]], align 8 ; CHECK-NEXT: [[TMP13:%.*]] = load i32*, i32** [[B:%.*]], align 8 @@ -710,31 +784,49 @@ ; CHECK-NEXT: [[PTR_PHI:%.*]] = phi %struct.2* [ [[A:%.*]], [[BB:%.*]] ], [ null, [[LOOP]] ] ; CHECK-NEXT: br i1 [[C:%.*]], label [[EXIT:%.*]], label [[LOOP]] ; CHECK: exit: -; CHECK-NEXT: [[B_GEP_0:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 0 +; CHECK-NEXT: [[PTR_PHI_LCSSA:%.*]] = phi %struct.2* [ [[PTR_PHI]], [[LOOP]] ] +; CHECK-NEXT: [[PTR_PHI_LCSSA22:%.*]] = ptrtoint %struct.2* [[PTR_PHI_LCSSA]] to i64 +; CHECK-NEXT: [[B_GEP_0:%.*]] = getelementptr float, float* [[B:%.*]], i64 0 +; CHECK-NEXT: [[B_GEP_021:%.*]] = ptrtoint float* [[B_GEP_0]] to i64 +; CHECK-NEXT: [[TMP0:%.*]] = sub i64 [[B_GEP_021]], [[PTR_PHI_LCSSA22]] +; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 16 +; CHECK-NEXT: br i1 [[DIFF_CHECK]], label [[EXIT_SCALAR:%.*]], label [[EXIT_SLPVERSIONED1:%.*]] +; CHECK: exit.scalar: ; CHECK-NEXT: [[L_0:%.*]] = load float, float* [[B_GEP_0]], align 8 ; CHECK-NEXT: [[ADD_0:%.*]] = fadd float [[L_0]], 1.000000e+01 ; CHECK-NEXT: [[MUL_0:%.*]] = fmul float [[ADD_0]], 3.000000e+01 -; CHECK-NEXT: [[A_GEP_0:%.*]] = getelementptr inbounds [[STRUCT_2:%.*]], %struct.2* [[PTR_PHI]], i64 0, i32 0, i32 0 +; CHECK-NEXT: [[A_GEP_0:%.*]] = getelementptr inbounds [[STRUCT_2:%.*]], %struct.2* [[PTR_PHI_LCSSA]], i64 0, i32 0, i32 0 ; CHECK-NEXT: store float [[MUL_0]], float* [[A_GEP_0]], align 8 ; CHECK-NEXT: [[B_GEP_1:%.*]] = getelementptr inbounds float, float* [[B]], i64 1 ; CHECK-NEXT: [[L_1:%.*]] = load float, float* [[B_GEP_1]], align 8 ; CHECK-NEXT: [[ADD_1:%.*]] = fadd float [[L_1]], 1.000000e+01 ; CHECK-NEXT: [[MUL_1:%.*]] = fmul float [[ADD_1]], 3.000000e+01 -; CHECK-NEXT: [[A_GEP_1:%.*]] = getelementptr inbounds [[STRUCT_2]], %struct.2* [[PTR_PHI]], i64 0, i32 0, i32 1 +; CHECK-NEXT: [[A_GEP_1:%.*]] = getelementptr inbounds [[STRUCT_2]], %struct.2* [[PTR_PHI_LCSSA]], i64 0, i32 0, i32 1 ; CHECK-NEXT: store float [[MUL_1]], float* [[A_GEP_1]], align 8 ; CHECK-NEXT: [[B_GEP_2:%.*]] = getelementptr inbounds float, float* [[B]], i64 2 ; CHECK-NEXT: [[L_2:%.*]] = load float, float* [[B_GEP_2]], align 8 ; CHECK-NEXT: [[ADD_2:%.*]] = fadd float [[L_2]], 1.000000e+01 ; CHECK-NEXT: [[MUL_2:%.*]] = fmul float [[ADD_2]], 3.000000e+01 -; CHECK-NEXT: [[A_GEP_2:%.*]] = getelementptr inbounds [[STRUCT_2]], %struct.2* [[PTR_PHI]], i64 0, i32 0, i32 2 +; CHECK-NEXT: [[A_GEP_2:%.*]] = getelementptr inbounds [[STRUCT_2]], %struct.2* [[PTR_PHI_LCSSA]], i64 0, i32 0, i32 2 ; CHECK-NEXT: store float [[MUL_2]], float* [[A_GEP_2]], align 8 ; CHECK-NEXT: [[B_GEP_3:%.*]] = getelementptr inbounds float, float* [[B]], i64 3 ; CHECK-NEXT: [[L_3:%.*]] = load float, float* [[B_GEP_3]], align 8 ; CHECK-NEXT: [[ADD_3:%.*]] = fadd float [[L_3]], 1.000000e+01 ; CHECK-NEXT: [[MUL_3:%.*]] = fmul float [[ADD_3]], 3.000000e+01 -; CHECK-NEXT: [[A_GEP_3:%.*]] = getelementptr inbounds [[STRUCT_2]], %struct.2* [[PTR_PHI]], i64 0, i32 0, i32 3 +; CHECK-NEXT: [[A_GEP_3:%.*]] = getelementptr inbounds [[STRUCT_2]], %struct.2* [[PTR_PHI_LCSSA]], i64 0, i32 0, i32 3 ; CHECK-NEXT: store float [[MUL_3]], float* [[A_GEP_3]], align 8 +; CHECK-NEXT: br label [[EXIT_MERGE:%.*]] +; CHECK: exit.merge: ; CHECK-NEXT: ret void +; CHECK: exit.slpversioned1: +; CHECK-NEXT: [[A_GEP_05:%.*]] = getelementptr inbounds [[STRUCT_2]], %struct.2* [[PTR_PHI_LCSSA]], i64 0, i32 0, i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[B_GEP_0]] to <4 x float>* +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 8, !alias.scope !25, !noalias !28 +; CHECK-NEXT: [[TMP3:%.*]] = fadd <4 x float> [[TMP2]], +; CHECK-NEXT: [[TMP4:%.*]] = fmul <4 x float> [[TMP3]], +; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[A_GEP_05]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP4]], <4 x float>* [[TMP5]], align 8, !alias.scope !28, !noalias !25 +; CHECK-NEXT: br label [[EXIT_MERGE]] ; bb: br label %loop @@ -781,7 +873,13 @@ ; CHECK-NEXT: br i1 [[C:%.*]], label [[EXIT:%.*]], label [[LOOP]] ; CHECK: exit: ; CHECK-NEXT: [[PTR_PHI_LCSSA:%.*]] = phi %struct.2* [ [[PTR_PHI]], [[LOOP]] ] -; CHECK-NEXT: [[B_GEP_0:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 0 +; CHECK-NEXT: [[PTR_PHI_LCSSA22:%.*]] = ptrtoint %struct.2* [[PTR_PHI_LCSSA]] to i64 +; CHECK-NEXT: [[B_GEP_0:%.*]] = getelementptr float, float* [[B:%.*]], i64 0 +; CHECK-NEXT: [[B_GEP_021:%.*]] = ptrtoint float* [[B_GEP_0]] to i64 +; CHECK-NEXT: [[TMP0:%.*]] = sub i64 [[B_GEP_021]], [[PTR_PHI_LCSSA22]] +; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 16 +; CHECK-NEXT: br i1 [[DIFF_CHECK]], label [[EXIT_SCALAR:%.*]], label [[EXIT_SLPVERSIONED1:%.*]] +; CHECK: exit.scalar: ; CHECK-NEXT: [[L_0:%.*]] = load float, float* [[B_GEP_0]], align 8 ; CHECK-NEXT: [[ADD_0:%.*]] = fadd float [[L_0]], 1.000000e+01 ; CHECK-NEXT: [[MUL_0:%.*]] = fmul float [[ADD_0]], 3.000000e+01 @@ -805,7 +903,18 @@ ; CHECK-NEXT: [[MUL_3:%.*]] = fmul float [[ADD_3]], 3.000000e+01 ; CHECK-NEXT: [[A_GEP_3:%.*]] = getelementptr inbounds [[STRUCT_2]], %struct.2* [[PTR_PHI_LCSSA]], i64 0, i32 0, i32 3 ; CHECK-NEXT: store float [[MUL_3]], float* [[A_GEP_3]], align 8 +; CHECK-NEXT: br label [[EXIT_MERGE:%.*]] +; CHECK: exit.merge: ; CHECK-NEXT: ret void +; CHECK: exit.slpversioned1: +; CHECK-NEXT: [[A_GEP_05:%.*]] = getelementptr inbounds [[STRUCT_2]], %struct.2* [[PTR_PHI_LCSSA]], i64 0, i32 0, i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[B_GEP_0]] to <4 x float>* +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 8, !alias.scope !30, !noalias !33 +; CHECK-NEXT: [[TMP3:%.*]] = fadd <4 x float> [[TMP2]], +; CHECK-NEXT: [[TMP4:%.*]] = fmul <4 x float> [[TMP3]], +; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[A_GEP_05]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP4]], <4 x float>* [[TMP5]], align 8, !alias.scope !33, !noalias !30 +; CHECK-NEXT: br label [[EXIT_MERGE]] ; bb: br label %loop @@ -1185,10 +1294,13 @@ define void @crash_instructions_deleted(float* %t, i32* %a, i32** noalias %ptr) { ; CHECK-LABEL: @crash_instructions_deleted( ; CHECK-NEXT: bb: +; CHECK-NEXT: [[T42:%.*]] = ptrtoint float* [[T:%.*]] to i64 ; CHECK-NEXT: [[T15:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 2 +; CHECK-NEXT: [[T16:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 3 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[T15]] to <2 x i32>* ; CHECK-NEXT: store <2 x i32> , <2 x i32>* [[TMP0]], align 8 ; CHECK-NEXT: [[T17:%.*]] = load i32*, i32** [[PTR:%.*]], align 8 +; CHECK-NEXT: [[T1718:%.*]] = ptrtoint i32* [[T17]] to i64 ; CHECK-NEXT: br label [[BB18:%.*]] ; CHECK: bb18: ; CHECK-NEXT: [[T19:%.*]] = sext i32 0 to i64 @@ -1198,10 +1310,15 @@ ; CHECK-NEXT: [[T23:%.*]] = getelementptr inbounds i8, i8* [[T22]], i64 1 ; CHECK-NEXT: [[T24:%.*]] = getelementptr inbounds i8, i8* [[T22]], i64 2 ; CHECK-NEXT: [[T25:%.*]] = getelementptr inbounds i8, i8* [[T22]], i64 3 +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[T1718]], 4 +; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], [[T42]] +; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], 16 +; CHECK-NEXT: br i1 [[DIFF_CHECK]], label [[BB18_SCALAR:%.*]], label [[BB18_SLPVERSIONED1:%.*]] +; CHECK: bb18.scalar: ; CHECK-NEXT: [[T26:%.*]] = load i8, i8* [[T22]], align 1 ; CHECK-NEXT: [[T27:%.*]] = uitofp i8 [[T26]] to float ; CHECK-NEXT: [[T28:%.*]] = fdiv float [[T27]], 2.550000e+02 -; CHECK-NEXT: [[T29:%.*]] = getelementptr inbounds float, float* [[T:%.*]], i64 0 +; CHECK-NEXT: [[T29:%.*]] = getelementptr inbounds float, float* [[T]], i64 0 ; CHECK-NEXT: store float [[T28]], float* [[T29]], align 8 ; CHECK-NEXT: [[T30:%.*]] = load i8, i8* [[T23]], align 1 ; CHECK-NEXT: [[T31:%.*]] = uitofp i8 [[T30]] to float @@ -1218,7 +1335,18 @@ ; CHECK-NEXT: [[T40:%.*]] = fdiv float [[T39]], 2.550000e+02 ; CHECK-NEXT: [[T41:%.*]] = getelementptr inbounds float, float* [[T]], i64 3 ; CHECK-NEXT: store float [[T40]], float* [[T41]], align 4 +; CHECK-NEXT: br label [[BB18_MERGE:%.*]] +; CHECK: bb18.merge: ; CHECK-NEXT: ret void +; CHECK: bb18.slpversioned1: +; CHECK-NEXT: [[T295:%.*]] = getelementptr inbounds float, float* [[T]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[T22]] to <4 x i8>* +; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i8>, <4 x i8>* [[TMP3]], align 1, !alias.scope !35, !noalias !38 +; CHECK-NEXT: [[TMP5:%.*]] = uitofp <4 x i8> [[TMP4]] to <4 x float> +; CHECK-NEXT: [[TMP6:%.*]] = fdiv <4 x float> [[TMP5]], +; CHECK-NEXT: [[TMP7:%.*]] = bitcast float* [[T295]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP6]], <4 x float>* [[TMP7]], align 8, !alias.scope !38, !noalias !35 +; CHECK-NEXT: br label [[BB18_MERGE]] ; bb: %t6 = icmp slt i32 10, 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/memory-runtime-checks.ll b/llvm/test/Transforms/SLPVectorizer/X86/memory-runtime-checks.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/memory-runtime-checks.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/memory-runtime-checks.ll @@ -1,33 +1,52 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -scoped-noalias-aa -slp-vectorizer -mtriple=x86_64-apple-darwin -enable-new-pm=false -S %s | FileCheck %s -; RUN: opt -aa-pipeline='basic-aa,scoped-noalias-aa' -passes=slp-vectorizer -mtriple=x86_64-apple-darwin -S %s | FileCheck %s +; RUN: opt -aa-pipeline='basic-aa,scoped-noalias-aa' -slp-memory-versioning -passes=slp-vectorizer -mtriple=x86_64-apple-darwin -S %s | FileCheck %s +; RUN: opt -aa-pipeline='basic-aa,scoped-noalias-aa' -slp-memory-versioning=false -passes=slp-vectorizer -mtriple=x86_64-apple-darwin -S %s | FileCheck --check-prefix=NOVERSION %s + +; NOVERSION-NOT: memcheck define void @version_multiple(i32* nocapture %out_block, i32* nocapture readonly %counter) { ; CHECK-LABEL: @version_multiple( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[COUNTER:%.*]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[OUT_BLOCK:%.*]], align 4 -; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[TMP1]], [[TMP0]] +; CHECK-NEXT: [[OUT_BLOCK13:%.*]] = ptrtoint i32* [[OUT_BLOCK:%.*]] to i64 +; CHECK-NEXT: [[COUNTER12:%.*]] = ptrtoint i32* [[COUNTER:%.*]] to i64 +; CHECK-NEXT: [[TMP0:%.*]] = sub i64 [[COUNTER12]], [[OUT_BLOCK13]] +; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 16 +; CHECK-NEXT: br i1 [[DIFF_CHECK]], label [[ENTRY_SCALAR:%.*]], label [[ENTRY_SLPVERSIONED1:%.*]] +; CHECK: entry.scalar: +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[COUNTER]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[OUT_BLOCK]], align 4 +; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[TMP2]], [[TMP1]] ; CHECK-NEXT: store i32 [[XOR]], i32* [[OUT_BLOCK]], align 4 ; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[COUNTER]], i64 1 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4 ; CHECK-NEXT: [[ARRAYIDX2_1:%.*]] = getelementptr inbounds i32, i32* [[OUT_BLOCK]], i64 1 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX2_1]], align 4 -; CHECK-NEXT: [[XOR_1:%.*]] = xor i32 [[TMP3]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX2_1]], align 4 +; CHECK-NEXT: [[XOR_1:%.*]] = xor i32 [[TMP4]], [[TMP3]] ; CHECK-NEXT: store i32 [[XOR_1]], i32* [[ARRAYIDX2_1]], align 4 ; CHECK-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[COUNTER]], i64 2 -; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4 ; CHECK-NEXT: [[ARRAYIDX2_2:%.*]] = getelementptr inbounds i32, i32* [[OUT_BLOCK]], i64 2 -; CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX2_2]], align 4 -; CHECK-NEXT: [[XOR_2:%.*]] = xor i32 [[TMP5]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX2_2]], align 4 +; CHECK-NEXT: [[XOR_2:%.*]] = xor i32 [[TMP6]], [[TMP5]] ; CHECK-NEXT: store i32 [[XOR_2]], i32* [[ARRAYIDX2_2]], align 4 ; CHECK-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[COUNTER]], i64 3 -; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4 ; CHECK-NEXT: [[ARRAYIDX2_3:%.*]] = getelementptr inbounds i32, i32* [[OUT_BLOCK]], i64 3 -; CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX2_3]], align 4 -; CHECK-NEXT: [[XOR_3:%.*]] = xor i32 [[TMP7]], [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX2_3]], align 4 +; CHECK-NEXT: [[XOR_3:%.*]] = xor i32 [[TMP8]], [[TMP7]] ; CHECK-NEXT: store i32 [[XOR_3]], i32* [[ARRAYIDX2_3]], align 4 +; CHECK-NEXT: br label [[ENTRY_MERGE:%.*]] +; CHECK: entry.merge: ; CHECK-NEXT: ret void +; CHECK: entry.slpversioned1: +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i32* [[COUNTER]] to <4 x i32>* +; CHECK-NEXT: [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4, !alias.scope !0, !noalias !3 +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i32* [[OUT_BLOCK]] to <4 x i32>* +; CHECK-NEXT: [[TMP12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP11]], align 4, !alias.scope !3, !noalias !0 +; CHECK-NEXT: [[TMP13:%.*]] = xor <4 x i32> [[TMP12]], [[TMP10]] +; CHECK-NEXT: [[TMP14:%.*]] = bitcast i32* [[OUT_BLOCK]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP13]], <4 x i32>* [[TMP14]], align 4, !alias.scope !3, !noalias !0 +; CHECK-NEXT: br label [[ENTRY_MERGE]] ; entry: %0 = load i32, i32* %counter, align 4