diff --git a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h --- a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h +++ b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h @@ -55,6 +55,14 @@ } // end namespace slpvectorizer +struct SLPVectorizerResult { + bool MadeAnyChange; + bool MadeCFGChange; + + SLPVectorizerResult(bool MadeAnyChange, bool MadeCFGChange) + : MadeAnyChange(MadeAnyChange), MadeCFGChange(MadeCFGChange) {} +}; + struct SLPVectorizerPass : public PassInfoMixin { using StoreList = SmallVector; using StoreListMap = MapVector; @@ -75,10 +83,12 @@ PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); // Glue for old PM. - bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, - TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, - DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, - OptimizationRemarkEmitter *ORE_); + SLPVectorizerResult runImpl(Function &F, ScalarEvolution *SE_, + TargetTransformInfo *TTI_, + TargetLibraryInfo *TLI_, AAResults *AA_, + LoopInfo *LI_, DominatorTree *DT_, + AssumptionCache *AC_, DemandedBits *DB_, + OptimizationRemarkEmitter *ORE_); private: /// Collect store and getelementptr instructions and organize them @@ -137,6 +147,11 @@ bool vectorizeStores(ArrayRef Stores, slpvectorizer::BoUpSLP &R); + SLPVectorizerResult + vectorizeBlockWithVersioning(BasicBlock *BB, + const SmallPtrSetImpl &TrackedObjects, + slpvectorizer::BoUpSLP &R); + /// The store instructions in a basic block organized by base pointer. StoreListMap Stores; diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -35,6 +35,7 @@ #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CodeMetrics.h" #include "llvm/Analysis/DemandedBits.h" +#include "llvm/Analysis/DomTreeUpdater.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/IVDescriptors.h" #include "llvm/Analysis/LoopAccessAnalysis.h" @@ -62,6 +63,7 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" +#include "llvm/IR/MDBuilder.h" #include "llvm/IR/Module.h" #include "llvm/IR/NoFolder.h" #include "llvm/IR/Operator.h" @@ -85,8 +87,11 @@ #include "llvm/Support/KnownBits.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Transforms/Utils/InjectTLIMappings.h" #include "llvm/Transforms/Utils/LoopUtils.h" +#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" #include "llvm/Transforms/Vectorize.h" #include #include @@ -107,6 +112,10 @@ #define DEBUG_TYPE "SLP" STATISTIC(NumVectorInstructions, "Number of vector instructions generated"); +STATISTIC(NumVersioningSuccessful, + "Number of times versioning was tried and beneficial"); +STATISTIC(NumVersioningFailed, + "Number of times versioning was tried but was not beneficial"); cl::opt RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, cl::desc("Run the SLP vectorization passes")); @@ -175,6 +184,10 @@ ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz")); +static cl::opt EnableMemoryVersioning( + "slp-memory-versioning", cl::init(false), cl::Hidden, + cl::desc("Enable memory versioning for SLP vectorization.")); + // Limit the number of alias checks. The limit is chosen so that // it has no negative effect on the llvm benchmarks. static const unsigned AliasedCheckLimit = 10; @@ -581,6 +594,45 @@ return Index; } +struct AccessInfo { + Value *UnderlyingObj; + const SCEV *PtrSCEV; + Type *AccessTy; + + AccessInfo() : UnderlyingObj(nullptr), PtrSCEV(nullptr), AccessTy(nullptr) {} + AccessInfo(Value *UnderlyingObj, const SCEV *PtrSCEV, Type *AccessTy) + : UnderlyingObj(UnderlyingObj), PtrSCEV(PtrSCEV), AccessTy(AccessTy) {} +}; +static AccessInfo getObject(Instruction &I, ScalarEvolution &SE, + DominatorTree &DT) { + BasicBlock *BB = I.getParent(); + auto GetPtrAndAccessTy = [](Instruction *I) -> std::pair { + if (auto *L = dyn_cast(I)) { + if (!L->getType()->isVectorTy()) + return {L->getPointerOperand(), L->getType()}; + } + if (auto *S = dyn_cast(I)) + if (!S->getValueOperand()->getType()->isVectorTy()) + return {S->getPointerOperand(), S->getValueOperand()->getType()}; + return {nullptr, nullptr}; + }; + Value *Ptr; + Type *AccessTy; + std::tie(Ptr, AccessTy) = GetPtrAndAccessTy(&I); + if (!Ptr) + return {}; + Value *Obj = getUnderlyingObject(Ptr); + if (!Obj) + return {}; + auto *Start = SE.getSCEV(Ptr); + + PHINode *PN = dyn_cast(Obj); + if (!SE.properlyDominates(Start, BB) && + !(PN && DT.dominates(PN->getParent(), BB))) + return {}; + return {Obj, Start, AccessTy}; +} + namespace slpvectorizer { /// Bottom Up SLP Vectorizer. @@ -589,6 +641,16 @@ struct ScheduleData; public: + // Map of objects to start & end pointers we need to generate runtime checks + // for. + SmallPtrSet TrackedObjects; + /// Cache for alias results. + /// TODO: consider moving this to the AliasAnalysis itself. + using AliasCacheKey = std::pair; + DenseMap> AliasCache; + + bool CollectMemAccess = false; + using ValueList = SmallVector; using InstrList = SmallVector; using ValueSet = SmallPtrSet; @@ -772,6 +834,24 @@ "All indices must be initialized"); } + void removeDeletedInstructions() { + for (const auto &Pair : DeletedInstructions) { + // Replace operands of ignored instructions with Undefs in case if they + // were marked for deletion. + if (Pair.getSecond()) { + Value *Undef = UndefValue::get(Pair.getFirst()->getType()); + Pair.getFirst()->replaceAllUsesWith(Undef); + } + Pair.getFirst()->dropAllReferences(); + } + for (const auto &Pair : DeletedInstructions) { + assert(Pair.getFirst()->use_empty() && + "trying to erase instruction with users."); + Pair.getFirst()->eraseFromParent(); + } + DeletedInstructions.clear(); + } + /// \return The vector element size in bits to use when vectorizing the /// expression tree ending at \p V. If V is a store, the size is the width of /// the stored value. Otherwise, the size is the width of the largest loaded @@ -1976,11 +2056,6 @@ return aliased; } - using AliasCacheKey = std::pair; - - /// Cache for alias results. - /// TODO: consider moving this to the AliasAnalysis itself. - DenseMap> AliasCache; /// Removes an instruction from its block and eventually deletes it. /// It's like Instruction::eraseFromParent() except that the actual deletion @@ -2565,27 +2640,7 @@ } // end namespace llvm -BoUpSLP::~BoUpSLP() { - for (const auto &Pair : DeletedInstructions) { - // Replace operands of ignored instructions with Undefs in case if they were - // marked for deletion. - if (Pair.getSecond()) { - Value *Undef = UndefValue::get(Pair.getFirst()->getType()); - Pair.getFirst()->replaceAllUsesWith(Undef); - } - Pair.getFirst()->dropAllReferences(); - } - for (const auto &Pair : DeletedInstructions) { - assert(Pair.getFirst()->use_empty() && - "trying to erase instruction with users."); - Pair.getFirst()->eraseFromParent(); - } -#ifdef EXPENSIVE_CHECKS - // If we could guarantee that this call is not extremely slow, we could - // remove the ifdef limitation (see PR47712). - assert(!verifyFunction(*F, &dbgs())); -#endif -} +BoUpSLP::~BoUpSLP() { removeDeletedInstructions(); } void BoUpSLP::eraseInstructions(ArrayRef AV) { for (auto *V : AV) { @@ -6242,6 +6297,7 @@ while (DepDest) { assert(isInSchedulingRegion(DepDest)); + ScheduleData *DestBundle = DepDest->FirstInBundle; // We have two limits to reduce the complexity: // 1) AliasedCheckLimit: It's a small limit to reduce calls to // SLP->isAliased (which is the expensive part in this loop). @@ -6259,9 +6315,38 @@ // balance between reduced runtime and accurate dependencies. numAliased++; + // If this bundle is not scheduled and no versioned code has been + // generated yet, try to collect the bounds of the accesses to + // generate runtime checks. + if (!DestBundle->IsScheduled && SLP->CollectMemAccess) { + // FIXME Naming + auto GetPtr = [](Instruction *I) -> Value * { + if (auto *L = dyn_cast(I)) + return L->getPointerOperand(); + if (auto *S = dyn_cast(I)) + return S->getPointerOperand(); + return nullptr; + }; + auto *Src = GetPtr(SrcInst); + auto *Dst = GetPtr(DepDest->Inst); + + if (SrcInst->getParent() == DepDest->Inst->getParent() && Src && + Dst) { + auto SrcObjAndPtr = getObject(*SrcInst, *SLP->SE, *SLP->DT); + auto DstObjAndPtr = + getObject(*DepDest->Inst, *SLP->SE, *SLP->DT); + if (!SrcObjAndPtr.UnderlyingObj || + !DstObjAndPtr.UnderlyingObj || + SrcObjAndPtr.UnderlyingObj == DstObjAndPtr.UnderlyingObj) + SLP->TrackedObjects.clear(); + else { + SLP->TrackedObjects.insert(SrcObjAndPtr.UnderlyingObj); + SLP->TrackedObjects.insert(DstObjAndPtr.UnderlyingObj); + } + } + } DepDest->MemoryDependencies.push_back(BundleMember); BundleMember->Dependencies++; - ScheduleData *DestBundle = DepDest->FirstInBundle; if (!DestBundle->IsScheduled) { BundleMember->incrementUnscheduledDeps(1); } @@ -6701,7 +6786,7 @@ auto *DB = &getAnalysis().getDemandedBits(); auto *ORE = &getAnalysis().getORE(); - return Impl.runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE); + return Impl.runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE).MadeAnyChange; } void getAnalysisUsage(AnalysisUsage &AU) const override { @@ -6719,7 +6804,8 @@ AU.addPreserved(); AU.addPreserved(); AU.addPreserved(); - AU.setPreservesCFG(); + if (!EnableMemoryVersioning) + AU.setPreservesCFG(); } }; @@ -6736,23 +6822,305 @@ auto *DB = &AM.getResult(F); auto *ORE = &AM.getResult(F); - bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE); - if (!Changed) + auto Result = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE); + if (!Result.MadeAnyChange) return PreservedAnalyses::all(); PreservedAnalyses PA; - PA.preserveSet(); + if (!Result.MadeCFGChange) + PA.preserveSet(); + PA.preserve(); + PA.preserve(); return PA; } -bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_, - TargetTransformInfo *TTI_, - TargetLibraryInfo *TLI_, AAResults *AA_, - LoopInfo *LI_, DominatorTree *DT_, - AssumptionCache *AC_, DemandedBits *DB_, - OptimizationRemarkEmitter *ORE_) { +SLPVectorizerResult SLPVectorizerPass::vectorizeBlockWithVersioning( + BasicBlock *BB, const SmallPtrSetImpl &TrackedObjects, + slpvectorizer::BoUpSLP &R) { + bool Changed = false; + bool CFGChanged = false; + R.AliasCache.clear(); + + // First, clean up delete instructions, so they are not re-used during SCEV + // expansion. + R.removeDeletedInstructions(); + + // Collect up-to-date memory bounds for tracked objects. Also collect the + // first and last memory instruction using a tracked object. + MapVector MemBounds; + SmallPtrSet WrittenObjs; + Instruction *FirstTrackedInst = nullptr; + Instruction *LastTrackedInst = nullptr; + for (Instruction &I : *BB) { + auto ObjAndStart = getObject(I, *SE, *DT); + if (!ObjAndStart.UnderlyingObj) + continue; + auto *Obj = ObjAndStart.UnderlyingObj; + const auto *Start = ObjAndStart.PtrSCEV; + + if (I.mayWriteToMemory()) + WrittenObjs.insert(Obj); + + unsigned AS = Obj->getType()->getPointerAddressSpace(); + // Runtime checks are generated to ensure this property holds. + auto &DL = BB->getModule()->getDataLayout(); + Type *IdxTy = DL.getIndexType(Obj->getType()); + const SCEV *EltSizeSCEV = + SE->getStoreSizeOfExpr(IdxTy, ObjAndStart.AccessTy); + auto *End = SE->getAddExpr(Start, EltSizeSCEV); + + if (TrackedObjects.find(Obj) != TrackedObjects.end()) + MemBounds.insert({Obj, {0, Start, End, AS}}); + auto BoundsIter = MemBounds.find(Obj); + if (BoundsIter == MemBounds.end()) + continue; + BoundsIter->second.addPointer(0, Start, End, AS, *SE); + if (!FirstTrackedInst) + FirstTrackedInst = &I; + LastTrackedInst = &I; + } + + // Not enough memory access bounds for runtime checks. + if (MemBounds.size() < 2 || WrittenObjs.empty()) + return {Changed, CFGChanged}; + + // Check if all uses between the first and last tracked instruction are inside + // the region. If that is not the case, PHIs would need to be added when + // duplicating the block. + auto AllUsesInside = [FirstTrackedInst, LastTrackedInst](BasicBlock *BB) { + return all_of(make_range(FirstTrackedInst->getIterator(), + std::next(LastTrackedInst->getIterator())), + [LastTrackedInst, BB](Instruction &I) { + return all_of(I.users(), [LastTrackedInst, BB](User *U) { + if (auto *UserI = dyn_cast(U)) + return UserI->getParent() == BB && + (UserI->comesBefore(LastTrackedInst) || + UserI == LastTrackedInst); + return true; + }); + }); + }; + if (!AllUsesInside(BB)) + return {Changed, CFGChanged}; + + SmallVector> BoundGroups; + for (auto &B : MemBounds) + BoundGroups.emplace_back(B.first, &B.second); + + // Create a RuntimePointerCheck for all groups in BoundGroups. + SmallVector PointerChecks; + for (unsigned I = 0, E = BoundGroups.size(); I != E; ++I) { + bool AWrites = WrittenObjs.contains(BoundGroups[I].first); + for (unsigned J = I + 1; J != E; ++J) + if (AWrites || WrittenObjs.contains(BoundGroups[J].first)) + PointerChecks.emplace_back(&*BoundGroups[I].second, + &*BoundGroups[J].second); + } + + // Duplicate BB now and set up block and branches for memory checks. + std::string OriginalBBName = BB->getName().str(); + IRBuilder<> ChkBuilder(BB->getFirstNonPHI()); + DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); + + BasicBlock *Tail = nullptr; + if (LastTrackedInst->getNextNode() != BB->getTerminator()) + Tail = SplitBlock(BB, LastTrackedInst->getNextNode(), &DTU, LI, nullptr, + OriginalBBName + ".tail"); + auto *CheckBlock = BB; + BB = SplitBlock(BB, FirstTrackedInst, &DTU, LI, nullptr, + OriginalBBName + ".slpmemcheck"); + for (Use &U : make_early_inc_range(BB->uses())) { + BasicBlock *UserBB = cast(U.getUser())->getParent(); + if (UserBB == CheckBlock) + continue; + + U.set(CheckBlock); + DTU.applyUpdates({{DT->Delete, UserBB, BB}}); + DTU.applyUpdates({{DT->Insert, UserBB, CheckBlock}}); + } + CFGChanged = true; + + auto *MergeBlock = BB; + BasicBlock *ScalarBB = + splitBlockBefore(BB, BB->getTerminator(), &DTU, LI, nullptr, + OriginalBBName + ".slpversioned"); + + ValueToValueMapTy VMap; + BasicBlock *VectorBB = CloneBasicBlock(ScalarBB, VMap, "", BB->getParent()); + ScalarBB->setName(OriginalBBName + ".scalar"); + MergeBlock->setName(OriginalBBName + ".merge"); + SmallVector Tmp; + Tmp.push_back(VectorBB); + remapInstructionsInBlocks(Tmp, VMap); + auto *Term = CheckBlock->getTerminator(); + ChkBuilder.SetInsertPoint(CheckBlock->getTerminator()); + ChkBuilder.CreateCondBr(ChkBuilder.getTrue(), ScalarBB, VectorBB); + Term->eraseFromParent(); + DTU.applyUpdates({{DT->Insert, CheckBlock, VectorBB}}); + if (auto *L = LI->getLoopFor(CheckBlock)) + L->addBasicBlockToLoop(VectorBB, *LI); + Changed = true; + + // Add !noalias metadata to memory accesses in the versiond block. + LLVMContext &Ctx = BB->getContext(); + MDBuilder MDB(Ctx); + MDNode *Domain = MDB.createAnonymousAliasScopeDomain("SLPVerDomain"); + + DenseMap GroupToScope; + for (const auto &Group : MemBounds) + GroupToScope[&Group.second] = MDB.createAnonymousAliasScope(Domain); + + for (Instruction &I : *VectorBB) { + auto GetPtr = [](Instruction *I) -> Value * { + if (auto *L = dyn_cast(I)) + return L->getPointerOperand(); + if (auto *S = dyn_cast(I)) + return S->getPointerOperand(); + return nullptr; + }; + auto *Ptr = GetPtr(&I); + if (!Ptr) + continue; + + auto *PtrSCEV = SE->getSCEV(Ptr); + Value *Obj = getUnderlyingObject(Ptr); + if (!Obj) { + if (auto *GEP = dyn_cast(Ptr)) + Obj = GEP->getOperand(0); + else + continue; + } + + auto BoundsIter = MemBounds.find(Obj); + if (BoundsIter == MemBounds.end()) + continue; + auto *LowerBound = BoundsIter->second.Low; + auto *UpperBound = BoundsIter->second.High; + auto *Scope = GroupToScope.find(&BoundsIter->second)->second; + + auto *LowerSub = SE->getMinusSCEV(PtrSCEV, LowerBound); + auto *UpperSub = SE->getMinusSCEV(UpperBound, PtrSCEV); + if (!isa(LowerSub) && + !isa(UpperSub) && + SE->isKnownNonNegative(LowerSub) && SE->isKnownNonNegative(UpperSub)) { + I.setMetadata( + LLVMContext::MD_alias_scope, + MDNode::concatenate(I.getMetadata(LLVMContext::MD_alias_scope), + MDNode::get(Ctx, Scope))); + + SmallVector NonAliasing; + for (auto &KV : GroupToScope) { + if (KV.first == &BoundsIter->second) + continue; + NonAliasing.push_back(KV.second); + } + I.setMetadata(LLVMContext::MD_noalias, + MDNode::concatenate(I.getMetadata(LLVMContext::MD_noalias), + MDNode::get(Ctx, NonAliasing))); + } + } + + DTU.flush(); + DT->updateDFSNumbers(); + collectSeedInstructions(VectorBB); + + // Vectorize trees that end at stores. + assert(!Stores.empty() && "should have stores when versioning"); + LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size() + << " underlying objects.\n"); + bool AnyVectorized = vectorizeStoreChains(R); + Changed |= AnyVectorized; + + R.removeDeletedInstructions(); + InstructionCost ScalarCost = 0; + for (Instruction &I : *ScalarBB) + ScalarCost += TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput); + InstructionCost SLPCost = 0; + for (Instruction &I : make_early_inc_range(reverse(*VectorBB))) { + if (!I.getType()->isVoidTy() && I.use_empty()) { + I.eraseFromParent(); + continue; + } + SLPCost += TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput); + } + + // Estimate the size of the runtime checks, consisting of computing lower & + // upper bounds (2), the overlap checks (2) and the AND/OR to combine the + // checks. + SLPCost += 5 * PointerChecks.size() + MemBounds.size(); + if (SLPCost >= ScalarCost) { + // Vectorization not beneficial or possible. Restore original state by + // removing the introduced blocks. + R.getORE()->emit([&]() { + return OptimizationRemarkMissed(SV_NAME, "VersioningNotBeneficial", + &*ScalarBB->begin()) + << "Tried to version block but was not beneficial" + << ore::NV("VectorCost", SLPCost) + << " >= " << ore::NV("ScalarCost", ScalarCost) + << ore::NV("AnyVectorized", AnyVectorized); + }); + + Changed = false; + CFGChanged = false; + CheckBlock->setName(OriginalBBName); + Instruction *OldTerm = CheckBlock->getTerminator(); + OldTerm->eraseFromParent(); + IRBuilder<> Builder(CheckBlock); + Builder.CreateBr(ScalarBB); + DTU.applyUpdates({{DT->Delete, CheckBlock, VectorBB}}); + LI->removeBlock(VectorBB); + DTU.deleteBB(VectorBB); + DTU.applyUpdates({{DT->Delete, VectorBB, MergeBlock}}); + MergeBlockIntoPredecessor(MergeBlock, &DTU, LI); + if (Tail) + MergeBlockIntoPredecessor(Tail, &DTU, LI); + MergeBlockIntoPredecessor(ScalarBB, &DTU, LI); + NumVersioningFailed++; + } else { + R.getORE()->emit( + OptimizationRemark(SV_NAME, "VersioningSuccessful", &*ScalarBB->begin()) + << "SLP vectorization with versioning is beneficial " + << ore::NV("VectorCost", SLPCost) << " < " + << ore::NV("ScalarCost", ScalarCost) + << ore::NV("AnyVectorized", AnyVectorized)); + + ChkBuilder.SetInsertPoint(CheckBlock->getTerminator()); + SCEVExpander Exp(*SE, BB->getParent()->getParent()->getDataLayout(), + "memcheck"); + Value *MemoryOverlap = addRuntimeChecks(CheckBlock->getTerminator(), + nullptr, PointerChecks, Exp) + .second; + assert(MemoryOverlap && + "runtime checks required, but no checks generated in IR?"); + + Value *NoOverflowCheck = MemoryOverlap; + // https://alive2.llvm.org/ce/z/dTuGLx + // // Emit checks ensuring that computing the upper bound does not + // overflow. + // for (auto &B : MemBounds) { + // Type *PtrArithTy = Type::getInt8PtrTy(Ctx, B.second.AddressSpace); + // Value *Low = Exp.expandCodeFor(B.second.Low, PtrArithTy); + // Value *High = Exp.expandCodeFor(B.second.High, PtrArithTy); + // NoOverflowCheck = ChkBuilder.CreateOr( + // NoOverflowCheck, ChkBuilder.CreateICmpUGT(Low, High, "wrap"), + //"check"); + //} + cast(CheckBlock->getTerminator()) + ->setCondition(NoOverflowCheck); + NumVersioningSuccessful++; + } + DTU.flush(); + DT->updateDFSNumbers(); + + return {Changed, CFGChanged}; +} + +SLPVectorizerResult SLPVectorizerPass::runImpl( + Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, + TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_, + AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_) { if (!RunSLPVectorization) - return false; + return {false, false}; SE = SE_; TTI = TTI_; TLI = TLI_; @@ -6766,15 +7134,16 @@ Stores.clear(); GEPs.clear(); bool Changed = false; + bool CFGChanged = false; // If the target claims to have no vector registers don't attempt // vectorization. if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true))) - return false; + return {false, false}; // Don't vectorize when the attribute NoImplicitFloat is used. if (F.hasFnAttribute(Attribute::NoImplicitFloat)) - return false; + return {false, false}; LLVM_DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n"); @@ -6788,19 +7157,29 @@ // Update DFS numbers now so that we can use them for ordering. DT->updateDFSNumbers(); + SmallVector BlocksToRetry; + SmallVector, 4> BoundsToUse; // Scan the blocks in the function in post order. for (auto BB : post_order(&F.getEntryBlock())) { collectSeedInstructions(BB); + bool VectorizedBlock = false; // Vectorize trees that end at stores. if (!Stores.empty()) { LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size() << " underlying objects.\n"); - Changed |= vectorizeStoreChains(R); + R.TrackedObjects.clear(); + + if (EnableMemoryVersioning) + R.CollectMemAccess = BB->size() <= 300; + + VectorizedBlock = vectorizeStoreChains(R); + + R.CollectMemAccess = false; } // Vectorize trees that end at reductions. - Changed |= vectorizeChainsInBlock(BB, R); + VectorizedBlock |= vectorizeChainsInBlock(BB, R); // Vectorize the index computations of getelementptr instructions. This // is primarily intended to catch gather-like idioms ending at @@ -6808,15 +7187,30 @@ if (!GEPs.empty()) { LLVM_DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size() << " underlying objects.\n"); - Changed |= vectorizeGEPIndices(BB, R); + VectorizedBlock |= vectorizeGEPIndices(BB, R); + } + + if (!VectorizedBlock && !R.TrackedObjects.empty()) { + BlocksToRetry.push_back(BB); + BoundsToUse.push_back(R.TrackedObjects); } + R.TrackedObjects.clear(); + Changed |= VectorizedBlock; } - if (Changed) { + for (unsigned I = 0; I != BlocksToRetry.size(); I++) { + auto Status = + vectorizeBlockWithVersioning(BlocksToRetry[I], BoundsToUse[I], R); + Changed |= Status.MadeAnyChange; + CFGChanged |= Status.MadeCFGChange; + } + + if (Changed && BlocksToRetry.empty()) { R.optimizeGatherSequence(); LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n"); } - return Changed; + + return {Changed, CFGChanged}; } /// Order may have elements assigned special value (size) which is out of diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/loadi8.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/loadi8.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/loadi8.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/loadi8.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -S -slp-vectorizer -mtriple=aarch64--linux-gnu -mcpu=generic < %s | FileCheck %s +; RUN: opt -slp-memory-versioning -scoped-noalias-aa -S -slp-vectorizer -mtriple=aarch64--linux-gnu -mcpu=generic -enable-new-pm=false < %s | FileCheck %s target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64" @@ -102,7 +102,15 @@ ; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[SCALE]], align 16 ; CHECK-NEXT: [[OFFSET:%.*]] = getelementptr inbounds [[STRUCT_WEIGHT_T]], %struct.weight_t* [[W]], i64 0, i32 1 ; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[OFFSET]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = load i8, i8* [[SRC:%.*]], align 1 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, i8* [[SRC:%.*]], i64 4 +; CHECK-NEXT: [[SCEVGEP37:%.*]] = getelementptr i8, i8* [[DST:%.*]], i64 4 +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[SRC]], [[SCEVGEP37]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[DST]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[MEMCHECK_CONFLICT:%.*]] = and i1 [[FOUND_CONFLICT]], true +; CHECK-NEXT: br i1 [[MEMCHECK_CONFLICT]], label [[ENTRY_SCALAR:%.*]], label [[ENTRY_SLPVERSIONED1:%.*]] +; CHECK: entry.scalar: +; CHECK-NEXT: [[TMP2:%.*]] = load i8, i8* [[SRC]], align 1 ; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP2]] to i32 ; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP0]], [[CONV]] ; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[MUL]], [[TMP1]] @@ -111,7 +119,7 @@ ; CHECK-NEXT: [[SHR_I:%.*]] = sext i1 [[TMP3]] to i32 ; CHECK-NEXT: [[COND_I:%.*]] = select i1 [[TOBOOL_NOT_I]], i32 [[ADD]], i32 [[SHR_I]] ; CHECK-NEXT: [[CONV_I:%.*]] = trunc i32 [[COND_I]] to i8 -; CHECK-NEXT: store i8 [[CONV_I]], i8* [[DST:%.*]], align 1 +; CHECK-NEXT: store i8 [[CONV_I]], i8* [[DST]], align 1 ; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 1 ; CHECK-NEXT: [[TMP4:%.*]] = load i8, i8* [[ARRAYIDX_1]], align 1 ; CHECK-NEXT: [[CONV_1:%.*]] = zext i8 [[TMP4]] to i32 @@ -148,7 +156,27 @@ ; CHECK-NEXT: [[CONV_I_3:%.*]] = trunc i32 [[COND_I_3]] to i8 ; CHECK-NEXT: [[ARRAYIDX2_3:%.*]] = getelementptr inbounds i8, i8* [[DST]], i64 3 ; CHECK-NEXT: store i8 [[CONV_I_3]], i8* [[ARRAYIDX2_3]], align 1 +; CHECK-NEXT: br label [[ENTRY_MERGE:%.*]] +; CHECK: entry.merge: ; CHECK-NEXT: ret void +; CHECK: entry.slpversioned1: +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8* [[SRC]] to <4 x i8>* +; CHECK-NEXT: [[TMP11:%.*]] = load <4 x i8>, <4 x i8>* [[TMP10]], align 1, !alias.scope !0, !noalias !3 +; CHECK-NEXT: [[TMP12:%.*]] = zext <4 x i8> [[TMP11]] to <4 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i32 0 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP13]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = mul nsw <4 x i32> [[SHUFFLE]], [[TMP12]] +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1]], i32 0 +; CHECK-NEXT: [[SHUFFLE36:%.*]] = shufflevector <4 x i32> [[TMP15]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = add nsw <4 x i32> [[TMP14]], [[SHUFFLE36]] +; CHECK-NEXT: [[TMP17:%.*]] = icmp ult <4 x i32> [[TMP16]], +; CHECK-NEXT: [[TMP18:%.*]] = icmp sgt <4 x i32> [[TMP16]], zeroinitializer +; CHECK-NEXT: [[TMP19:%.*]] = sext <4 x i1> [[TMP18]] to <4 x i32> +; CHECK-NEXT: [[TMP20:%.*]] = select <4 x i1> [[TMP17]], <4 x i32> [[TMP16]], <4 x i32> [[TMP19]] +; CHECK-NEXT: [[TMP21:%.*]] = trunc <4 x i32> [[TMP20]] to <4 x i8> +; CHECK-NEXT: [[TMP22:%.*]] = bitcast i8* [[DST]] to <4 x i8>* +; CHECK-NEXT: store <4 x i8> [[TMP21]], <4 x i8>* [[TMP22]], align 1, !alias.scope !3, !noalias !0 +; CHECK-NEXT: br label [[ENTRY_MERGE]] ; entry: %scale = getelementptr inbounds %struct.weight_t, %struct.weight_t* %w, i64 0, i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks-in-loops.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks-in-loops.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks-in-loops.ll @@ -0,0 +1,238 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -scoped-noalias-aa -slp-vectorizer -slp-memory-versioning -enable-new-pm=false -mtriple=arm64-apple-ios -S %s | FileCheck %s + +define void @loop1(i32* %A, i32* %B, i64 %N) { +; CHECK-LABEL: @loop1( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[INDVAR:%.*]] = phi i64 [ [[INDVAR_NEXT:%.*]], [[LOOP_TAIL:%.*]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[LOOP_TAIL]] ] +; CHECK-NEXT: [[TMP0:%.*]] = shl nuw i64 [[INDVAR]], 4 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[SCEVGEP28:%.*]] = bitcast i32* [[SCEVGEP]] to i8* +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[TMP0]], 4 +; CHECK-NEXT: [[SCEVGEP29:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP1]] +; CHECK-NEXT: [[SCEVGEP2930:%.*]] = bitcast i32* [[SCEVGEP29]] to i8* +; CHECK-NEXT: [[SCEVGEP31:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[SCEVGEP3132:%.*]] = bitcast i32* [[SCEVGEP31]] to i8* +; CHECK-NEXT: [[SCEVGEP33:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP1]] +; CHECK-NEXT: [[SCEVGEP3334:%.*]] = bitcast i32* [[SCEVGEP33]] to i8* +; CHECK-NEXT: [[B_GEP_0:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[IV]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[SCEVGEP28]], [[SCEVGEP3334]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[SCEVGEP3132]], [[SCEVGEP2930]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[MEMCHECK_CONFLICT:%.*]] = and i1 [[FOUND_CONFLICT]], true +; CHECK-NEXT: br i1 [[MEMCHECK_CONFLICT]], label [[LOOP_SCALAR:%.*]], label [[LOOP_SLPVERSIONED1:%.*]] +; CHECK: loop.scalar: +; CHECK-NEXT: [[B_0:%.*]] = load i32, i32* [[B_GEP_0]], align 4 +; CHECK-NEXT: [[A_GEP_0:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[IV]] +; CHECK-NEXT: [[A_0:%.*]] = load i32, i32* [[A_GEP_0]], align 4 +; CHECK-NEXT: [[ADD_0:%.*]] = add i32 [[A_0]], 20 +; CHECK-NEXT: [[XOR_0:%.*]] = xor i32 [[ADD_0]], [[B_0]] +; CHECK-NEXT: store i32 [[XOR_0]], i32* [[A_GEP_0]], align 4 +; CHECK-NEXT: [[IV_1:%.*]] = or i64 [[IV]], 1 +; CHECK-NEXT: [[B_GEP_1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[IV_1]] +; CHECK-NEXT: [[B_1:%.*]] = load i32, i32* [[B_GEP_1]], align 4 +; CHECK-NEXT: [[A_GEP_1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[IV_1]] +; CHECK-NEXT: [[A_1:%.*]] = load i32, i32* [[A_GEP_1]], align 4 +; CHECK-NEXT: [[ADD_1:%.*]] = add i32 [[A_1]], 20 +; CHECK-NEXT: [[XOR_1:%.*]] = xor i32 [[ADD_1]], [[B_1]] +; CHECK-NEXT: store i32 [[XOR_1]], i32* [[A_GEP_1]], align 4 +; CHECK-NEXT: [[IV_2:%.*]] = or i64 [[IV]], 2 +; CHECK-NEXT: [[B_GEP_2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[IV_2]] +; CHECK-NEXT: [[B_2:%.*]] = load i32, i32* [[B_GEP_2]], align 4 +; CHECK-NEXT: [[A_GEP_2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[IV_2]] +; CHECK-NEXT: [[A_2:%.*]] = load i32, i32* [[A_GEP_2]], align 4 +; CHECK-NEXT: [[ADD_2:%.*]] = add i32 [[A_2]], 20 +; CHECK-NEXT: [[XOR_2:%.*]] = xor i32 [[ADD_2]], [[B_2]] +; CHECK-NEXT: store i32 [[XOR_2]], i32* [[A_GEP_2]], align 4 +; CHECK-NEXT: [[IV_3:%.*]] = or i64 [[IV]], 3 +; CHECK-NEXT: [[B_GEP_3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[IV_3]] +; CHECK-NEXT: [[B_3:%.*]] = load i32, i32* [[B_GEP_3]], align 4 +; CHECK-NEXT: [[A_GEP_3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[IV_3]] +; CHECK-NEXT: [[A_3:%.*]] = load i32, i32* [[A_GEP_3]], align 4 +; CHECK-NEXT: [[ADD_3:%.*]] = add i32 [[A_3]], 20 +; CHECK-NEXT: [[XOR_3:%.*]] = xor i32 [[ADD_3]], [[B_3]] +; CHECK-NEXT: store i32 [[XOR_3]], i32* [[A_GEP_3]], align 4 +; CHECK-NEXT: br label [[LOOP_MERGE:%.*]] +; CHECK: loop.merge: +; CHECK-NEXT: br label [[LOOP_TAIL]] +; CHECK: loop.tail: +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 16 +; CHECK-NEXT: [[COND:%.*]] = icmp ult i64 [[IV_NEXT]], [[N:%.*]] +; CHECK-NEXT: [[INDVAR_NEXT]] = add i64 [[INDVAR]], 1 +; CHECK-NEXT: br i1 [[COND]], label [[LOOP]], label [[EXIT:%.*]] +; CHECK: exit: +; CHECK-NEXT: ret void +; CHECK: loop.slpversioned1: +; CHECK-NEXT: [[A_GEP_03:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[IV]] +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[B_GEP_0]] to <4 x i32>* +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4, !alias.scope !0, !noalias !3 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[A_GEP_03]] to <4 x i32>* +; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4, !alias.scope !3, !noalias !0 +; CHECK-NEXT: [[TMP6:%.*]] = add <4 x i32> [[TMP5]], +; CHECK-NEXT: [[TMP7:%.*]] = xor <4 x i32> [[TMP6]], [[TMP3]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[A_GEP_03]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4, !alias.scope !3, !noalias !0 +; CHECK-NEXT: br label [[LOOP_MERGE]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %B.gep.0 = getelementptr inbounds i32, i32* %B, i64 %iv + %B.0 = load i32, i32* %B.gep.0, align 4 + %A.gep.0 = getelementptr inbounds i32, i32* %A, i64 %iv + %A.0 = load i32, i32* %A.gep.0, align 4 + %add.0 = add i32 %A.0, 20 + %xor.0 = xor i32 %add.0, %B.0 + store i32 %xor.0, i32* %A.gep.0, align 4 + %iv.1 = or i64 %iv, 1 + %B.gep.1 = getelementptr inbounds i32, i32* %B, i64 %iv.1 + %B.1 = load i32, i32* %B.gep.1, align 4 + %A.gep.1 = getelementptr inbounds i32, i32* %A, i64 %iv.1 + %A.1 = load i32, i32* %A.gep.1, align 4 + %add.1 = add i32 %A.1, 20 + %xor.1 = xor i32 %add.1, %B.1 + store i32 %xor.1, i32* %A.gep.1, align 4 + %iv.2 = or i64 %iv, 2 + %B.gep.2 = getelementptr inbounds i32, i32* %B, i64 %iv.2 + %B.2 = load i32, i32* %B.gep.2, align 4 + %A.gep.2 = getelementptr inbounds i32, i32* %A, i64 %iv.2 + %A.2 = load i32, i32* %A.gep.2, align 4 + %add.2 = add i32 %A.2, 20 + %xor.2 = xor i32 %add.2, %B.2 + store i32 %xor.2, i32* %A.gep.2, align 4 + %iv.3 = or i64 %iv, 3 + %B.gep.3 = getelementptr inbounds i32, i32* %B, i64 %iv.3 + %B.3 = load i32, i32* %B.gep.3, align 4 + %A.gep.3 = getelementptr inbounds i32, i32* %A, i64 %iv.3 + %A.3 = load i32, i32* %A.gep.3, align 4 + %add.3 = add i32 %A.3, 20 + %xor.3 = xor i32 %add.3, %B.3 + store i32 %xor.3, i32* %A.gep.3, align 4 + %iv.next = add nuw nsw i64 %iv, 16 + %cond = icmp ult i64 %iv.next, %N + br i1 %cond, label %loop, label %exit + +exit: + ret void +} + +define void @loop_iv_update_at_start(float* %src, float* %dst) #0 { +; CHECK-LABEL: @loop_iv_update_at_start( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SRC26:%.*]] = bitcast float* [[SRC:%.*]] to i8* +; CHECK-NEXT: [[DST28:%.*]] = bitcast float* [[DST:%.*]] to i8* +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr float, float* [[SRC]], i64 5 +; CHECK-NEXT: [[SCEVGEP27:%.*]] = bitcast float* [[SCEVGEP]] to i8* +; CHECK-NEXT: [[SCEVGEP29:%.*]] = getelementptr float, float* [[DST]], i64 5 +; CHECK-NEXT: [[SCEVGEP2930:%.*]] = bitcast float* [[SCEVGEP29]] to i8* +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_MERGE:%.*]] ] +; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 +; CHECK-NEXT: [[COND:%.*]] = icmp ult i32 [[IV]], 2000 +; CHECK-NEXT: [[SRC_GEP_0:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 0 +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[SRC26]], [[SCEVGEP2930]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[DST28]], [[SCEVGEP27]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[MEMCHECK_CONFLICT:%.*]] = and i1 [[FOUND_CONFLICT]], true +; CHECK-NEXT: br i1 [[MEMCHECK_CONFLICT]], label [[LOOP_SCALAR:%.*]], label [[LOOP_SLPVERSIONED1:%.*]] +; CHECK: loop.scalar: +; CHECK-NEXT: [[SRC_0:%.*]] = load float, float* [[SRC_GEP_0]], align 8 +; CHECK-NEXT: [[ADD_0:%.*]] = fadd float [[SRC_0]], 1.000000e+00 +; CHECK-NEXT: [[MUL_0:%.*]] = fmul float [[ADD_0]], [[SRC_0]] +; CHECK-NEXT: [[DST_GEP_0:%.*]] = getelementptr inbounds float, float* [[DST]], i64 0 +; CHECK-NEXT: store float [[MUL_0]], float* [[DST_GEP_0]], align 8 +; CHECK-NEXT: [[SRC_GEP_1:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 1 +; CHECK-NEXT: [[SRC_1:%.*]] = load float, float* [[SRC_GEP_1]], align 8 +; CHECK-NEXT: [[ADD_1:%.*]] = fadd float [[SRC_1]], 1.000000e+00 +; CHECK-NEXT: [[MUL_1:%.*]] = fmul float [[ADD_1]], [[SRC_1]] +; CHECK-NEXT: [[DST_GEP_1:%.*]] = getelementptr inbounds float, float* [[DST]], i64 1 +; CHECK-NEXT: store float [[MUL_1]], float* [[DST_GEP_1]], align 8 +; CHECK-NEXT: [[SRC_GEP_2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2 +; CHECK-NEXT: [[SRC_2:%.*]] = load float, float* [[SRC_GEP_2]], align 8 +; CHECK-NEXT: [[ADD_2:%.*]] = fadd float [[SRC_2]], 1.000000e+00 +; CHECK-NEXT: [[MUL_2:%.*]] = fmul float [[ADD_2]], [[SRC_2]] +; CHECK-NEXT: [[DST_GEP_2:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2 +; CHECK-NEXT: store float [[MUL_2]], float* [[DST_GEP_2]], align 8 +; CHECK-NEXT: [[SRC_GEP_3:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3 +; CHECK-NEXT: [[SRC_3:%.*]] = load float, float* [[SRC_GEP_3]], align 8 +; CHECK-NEXT: [[ADD_3:%.*]] = fadd float [[SRC_3]], 1.000000e+00 +; CHECK-NEXT: [[MUL_3:%.*]] = fmul float [[ADD_3]], [[SRC_3]] +; CHECK-NEXT: [[DST_GEP_3:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3 +; CHECK-NEXT: store float [[MUL_3]], float* [[DST_GEP_3]], align 8 +; CHECK-NEXT: [[SRC_GEP_4:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 4 +; CHECK-NEXT: [[SRC_4:%.*]] = load float, float* [[SRC_GEP_4]], align 8 +; CHECK-NEXT: [[ADD_4:%.*]] = fadd float [[SRC_4]], 1.000000e+00 +; CHECK-NEXT: [[MUL_4:%.*]] = fmul float [[ADD_4]], [[SRC_4]] +; CHECK-NEXT: [[DST_GEP_4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 4 +; CHECK-NEXT: store float [[MUL_4]], float* [[DST_GEP_4]], align 8 +; CHECK-NEXT: br label [[LOOP_MERGE]] +; CHECK: loop.merge: +; CHECK-NEXT: br i1 [[COND]], label [[LOOP]], label [[EXIT:%.*]] +; CHECK: exit: +; CHECK-NEXT: ret void +; CHECK: loop.slpversioned1: +; CHECK-NEXT: [[DST_GEP_05:%.*]] = getelementptr inbounds float, float* [[DST]], i64 0 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[SRC_GEP_0]] to <4 x float>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 8, !alias.scope !5, !noalias !8 +; CHECK-NEXT: [[TMP2:%.*]] = fadd <4 x float> [[TMP1]], +; CHECK-NEXT: [[TMP3:%.*]] = fmul <4 x float> [[TMP2]], [[TMP1]] +; CHECK-NEXT: [[TMP4:%.*]] = bitcast float* [[DST_GEP_05]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP3]], <4 x float>* [[TMP4]], align 8, !alias.scope !8, !noalias !5 +; CHECK-NEXT: [[SRC_GEP_421:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 4 +; CHECK-NEXT: [[SRC_422:%.*]] = load float, float* [[SRC_GEP_421]], align 8, !alias.scope !5, !noalias !8 +; CHECK-NEXT: [[ADD_423:%.*]] = fadd float [[SRC_422]], 1.000000e+00 +; CHECK-NEXT: [[MUL_424:%.*]] = fmul float [[ADD_423]], [[SRC_422]] +; CHECK-NEXT: [[DST_GEP_425:%.*]] = getelementptr inbounds float, float* [[DST]], i64 4 +; CHECK-NEXT: store float [[MUL_424]], float* [[DST_GEP_425]], align 8, !alias.scope !8, !noalias !5 +; CHECK-NEXT: br label [[LOOP_MERGE]] +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] + %iv.next = add i32 %iv, 1 + %cond = icmp ult i32 %iv, 2000 + + %src.gep.0 = getelementptr inbounds float, float* %src, i64 0 + %src.0 = load float, float* %src.gep.0, align 8 + %add.0 = fadd float %src.0, 1.0 + %mul.0 = fmul float %add.0, %src.0 + %dst.gep.0 = getelementptr inbounds float, float* %dst, i64 0 + store float %mul.0, float* %dst.gep.0, align 8 + + %src.gep.1 = getelementptr inbounds float, float* %src, i64 1 + %src.1 = load float, float* %src.gep.1, align 8 + %add.1 = fadd float %src.1, 1.0 + %mul.1 = fmul float %add.1, %src.1 + %dst.gep.1 = getelementptr inbounds float, float* %dst, i64 1 + store float %mul.1, float* %dst.gep.1, align 8 + %src.gep.2 = getelementptr inbounds float, float* %src, i64 2 + %src.2 = load float, float* %src.gep.2, align 8 + %add.2 = fadd float %src.2, 1.0 + %mul.2 = fmul float %add.2, %src.2 + %dst.gep.2 = getelementptr inbounds float, float* %dst, i64 2 + store float %mul.2, float* %dst.gep.2, align 8 + %src.gep.3 = getelementptr inbounds float, float* %src, i64 3 + %src.3 = load float, float* %src.gep.3, align 8 + %add.3 = fadd float %src.3, 1.0 + %mul.3 = fmul float %add.3, %src.3 + %dst.gep.3 = getelementptr inbounds float, float* %dst, i64 3 + store float %mul.3, float* %dst.gep.3, align 8 + %src.gep.4 = getelementptr inbounds float, float* %src, i64 4 + %src.4 = load float, float* %src.gep.4, align 8 + %add.4 = fadd float %src.4, 1.0 + %mul.4 = fmul float %add.4, %src.4 + %dst.gep.4 = getelementptr inbounds float, float* %dst, i64 4 + store float %mul.4, float* %dst.gep.4, align 8 + br i1 %cond, label %loop, label %exit + +exit: + ret void +} diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll @@ -1,5 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -scoped-noalias-aa -slp-vectorizer -mtriple=arm64-apple-darwin -enable-new-pm=false -S %s | FileCheck %s +; RUN: opt -scoped-noalias-aa -slp-vectorizer -slp-memory-versioning -mtriple=arm64-apple-darwin -enable-new-pm=false -S %s | FileCheck %s +; RUN: opt -slp-memory-versioning=false -scoped-noalias-aa -slp-vectorizer -mtriple=arm64-apple-darwin -enable-new-pm=false -S %s | FileCheck --check-prefix=NOVERSION %s + +; NOVERSION-NOT: slpversioned define void @needs_versioning_not_profitable(i32* %dst, i32* %src) { ; CHECK-LABEL: @needs_versioning_not_profitable( @@ -29,6 +32,317 @@ define void @needs_versioning_profitable(i32* %dst, i32* %src) { ; CHECK-LABEL: @needs_versioning_profitable( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[SRC16:%.*]] = bitcast i32* [[SRC:%.*]] to i8* +; CHECK-NEXT: [[DST18:%.*]] = bitcast i32* [[DST:%.*]] to i8* +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[SRC]], i64 4 +; CHECK-NEXT: [[SCEVGEP17:%.*]] = bitcast i32* [[SCEVGEP]] to i8* +; CHECK-NEXT: [[SCEVGEP19:%.*]] = getelementptr i32, i32* [[DST]], i64 4 +; CHECK-NEXT: [[SCEVGEP1920:%.*]] = bitcast i32* [[SCEVGEP19]] to i8* +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[SRC16]], [[SCEVGEP1920]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[DST18]], [[SCEVGEP17]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[MEMCHECK_CONFLICT:%.*]] = and i1 [[FOUND_CONFLICT]], true +; CHECK-NEXT: br i1 [[MEMCHECK_CONFLICT]], label [[ENTRY_SCALAR:%.*]], label [[ENTRY_SLPVERSIONED1:%.*]] +; CHECK: entry.scalar: +; CHECK-NEXT: [[SRC_0:%.*]] = load i32, i32* [[SRC]], align 4 +; CHECK-NEXT: [[R_0:%.*]] = ashr i32 [[SRC_0]], 16 +; CHECK-NEXT: store i32 [[R_0]], i32* [[DST]], align 4 +; CHECK-NEXT: [[SRC_GEP_1:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 1 +; CHECK-NEXT: [[SRC_1:%.*]] = load i32, i32* [[SRC_GEP_1]], align 4 +; CHECK-NEXT: [[R_1:%.*]] = ashr i32 [[SRC_1]], 16 +; CHECK-NEXT: [[DST_GEP_1:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 1 +; CHECK-NEXT: store i32 [[R_1]], i32* [[DST_GEP_1]], align 4 +; CHECK-NEXT: [[SRC_GEP_2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2 +; CHECK-NEXT: [[SRC_2:%.*]] = load i32, i32* [[SRC_GEP_2]], align 4 +; CHECK-NEXT: [[R_2:%.*]] = ashr i32 [[SRC_2]], 16 +; CHECK-NEXT: [[DST_GEP_2:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2 +; CHECK-NEXT: store i32 [[R_2]], i32* [[DST_GEP_2]], align 4 +; CHECK-NEXT: [[SRC_GEP_3:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3 +; CHECK-NEXT: [[SRC_3:%.*]] = load i32, i32* [[SRC_GEP_3]], align 4 +; CHECK-NEXT: [[R_3:%.*]] = ashr i32 [[SRC_3]], 16 +; CHECK-NEXT: [[DST_GEP_3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3 +; CHECK-NEXT: store i32 [[R_3]], i32* [[DST_GEP_3]], align 4 +; CHECK-NEXT: br label [[ENTRY_MERGE:%.*]] +; CHECK: entry.merge: +; CHECK-NEXT: ret void +; CHECK: entry.slpversioned1: +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4, !alias.scope !0, !noalias !3 +; CHECK-NEXT: [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[DST]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4, !alias.scope !3, !noalias !0 +; CHECK-NEXT: br label [[ENTRY_MERGE]] +; +entry: + %src.0 = load i32, i32* %src, align 4 + %r.0 = ashr i32 %src.0, 16 + store i32 %r.0, i32* %dst, align 4 + %src.gep.1 = getelementptr inbounds i32, i32* %src, i64 1 + %src.1 = load i32, i32* %src.gep.1, align 4 + %r.1 = ashr i32 %src.1, 16 + %dst.gep.1 = getelementptr inbounds i32, i32* %dst, i64 1 + store i32 %r.1, i32* %dst.gep.1, align 4 + %src.gep.2 = getelementptr inbounds i32, i32* %src, i64 2 + %src.2 = load i32, i32* %src.gep.2, align 4 + %r.2 = ashr i32 %src.2, 16 + %dst.gep.2 = getelementptr inbounds i32, i32* %dst, i64 2 + store i32 %r.2, i32* %dst.gep.2, align 4 + %src.gep.3 = getelementptr inbounds i32, i32* %src, i64 3 + %src.3 = load i32, i32* %src.gep.3, align 4 + %r.3 = ashr i32 %src.3, 16 + %dst.gep.3 = getelementptr inbounds i32, i32* %dst, i64 3 + store i32 %r.3, i32* %dst.gep.3, align 4 + + ret void +} + +define void @needs_versioning_profitable_2_sources(i32* %dst, i32* %A, i32* %B) { +; CHECK-LABEL: @needs_versioning_profitable_2_sources( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[A27:%.*]] = bitcast i32* [[A:%.*]] to i8* +; CHECK-NEXT: [[DST29:%.*]] = bitcast i32* [[DST:%.*]] to i8* +; CHECK-NEXT: [[B32:%.*]] = bitcast i32* [[B:%.*]] to i8* +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[A]], i64 4 +; CHECK-NEXT: [[SCEVGEP28:%.*]] = bitcast i32* [[SCEVGEP]] to i8* +; CHECK-NEXT: [[SCEVGEP30:%.*]] = getelementptr i32, i32* [[DST]], i64 4 +; CHECK-NEXT: [[SCEVGEP3031:%.*]] = bitcast i32* [[SCEVGEP30]] to i8* +; CHECK-NEXT: [[SCEVGEP33:%.*]] = getelementptr i32, i32* [[B]], i64 4 +; CHECK-NEXT: [[SCEVGEP3334:%.*]] = bitcast i32* [[SCEVGEP33]] to i8* +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[A27]], [[SCEVGEP3031]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[DST29]], [[SCEVGEP28]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[BOUND035:%.*]] = icmp ult i8* [[B32]], [[SCEVGEP3031]] +; CHECK-NEXT: [[BOUND136:%.*]] = icmp ult i8* [[DST29]], [[SCEVGEP3334]] +; CHECK-NEXT: [[FOUND_CONFLICT37:%.*]] = and i1 [[BOUND035]], [[BOUND136]] +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT37]] +; CHECK-NEXT: [[MEMCHECK_CONFLICT:%.*]] = and i1 [[CONFLICT_RDX]], true +; CHECK-NEXT: br i1 [[MEMCHECK_CONFLICT]], label [[ENTRY_SCALAR:%.*]], label [[ENTRY_SLPVERSIONED1:%.*]] +; CHECK: entry.scalar: +; CHECK-NEXT: [[A_0:%.*]] = load i32, i32* [[A]], align 4 +; CHECK-NEXT: [[B_0:%.*]] = load i32, i32* [[B]], align 4 +; CHECK-NEXT: [[R_0:%.*]] = add i32 [[A_0]], [[B_0]] +; CHECK-NEXT: [[MUL_0:%.*]] = mul i32 [[R_0]], 2 +; CHECK-NEXT: store i32 [[MUL_0]], i32* [[DST]], align 4 +; CHECK-NEXT: [[A_GEP_1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 1 +; CHECK-NEXT: [[A_1:%.*]] = load i32, i32* [[A_GEP_1]], align 4 +; CHECK-NEXT: [[B_GEP_1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 1 +; CHECK-NEXT: [[B_1:%.*]] = load i32, i32* [[B_GEP_1]], align 4 +; CHECK-NEXT: [[R_1:%.*]] = add i32 [[A_1]], [[B_1]] +; CHECK-NEXT: [[MUL_1:%.*]] = mul i32 [[R_1]], 2 +; CHECK-NEXT: [[DST_GEP_1:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 1 +; CHECK-NEXT: store i32 [[MUL_1]], i32* [[DST_GEP_1]], align 4 +; CHECK-NEXT: [[A_GEP_2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2 +; CHECK-NEXT: [[A_2:%.*]] = load i32, i32* [[A_GEP_2]], align 4 +; CHECK-NEXT: [[B_GEP_2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2 +; CHECK-NEXT: [[B_2:%.*]] = load i32, i32* [[B_GEP_2]], align 4 +; CHECK-NEXT: [[R_2:%.*]] = add i32 [[A_2]], [[B_2]] +; CHECK-NEXT: [[MUL_2:%.*]] = mul i32 [[R_2]], 2 +; CHECK-NEXT: [[DST_GEP_2:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2 +; CHECK-NEXT: store i32 [[MUL_2]], i32* [[DST_GEP_2]], align 4 +; CHECK-NEXT: [[A_GEP_3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3 +; CHECK-NEXT: [[A_3:%.*]] = load i32, i32* [[A_GEP_3]], align 4 +; CHECK-NEXT: [[B_GEP_3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3 +; CHECK-NEXT: [[B_3:%.*]] = load i32, i32* [[B_GEP_3]], align 4 +; CHECK-NEXT: [[R_3:%.*]] = add i32 [[A_3]], [[B_3]] +; CHECK-NEXT: [[MUL_3:%.*]] = mul i32 [[R_3]], 2 +; CHECK-NEXT: [[DST_GEP_3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3 +; CHECK-NEXT: store i32 [[MUL_3]], i32* [[DST_GEP_3]], align 4 +; CHECK-NEXT: br label [[ENTRY_MERGE:%.*]] +; CHECK: entry.merge: +; CHECK-NEXT: ret void +; CHECK: entry.slpversioned1: +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[A]] to <4 x i32>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4, !alias.scope !5, !noalias !8 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[B]] to <4 x i32>* +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4, !alias.scope !11, !noalias !12 +; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = mul <4 x i32> [[TMP4]], +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[DST]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4, !alias.scope !13, !noalias !14 +; CHECK-NEXT: br label [[ENTRY_MERGE]] +; +entry: + %A.0 = load i32, i32* %A, align 4 + %B.0 = load i32, i32* %B, align 4 + %r.0 = add i32 %A.0, %B.0 + %mul.0 = mul i32 %r.0, 2 + store i32 %mul.0, i32* %dst, align 4 + %A.gep.1 = getelementptr inbounds i32, i32* %A, i64 1 + %A.1 = load i32, i32* %A.gep.1, align 4 + %B.gep.1 = getelementptr inbounds i32, i32* %B, i64 1 + %B.1 = load i32, i32* %B.gep.1, align 4 + %r.1 = add i32 %A.1, %B.1 + %mul.1 = mul i32 %r.1, 2 + %dst.gep.1 = getelementptr inbounds i32, i32* %dst, i64 1 + store i32 %mul.1, i32* %dst.gep.1, align 4 + %A.gep.2 = getelementptr inbounds i32, i32* %A, i64 2 + %A.2 = load i32, i32* %A.gep.2, align 4 + %B.gep.2 = getelementptr inbounds i32, i32* %B, i64 2 + %B.2 = load i32, i32* %B.gep.2, align 4 + %r.2 = add i32 %A.2, %B.2 + %mul.2 = mul i32 %r.2, 2 + %dst.gep.2 = getelementptr inbounds i32, i32* %dst, i64 2 + store i32 %mul.2, i32* %dst.gep.2, align 4 + %A.gep.3 = getelementptr inbounds i32, i32* %A, i64 3 + %A.3 = load i32, i32* %A.gep.3, align 4 + %B.gep.3 = getelementptr inbounds i32, i32* %B, i64 3 + %B.3 = load i32, i32* %B.gep.3, align 4 + %r.3 = add i32 %A.3, %B.3 + %mul.3 = mul i32 %r.3, 2 + %dst.gep.3 = getelementptr inbounds i32, i32* %dst, i64 3 + store i32 %mul.3, i32* %dst.gep.3, align 4 + + ret void +} + +declare void @use(i32) + +declare void @bar() + +define void @needs_versioning_profitable_split_points(i32* %dst, i32* %src) { +; CHECK-LABEL: @needs_versioning_profitable_split_points( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SRC16:%.*]] = bitcast i32* [[SRC:%.*]] to i8* +; CHECK-NEXT: [[DST18:%.*]] = bitcast i32* [[DST:%.*]] to i8* +; CHECK-NEXT: call void @bar() +; CHECK-NEXT: call void @bar() +; CHECK-NEXT: call void @bar() +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[SRC]], i64 4 +; CHECK-NEXT: [[SCEVGEP17:%.*]] = bitcast i32* [[SCEVGEP]] to i8* +; CHECK-NEXT: [[SCEVGEP19:%.*]] = getelementptr i32, i32* [[DST]], i64 4 +; CHECK-NEXT: [[SCEVGEP1920:%.*]] = bitcast i32* [[SCEVGEP19]] to i8* +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[SRC16]], [[SCEVGEP1920]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[DST18]], [[SCEVGEP17]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[MEMCHECK_CONFLICT:%.*]] = and i1 [[FOUND_CONFLICT]], true +; CHECK-NEXT: br i1 [[MEMCHECK_CONFLICT]], label [[ENTRY_SCALAR:%.*]], label [[ENTRY_SLPVERSIONED1:%.*]] +; CHECK: entry.scalar: +; CHECK-NEXT: [[SRC_0:%.*]] = load i32, i32* [[SRC]], align 4 +; CHECK-NEXT: [[R_0:%.*]] = ashr i32 [[SRC_0]], 16 +; CHECK-NEXT: store i32 [[R_0]], i32* [[DST]], align 4 +; CHECK-NEXT: [[SRC_GEP_1:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 1 +; CHECK-NEXT: [[SRC_1:%.*]] = load i32, i32* [[SRC_GEP_1]], align 4 +; CHECK-NEXT: [[R_1:%.*]] = ashr i32 [[SRC_1]], 16 +; CHECK-NEXT: [[DST_GEP_1:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 1 +; CHECK-NEXT: store i32 [[R_1]], i32* [[DST_GEP_1]], align 4 +; CHECK-NEXT: [[SRC_GEP_2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2 +; CHECK-NEXT: [[SRC_2:%.*]] = load i32, i32* [[SRC_GEP_2]], align 4 +; CHECK-NEXT: [[R_2:%.*]] = ashr i32 [[SRC_2]], 16 +; CHECK-NEXT: [[DST_GEP_2:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2 +; CHECK-NEXT: store i32 [[R_2]], i32* [[DST_GEP_2]], align 4 +; CHECK-NEXT: [[SRC_GEP_3:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3 +; CHECK-NEXT: [[SRC_3:%.*]] = load i32, i32* [[SRC_GEP_3]], align 4 +; CHECK-NEXT: [[R_3:%.*]] = ashr i32 [[SRC_3]], 16 +; CHECK-NEXT: [[DST_GEP_3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3 +; CHECK-NEXT: store i32 [[R_3]], i32* [[DST_GEP_3]], align 4 +; CHECK-NEXT: br label [[ENTRY_MERGE:%.*]] +; CHECK: entry.merge: +; CHECK-NEXT: br label [[ENTRY_TAIL:%.*]] +; CHECK: entry.tail: +; CHECK-NEXT: call void @bar() +; CHECK-NEXT: ret void +; CHECK: entry.slpversioned1: +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4, !alias.scope !15, !noalias !18 +; CHECK-NEXT: [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[DST]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4, !alias.scope !18, !noalias !15 +; CHECK-NEXT: br label [[ENTRY_MERGE]] +; +entry: + call void @bar() + call void @bar() + call void @bar() + + %src.0 = load i32, i32* %src, align 4 + %r.0 = ashr i32 %src.0, 16 + store i32 %r.0, i32* %dst, align 4 + %src.gep.1 = getelementptr inbounds i32, i32* %src, i64 1 + %src.1 = load i32, i32* %src.gep.1, align 4 + %r.1 = ashr i32 %src.1, 16 + %dst.gep.1 = getelementptr inbounds i32, i32* %dst, i64 1 + store i32 %r.1, i32* %dst.gep.1, align 4 + %src.gep.2 = getelementptr inbounds i32, i32* %src, i64 2 + %src.2 = load i32, i32* %src.gep.2, align 4 + %r.2 = ashr i32 %src.2, 16 + %dst.gep.2 = getelementptr inbounds i32, i32* %dst, i64 2 + store i32 %r.2, i32* %dst.gep.2, align 4 + %src.gep.3 = getelementptr inbounds i32, i32* %src, i64 3 + %src.3 = load i32, i32* %src.gep.3, align 4 + %r.3 = ashr i32 %src.3, 16 + %dst.gep.3 = getelementptr inbounds i32, i32* %dst, i64 3 + store i32 %r.3, i32* %dst.gep.3, align 4 + + call void @bar() + ret void +} + +define void @needs_versioning_profitable_load_used_outside_region1(i32* %dst, i32* %src, i1 %c) { +; CHECK-LABEL: @needs_versioning_profitable_load_used_outside_region1( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[C:%.*]], label [[THEN:%.*]], label [[EXIT:%.*]] +; CHECK: then: +; CHECK-NEXT: [[SRC_0:%.*]] = load i32, i32* [[SRC:%.*]], align 4 +; CHECK-NEXT: [[R_0:%.*]] = ashr i32 [[SRC_0]], 16 +; CHECK-NEXT: store i32 [[R_0]], i32* [[DST:%.*]], align 4 +; CHECK-NEXT: [[SRC_GEP_1:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 1 +; CHECK-NEXT: [[SRC_1:%.*]] = load i32, i32* [[SRC_GEP_1]], align 4 +; CHECK-NEXT: [[R_1:%.*]] = ashr i32 [[SRC_1]], 16 +; CHECK-NEXT: [[DST_GEP_1:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 1 +; CHECK-NEXT: store i32 [[R_1]], i32* [[DST_GEP_1]], align 4 +; CHECK-NEXT: [[SRC_GEP_2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2 +; CHECK-NEXT: [[SRC_2:%.*]] = load i32, i32* [[SRC_GEP_2]], align 4 +; CHECK-NEXT: [[R_2:%.*]] = ashr i32 [[SRC_2]], 16 +; CHECK-NEXT: [[DST_GEP_2:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2 +; CHECK-NEXT: store i32 [[R_2]], i32* [[DST_GEP_2]], align 4 +; CHECK-NEXT: [[SRC_GEP_3:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3 +; CHECK-NEXT: [[SRC_3:%.*]] = load i32, i32* [[SRC_GEP_3]], align 4 +; CHECK-NEXT: [[R_3:%.*]] = ashr i32 [[SRC_3]], 16 +; CHECK-NEXT: [[DST_GEP_3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3 +; CHECK-NEXT: store i32 [[R_3]], i32* [[DST_GEP_3]], align 4 +; CHECK-NEXT: [[SRC_GEP_5:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 5 +; CHECK-NEXT: [[L:%.*]] = load i32, i32* [[SRC_GEP_5]], align 4 +; CHECK-NEXT: call void @use(i32 [[L]]) +; CHECK-NEXT: br label [[EXIT]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + br i1 %c, label %then, label %exit + +then: + %src.0 = load i32, i32* %src, align 4 + %r.0 = ashr i32 %src.0, 16 + store i32 %r.0, i32* %dst, align 4 + %src.gep.1 = getelementptr inbounds i32, i32* %src, i64 1 + %src.1 = load i32, i32* %src.gep.1, align 4 + %r.1 = ashr i32 %src.1, 16 + %dst.gep.1 = getelementptr inbounds i32, i32* %dst, i64 1 + store i32 %r.1, i32* %dst.gep.1, align 4 + %src.gep.2 = getelementptr inbounds i32, i32* %src, i64 2 + %src.2 = load i32, i32* %src.gep.2, align 4 + %r.2 = ashr i32 %src.2, 16 + %dst.gep.2 = getelementptr inbounds i32, i32* %dst, i64 2 + store i32 %r.2, i32* %dst.gep.2, align 4 + %src.gep.3 = getelementptr inbounds i32, i32* %src, i64 3 + %src.3 = load i32, i32* %src.gep.3, align 4 + %r.3 = ashr i32 %src.3, 16 + %dst.gep.3 = getelementptr inbounds i32, i32* %dst, i64 3 + store i32 %r.3, i32* %dst.gep.3, align 4 + %src.gep.5 = getelementptr inbounds i32, i32* %src, i64 5 + %l = load i32, i32* %src.gep.5 + call void @use(i32 %l) + br label %exit + +exit: + ret void +} + +define void @needs_versioning_profitable_load_used_outside_region2(i32* %dst, i32* %src, i1 %c) { +; CHECK-LABEL: @needs_versioning_profitable_load_used_outside_region2( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[C:%.*]], label [[THEN:%.*]], label [[EXIT:%.*]] +; CHECK: then: ; CHECK-NEXT: [[SRC_0:%.*]] = load i32, i32* [[SRC:%.*]], align 4 ; CHECK-NEXT: [[R_0:%.*]] = ashr i32 [[SRC_0]], 16 ; CHECK-NEXT: store i32 [[R_0]], i32* [[DST:%.*]], align 4 @@ -39,6 +353,8 @@ ; CHECK-NEXT: store i32 [[R_1]], i32* [[DST_GEP_1]], align 4 ; CHECK-NEXT: [[SRC_GEP_2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2 ; CHECK-NEXT: [[SRC_2:%.*]] = load i32, i32* [[SRC_GEP_2]], align 4 +; CHECK-NEXT: [[SRC_GEP_5:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 5 +; CHECK-NEXT: [[L:%.*]] = load i32, i32* [[SRC_GEP_5]], align 4 ; CHECK-NEXT: [[R_2:%.*]] = ashr i32 [[SRC_2]], 16 ; CHECK-NEXT: [[DST_GEP_2:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2 ; CHECK-NEXT: store i32 [[R_2]], i32* [[DST_GEP_2]], align 4 @@ -47,9 +363,15 @@ ; CHECK-NEXT: [[R_3:%.*]] = ashr i32 [[SRC_3]], 16 ; CHECK-NEXT: [[DST_GEP_3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3 ; CHECK-NEXT: store i32 [[R_3]], i32* [[DST_GEP_3]], align 4 +; CHECK-NEXT: call void @use(i32 [[L]]) +; CHECK-NEXT: br label [[EXIT]] +; CHECK: exit: ; CHECK-NEXT: ret void ; entry: + br i1 %c, label %then, label %exit + +then: %src.0 = load i32, i32* %src, align 4 %r.0 = ashr i32 %src.0, 16 store i32 %r.0, i32* %dst, align 4 @@ -60,6 +382,8 @@ store i32 %r.1, i32* %dst.gep.1, align 4 %src.gep.2 = getelementptr inbounds i32, i32* %src, i64 2 %src.2 = load i32, i32* %src.gep.2, align 4 + %src.gep.5 = getelementptr inbounds i32, i32* %src, i64 5 + %l = load i32, i32* %src.gep.5 %r.2 = ashr i32 %src.2, 16 %dst.gep.2 = getelementptr inbounds i32, i32* %dst, i64 2 store i32 %r.2, i32* %dst.gep.2, align 4 @@ -68,10 +392,14 @@ %r.3 = ashr i32 %src.3, 16 %dst.gep.3 = getelementptr inbounds i32, i32* %dst, i64 3 store i32 %r.3, i32* %dst.gep.3, align 4 + call void @use(i32 %l) + br label %exit +exit: ret void } +>>>>>>> bac4352c3d46 ([SLPVectorizer] WIP Implement initial memory versioning (WIP!)) define void @no_version(i32* nocapture %dst, i32* nocapture readonly %src) { ; CHECK-LABEL: @no_version( @@ -100,8 +428,20 @@ define void @version_multiple(i32* nocapture %out_block, i32* nocapture readonly %counter) { ; CHECK-LABEL: @version_multiple( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[COUNTER:%.*]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[OUT_BLOCK:%.*]], align 4 +; CHECK-NEXT: [[COUNTER12:%.*]] = bitcast i32* [[COUNTER:%.*]] to i8* +; CHECK-NEXT: [[OUT_BLOCK14:%.*]] = bitcast i32* [[OUT_BLOCK:%.*]] to i8* +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[COUNTER]], i64 4 +; CHECK-NEXT: [[SCEVGEP13:%.*]] = bitcast i32* [[SCEVGEP]] to i8* +; CHECK-NEXT: [[SCEVGEP15:%.*]] = getelementptr i32, i32* [[OUT_BLOCK]], i64 4 +; CHECK-NEXT: [[SCEVGEP1516:%.*]] = bitcast i32* [[SCEVGEP15]] to i8* +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[COUNTER12]], [[SCEVGEP1516]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[OUT_BLOCK14]], [[SCEVGEP13]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[MEMCHECK_CONFLICT:%.*]] = and i1 [[FOUND_CONFLICT]], true +; CHECK-NEXT: br i1 [[MEMCHECK_CONFLICT]], label [[ENTRY_SCALAR:%.*]], label [[ENTRY_SLPVERSIONED1:%.*]] +; CHECK: entry.scalar: +; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[COUNTER]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[OUT_BLOCK]], align 4 ; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[TMP1]], [[TMP0]] ; CHECK-NEXT: store i32 [[XOR]], i32* [[OUT_BLOCK]], align 4 ; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[COUNTER]], i64 1 @@ -122,7 +462,18 @@ ; CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX2_3]], align 4 ; CHECK-NEXT: [[XOR_3:%.*]] = xor i32 [[TMP7]], [[TMP6]] ; CHECK-NEXT: store i32 [[XOR_3]], i32* [[ARRAYIDX2_3]], align 4 +; CHECK-NEXT: br label [[ENTRY_MERGE:%.*]] +; CHECK: entry.merge: ; CHECK-NEXT: ret void +; CHECK: entry.slpversioned1: +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[COUNTER]] to <4 x i32>* +; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP8]], align 4, !alias.scope !20, !noalias !23 +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i32* [[OUT_BLOCK]] to <4 x i32>* +; CHECK-NEXT: [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP10]], align 4, !alias.scope !23, !noalias !20 +; CHECK-NEXT: [[TMP12:%.*]] = xor <4 x i32> [[TMP11]], [[TMP9]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast i32* [[OUT_BLOCK]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* [[TMP13]], align 4, !alias.scope !23, !noalias !20 +; CHECK-NEXT: br label [[ENTRY_MERGE]] ; entry: %0 = load i32, i32* %counter, align 4 @@ -446,9 +797,767 @@ br label %bb15 bb15: +<<<<<<< HEAD %tmp16 = fmul double %tmp, 20.0 store double %tmp16, double* %tmp9, align 8 %tmp17 = fmul double %tmp13, 30.0 store double %tmp17, double* %tmp14, align 8 +======= + %t16 = fmul double %t, 20.0 + store double %t16, double* %t9, align 8 + %t17 = fmul double %t13, 30.0 + store double %t17, double* %t14, align 8 + ret void +} + +%struct.2 = type { [4 x float] } + +; Make sure we do not crash when we encounter a SCEVCouldNotCompute. +define void @no_lcssa_phi(%struct.2* %A, float* %B, i1 %c) { +; CHECK-LABEL: @no_lcssa_phi( +; CHECK-NEXT: bb: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[PTR_PHI:%.*]] = phi %struct.2* [ [[A:%.*]], [[BB:%.*]] ], [ null, [[LOOP]] ] +; CHECK-NEXT: br i1 [[C:%.*]], label [[EXIT:%.*]], label [[LOOP]] +; CHECK: exit: +; CHECK-NEXT: [[PTR_PHI_LCSSA:%.*]] = phi %struct.2* [ [[PTR_PHI]], [[LOOP]] ] +; CHECK-NEXT: [[PTR_PHI_LCSSA23:%.*]] = bitcast %struct.2* [[PTR_PHI_LCSSA]] to i8* +; CHECK-NEXT: [[B_GEP_0:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 0 +; CHECK-NEXT: [[B_GEP_021:%.*]] = bitcast float* [[B_GEP_0]] to i8* +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr float, float* [[B_GEP_0]], i64 4 +; CHECK-NEXT: [[SCEVGEP22:%.*]] = bitcast float* [[SCEVGEP]] to i8* +; CHECK-NEXT: [[SCEVGEP24:%.*]] = getelementptr [[STRUCT_2:%.*]], %struct.2* [[PTR_PHI_LCSSA]], i64 1 +; CHECK-NEXT: [[SCEVGEP2425:%.*]] = bitcast %struct.2* [[SCEVGEP24]] to i8* +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[B_GEP_021]], [[SCEVGEP2425]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[PTR_PHI_LCSSA23]], [[SCEVGEP22]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[MEMCHECK_CONFLICT:%.*]] = and i1 [[FOUND_CONFLICT]], true +; CHECK-NEXT: br i1 [[MEMCHECK_CONFLICT]], label [[EXIT_SCALAR:%.*]], label [[EXIT_SLPVERSIONED1:%.*]] +; CHECK: exit.scalar: +; CHECK-NEXT: [[L_0:%.*]] = load float, float* [[B_GEP_0]], align 8 +; CHECK-NEXT: [[ADD_0:%.*]] = fadd float [[L_0]], 1.000000e+01 +; CHECK-NEXT: [[MUL_0:%.*]] = fmul float [[ADD_0]], 3.000000e+01 +; CHECK-NEXT: [[A_GEP_0:%.*]] = getelementptr inbounds [[STRUCT_2]], %struct.2* [[PTR_PHI_LCSSA]], i64 0, i32 0, i32 0 +; CHECK-NEXT: store float [[MUL_0]], float* [[A_GEP_0]], align 8 +; CHECK-NEXT: [[B_GEP_1:%.*]] = getelementptr inbounds float, float* [[B]], i64 1 +; CHECK-NEXT: [[L_1:%.*]] = load float, float* [[B_GEP_1]], align 8 +; CHECK-NEXT: [[ADD_1:%.*]] = fadd float [[L_1]], 1.000000e+01 +; CHECK-NEXT: [[MUL_1:%.*]] = fmul float [[ADD_1]], 3.000000e+01 +; CHECK-NEXT: [[A_GEP_1:%.*]] = getelementptr inbounds [[STRUCT_2]], %struct.2* [[PTR_PHI_LCSSA]], i64 0, i32 0, i32 1 +; CHECK-NEXT: store float [[MUL_1]], float* [[A_GEP_1]], align 8 +; CHECK-NEXT: [[B_GEP_2:%.*]] = getelementptr inbounds float, float* [[B]], i64 2 +; CHECK-NEXT: [[L_2:%.*]] = load float, float* [[B_GEP_2]], align 8 +; CHECK-NEXT: [[ADD_2:%.*]] = fadd float [[L_2]], 1.000000e+01 +; CHECK-NEXT: [[MUL_2:%.*]] = fmul float [[ADD_2]], 3.000000e+01 +; CHECK-NEXT: [[A_GEP_2:%.*]] = getelementptr inbounds [[STRUCT_2]], %struct.2* [[PTR_PHI_LCSSA]], i64 0, i32 0, i32 2 +; CHECK-NEXT: store float [[MUL_2]], float* [[A_GEP_2]], align 8 +; CHECK-NEXT: [[B_GEP_3:%.*]] = getelementptr inbounds float, float* [[B]], i64 3 +; CHECK-NEXT: [[L_3:%.*]] = load float, float* [[B_GEP_3]], align 8 +; CHECK-NEXT: [[ADD_3:%.*]] = fadd float [[L_3]], 1.000000e+01 +; CHECK-NEXT: [[MUL_3:%.*]] = fmul float [[ADD_3]], 3.000000e+01 +; CHECK-NEXT: [[A_GEP_3:%.*]] = getelementptr inbounds [[STRUCT_2]], %struct.2* [[PTR_PHI_LCSSA]], i64 0, i32 0, i32 3 +; CHECK-NEXT: store float [[MUL_3]], float* [[A_GEP_3]], align 8 +; CHECK-NEXT: br label [[EXIT_MERGE:%.*]] +; CHECK: exit.merge: +; CHECK-NEXT: ret void +; CHECK: exit.slpversioned1: +; CHECK-NEXT: [[A_GEP_05:%.*]] = getelementptr inbounds [[STRUCT_2]], %struct.2* [[PTR_PHI_LCSSA]], i64 0, i32 0, i32 0 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[B_GEP_0]] to <4 x float>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 8, !alias.scope !25, !noalias !28 +; CHECK-NEXT: [[TMP2:%.*]] = fadd <4 x float> [[TMP1]], +; CHECK-NEXT: [[TMP3:%.*]] = fmul <4 x float> [[TMP2]], +; CHECK-NEXT: [[TMP4:%.*]] = bitcast float* [[A_GEP_05]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP3]], <4 x float>* [[TMP4]], align 8, !alias.scope !28, !noalias !25 +; CHECK-NEXT: br label [[EXIT_MERGE]] +; +bb: + br label %loop + +loop: + %ptr.phi = phi %struct.2* [ %A, %bb ], [ null, %loop ] + br i1 %c, label %exit, label %loop + +exit: + %B.gep.0 = getelementptr inbounds float, float* %B, i64 0 + %l.0 = load float, float* %B.gep.0, align 8 + %add.0 = fadd float %l.0, 10.0 + %mul.0 = fmul float %add.0, 30.0 + %A.gep.0 = getelementptr inbounds %struct.2, %struct.2* %ptr.phi, i64 0, i32 0, i32 0 + store float %mul.0, float* %A.gep.0, align 8 + %B.gep.1 = getelementptr inbounds float, float* %B, i64 1 + %l.1 = load float, float* %B.gep.1, align 8 + %add.1 = fadd float %l.1, 10.0 + %mul.1 = fmul float %add.1, 30.0 + %A.gep.1 = getelementptr inbounds %struct.2, %struct.2* %ptr.phi, i64 0, i32 0, i32 1 + store float %mul.1, float* %A.gep.1, align 8 + %B.gep.2 = getelementptr inbounds float, float* %B, i64 2 + %l.2 = load float, float* %B.gep.2, align 8 + %add.2 = fadd float %l.2, 10.0 + %mul.2 = fmul float %add.2, 30.0 + %A.gep.2 = getelementptr inbounds %struct.2, %struct.2* %ptr.phi, i64 0, i32 0, i32 2 + store float %mul.2, float* %A.gep.2, align 8 + %B.gep.3 = getelementptr inbounds float, float* %B, i64 3 + %l.3 = load float, float* %B.gep.3, align 8 + %add.3 = fadd float %l.3, 10.0 + %mul.3 = fmul float %add.3, 30.0 + %A.gep.3 = getelementptr inbounds %struct.2, %struct.2* %ptr.phi, i64 0, i32 0, i32 3 + store float %mul.3, float* %A.gep.3, align 8 + ret void +} + +; Make sure lcssa phis as pointer bases are handled properly. +define void @lcssa_phi(%struct.2* %A, float* %B, i1 %c) { +; CHECK-LABEL: @lcssa_phi( +; CHECK-NEXT: bb: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[PTR_PHI:%.*]] = phi %struct.2* [ [[A:%.*]], [[BB:%.*]] ], [ null, [[LOOP]] ] +; CHECK-NEXT: br i1 [[C:%.*]], label [[EXIT:%.*]], label [[LOOP]] +; CHECK: exit: +; CHECK-NEXT: [[PTR_PHI_LCSSA:%.*]] = phi %struct.2* [ [[PTR_PHI]], [[LOOP]] ] +; CHECK-NEXT: [[PTR_PHI_LCSSA23:%.*]] = bitcast %struct.2* [[PTR_PHI_LCSSA]] to i8* +; CHECK-NEXT: [[B_GEP_0:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 0 +; CHECK-NEXT: [[B_GEP_021:%.*]] = bitcast float* [[B_GEP_0]] to i8* +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr float, float* [[B_GEP_0]], i64 4 +; CHECK-NEXT: [[SCEVGEP22:%.*]] = bitcast float* [[SCEVGEP]] to i8* +; CHECK-NEXT: [[SCEVGEP24:%.*]] = getelementptr [[STRUCT_2:%.*]], %struct.2* [[PTR_PHI_LCSSA]], i64 1 +; CHECK-NEXT: [[SCEVGEP2425:%.*]] = bitcast %struct.2* [[SCEVGEP24]] to i8* +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[B_GEP_021]], [[SCEVGEP2425]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[PTR_PHI_LCSSA23]], [[SCEVGEP22]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[MEMCHECK_CONFLICT:%.*]] = and i1 [[FOUND_CONFLICT]], true +; CHECK-NEXT: br i1 [[MEMCHECK_CONFLICT]], label [[EXIT_SCALAR:%.*]], label [[EXIT_SLPVERSIONED1:%.*]] +; CHECK: exit.scalar: +; CHECK-NEXT: [[L_0:%.*]] = load float, float* [[B_GEP_0]], align 8 +; CHECK-NEXT: [[ADD_0:%.*]] = fadd float [[L_0]], 1.000000e+01 +; CHECK-NEXT: [[MUL_0:%.*]] = fmul float [[ADD_0]], 3.000000e+01 +; CHECK-NEXT: [[A_GEP_0:%.*]] = getelementptr inbounds [[STRUCT_2]], %struct.2* [[PTR_PHI_LCSSA]], i64 0, i32 0, i32 0 +; CHECK-NEXT: store float [[MUL_0]], float* [[A_GEP_0]], align 8 +; CHECK-NEXT: [[B_GEP_1:%.*]] = getelementptr inbounds float, float* [[B]], i64 1 +; CHECK-NEXT: [[L_1:%.*]] = load float, float* [[B_GEP_1]], align 8 +; CHECK-NEXT: [[ADD_1:%.*]] = fadd float [[L_1]], 1.000000e+01 +; CHECK-NEXT: [[MUL_1:%.*]] = fmul float [[ADD_1]], 3.000000e+01 +; CHECK-NEXT: [[A_GEP_1:%.*]] = getelementptr inbounds [[STRUCT_2]], %struct.2* [[PTR_PHI_LCSSA]], i64 0, i32 0, i32 1 +; CHECK-NEXT: store float [[MUL_1]], float* [[A_GEP_1]], align 8 +; CHECK-NEXT: [[B_GEP_2:%.*]] = getelementptr inbounds float, float* [[B]], i64 2 +; CHECK-NEXT: [[L_2:%.*]] = load float, float* [[B_GEP_2]], align 8 +; CHECK-NEXT: [[ADD_2:%.*]] = fadd float [[L_2]], 1.000000e+01 +; CHECK-NEXT: [[MUL_2:%.*]] = fmul float [[ADD_2]], 3.000000e+01 +; CHECK-NEXT: [[A_GEP_2:%.*]] = getelementptr inbounds [[STRUCT_2]], %struct.2* [[PTR_PHI_LCSSA]], i64 0, i32 0, i32 2 +; CHECK-NEXT: store float [[MUL_2]], float* [[A_GEP_2]], align 8 +; CHECK-NEXT: [[B_GEP_3:%.*]] = getelementptr inbounds float, float* [[B]], i64 3 +; CHECK-NEXT: [[L_3:%.*]] = load float, float* [[B_GEP_3]], align 8 +; CHECK-NEXT: [[ADD_3:%.*]] = fadd float [[L_3]], 1.000000e+01 +; CHECK-NEXT: [[MUL_3:%.*]] = fmul float [[ADD_3]], 3.000000e+01 +; CHECK-NEXT: [[A_GEP_3:%.*]] = getelementptr inbounds [[STRUCT_2]], %struct.2* [[PTR_PHI_LCSSA]], i64 0, i32 0, i32 3 +; CHECK-NEXT: store float [[MUL_3]], float* [[A_GEP_3]], align 8 +; CHECK-NEXT: br label [[EXIT_MERGE:%.*]] +; CHECK: exit.merge: +; CHECK-NEXT: ret void +; CHECK: exit.slpversioned1: +; CHECK-NEXT: [[A_GEP_05:%.*]] = getelementptr inbounds [[STRUCT_2]], %struct.2* [[PTR_PHI_LCSSA]], i64 0, i32 0, i32 0 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[B_GEP_0]] to <4 x float>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 8, !alias.scope !30, !noalias !33 +; CHECK-NEXT: [[TMP2:%.*]] = fadd <4 x float> [[TMP1]], +; CHECK-NEXT: [[TMP3:%.*]] = fmul <4 x float> [[TMP2]], +; CHECK-NEXT: [[TMP4:%.*]] = bitcast float* [[A_GEP_05]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP3]], <4 x float>* [[TMP4]], align 8, !alias.scope !33, !noalias !30 +; CHECK-NEXT: br label [[EXIT_MERGE]] +; +bb: + br label %loop + +loop: + %ptr.phi = phi %struct.2* [ %A, %bb ], [ null, %loop ] + br i1 %c, label %exit, label %loop + +exit: + %ptr.phi.lcssa = phi %struct.2* [ %ptr.phi, %loop ] + %B.gep.0 = getelementptr inbounds float, float* %B, i64 0 + %l.0 = load float, float* %B.gep.0, align 8 + %add.0 = fadd float %l.0, 10.0 + %mul.0 = fmul float %add.0, 30.0 + %A.gep.0 = getelementptr inbounds %struct.2, %struct.2* %ptr.phi.lcssa, i64 0, i32 0, i32 0 + store float %mul.0, float* %A.gep.0, align 8 + %B.gep.1 = getelementptr inbounds float, float* %B, i64 1 + %l.1 = load float, float* %B.gep.1, align 8 + %add.1 = fadd float %l.1, 10.0 + %mul.1 = fmul float %add.1, 30.0 + %A.gep.1 = getelementptr inbounds %struct.2, %struct.2* %ptr.phi.lcssa, i64 0, i32 0, i32 1 + store float %mul.1, float* %A.gep.1, align 8 + %B.gep.2 = getelementptr inbounds float, float* %B, i64 2 + %l.2 = load float, float* %B.gep.2, align 8 + %add.2 = fadd float %l.2, 10.0 + %mul.2 = fmul float %add.2, 30.0 + %A.gep.2 = getelementptr inbounds %struct.2, %struct.2* %ptr.phi.lcssa, i64 0, i32 0, i32 2 + store float %mul.2, float* %A.gep.2, align 8 + %B.gep.3 = getelementptr inbounds float, float* %B, i64 3 + %l.3 = load float, float* %B.gep.3, align 8 + %add.3 = fadd float %l.3, 10.0 + %mul.3 = fmul float %add.3, 30.0 + %A.gep.3 = getelementptr inbounds %struct.2, %struct.2* %ptr.phi.lcssa, i64 0, i32 0, i32 3 + store float %mul.3, float* %A.gep.3, align 8 + ret void +} + +%struct.spam = type { [60 x i32], i32, [12 x i8] } + +declare void @foo(i8*) + +; Test case with a basic block where parts can be vectorized without versioning. +define i32 @block_partly_vectorized_without_versioning(%struct.spam* readonly %arg, i8* nocapture readonly %arg1, i8* nocapture %arg2, i8* nocapture readonly %arg3, i8* %A, i8* %B) { +; CHECK-LABEL: @block_partly_vectorized_without_versioning( +; CHECK-NEXT: bb: +; CHECK-NEXT: [[T:%.*]] = alloca <16 x i8>, align 16 +; CHECK-NEXT: [[T4:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[T]], i64 0, i64 0 +; CHECK-NEXT: [[T5:%.*]] = getelementptr inbounds i8, i8* [[ARG3:%.*]], i64 1 +; CHECK-NEXT: [[T6:%.*]] = getelementptr inbounds i8, i8* [[ARG3]], i64 2 +; CHECK-NEXT: [[T7:%.*]] = getelementptr inbounds i8, i8* [[ARG3]], i64 3 +; CHECK-NEXT: [[T8:%.*]] = getelementptr inbounds i8, i8* [[ARG3]], i64 4 +; CHECK-NEXT: [[T9:%.*]] = getelementptr inbounds i8, i8* [[ARG3]], i64 5 +; CHECK-NEXT: [[T10:%.*]] = getelementptr inbounds i8, i8* [[ARG3]], i64 6 +; CHECK-NEXT: [[T11:%.*]] = getelementptr inbounds i8, i8* [[ARG3]], i64 7 +; CHECK-NEXT: [[T12:%.*]] = getelementptr inbounds i8, i8* [[ARG3]], i64 8 +; CHECK-NEXT: [[T13:%.*]] = getelementptr inbounds i8, i8* [[ARG3]], i64 9 +; CHECK-NEXT: [[T14:%.*]] = getelementptr inbounds i8, i8* [[ARG3]], i64 10 +; CHECK-NEXT: [[T15:%.*]] = getelementptr inbounds i8, i8* [[ARG3]], i64 11 +; CHECK-NEXT: [[T16:%.*]] = getelementptr inbounds i8, i8* [[ARG3]], i64 12 +; CHECK-NEXT: [[T17:%.*]] = getelementptr inbounds i8, i8* [[ARG3]], i64 13 +; CHECK-NEXT: [[T18:%.*]] = getelementptr inbounds i8, i8* [[ARG3]], i64 14 +; CHECK-NEXT: [[T19:%.*]] = bitcast i8* [[ARG1:%.*]] to <16 x i8>* +; CHECK-NEXT: [[A_GEP_0:%.*]] = getelementptr i8, i8* [[A:%.*]], i64 0 +; CHECK-NEXT: [[B_GEP_0:%.*]] = getelementptr i8, i8* [[B:%.*]], i64 0 +; CHECK-NEXT: [[A_GEP_1:%.*]] = getelementptr i8, i8* [[A]], i64 1 +; CHECK-NEXT: [[B_GEP_1:%.*]] = getelementptr i8, i8* [[B]], i64 1 +; CHECK-NEXT: [[A_GEP_2:%.*]] = getelementptr i8, i8* [[A]], i64 2 +; CHECK-NEXT: [[B_GEP_2:%.*]] = getelementptr i8, i8* [[B]], i64 2 +; CHECK-NEXT: [[A_GEP_3:%.*]] = getelementptr i8, i8* [[A]], i64 3 +; CHECK-NEXT: [[B_GEP_3:%.*]] = getelementptr i8, i8* [[B]], i64 3 +; CHECK-NEXT: [[A_GEP_4:%.*]] = getelementptr i8, i8* [[A]], i64 4 +; CHECK-NEXT: [[B_GEP_4:%.*]] = getelementptr i8, i8* [[B]], i64 4 +; CHECK-NEXT: [[A_GEP_5:%.*]] = getelementptr i8, i8* [[A]], i64 5 +; CHECK-NEXT: [[B_GEP_5:%.*]] = getelementptr i8, i8* [[B]], i64 5 +; CHECK-NEXT: [[A_GEP_6:%.*]] = getelementptr i8, i8* [[A]], i64 6 +; CHECK-NEXT: [[B_GEP_6:%.*]] = getelementptr i8, i8* [[B]], i64 6 +; CHECK-NEXT: [[A_GEP_7:%.*]] = getelementptr i8, i8* [[A]], i64 7 +; CHECK-NEXT: [[B_GEP_7:%.*]] = getelementptr i8, i8* [[B]], i64 7 +; CHECK-NEXT: [[A_GEP_8:%.*]] = getelementptr i8, i8* [[A]], i64 8 +; CHECK-NEXT: [[B_GEP_8:%.*]] = getelementptr i8, i8* [[B]], i64 8 +; CHECK-NEXT: [[A_GEP_9:%.*]] = getelementptr i8, i8* [[A]], i64 9 +; CHECK-NEXT: [[B_GEP_9:%.*]] = getelementptr i8, i8* [[B]], i64 9 +; CHECK-NEXT: [[A_GEP_10:%.*]] = getelementptr i8, i8* [[A]], i64 10 +; CHECK-NEXT: [[B_GEP_10:%.*]] = getelementptr i8, i8* [[B]], i64 10 +; CHECK-NEXT: [[A_GEP_11:%.*]] = getelementptr i8, i8* [[A]], i64 11 +; CHECK-NEXT: [[B_GEP_11:%.*]] = getelementptr i8, i8* [[B]], i64 11 +; CHECK-NEXT: [[A_GEP_12:%.*]] = getelementptr i8, i8* [[A]], i64 12 +; CHECK-NEXT: [[B_GEP_12:%.*]] = getelementptr i8, i8* [[B]], i64 12 +; CHECK-NEXT: [[A_GEP_13:%.*]] = getelementptr i8, i8* [[A]], i64 13 +; CHECK-NEXT: [[B_GEP_13:%.*]] = getelementptr i8, i8* [[B]], i64 13 +; CHECK-NEXT: [[A_GEP_14:%.*]] = getelementptr i8, i8* [[A]], i64 14 +; CHECK-NEXT: [[B_GEP_14:%.*]] = getelementptr i8, i8* [[B]], i64 14 +; CHECK-NEXT: [[A_GEP_15:%.*]] = getelementptr i8, i8* [[A]], i64 15 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[A_GEP_0]] to <16 x i8>* +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[TMP0]], align 1 +; CHECK-NEXT: [[B_GEP_15:%.*]] = getelementptr i8, i8* [[B]], i64 15 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[B_GEP_0]] to <16 x i8>* +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[TMP2]], align 1 +; CHECK-NEXT: [[TMP4:%.*]] = xor <16 x i8> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[R_GEP_0:%.*]] = getelementptr i8, i8* [[ARG1]], i64 0 +; CHECK-NEXT: [[R_GEP_1:%.*]] = getelementptr i8, i8* [[ARG1]], i64 1 +; CHECK-NEXT: [[R_GEP_2:%.*]] = getelementptr i8, i8* [[ARG1]], i64 2 +; CHECK-NEXT: [[R_GEP_3:%.*]] = getelementptr i8, i8* [[ARG1]], i64 3 +; CHECK-NEXT: [[R_GEP_4:%.*]] = getelementptr i8, i8* [[ARG1]], i64 4 +; CHECK-NEXT: [[R_GEP_5:%.*]] = getelementptr i8, i8* [[ARG1]], i64 5 +; CHECK-NEXT: [[R_GEP_6:%.*]] = getelementptr i8, i8* [[ARG1]], i64 6 +; CHECK-NEXT: [[R_GEP_7:%.*]] = getelementptr i8, i8* [[ARG1]], i64 7 +; CHECK-NEXT: [[R_GEP_8:%.*]] = getelementptr i8, i8* [[ARG1]], i64 8 +; CHECK-NEXT: [[R_GEP_9:%.*]] = getelementptr i8, i8* [[ARG1]], i64 9 +; CHECK-NEXT: [[R_GEP_10:%.*]] = getelementptr i8, i8* [[ARG1]], i64 10 +; CHECK-NEXT: [[R_GEP_11:%.*]] = getelementptr i8, i8* [[ARG1]], i64 11 +; CHECK-NEXT: [[R_GEP_12:%.*]] = getelementptr i8, i8* [[ARG1]], i64 12 +; CHECK-NEXT: [[R_GEP_13:%.*]] = getelementptr i8, i8* [[ARG1]], i64 13 +; CHECK-NEXT: [[R_GEP_14:%.*]] = getelementptr i8, i8* [[ARG1]], i64 14 +; CHECK-NEXT: [[R_GEP_15:%.*]] = getelementptr i8, i8* [[ARG1]], i64 15 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8* [[R_GEP_0]] to <16 x i8>* +; CHECK-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[TMP5]], align 1 +; CHECK-NEXT: [[T21:%.*]] = getelementptr inbounds i8, i8* [[ARG3]], i64 15 +; CHECK-NEXT: [[T22:%.*]] = bitcast i8* [[ARG3]] to <16 x i8>* +; CHECK-NEXT: call void @foo(i8* nonnull [[T4]]) +; CHECK-NEXT: [[T26:%.*]] = load i8, i8* [[ARG3]], align 1 +; CHECK-NEXT: [[T27:%.*]] = load i8, i8* [[ARG2:%.*]], align 1 +; CHECK-NEXT: [[T28:%.*]] = xor i8 [[T27]], [[T26]] +; CHECK-NEXT: store i8 [[T28]], i8* [[ARG2]], align 1 +; CHECK-NEXT: [[T29:%.*]] = load i8, i8* [[T5]], align 1 +; CHECK-NEXT: [[T30:%.*]] = getelementptr inbounds i8, i8* [[ARG2]], i64 1 +; CHECK-NEXT: [[T31:%.*]] = load i8, i8* [[T30]], align 1 +; CHECK-NEXT: [[T32:%.*]] = xor i8 [[T31]], [[T29]] +; CHECK-NEXT: store i8 [[T32]], i8* [[T30]], align 1 +; CHECK-NEXT: [[T33:%.*]] = load i8, i8* [[T6]], align 1 +; CHECK-NEXT: [[T34:%.*]] = getelementptr inbounds i8, i8* [[ARG2]], i64 2 +; CHECK-NEXT: [[T35:%.*]] = load i8, i8* [[T34]], align 1 +; CHECK-NEXT: [[T36:%.*]] = xor i8 [[T35]], [[T33]] +; CHECK-NEXT: store i8 [[T36]], i8* [[T34]], align 1 +; CHECK-NEXT: [[T37:%.*]] = load i8, i8* [[T7]], align 1 +; CHECK-NEXT: [[T38:%.*]] = getelementptr inbounds i8, i8* [[ARG2]], i64 3 +; CHECK-NEXT: [[T39:%.*]] = load i8, i8* [[T38]], align 1 +; CHECK-NEXT: [[T40:%.*]] = xor i8 [[T39]], [[T37]] +; CHECK-NEXT: store i8 [[T40]], i8* [[T38]], align 1 +; CHECK-NEXT: [[T41:%.*]] = load i8, i8* [[T8]], align 1 +; CHECK-NEXT: [[T42:%.*]] = getelementptr inbounds i8, i8* [[ARG2]], i64 4 +; CHECK-NEXT: [[T43:%.*]] = load i8, i8* [[T42]], align 1 +; CHECK-NEXT: [[T44:%.*]] = xor i8 [[T43]], [[T41]] +; CHECK-NEXT: store i8 [[T44]], i8* [[T42]], align 1 +; CHECK-NEXT: [[T45:%.*]] = load i8, i8* [[T9]], align 1 +; CHECK-NEXT: [[T46:%.*]] = getelementptr inbounds i8, i8* [[ARG2]], i64 5 +; CHECK-NEXT: [[T47:%.*]] = load i8, i8* [[T46]], align 1 +; CHECK-NEXT: [[T48:%.*]] = xor i8 [[T47]], [[T45]] +; CHECK-NEXT: store i8 [[T48]], i8* [[T46]], align 1 +; CHECK-NEXT: [[T49:%.*]] = load i8, i8* [[T10]], align 1 +; CHECK-NEXT: [[T50:%.*]] = getelementptr inbounds i8, i8* [[ARG2]], i64 6 +; CHECK-NEXT: [[T51:%.*]] = load i8, i8* [[T50]], align 1 +; CHECK-NEXT: [[T52:%.*]] = xor i8 [[T51]], [[T49]] +; CHECK-NEXT: store i8 [[T52]], i8* [[T50]], align 1 +; CHECK-NEXT: [[T53:%.*]] = load i8, i8* [[T11]], align 1 +; CHECK-NEXT: [[T54:%.*]] = getelementptr inbounds i8, i8* [[ARG2]], i64 7 +; CHECK-NEXT: [[T55:%.*]] = load i8, i8* [[T54]], align 1 +; CHECK-NEXT: [[T56:%.*]] = xor i8 [[T55]], [[T53]] +; CHECK-NEXT: store i8 [[T56]], i8* [[T54]], align 1 +; CHECK-NEXT: [[T57:%.*]] = load i8, i8* [[T12]], align 1 +; CHECK-NEXT: [[T58:%.*]] = getelementptr inbounds i8, i8* [[ARG2]], i64 8 +; CHECK-NEXT: [[T59:%.*]] = load i8, i8* [[T58]], align 1 +; CHECK-NEXT: [[T60:%.*]] = xor i8 [[T59]], [[T57]] +; CHECK-NEXT: store i8 [[T60]], i8* [[T58]], align 1 +; CHECK-NEXT: [[T61:%.*]] = load i8, i8* [[T13]], align 1 +; CHECK-NEXT: [[T62:%.*]] = getelementptr inbounds i8, i8* [[ARG2]], i64 9 +; CHECK-NEXT: [[T63:%.*]] = load i8, i8* [[T62]], align 1 +; CHECK-NEXT: [[T64:%.*]] = xor i8 [[T63]], [[T61]] +; CHECK-NEXT: store i8 [[T64]], i8* [[T62]], align 1 +; CHECK-NEXT: [[T65:%.*]] = load i8, i8* [[T14]], align 1 +; CHECK-NEXT: [[T66:%.*]] = getelementptr inbounds i8, i8* [[ARG2]], i64 10 +; CHECK-NEXT: [[T67:%.*]] = load i8, i8* [[T66]], align 1 +; CHECK-NEXT: [[T68:%.*]] = xor i8 [[T67]], [[T65]] +; CHECK-NEXT: store i8 [[T68]], i8* [[T66]], align 1 +; CHECK-NEXT: [[T69:%.*]] = load i8, i8* [[T15]], align 1 +; CHECK-NEXT: [[T70:%.*]] = getelementptr inbounds i8, i8* [[ARG2]], i64 11 +; CHECK-NEXT: [[T71:%.*]] = load i8, i8* [[T70]], align 1 +; CHECK-NEXT: [[T72:%.*]] = xor i8 [[T71]], [[T69]] +; CHECK-NEXT: store i8 [[T72]], i8* [[T70]], align 1 +; CHECK-NEXT: [[T73:%.*]] = load i8, i8* [[T16]], align 1 +; CHECK-NEXT: [[T74:%.*]] = getelementptr inbounds i8, i8* [[ARG2]], i64 12 +; CHECK-NEXT: [[T75:%.*]] = load i8, i8* [[T74]], align 1 +; CHECK-NEXT: [[T76:%.*]] = xor i8 [[T75]], [[T73]] +; CHECK-NEXT: store i8 [[T76]], i8* [[T74]], align 1 +; CHECK-NEXT: [[T77:%.*]] = load i8, i8* [[T17]], align 1 +; CHECK-NEXT: [[T78:%.*]] = getelementptr inbounds i8, i8* [[ARG2]], i64 13 +; CHECK-NEXT: [[T79:%.*]] = load i8, i8* [[T78]], align 1 +; CHECK-NEXT: [[T80:%.*]] = xor i8 [[T79]], [[T77]] +; CHECK-NEXT: store i8 [[T80]], i8* [[T78]], align 1 +; CHECK-NEXT: [[T81:%.*]] = load i8, i8* [[T18]], align 1 +; CHECK-NEXT: [[T82:%.*]] = getelementptr inbounds i8, i8* [[ARG2]], i64 14 +; CHECK-NEXT: [[T83:%.*]] = load i8, i8* [[T82]], align 1 +; CHECK-NEXT: [[T84:%.*]] = xor i8 [[T83]], [[T81]] +; CHECK-NEXT: store i8 [[T84]], i8* [[T82]], align 1 +; CHECK-NEXT: [[T85:%.*]] = load i8, i8* [[T21]], align 1 +; CHECK-NEXT: [[T86:%.*]] = getelementptr inbounds i8, i8* [[ARG2]], i64 15 +; CHECK-NEXT: [[T87:%.*]] = load i8, i8* [[T86]], align 1 +; CHECK-NEXT: [[T88:%.*]] = xor i8 [[T87]], [[T85]] +; CHECK-NEXT: store i8 [[T88]], i8* [[T86]], align 1 +; CHECK-NEXT: ret i32 1 +; +bb: + %t = alloca <16 x i8>, align 16 + %t4 = getelementptr inbounds <16 x i8>, <16 x i8>* %t, i64 0, i64 0 + %t5 = getelementptr inbounds i8, i8* %arg3, i64 1 + %t6 = getelementptr inbounds i8, i8* %arg3, i64 2 + %t7 = getelementptr inbounds i8, i8* %arg3, i64 3 + %t8 = getelementptr inbounds i8, i8* %arg3, i64 4 + %t9 = getelementptr inbounds i8, i8* %arg3, i64 5 + %t10 = getelementptr inbounds i8, i8* %arg3, i64 6 + %t11 = getelementptr inbounds i8, i8* %arg3, i64 7 + %t12 = getelementptr inbounds i8, i8* %arg3, i64 8 + %t13 = getelementptr inbounds i8, i8* %arg3, i64 9 + %t14 = getelementptr inbounds i8, i8* %arg3, i64 10 + %t15 = getelementptr inbounds i8, i8* %arg3, i64 11 + %t16 = getelementptr inbounds i8, i8* %arg3, i64 12 + %t17 = getelementptr inbounds i8, i8* %arg3, i64 13 + %t18 = getelementptr inbounds i8, i8* %arg3, i64 14 + %t19 = bitcast i8* %arg1 to <16 x i8>* + %A.gep.0 = getelementptr i8, i8* %A, i64 0 + %A.0 = load i8, i8* %A.gep.0 + %B.gep.0 = getelementptr i8, i8* %B, i64 0 + %B.0 = load i8, i8* %B.gep.0 + %xor.0 = xor i8 %A.0, %B.0 + %A.gep.1 = getelementptr i8, i8* %A, i64 1 + %A.1 = load i8, i8* %A.gep.1 + %B.gep.1 = getelementptr i8, i8* %B, i64 1 + %B.1 = load i8, i8* %B.gep.1 + %xor.1 = xor i8 %A.1, %B.1 + %A.gep.2 = getelementptr i8, i8* %A, i64 2 + %A.2 = load i8, i8* %A.gep.2 + %B.gep.2 = getelementptr i8, i8* %B, i64 2 + %B.2 = load i8, i8* %B.gep.2 + %xor.2 = xor i8 %A.2, %B.2 + %A.gep.3 = getelementptr i8, i8* %A, i64 3 + %A.3 = load i8, i8* %A.gep.3 + %B.gep.3 = getelementptr i8, i8* %B, i64 3 + %B.3 = load i8, i8* %B.gep.3 + %xor.3 = xor i8 %A.3, %B.3 + %A.gep.4 = getelementptr i8, i8* %A, i64 4 + %A.4 = load i8, i8* %A.gep.4 + %B.gep.4 = getelementptr i8, i8* %B, i64 4 + %B.4 = load i8, i8* %B.gep.4 + %xor.4 = xor i8 %A.4, %B.4 + %A.gep.5 = getelementptr i8, i8* %A, i64 5 + %A.5 = load i8, i8* %A.gep.5 + %B.gep.5 = getelementptr i8, i8* %B, i64 5 + %B.5 = load i8, i8* %B.gep.5 + %xor.5 = xor i8 %A.5, %B.5 + %A.gep.6 = getelementptr i8, i8* %A, i64 6 + %A.6 = load i8, i8* %A.gep.6 + %B.gep.6 = getelementptr i8, i8* %B, i64 6 + %B.6 = load i8, i8* %B.gep.6 + %xor.6 = xor i8 %A.6, %B.6 + %A.gep.7 = getelementptr i8, i8* %A, i64 7 + %A.7 = load i8, i8* %A.gep.7 + %B.gep.7 = getelementptr i8, i8* %B, i64 7 + %B.7 = load i8, i8* %B.gep.7 + %xor.7 = xor i8 %A.7, %B.7 + %A.gep.8 = getelementptr i8, i8* %A, i64 8 + %A.8 = load i8, i8* %A.gep.8 + %B.gep.8 = getelementptr i8, i8* %B, i64 8 + %B.8 = load i8, i8* %B.gep.8 + %xor.8 = xor i8 %A.8, %B.8 + %A.gep.9 = getelementptr i8, i8* %A, i64 9 + %A.9 = load i8, i8* %A.gep.9 + %B.gep.9 = getelementptr i8, i8* %B, i64 9 + %B.9 = load i8, i8* %B.gep.9 + %xor.9 = xor i8 %A.9, %B.9 + %A.gep.10 = getelementptr i8, i8* %A, i64 10 + %A.10 = load i8, i8* %A.gep.10 + %B.gep.10 = getelementptr i8, i8* %B, i64 10 + %B.10 = load i8, i8* %B.gep.10 + %xor.10 = xor i8 %A.10, %B.10 + %A.gep.11 = getelementptr i8, i8* %A, i64 11 + %A.11 = load i8, i8* %A.gep.11 + %B.gep.11 = getelementptr i8, i8* %B, i64 11 + %B.11 = load i8, i8* %B.gep.11 + %xor.11 = xor i8 %A.11, %B.11 + %A.gep.12 = getelementptr i8, i8* %A, i64 12 + %A.12 = load i8, i8* %A.gep.12 + %B.gep.12 = getelementptr i8, i8* %B, i64 12 + %B.12 = load i8, i8* %B.gep.12 + %xor.12 = xor i8 %A.12, %B.12 + %A.gep.13 = getelementptr i8, i8* %A, i64 13 + %A.13 = load i8, i8* %A.gep.13 + %B.gep.13 = getelementptr i8, i8* %B, i64 13 + %B.13 = load i8, i8* %B.gep.13 + %xor.13 = xor i8 %A.13, %B.13 + %A.gep.14 = getelementptr i8, i8* %A, i64 14 + %A.14 = load i8, i8* %A.gep.14 + %B.gep.14 = getelementptr i8, i8* %B, i64 14 + %B.14 = load i8, i8* %B.gep.14 + %xor.14 = xor i8 %A.14, %B.14 + %A.gep.15 = getelementptr i8, i8* %A, i64 15 + %A.15 = load i8, i8* %A.gep.15 + %B.gep.15 = getelementptr i8, i8* %B, i64 15 + %B.15 = load i8, i8* %B.gep.15 + %xor.15 = xor i8 %A.15, %B.15 + %R.gep.0 = getelementptr i8, i8* %arg1, i64 0 + store i8 %xor.0, i8* %R.gep.0 + %R.gep.1 = getelementptr i8, i8* %arg1, i64 1 + store i8 %xor.1, i8* %R.gep.1 + %R.gep.2 = getelementptr i8, i8* %arg1, i64 2 + store i8 %xor.2, i8* %R.gep.2 + %R.gep.3 = getelementptr i8, i8* %arg1, i64 3 + store i8 %xor.3, i8* %R.gep.3 + %R.gep.4 = getelementptr i8, i8* %arg1, i64 4 + store i8 %xor.4, i8* %R.gep.4 + %R.gep.5 = getelementptr i8, i8* %arg1, i64 5 + store i8 %xor.5, i8* %R.gep.5 + %R.gep.6 = getelementptr i8, i8* %arg1, i64 6 + store i8 %xor.6, i8* %R.gep.6 + %R.gep.7 = getelementptr i8, i8* %arg1, i64 7 + store i8 %xor.7, i8* %R.gep.7 + %R.gep.8 = getelementptr i8, i8* %arg1, i64 8 + store i8 %xor.8, i8* %R.gep.8 + %R.gep.9 = getelementptr i8, i8* %arg1, i64 9 + store i8 %xor.9, i8* %R.gep.9 + %R.gep.10 = getelementptr i8, i8* %arg1, i64 10 + store i8 %xor.10, i8* %R.gep.10 + %R.gep.11 = getelementptr i8, i8* %arg1, i64 11 + store i8 %xor.11, i8* %R.gep.11 + %R.gep.12 = getelementptr i8, i8* %arg1, i64 12 + store i8 %xor.12, i8* %R.gep.12 + %R.gep.13 = getelementptr i8, i8* %arg1, i64 13 + store i8 %xor.13, i8* %R.gep.13 + %R.gep.14 = getelementptr i8, i8* %arg1, i64 14 + store i8 %xor.14, i8* %R.gep.14 + %R.gep.15 = getelementptr i8, i8* %arg1, i64 15 + store i8 %xor.15, i8* %R.gep.15 + + + %t21 = getelementptr inbounds i8, i8* %arg3, i64 15 + %t22 = bitcast i8* %arg3 to <16 x i8>* + + call void @foo(i8* nonnull %t4) + %t26 = load i8, i8* %arg3, align 1 + %t27 = load i8, i8* %arg2, align 1 + %t28 = xor i8 %t27, %t26 + store i8 %t28, i8* %arg2, align 1 + %t29 = load i8, i8* %t5, align 1 + %t30 = getelementptr inbounds i8, i8* %arg2, i64 1 + %t31 = load i8, i8* %t30, align 1 + %t32 = xor i8 %t31, %t29 + store i8 %t32, i8* %t30, align 1 + %t33 = load i8, i8* %t6, align 1 + %t34 = getelementptr inbounds i8, i8* %arg2, i64 2 + %t35 = load i8, i8* %t34, align 1 + %t36 = xor i8 %t35, %t33 + store i8 %t36, i8* %t34, align 1 + %t37 = load i8, i8* %t7, align 1 + %t38 = getelementptr inbounds i8, i8* %arg2, i64 3 + %t39 = load i8, i8* %t38, align 1 + %t40 = xor i8 %t39, %t37 + store i8 %t40, i8* %t38, align 1 + %t41 = load i8, i8* %t8, align 1 + %t42 = getelementptr inbounds i8, i8* %arg2, i64 4 + %t43 = load i8, i8* %t42, align 1 + %t44 = xor i8 %t43, %t41 + store i8 %t44, i8* %t42, align 1 + %t45 = load i8, i8* %t9, align 1 + %t46 = getelementptr inbounds i8, i8* %arg2, i64 5 + %t47 = load i8, i8* %t46, align 1 + %t48 = xor i8 %t47, %t45 + store i8 %t48, i8* %t46, align 1 + %t49 = load i8, i8* %t10, align 1 + %t50 = getelementptr inbounds i8, i8* %arg2, i64 6 + %t51 = load i8, i8* %t50, align 1 + %t52 = xor i8 %t51, %t49 + store i8 %t52, i8* %t50, align 1 + %t53 = load i8, i8* %t11, align 1 + %t54 = getelementptr inbounds i8, i8* %arg2, i64 7 + %t55 = load i8, i8* %t54, align 1 + %t56 = xor i8 %t55, %t53 + store i8 %t56, i8* %t54, align 1 + %t57 = load i8, i8* %t12, align 1 + %t58 = getelementptr inbounds i8, i8* %arg2, i64 8 + %t59 = load i8, i8* %t58, align 1 + %t60 = xor i8 %t59, %t57 + store i8 %t60, i8* %t58, align 1 + %t61 = load i8, i8* %t13, align 1 + %t62 = getelementptr inbounds i8, i8* %arg2, i64 9 + %t63 = load i8, i8* %t62, align 1 + %t64 = xor i8 %t63, %t61 + store i8 %t64, i8* %t62, align 1 + %t65 = load i8, i8* %t14, align 1 + %t66 = getelementptr inbounds i8, i8* %arg2, i64 10 + %t67 = load i8, i8* %t66, align 1 + %t68 = xor i8 %t67, %t65 + store i8 %t68, i8* %t66, align 1 + %t69 = load i8, i8* %t15, align 1 + %t70 = getelementptr inbounds i8, i8* %arg2, i64 11 + %t71 = load i8, i8* %t70, align 1 + %t72 = xor i8 %t71, %t69 + store i8 %t72, i8* %t70, align 1 + %t73 = load i8, i8* %t16, align 1 + %t74 = getelementptr inbounds i8, i8* %arg2, i64 12 + %t75 = load i8, i8* %t74, align 1 + %t76 = xor i8 %t75, %t73 + store i8 %t76, i8* %t74, align 1 + %t77 = load i8, i8* %t17, align 1 + %t78 = getelementptr inbounds i8, i8* %arg2, i64 13 + %t79 = load i8, i8* %t78, align 1 + %t80 = xor i8 %t79, %t77 + store i8 %t80, i8* %t78, align 1 + %t81 = load i8, i8* %t18, align 1 + %t82 = getelementptr inbounds i8, i8* %arg2, i64 14 + %t83 = load i8, i8* %t82, align 1 + %t84 = xor i8 %t83, %t81 + store i8 %t84, i8* %t82, align 1 + %t85 = load i8, i8* %t21, align 1 + %t86 = getelementptr inbounds i8, i8* %arg2, i64 15 + %t87 = load i8, i8* %t86, align 1 + %t88 = xor i8 %t87, %t85 + store i8 %t88, i8* %t86, align 1 + ret i32 1 +} + +; A test case where instructions required to compute the pointer bounds get +; vectorized before versioning. Make sure there is no crash. +define void @crash_instructions_deleted(float* %t, i32* %a, i32** noalias %ptr) { +; CHECK-LABEL: @crash_instructions_deleted( +; CHECK-NEXT: bb: +; CHECK-NEXT: [[T42:%.*]] = bitcast float* [[T:%.*]] to i8* +; CHECK-NEXT: [[T15:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 2 +; CHECK-NEXT: [[T16:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 3 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[T15]] to <2 x i32>* +; CHECK-NEXT: store <2 x i32> , <2 x i32>* [[TMP0]], align 8 +; CHECK-NEXT: [[T17:%.*]] = load i32*, i32** [[PTR:%.*]], align 8 +; CHECK-NEXT: br label [[BB18:%.*]] +; CHECK: bb18: +; CHECK-NEXT: [[T19:%.*]] = sext i32 0 to i64 +; CHECK-NEXT: [[T20:%.*]] = add nsw i64 1, [[T19]] +; CHECK-NEXT: [[T21:%.*]] = getelementptr inbounds i32, i32* [[T17]], i64 [[T20]] +; CHECK-NEXT: [[T22:%.*]] = bitcast i32* [[T21]] to i8* +; CHECK-NEXT: [[T23:%.*]] = getelementptr inbounds i8, i8* [[T22]], i64 1 +; CHECK-NEXT: [[T24:%.*]] = getelementptr inbounds i8, i8* [[T22]], i64 2 +; CHECK-NEXT: [[T25:%.*]] = getelementptr inbounds i8, i8* [[T22]], i64 3 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[T17]], i64 2 +; CHECK-NEXT: [[SCEVGEP18:%.*]] = bitcast i32* [[SCEVGEP]] to i8* +; CHECK-NEXT: [[SCEVGEP43:%.*]] = getelementptr float, float* [[T]], i64 4 +; CHECK-NEXT: [[SCEVGEP4344:%.*]] = bitcast float* [[SCEVGEP43]] to i8* +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[T22]], [[SCEVGEP4344]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[T42]], [[SCEVGEP18]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[MEMCHECK_CONFLICT:%.*]] = and i1 [[FOUND_CONFLICT]], true +; CHECK-NEXT: br i1 [[MEMCHECK_CONFLICT]], label [[BB18_SCALAR:%.*]], label [[BB18_SLPVERSIONED1:%.*]] +; CHECK: bb18.scalar: +; CHECK-NEXT: [[T26:%.*]] = load i8, i8* [[T22]], align 1 +; CHECK-NEXT: [[T27:%.*]] = uitofp i8 [[T26]] to float +; CHECK-NEXT: [[T28:%.*]] = fdiv float [[T27]], 2.550000e+02 +; CHECK-NEXT: [[T29:%.*]] = getelementptr inbounds float, float* [[T]], i64 0 +; CHECK-NEXT: store float [[T28]], float* [[T29]], align 8 +; CHECK-NEXT: [[T30:%.*]] = load i8, i8* [[T23]], align 1 +; CHECK-NEXT: [[T31:%.*]] = uitofp i8 [[T30]] to float +; CHECK-NEXT: [[T32:%.*]] = fdiv float [[T31]], 2.550000e+02 +; CHECK-NEXT: [[T33:%.*]] = getelementptr inbounds float, float* [[T]], i64 1 +; CHECK-NEXT: store float [[T32]], float* [[T33]], align 4 +; CHECK-NEXT: [[T34:%.*]] = load i8, i8* [[T24]], align 1 +; CHECK-NEXT: [[T35:%.*]] = uitofp i8 [[T34]] to float +; CHECK-NEXT: [[T36:%.*]] = fdiv float [[T35]], 2.550000e+02 +; CHECK-NEXT: [[T37:%.*]] = getelementptr inbounds float, float* [[T]], i64 2 +; CHECK-NEXT: store float [[T36]], float* [[T37]], align 8 +; CHECK-NEXT: [[T38:%.*]] = load i8, i8* [[T25]], align 1 +; CHECK-NEXT: [[T39:%.*]] = uitofp i8 [[T38]] to float +; CHECK-NEXT: [[T40:%.*]] = fdiv float [[T39]], 2.550000e+02 +; CHECK-NEXT: [[T41:%.*]] = getelementptr inbounds float, float* [[T]], i64 3 +; CHECK-NEXT: store float [[T40]], float* [[T41]], align 4 +; CHECK-NEXT: br label [[BB18_MERGE:%.*]] +; CHECK: bb18.merge: +; CHECK-NEXT: ret void +; CHECK: bb18.slpversioned1: +; CHECK-NEXT: [[T295:%.*]] = getelementptr inbounds float, float* [[T]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[T22]] to <4 x i8>* +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1, !alias.scope !35, !noalias !38 +; CHECK-NEXT: [[TMP3:%.*]] = uitofp <4 x i8> [[TMP2]] to <4 x float> +; CHECK-NEXT: [[TMP4:%.*]] = fdiv <4 x float> [[TMP3]], +; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[T295]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP4]], <4 x float>* [[TMP5]], align 8, !alias.scope !38, !noalias !35 +; CHECK-NEXT: br label [[BB18_MERGE]] +; +bb: + %t6 = icmp slt i32 10, 0 + %t7 = icmp sgt i32 20, 20 + %t9 = select i1 %t7, i32 5, i32 0 + %t10 = select i1 %t6, i32 0, i32 %t9 + %t11 = icmp slt i32 10, 0 + %t12 = icmp sgt i32 20, 20 + %t13 = select i1 %t12, i32 5, i32 10 + %t14 = select i1 %t11, i32 0, i32 %t13 + %t15 = getelementptr inbounds i32, i32* %a, i32 2 + store i32 %t10, i32* %t15, align 8 + %t16 = getelementptr inbounds i32, i32* %a, i32 3 + store i32 %t14, i32* %t16, align 4 + %t17 = load i32*, i32** %ptr, align 8 + br label %bb18 + +bb18: ; preds = %bb5 + %t19 = sext i32 %t10 to i64 + %t20 = add nsw i64 1, %t19 + %t21 = getelementptr inbounds i32, i32* %t17, i64 %t20 + %t22 = bitcast i32* %t21 to i8* + %t23 = getelementptr inbounds i8, i8* %t22, i64 1 + %t24 = getelementptr inbounds i8, i8* %t22, i64 2 + %t25 = getelementptr inbounds i8, i8* %t22, i64 3 + %t26 = load i8, i8* %t22, align 1 + %t27 = uitofp i8 %t26 to float + %t28 = fdiv float %t27, 2.550000e+02 + %t29 = getelementptr inbounds float, float* %t, i64 0 + store float %t28, float* %t29, align 8 + %t30 = load i8, i8* %t23, align 1 + %t31 = uitofp i8 %t30 to float + %t32 = fdiv float %t31, 2.550000e+02 + %t33 = getelementptr inbounds float, float* %t, i64 1 + store float %t32, float* %t33, align 4 + %t34 = load i8, i8* %t24, align 1 + %t35 = uitofp i8 %t34 to float + %t36 = fdiv float %t35, 2.550000e+02 + %t37 = getelementptr inbounds float, float* %t, i64 2 + store float %t36, float* %t37, align 8 + %t38 = load i8, i8* %t25, align 1 + %t39 = uitofp i8 %t38 to float + %t40 = fdiv float %t39, 2.550000e+02 + %t41 = getelementptr inbounds float, float* %t, i64 3 + store float %t40, float* %t41, align 4 + ret void +} + +; A test case where there are no instructions accessing a tracked object in a +; block for which versioning was requested. +define void @crash_no_tracked_instructions(float** %arg, float* %arg.2, float* %arg.3, i1 %c) { +; CHECK-LABEL: @crash_no_tracked_instructions( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[T19:%.*]] = load float*, float** [[ARG:%.*]], align 8 +; CHECK-NEXT: [[T20:%.*]] = load float, float* [[ARG_3:%.*]], align 4 +; CHECK-NEXT: [[T21:%.*]] = getelementptr inbounds float, float* [[ARG_2:%.*]], i64 0 +; CHECK-NEXT: br i1 [[C:%.*]], label [[BB22:%.*]], label [[BB30:%.*]] +; CHECK: bb22: +; CHECK-NEXT: [[T23:%.*]] = fmul float [[T20]], 9.900000e+01 +; CHECK-NEXT: [[T24:%.*]] = fmul float [[T23]], 9.900000e+01 +; CHECK-NEXT: [[T25:%.*]] = getelementptr inbounds float, float* [[T19]], i64 2 +; CHECK-NEXT: [[T26:%.*]] = fmul float [[T23]], 1.000000e+01 +; CHECK-NEXT: store float [[T26]], float* [[T25]], align 4 +; CHECK-NEXT: [[T27:%.*]] = load float, float* [[T21]], align 8 +; CHECK-NEXT: [[T28:%.*]] = fadd float [[T24]], 2.000000e+01 +; CHECK-NEXT: [[T29:%.*]] = fadd float [[T26]], 2.000000e+01 +; CHECK-NEXT: br label [[BB30]] +; CHECK: bb30: +; CHECK-NEXT: [[T31:%.*]] = phi float [ [[T28]], [[BB22]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[T32:%.*]] = phi float [ [[T29]], [[BB22]] ], [ [[T20]], [[ENTRY]] ] +; CHECK-NEXT: br label [[BB36:%.*]] +; CHECK: bb36: +; CHECK-NEXT: [[T37:%.*]] = fmul float [[T31]], 3.000000e+00 +; CHECK-NEXT: [[T38:%.*]] = getelementptr inbounds float, float* [[ARG_3]], i64 0 +; CHECK-NEXT: store float [[T37]], float* [[T38]], align 4 +; CHECK-NEXT: [[T39:%.*]] = fmul float [[T32]], 3.000000e+00 +; CHECK-NEXT: [[T40:%.*]] = getelementptr inbounds float, float* [[ARG_3]], i64 1 +; CHECK-NEXT: store float [[T39]], float* [[T40]], align 4 +; CHECK-NEXT: br label [[BB41:%.*]] +; CHECK: bb41: +; CHECK-NEXT: ret void +; +entry: + %t19 = load float*, float** %arg + %t20 = load float, float* %arg.3, align 4 + %t21 = getelementptr inbounds float, float* %arg.2, i64 0 + br i1 %c, label %bb22, label %bb30 + +bb22: + %t23 = fmul float %t20, 99.0 + %t24 = fmul float %t23, 99.0 + %t25 = getelementptr inbounds float, float* %t19, i64 2 + %t26 = fmul float %t23, 10.0 + store float %t26, float* %t25, align 4 + %t27 = load float, float* %t21, align 8 + %t28 = fadd float %t24, 20.0 + %t29 = fadd float %t26, 20.0 + br label %bb30 + +bb30: + %t31 = phi float [ %t28, %bb22 ], [ 0.0, %entry ] + %t32 = phi float [ %t29, %bb22 ], [ %t20, %entry ] + br label %bb36 + +bb36: + %t37 = fmul float %t31, 3.0 + %t38 = getelementptr inbounds float, float* %arg.3, i64 0 + store float %t37, float* %t38, align 4 + %t39 = fmul float %t32, 3.0 + %t40 = getelementptr inbounds float, float* %arg.3, i64 1 + store float %t39, float* %t40, align 4 + br label %bb41 + +bb41: ret void } diff --git a/llvm/test/Transforms/SLPVectorizer/X86/memory-runtime-checks.ll b/llvm/test/Transforms/SLPVectorizer/X86/memory-runtime-checks.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/memory-runtime-checks.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/memory-runtime-checks.ll @@ -1,11 +1,26 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -scoped-noalias-aa -slp-vectorizer -mtriple=x86_64-apple-darwin -enable-new-pm=false -S %s | FileCheck %s +; RUN: opt -slp-memory-versioning -scoped-noalias-aa -slp-vectorizer -mtriple=x86_64-apple-darwin -enable-new-pm=false -S %s | FileCheck %s +; RUN: opt -slp-memory-versioning=false -scoped-noalias-aa -slp-vectorizer -mtriple=x86_64-apple-darwin -enable-new-pm=false -S %s | FileCheck --check-prefix=NOVERSION %s + +; NOVERSION-NOT: memcheck define void @version_multiple(i32* nocapture %out_block, i32* nocapture readonly %counter) { ; CHECK-LABEL: @version_multiple( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[COUNTER:%.*]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[OUT_BLOCK:%.*]], align 4 +; CHECK-NEXT: [[COUNTER12:%.*]] = bitcast i32* [[COUNTER:%.*]] to i8* +; CHECK-NEXT: [[OUT_BLOCK14:%.*]] = bitcast i32* [[OUT_BLOCK:%.*]] to i8* +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[COUNTER]], i64 4 +; CHECK-NEXT: [[SCEVGEP13:%.*]] = bitcast i32* [[SCEVGEP]] to i8* +; CHECK-NEXT: [[SCEVGEP15:%.*]] = getelementptr i32, i32* [[OUT_BLOCK]], i64 4 +; CHECK-NEXT: [[SCEVGEP1516:%.*]] = bitcast i32* [[SCEVGEP15]] to i8* +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[COUNTER12]], [[SCEVGEP1516]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[OUT_BLOCK14]], [[SCEVGEP13]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[MEMCHECK_CONFLICT:%.*]] = and i1 [[FOUND_CONFLICT]], true +; CHECK-NEXT: br i1 [[MEMCHECK_CONFLICT]], label [[ENTRY_SCALAR:%.*]], label [[ENTRY_SLPVERSIONED1:%.*]] +; CHECK: entry.scalar: +; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[COUNTER]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[OUT_BLOCK]], align 4 ; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[TMP1]], [[TMP0]] ; CHECK-NEXT: store i32 [[XOR]], i32* [[OUT_BLOCK]], align 4 ; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[COUNTER]], i64 1 @@ -26,7 +41,18 @@ ; CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX2_3]], align 4 ; CHECK-NEXT: [[XOR_3:%.*]] = xor i32 [[TMP7]], [[TMP6]] ; CHECK-NEXT: store i32 [[XOR_3]], i32* [[ARRAYIDX2_3]], align 4 +; CHECK-NEXT: br label [[ENTRY_MERGE:%.*]] +; CHECK: entry.merge: ; CHECK-NEXT: ret void +; CHECK: entry.slpversioned1: +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[COUNTER]] to <4 x i32>* +; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP8]], align 4, !alias.scope !0, !noalias !3 +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i32* [[OUT_BLOCK]] to <4 x i32>* +; CHECK-NEXT: [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP10]], align 4, !alias.scope !3, !noalias !0 +; CHECK-NEXT: [[TMP12:%.*]] = xor <4 x i32> [[TMP11]], [[TMP9]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast i32* [[OUT_BLOCK]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* [[TMP13]], align 4, !alias.scope !3, !noalias !0 +; CHECK-NEXT: br label [[ENTRY_MERGE]] ; entry: %0 = load i32, i32* %counter, align 4