diff --git a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h --- a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h +++ b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h @@ -55,6 +55,14 @@ } // end namespace slpvectorizer +struct SLPVectorizerResult { + bool MadeAnyChange; + bool MadeCFGChange; + + SLPVectorizerResult(bool MadeAnyChange, bool MadeCFGChange) + : MadeAnyChange(MadeAnyChange), MadeCFGChange(MadeCFGChange) {} +}; + struct SLPVectorizerPass : public PassInfoMixin { using StoreList = SmallVector; using StoreListMap = MapVector; @@ -75,10 +83,12 @@ PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); // Glue for old PM. - bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, - TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, - DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, - OptimizationRemarkEmitter *ORE_); + SLPVectorizerResult runImpl(Function &F, ScalarEvolution *SE_, + TargetTransformInfo *TTI_, + TargetLibraryInfo *TLI_, AAResults *AA_, + LoopInfo *LI_, DominatorTree *DT_, + AssumptionCache *AC_, DemandedBits *DB_, + OptimizationRemarkEmitter *ORE_); private: /// Collect store and getelementptr instructions and organize them diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -35,6 +35,7 @@ #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CodeMetrics.h" #include "llvm/Analysis/DemandedBits.h" +#include "llvm/Analysis/DomTreeUpdater.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/IVDescriptors.h" #include "llvm/Analysis/LoopAccessAnalysis.h" @@ -62,6 +63,7 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" +#include "llvm/IR/MDBuilder.h" #include "llvm/IR/Module.h" #include "llvm/IR/NoFolder.h" #include "llvm/IR/Operator.h" @@ -85,8 +87,11 @@ #include "llvm/Support/KnownBits.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Transforms/Utils/InjectTLIMappings.h" #include "llvm/Transforms/Utils/LoopUtils.h" +#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" #include "llvm/Transforms/Vectorize.h" #include #include @@ -107,6 +112,10 @@ #define DEBUG_TYPE "SLP" STATISTIC(NumVectorInstructions, "Number of vector instructions generated"); +STATISTIC(NumVersioningSuccessful, + "Number of times versioning was tried and beneficial"); +STATISTIC(NumVersioningFailed, + "Number of times versioning was tried but was not beneficial"); cl::opt RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, cl::desc("Run the SLP vectorization passes")); @@ -175,6 +184,10 @@ ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz")); +static cl::opt EnableMemoryVersioning( + "slp-memory-versioning", cl::init(false), cl::Hidden, + cl::desc("Enable memory versioning for SLP vectorization.")); + // Limit the number of alias checks. The limit is chosen so that // it has no negative effect on the llvm benchmarks. static const unsigned AliasedCheckLimit = 10; @@ -581,6 +594,44 @@ return Index; } +// Try to add or extend the runtime pointer checking group for \p I, if it is a +// memory access. +static bool +extendMemBounds(Instruction &I, bool Insert, ScalarEvolution &SE, + MapVector &MemBounds) { + + BasicBlock *BB = I.getParent(); + auto GetPtr = [](Instruction *I) -> Value * { + if (auto *L = dyn_cast(I)) + return L->getPointerOperand(); + if (auto *S = dyn_cast(I)) + return S->getPointerOperand(); + return nullptr; + }; + auto *Ptr = GetPtr(&I); + if (!Ptr) + return false; + auto *Start = SE.getSCEV(Ptr); + + Value *Obj = getUnderlyingObject(Ptr); + if (!Obj) + return false; + + if (!SE.properlyDominates(Start, BB)) + return false; + + unsigned AS = Ptr->getType()->getPointerAddressSpace(); + // Runtime checks are generated to ensure this property holds. + auto *End = SE.getAddExpr(Start, SE.getOne(Ptr->getType()), SCEV::FlagNUW); + if (Insert) + MemBounds.insert({Obj, {0, Start, End, AS}}); + auto BoundsIter = MemBounds.find(Obj); + if (BoundsIter == MemBounds.end()) + return false; + + return BoundsIter->second.addPointer(0, Start, End, AS, SE); +} + namespace slpvectorizer { /// Bottom Up SLP Vectorizer. @@ -589,6 +640,16 @@ struct ScheduleData; public: + // Map of objects to start & end pointers we need to generate runtime checks + // for. + MapVector MemBounds; + /// Cache for alias results. + /// TODO: consider moving this to the AliasAnalysis itself. + using AliasCacheKey = std::pair; + DenseMap> AliasCache; + + bool CollectMemAccess = false; + using ValueList = SmallVector; using InstrList = SmallVector; using ValueSet = SmallPtrSet; @@ -667,6 +728,7 @@ } MinBWs.clear(); InstrElementSize.clear(); + MemBounds.clear(); } unsigned getTreeSize() const { return VectorizableTree.size(); } @@ -674,6 +736,25 @@ /// Perform LICM and CSE on the newly generated gather sequences. void optimizeGatherSequence(); + /// Remove instructions in DeletedInstructions. + void removeDeletedInstructions() { + for (const auto &Pair : DeletedInstructions) { + // Replace operands of ignored instructions with Undefs in case if they + // were marked for deletion. + if (Pair.getSecond()) { + Value *Undef = UndefValue::get(Pair.getFirst()->getType()); + Pair.getFirst()->replaceAllUsesWith(Undef); + } + Pair.getFirst()->dropAllReferences(); + } + for (const auto &Pair : DeletedInstructions) { + assert(Pair.getFirst()->use_empty() && + "trying to erase instruction with users."); + Pair.getFirst()->eraseFromParent(); + } + DeletedInstructions.clear(); + } + /// \returns The best order of instructions for vectorization. Optional> bestOrder() const { assert(llvm::all_of( @@ -1976,11 +2057,6 @@ return aliased; } - using AliasCacheKey = std::pair; - - /// Cache for alias results. - /// TODO: consider moving this to the AliasAnalysis itself. - DenseMap> AliasCache; /// Removes an instruction from its block and eventually deletes it. /// It's like Instruction::eraseFromParent() except that the actual deletion @@ -2565,27 +2641,7 @@ } // end namespace llvm -BoUpSLP::~BoUpSLP() { - for (const auto &Pair : DeletedInstructions) { - // Replace operands of ignored instructions with Undefs in case if they were - // marked for deletion. - if (Pair.getSecond()) { - Value *Undef = UndefValue::get(Pair.getFirst()->getType()); - Pair.getFirst()->replaceAllUsesWith(Undef); - } - Pair.getFirst()->dropAllReferences(); - } - for (const auto &Pair : DeletedInstructions) { - assert(Pair.getFirst()->use_empty() && - "trying to erase instruction with users."); - Pair.getFirst()->eraseFromParent(); - } -#ifdef EXPENSIVE_CHECKS - // If we could guarantee that this call is not extremely slow, we could - // remove the ifdef limitation (see PR47712). - assert(!verifyFunction(*F, &dbgs())); -#endif -} +BoUpSLP::~BoUpSLP() { removeDeletedInstructions(); } void BoUpSLP::eraseInstructions(ArrayRef AV) { for (auto *V : AV) { @@ -6213,6 +6269,7 @@ while (DepDest) { assert(isInSchedulingRegion(DepDest)); + ScheduleData *DestBundle = DepDest->FirstInBundle; // We have two limits to reduce the complexity: // 1) AliasedCheckLimit: It's a small limit to reduce calls to // SLP->isAliased (which is the expensive part in this loop). @@ -6230,9 +6287,41 @@ // balance between reduced runtime and accurate dependencies. numAliased++; + // If this bundle is not scheduled and no versioned code has been + // generated yet, try to collect the bounds of the accesses to + // generate runtime checks. + if (!DestBundle->IsScheduled && SLP->CollectMemAccess) { + // FIXME Naming + auto GetPtr = [](Instruction *I) -> Value * { + if (auto *L = dyn_cast(I)) + return L->getPointerOperand(); + if (auto *S = dyn_cast(I)) + return S->getPointerOperand(); + return nullptr; + }; + auto *Src = GetPtr(SrcInst); + auto *Dst = GetPtr(DepDest->Inst); + + if (SrcInst->getParent() == DepDest->Inst->getParent() && Src && + Dst) { + auto GetPtr = [](Instruction *I) -> Value * { + if (auto *L = dyn_cast(I)) + return getUnderlyingObject(L->getPointerOperand()); + if (auto *S = dyn_cast(I)) + return getUnderlyingObject(S->getPointerOperand()); + return nullptr; + }; + bool AddedSrc = + extendMemBounds(*SrcInst, true, *SLP->SE, SLP->MemBounds); + bool AddedDst = extendMemBounds(*DepDest->Inst, true, + *SLP->SE, SLP->MemBounds); + if (!AddedSrc || !AddedDst || + GetPtr(SrcInst) == GetPtr(DepDest->Inst)) + SLP->MemBounds.clear(); + } + } DepDest->MemoryDependencies.push_back(BundleMember); BundleMember->Dependencies++; - ScheduleData *DestBundle = DepDest->FirstInBundle; if (!DestBundle->IsScheduled) { BundleMember->incrementUnscheduledDeps(1); } @@ -6672,7 +6761,7 @@ auto *DB = &getAnalysis().getDemandedBits(); auto *ORE = &getAnalysis().getORE(); - return Impl.runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE); + return Impl.runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE).MadeAnyChange; } void getAnalysisUsage(AnalysisUsage &AU) const override { @@ -6688,9 +6777,7 @@ AU.addRequired(); AU.addPreserved(); AU.addPreserved(); - AU.addPreserved(); AU.addPreserved(); - AU.setPreservesCFG(); } }; @@ -6707,23 +6794,24 @@ auto *DB = &AM.getResult(F); auto *ORE = &AM.getResult(F); - bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE); - if (!Changed) + auto Result = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE); + if (!Result.MadeAnyChange) return PreservedAnalyses::all(); PreservedAnalyses PA; - PA.preserveSet(); + if (!Result.MadeCFGChange) + PA.preserveSet(); + PA.preserve(); + PA.preserve(); return PA; } -bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_, - TargetTransformInfo *TTI_, - TargetLibraryInfo *TLI_, AAResults *AA_, - LoopInfo *LI_, DominatorTree *DT_, - AssumptionCache *AC_, DemandedBits *DB_, - OptimizationRemarkEmitter *ORE_) { +SLPVectorizerResult SLPVectorizerPass::runImpl( + Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, + TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_, + AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_) { if (!RunSLPVectorization) - return false; + return {false, false}; SE = SE_; TTI = TTI_; TLI = TLI_; @@ -6737,15 +6825,16 @@ Stores.clear(); GEPs.clear(); bool Changed = false; + bool CFGChanged = false; // If the target claims to have no vector registers don't attempt // vectorization. if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true))) - return false; + return {false, false}; // Don't vectorize when the attribute NoImplicitFloat is used. if (F.hasFnAttribute(Attribute::NoImplicitFloat)) - return false; + return {false, false}; LLVM_DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n"); @@ -6759,6 +6848,8 @@ // Update DFS numbers now so that we can use them for ordering. DT->updateDFSNumbers(); + SmallVector BlocksToRetry; + SmallVector, 4> BoundsToUse; // Scan the blocks in the function in post order. for (auto BB : post_order(&F.getEntryBlock())) { collectSeedInstructions(BB); @@ -6767,7 +6858,43 @@ if (!Stores.empty()) { LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size() << " underlying objects.\n"); - Changed |= vectorizeStoreChains(R); + R.MemBounds.clear(); + + auto NoOrSingleSucc = [](BasicBlock *BB) { + return succ_begin(BB) == succ_end(BB) || + std::next(succ_begin(BB)) == succ_end(BB); + }; + auto NoOrSinglePred = [](BasicBlock *BB) { + return pred_begin(BB) == pred_end(BB) || + std::next(pred_begin(BB)) == pred_end(BB); + }; + + auto AllUsesInside = [](BasicBlock *BB) { + return all_of(*BB, [BB](Instruction &I) { + return all_of(I.users(), [BB](User *U) { + return cast(U)->getParent() == BB; + }); + }); + }; + auto TermSupported = [](BasicBlock *BB) { + auto *RetI = dyn_cast(BB->getTerminator()); + return isa(BB->getTerminator()) || + (RetI && !RetI->getReturnValue()); + }; + + if (EnableMemoryVersioning) + R.CollectMemAccess = BB->size() <= 300 && NoOrSingleSucc(BB) && + NoOrSinglePred(BB) && AllUsesInside(BB) && + TermSupported(BB); + + bool VectorizedChains = vectorizeStoreChains(R); + if (!VectorizedChains && !R.MemBounds.empty()) { + BlocksToRetry.push_back(BB); + BoundsToUse.push_back(R.MemBounds); + } + R.CollectMemAccess = false; + R.MemBounds.clear(); + Changed |= VectorizedChains; } // Vectorize trees that end at reductions. @@ -6783,11 +6910,170 @@ } } - if (Changed) { + R.AliasCache.clear(); + for (unsigned I = 0; I != BlocksToRetry.size(); I++) { + // First, clean up delete instructions, so they are not re-used during SCEV + // expansion. + R.removeDeletedInstructions(); + BasicBlock *BB = BlocksToRetry[I]; + auto &MemBounds = BoundsToUse[I]; + + SmallVector PointerChecks; + CFGChanged = true; + // Minimize/maximize the lower/upper bounds of accesses in the block to + // version. + for (Instruction &I : *BB) + extendMemBounds(I, false, *SE, MemBounds); + + SmallVector BoundGroups; + for (auto &B : MemBounds) + BoundGroups.emplace_back(&B.second); + + // Create a RuntimePointerCheck for all groups in BoundGroups. + for (unsigned I = 0, E = BoundGroups.size(); I != E; ++I) + for (unsigned J = I + 1; J != E; ++J) + PointerChecks.emplace_back(&*BoundGroups[I], &*BoundGroups[J]); + + LLVMContext &Ctx = BB->getContext(); + DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); + std::string OriginalName = BB->getName().str(); + auto *CheckBlock = splitBlockBefore(BB, &*BB->getFirstNonPHI(), &DTU, LI, + nullptr, OriginalName + ".slpmemcheck"); + auto *MergeBlock = BB; + BB = splitBlockBefore(BB, BB->getTerminator(), &DTU, LI, nullptr, + OriginalName + ".slpversioned"); + + ValueToValueMapTy VMap; + auto *Scalar = CloneBasicBlock(BB, VMap, "", BB->getParent()); + Scalar->setName(OriginalName + ".scalar"); + MergeBlock->setName(OriginalName + ".merge"); + SmallVector Tmp; + Tmp.push_back(Scalar); + remapInstructionsInBlocks(Tmp, VMap); + + SCEVExpander Exp(*SE, BB->getParent()->getParent()->getDataLayout(), + "memcheck"); + auto *MemoryRuntimeCheck = addRuntimeChecks(CheckBlock->getTerminator(), + nullptr, PointerChecks, Exp) + .second; + assert(MemoryRuntimeCheck && + "runtime checks required, but no checks generated in IR?"); + + IRBuilder<> ChkBuilder(CheckBlock->getTerminator()); + Value *NoOverflowCheck = MemoryRuntimeCheck; + // Emit checks ensuring that computing the upper bound does not overflow. + for (auto &B : MemBounds) { + Type *PtrArithTy = Type::getInt8PtrTy(Ctx, B.second.AddressSpace); + Value *Low = Exp.expandCodeFor(B.second.Low, PtrArithTy); + Value *High = Exp.expandCodeFor(B.second.High, PtrArithTy); + NoOverflowCheck = ChkBuilder.CreateAnd( + NoOverflowCheck, ChkBuilder.CreateICmpUGT(High, Low, "nowrap"), + "check"); + } + ChkBuilder.CreateCondBr(NoOverflowCheck, Scalar, BB); + CheckBlock->getTerminator()->eraseFromParent(); + DTU.applyUpdates({{DT->Insert, CheckBlock, Scalar}}); + if (auto *L = LI->getLoopFor(CheckBlock)) + L->addBasicBlockToLoop(Scalar, *LI); + + Changed = true; + + // Add !noalias metadata to memory accesses in the versiond block. + MDBuilder MDB(Ctx); + MDNode *Domain = MDB.createAnonymousAliasScopeDomain("SLPVerDomain"); + + DenseMap GroupToScope; + for (const auto &Group : MemBounds) + GroupToScope[&Group.second] = MDB.createAnonymousAliasScope(Domain); + + for (Instruction &I : *BB) { + auto GetPtr = [](Instruction *I) -> Value * { + if (auto *L = dyn_cast(I)) + return L->getPointerOperand(); + if (auto *S = dyn_cast(I)) + return S->getPointerOperand(); + return nullptr; + }; + auto *Ptr = GetPtr(&I); + if (!Ptr) + continue; + auto *PtrSCEV = SE->getSCEV(Ptr); + + Value *Obj = getUnderlyingObject(Ptr); + if (!Obj) + continue; + + auto BoundsIter = MemBounds.find(Obj); + if (BoundsIter == MemBounds.end()) + continue; + auto *LowerBound = BoundsIter->second.Low; + auto *UpperBound = BoundsIter->second.High; + auto *Scope = GroupToScope.find(&BoundsIter->second)->second; + if (SE->isKnownPredicate(CmpInst::ICMP_UGE, PtrSCEV, LowerBound) && + SE->isKnownPredicate(CmpInst::ICMP_ULE, PtrSCEV, UpperBound)) { + I.setMetadata( + LLVMContext::MD_alias_scope, + MDNode::concatenate(I.getMetadata(LLVMContext::MD_alias_scope), + MDNode::get(Ctx, Scope))); + + SmallVector NonAliasing; + for (auto &KV : GroupToScope) { + if (KV.first == &BoundsIter->second) + continue; + NonAliasing.push_back(KV.second); + } + I.setMetadata( + LLVMContext::MD_noalias, + MDNode::concatenate(I.getMetadata(LLVMContext::MD_noalias), + MDNode::get(Ctx, NonAliasing))); + } + } + + DTU.flush(); + DT->updateDFSNumbers(); + collectSeedInstructions(BB); + + // Vectorize trees that end at stores. + assert(!Stores.empty() && "should have stores when versioning"); + LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size() + << " underlying objects.\n"); + Changed |= vectorizeStoreChains(R); + + R.removeDeletedInstructions(); + InstructionCost ScalarCost = 0; + for (Instruction &I : *Scalar) + ScalarCost += TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput); + InstructionCost SLPCost = 0; + for (Instruction &I : *CheckBlock) + SLPCost += TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput); + for (Instruction &I : *BB) + SLPCost += TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput); + + if (SLPCost >= ScalarCost) { + Instruction *OldTerm = CheckBlock->getTerminator(); + OldTerm->eraseFromParent(); + IRBuilder<> Builder(CheckBlock); + + Builder.CreateBr(Scalar); + DTU.applyUpdates({{DT->Delete, CheckBlock, BB}}); + LI->removeBlock(BB); + DTU.deleteBB(BB); + DTU.applyUpdates({{DT->Delete, BB, MergeBlock}}); + MergeBlockIntoPredecessor(MergeBlock, &DTU, LI); + MergeBlockIntoPredecessor(Scalar, &DTU, LI); + NumVersioningFailed++; + } else { + NumVersioningSuccessful++; + } + DTU.flush(); + DT->updateDFSNumbers(); + } + + if (Changed && BlocksToRetry.empty()) { R.optimizeGatherSequence(); LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n"); } - return Changed; + return {Changed, CFGChanged}; } /// Order may have elements assigned special value (size) which is out of diff --git a/llvm/test/Other/opt-O2-pipeline.ll b/llvm/test/Other/opt-O2-pipeline.ll --- a/llvm/test/Other/opt-O2-pipeline.ll +++ b/llvm/test/Other/opt-O2-pipeline.ll @@ -252,7 +252,11 @@ ; CHECK-NEXT: Optimization Remark Emitter ; CHECK-NEXT: Inject TLI Mappings ; CHECK-NEXT: SLP Vectorizer +; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) +; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Optimize scalar/vector ops +; CHECK-NEXT: Lazy Branch Probability Analysis +; CHECK-NEXT: Lazy Block Frequency Analysis ; CHECK-NEXT: Optimization Remark Emitter ; CHECK-NEXT: Combine redundant instructions ; CHECK-NEXT: Canonicalize natural loops diff --git a/llvm/test/Other/opt-O3-pipeline-enable-matrix.ll b/llvm/test/Other/opt-O3-pipeline-enable-matrix.ll --- a/llvm/test/Other/opt-O3-pipeline-enable-matrix.ll +++ b/llvm/test/Other/opt-O3-pipeline-enable-matrix.ll @@ -264,7 +264,11 @@ ; CHECK-NEXT: Optimization Remark Emitter ; CHECK-NEXT: Inject TLI Mappings ; CHECK-NEXT: SLP Vectorizer +; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) +; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Optimize scalar/vector ops +; CHECK-NEXT: Lazy Branch Probability Analysis +; CHECK-NEXT: Lazy Block Frequency Analysis ; CHECK-NEXT: Optimization Remark Emitter ; CHECK-NEXT: Combine redundant instructions ; CHECK-NEXT: Canonicalize natural loops diff --git a/llvm/test/Other/opt-O3-pipeline.ll b/llvm/test/Other/opt-O3-pipeline.ll --- a/llvm/test/Other/opt-O3-pipeline.ll +++ b/llvm/test/Other/opt-O3-pipeline.ll @@ -257,7 +257,11 @@ ; CHECK-NEXT: Optimization Remark Emitter ; CHECK-NEXT: Inject TLI Mappings ; CHECK-NEXT: SLP Vectorizer +; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) +; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Optimize scalar/vector ops +; CHECK-NEXT: Lazy Branch Probability Analysis +; CHECK-NEXT: Lazy Block Frequency Analysis ; CHECK-NEXT: Optimization Remark Emitter ; CHECK-NEXT: Combine redundant instructions ; CHECK-NEXT: Canonicalize natural loops diff --git a/llvm/test/Other/opt-Os-pipeline.ll b/llvm/test/Other/opt-Os-pipeline.ll --- a/llvm/test/Other/opt-Os-pipeline.ll +++ b/llvm/test/Other/opt-Os-pipeline.ll @@ -238,7 +238,11 @@ ; CHECK-NEXT: Optimization Remark Emitter ; CHECK-NEXT: Inject TLI Mappings ; CHECK-NEXT: SLP Vectorizer +; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) +; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Optimize scalar/vector ops +; CHECK-NEXT: Lazy Branch Probability Analysis +; CHECK-NEXT: Lazy Block Frequency Analysis ; CHECK-NEXT: Optimization Remark Emitter ; CHECK-NEXT: Combine redundant instructions ; CHECK-NEXT: Canonicalize natural loops diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/loadi8.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/loadi8.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/loadi8.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/loadi8.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -S -slp-vectorizer -mtriple=aarch64--linux-gnu -mcpu=generic < %s | FileCheck %s +; RUN: opt -slp-memory-versioning -S -slp-vectorizer -mtriple=aarch64--linux-gnu -mcpu=generic < %s | FileCheck %s target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64" @@ -101,57 +101,67 @@ ; define void @f_alias(i8* nocapture %dst, i8* nocapture readonly %src, %struct.weight_t* nocapture readonly %w) { ; CHECK-LABEL: @f_alias( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[SCALE:%.*]] = getelementptr inbounds [[STRUCT_WEIGHT_T:%.*]], %struct.weight_t* [[W:%.*]], i64 0, i32 0 -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[SCALE]], align 16 -; CHECK-NEXT: [[OFFSET:%.*]] = getelementptr inbounds [[STRUCT_WEIGHT_T]], %struct.weight_t* [[W]], i64 0, i32 1 -; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[OFFSET]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = load i8, i8* [[SRC:%.*]], align 1 -; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP2]] to i32 -; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP0]], [[CONV]] -; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[MUL]], [[TMP1]] -; CHECK-NEXT: [[TOBOOL_NOT_I:%.*]] = icmp ult i32 [[ADD]], 256 -; CHECK-NEXT: [[TMP3:%.*]] = icmp sgt i32 [[ADD]], 0 -; CHECK-NEXT: [[SHR_I:%.*]] = sext i1 [[TMP3]] to i32 -; CHECK-NEXT: [[COND_I:%.*]] = select i1 [[TOBOOL_NOT_I]], i32 [[ADD]], i32 [[SHR_I]] -; CHECK-NEXT: [[CONV_I:%.*]] = trunc i32 [[COND_I]] to i8 -; CHECK-NEXT: store i8 [[CONV_I]], i8* [[DST:%.*]], align 1 -; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 1 -; CHECK-NEXT: [[TMP4:%.*]] = load i8, i8* [[ARRAYIDX_1]], align 1 -; CHECK-NEXT: [[CONV_1:%.*]] = zext i8 [[TMP4]] to i32 -; CHECK-NEXT: [[MUL_1:%.*]] = mul nsw i32 [[TMP0]], [[CONV_1]] -; CHECK-NEXT: [[ADD_1:%.*]] = add nsw i32 [[MUL_1]], [[TMP1]] -; CHECK-NEXT: [[TOBOOL_NOT_I_1:%.*]] = icmp ult i32 [[ADD_1]], 256 -; CHECK-NEXT: [[TMP5:%.*]] = icmp sgt i32 [[ADD_1]], 0 -; CHECK-NEXT: [[SHR_I_1:%.*]] = sext i1 [[TMP5]] to i32 -; CHECK-NEXT: [[COND_I_1:%.*]] = select i1 [[TOBOOL_NOT_I_1]], i32 [[ADD_1]], i32 [[SHR_I_1]] -; CHECK-NEXT: [[CONV_I_1:%.*]] = trunc i32 [[COND_I_1]] to i8 -; CHECK-NEXT: [[ARRAYIDX2_1:%.*]] = getelementptr inbounds i8, i8* [[DST]], i64 1 -; CHECK-NEXT: store i8 [[CONV_I_1]], i8* [[ARRAYIDX2_1]], align 1 -; CHECK-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP6:%.*]] = load i8, i8* [[ARRAYIDX_2]], align 1 -; CHECK-NEXT: [[CONV_2:%.*]] = zext i8 [[TMP6]] to i32 -; CHECK-NEXT: [[MUL_2:%.*]] = mul nsw i32 [[TMP0]], [[CONV_2]] -; CHECK-NEXT: [[ADD_2:%.*]] = add nsw i32 [[MUL_2]], [[TMP1]] -; CHECK-NEXT: [[TOBOOL_NOT_I_2:%.*]] = icmp ult i32 [[ADD_2]], 256 -; CHECK-NEXT: [[TMP7:%.*]] = icmp sgt i32 [[ADD_2]], 0 -; CHECK-NEXT: [[SHR_I_2:%.*]] = sext i1 [[TMP7]] to i32 -; CHECK-NEXT: [[COND_I_2:%.*]] = select i1 [[TOBOOL_NOT_I_2]], i32 [[ADD_2]], i32 [[SHR_I_2]] -; CHECK-NEXT: [[CONV_I_2:%.*]] = trunc i32 [[COND_I_2]] to i8 -; CHECK-NEXT: [[ARRAYIDX2_2:%.*]] = getelementptr inbounds i8, i8* [[DST]], i64 2 -; CHECK-NEXT: store i8 [[CONV_I_2]], i8* [[ARRAYIDX2_2]], align 1 -; CHECK-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP8:%.*]] = load i8, i8* [[ARRAYIDX_3]], align 1 -; CHECK-NEXT: [[CONV_3:%.*]] = zext i8 [[TMP8]] to i32 -; CHECK-NEXT: [[MUL_3:%.*]] = mul nsw i32 [[TMP0]], [[CONV_3]] -; CHECK-NEXT: [[ADD_3:%.*]] = add nsw i32 [[MUL_3]], [[TMP1]] -; CHECK-NEXT: [[TOBOOL_NOT_I_3:%.*]] = icmp ult i32 [[ADD_3]], 256 -; CHECK-NEXT: [[TMP9:%.*]] = icmp sgt i32 [[ADD_3]], 0 -; CHECK-NEXT: [[SHR_I_3:%.*]] = sext i1 [[TMP9]] to i32 -; CHECK-NEXT: [[COND_I_3:%.*]] = select i1 [[TOBOOL_NOT_I_3]], i32 [[ADD_3]], i32 [[SHR_I_3]] -; CHECK-NEXT: [[CONV_I_3:%.*]] = trunc i32 [[COND_I_3]] to i8 -; CHECK-NEXT: [[ARRAYIDX2_3:%.*]] = getelementptr inbounds i8, i8* [[DST]], i64 3 -; CHECK-NEXT: store i8 [[CONV_I_3]], i8* [[ARRAYIDX2_3]], align 1 +; CHECK-NEXT: entry.slpmemcheck: +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, i8* [[DST:%.*]], i64 4 +; CHECK-NEXT: [[SCEVGEP38:%.*]] = getelementptr i8, i8* [[SRC:%.*]], i64 4 +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[DST]], [[SCEVGEP38]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[SRC]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[MEMCHECK_CONFLICT:%.*]] = and i1 [[FOUND_CONFLICT]], true +; CHECK-NEXT: [[NOWRAP:%.*]] = icmp ugt i8* [[SCEVGEP]], [[DST]] +; CHECK-NEXT: [[CHECK:%.*]] = and i1 [[MEMCHECK_CONFLICT]], [[NOWRAP]] +; CHECK-NEXT: [[NOWRAP39:%.*]] = icmp ugt i8* [[SCEVGEP38]], [[SRC]] +; CHECK-NEXT: [[CHECK40:%.*]] = and i1 [[CHECK]], [[NOWRAP39]] +; CHECK-NEXT: [[SCALE2:%.*]] = getelementptr inbounds [[STRUCT_WEIGHT_T:%.*]], %struct.weight_t* [[W:%.*]], i64 0, i32 0 +; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[SCALE2]], align 16 +; CHECK-NEXT: [[OFFSET3:%.*]] = getelementptr inbounds [[STRUCT_WEIGHT_T]], %struct.weight_t* [[W]], i64 0, i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[OFFSET3]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, i8* [[SRC]], align 1 +; CHECK-NEXT: [[CONV4:%.*]] = zext i8 [[TMP2]] to i32 +; CHECK-NEXT: [[MUL5:%.*]] = mul nsw i32 [[TMP0]], [[CONV4]] +; CHECK-NEXT: [[ADD6:%.*]] = add nsw i32 [[MUL5]], [[TMP1]] +; CHECK-NEXT: [[TOBOOL_NOT_I7:%.*]] = icmp ult i32 [[ADD6]], 256 +; CHECK-NEXT: [[TMP3:%.*]] = icmp sgt i32 [[ADD6]], 0 +; CHECK-NEXT: [[SHR_I8:%.*]] = sext i1 [[TMP3]] to i32 +; CHECK-NEXT: [[COND_I9:%.*]] = select i1 [[TOBOOL_NOT_I7]], i32 [[ADD6]], i32 [[SHR_I8]] +; CHECK-NEXT: [[CONV_I10:%.*]] = trunc i32 [[COND_I9]] to i8 +; CHECK-NEXT: store i8 [[CONV_I10]], i8* [[DST]], align 1 +; CHECK-NEXT: [[ARRAYIDX_111:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 1 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, i8* [[ARRAYIDX_111]], align 1 +; CHECK-NEXT: [[CONV_112:%.*]] = zext i8 [[TMP4]] to i32 +; CHECK-NEXT: [[MUL_113:%.*]] = mul nsw i32 [[TMP0]], [[CONV_112]] +; CHECK-NEXT: [[ADD_114:%.*]] = add nsw i32 [[MUL_113]], [[TMP1]] +; CHECK-NEXT: [[TOBOOL_NOT_I_115:%.*]] = icmp ult i32 [[ADD_114]], 256 +; CHECK-NEXT: [[TMP5:%.*]] = icmp sgt i32 [[ADD_114]], 0 +; CHECK-NEXT: [[SHR_I_116:%.*]] = sext i1 [[TMP5]] to i32 +; CHECK-NEXT: [[COND_I_117:%.*]] = select i1 [[TOBOOL_NOT_I_115]], i32 [[ADD_114]], i32 [[SHR_I_116]] +; CHECK-NEXT: [[CONV_I_118:%.*]] = trunc i32 [[COND_I_117]] to i8 +; CHECK-NEXT: [[ARRAYIDX2_119:%.*]] = getelementptr inbounds i8, i8* [[DST]], i64 1 +; CHECK-NEXT: store i8 [[CONV_I_118]], i8* [[ARRAYIDX2_119]], align 1 +; CHECK-NEXT: [[ARRAYIDX_220:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 2 +; CHECK-NEXT: [[TMP6:%.*]] = load i8, i8* [[ARRAYIDX_220]], align 1 +; CHECK-NEXT: [[CONV_221:%.*]] = zext i8 [[TMP6]] to i32 +; CHECK-NEXT: [[MUL_222:%.*]] = mul nsw i32 [[TMP0]], [[CONV_221]] +; CHECK-NEXT: [[ADD_223:%.*]] = add nsw i32 [[MUL_222]], [[TMP1]] +; CHECK-NEXT: [[TOBOOL_NOT_I_224:%.*]] = icmp ult i32 [[ADD_223]], 256 +; CHECK-NEXT: [[TMP7:%.*]] = icmp sgt i32 [[ADD_223]], 0 +; CHECK-NEXT: [[SHR_I_225:%.*]] = sext i1 [[TMP7]] to i32 +; CHECK-NEXT: [[COND_I_226:%.*]] = select i1 [[TOBOOL_NOT_I_224]], i32 [[ADD_223]], i32 [[SHR_I_225]] +; CHECK-NEXT: [[CONV_I_227:%.*]] = trunc i32 [[COND_I_226]] to i8 +; CHECK-NEXT: [[ARRAYIDX2_228:%.*]] = getelementptr inbounds i8, i8* [[DST]], i64 2 +; CHECK-NEXT: store i8 [[CONV_I_227]], i8* [[ARRAYIDX2_228]], align 1 +; CHECK-NEXT: [[ARRAYIDX_329:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 3 +; CHECK-NEXT: [[TMP8:%.*]] = load i8, i8* [[ARRAYIDX_329]], align 1 +; CHECK-NEXT: [[CONV_330:%.*]] = zext i8 [[TMP8]] to i32 +; CHECK-NEXT: [[MUL_331:%.*]] = mul nsw i32 [[TMP0]], [[CONV_330]] +; CHECK-NEXT: [[ADD_332:%.*]] = add nsw i32 [[MUL_331]], [[TMP1]] +; CHECK-NEXT: [[TOBOOL_NOT_I_333:%.*]] = icmp ult i32 [[ADD_332]], 256 +; CHECK-NEXT: [[TMP9:%.*]] = icmp sgt i32 [[ADD_332]], 0 +; CHECK-NEXT: [[SHR_I_334:%.*]] = sext i1 [[TMP9]] to i32 +; CHECK-NEXT: [[COND_I_335:%.*]] = select i1 [[TOBOOL_NOT_I_333]], i32 [[ADD_332]], i32 [[SHR_I_334]] +; CHECK-NEXT: [[CONV_I_336:%.*]] = trunc i32 [[COND_I_335]] to i8 +; CHECK-NEXT: [[ARRAYIDX2_337:%.*]] = getelementptr inbounds i8, i8* [[DST]], i64 3 +; CHECK-NEXT: store i8 [[CONV_I_336]], i8* [[ARRAYIDX2_337]], align 1 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll @@ -1,17 +1,36 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -scoped-noalias-aa -slp-vectorizer -mtriple=arm64-apple-darwin -enable-new-pm=false -S %s | FileCheck %s +; RUN: opt -slp-memory-versioning -scoped-noalias-aa -slp-vectorizer -mtriple=arm64-apple-darwin -enable-new-pm=false -S %s | FileCheck %s +; RUN: opt -slp-memory-versioning=false -scoped-noalias-aa -slp-vectorizer -mtriple=arm64-apple-darwin -enable-new-pm=false -S %s | FileCheck --check-prefix=NOVERSION %s + +; NOVERSION-NOT: memcheck define void @needs_versioning_not_profitable(i32* %dst, i32* %src) { ; CHECK-LABEL: @needs_versioning_not_profitable( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[SRC_0:%.*]] = load i32, i32* [[SRC:%.*]], align 4 -; CHECK-NEXT: [[R_0:%.*]] = ashr i32 [[SRC_0]], 16 -; CHECK-NEXT: store i32 [[R_0]], i32* [[DST:%.*]], align 4 -; CHECK-NEXT: [[SRC_GEP_1:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 1 -; CHECK-NEXT: [[SRC_1:%.*]] = load i32, i32* [[SRC_GEP_1]], align 4 -; CHECK-NEXT: [[R_1:%.*]] = ashr i32 [[SRC_1]], 16 -; CHECK-NEXT: [[DST_GEP_1:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 1 -; CHECK-NEXT: store i32 [[R_1]], i32* [[DST_GEP_1]], align 4 +; CHECK-NEXT: entry.slpmemcheck: +; CHECK-NEXT: [[DST8:%.*]] = bitcast i32* [[DST:%.*]] to i8* +; CHECK-NEXT: [[SRC10:%.*]] = bitcast i32* [[SRC:%.*]] to i8* +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[DST]], i64 1 +; CHECK-NEXT: [[SCEVGEP9:%.*]] = bitcast i32* [[SCEVGEP]] to i8* +; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, i8* [[SCEVGEP9]], i64 1 +; CHECK-NEXT: [[SCEVGEP11:%.*]] = getelementptr i32, i32* [[SRC]], i64 1 +; CHECK-NEXT: [[SCEVGEP1112:%.*]] = bitcast i32* [[SCEVGEP11]] to i8* +; CHECK-NEXT: [[UGLYGEP13:%.*]] = getelementptr i8, i8* [[SCEVGEP1112]], i64 1 +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[DST8]], [[UGLYGEP13]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[SRC10]], [[UGLYGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[MEMCHECK_CONFLICT:%.*]] = and i1 [[FOUND_CONFLICT]], true +; CHECK-NEXT: [[NOWRAP:%.*]] = icmp ugt i8* [[UGLYGEP]], [[DST8]] +; CHECK-NEXT: [[CHECK:%.*]] = and i1 [[MEMCHECK_CONFLICT]], [[NOWRAP]] +; CHECK-NEXT: [[NOWRAP14:%.*]] = icmp ugt i8* [[UGLYGEP13]], [[SRC10]] +; CHECK-NEXT: [[CHECK15:%.*]] = and i1 [[CHECK]], [[NOWRAP14]] +; CHECK-NEXT: [[SRC_02:%.*]] = load i32, i32* [[SRC]], align 4 +; CHECK-NEXT: [[R_03:%.*]] = ashr i32 [[SRC_02]], 16 +; CHECK-NEXT: store i32 [[R_03]], i32* [[DST]], align 4 +; CHECK-NEXT: [[SRC_GEP_14:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 1 +; CHECK-NEXT: [[SRC_15:%.*]] = load i32, i32* [[SRC_GEP_14]], align 4 +; CHECK-NEXT: [[R_16:%.*]] = ashr i32 [[SRC_15]], 16 +; CHECK-NEXT: [[DST_GEP_17:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 1 +; CHECK-NEXT: store i32 [[R_16]], i32* [[DST_GEP_17]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -28,25 +47,41 @@ define void @needs_versioning_profitable(i32* %dst, i32* %src) { ; CHECK-LABEL: @needs_versioning_profitable( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[SRC_0:%.*]] = load i32, i32* [[SRC:%.*]], align 4 -; CHECK-NEXT: [[R_0:%.*]] = ashr i32 [[SRC_0]], 16 -; CHECK-NEXT: store i32 [[R_0]], i32* [[DST:%.*]], align 4 -; CHECK-NEXT: [[SRC_GEP_1:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 1 -; CHECK-NEXT: [[SRC_1:%.*]] = load i32, i32* [[SRC_GEP_1]], align 4 -; CHECK-NEXT: [[R_1:%.*]] = ashr i32 [[SRC_1]], 16 -; CHECK-NEXT: [[DST_GEP_1:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 1 -; CHECK-NEXT: store i32 [[R_1]], i32* [[DST_GEP_1]], align 4 -; CHECK-NEXT: [[SRC_GEP_2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2 -; CHECK-NEXT: [[SRC_2:%.*]] = load i32, i32* [[SRC_GEP_2]], align 4 -; CHECK-NEXT: [[R_2:%.*]] = ashr i32 [[SRC_2]], 16 -; CHECK-NEXT: [[DST_GEP_2:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2 -; CHECK-NEXT: store i32 [[R_2]], i32* [[DST_GEP_2]], align 4 -; CHECK-NEXT: [[SRC_GEP_3:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3 -; CHECK-NEXT: [[SRC_3:%.*]] = load i32, i32* [[SRC_GEP_3]], align 4 -; CHECK-NEXT: [[R_3:%.*]] = ashr i32 [[SRC_3]], 16 -; CHECK-NEXT: [[DST_GEP_3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3 -; CHECK-NEXT: store i32 [[R_3]], i32* [[DST_GEP_3]], align 4 +; CHECK-NEXT: entry.slpmemcheck: +; CHECK-NEXT: [[DST16:%.*]] = bitcast i32* [[DST:%.*]] to i8* +; CHECK-NEXT: [[SRC18:%.*]] = bitcast i32* [[SRC:%.*]] to i8* +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[DST]], i64 3 +; CHECK-NEXT: [[SCEVGEP17:%.*]] = bitcast i32* [[SCEVGEP]] to i8* +; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, i8* [[SCEVGEP17]], i64 1 +; CHECK-NEXT: [[SCEVGEP19:%.*]] = getelementptr i32, i32* [[SRC]], i64 3 +; CHECK-NEXT: [[SCEVGEP1920:%.*]] = bitcast i32* [[SCEVGEP19]] to i8* +; CHECK-NEXT: [[UGLYGEP21:%.*]] = getelementptr i8, i8* [[SCEVGEP1920]], i64 1 +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[DST16]], [[UGLYGEP21]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[SRC18]], [[UGLYGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[MEMCHECK_CONFLICT:%.*]] = and i1 [[FOUND_CONFLICT]], true +; CHECK-NEXT: [[NOWRAP:%.*]] = icmp ugt i8* [[UGLYGEP]], [[DST16]] +; CHECK-NEXT: [[CHECK:%.*]] = and i1 [[MEMCHECK_CONFLICT]], [[NOWRAP]] +; CHECK-NEXT: [[NOWRAP22:%.*]] = icmp ugt i8* [[UGLYGEP21]], [[SRC18]] +; CHECK-NEXT: [[CHECK23:%.*]] = and i1 [[CHECK]], [[NOWRAP22]] +; CHECK-NEXT: [[SRC_02:%.*]] = load i32, i32* [[SRC]], align 4 +; CHECK-NEXT: [[R_03:%.*]] = ashr i32 [[SRC_02]], 16 +; CHECK-NEXT: store i32 [[R_03]], i32* [[DST]], align 4 +; CHECK-NEXT: [[SRC_GEP_14:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 1 +; CHECK-NEXT: [[SRC_15:%.*]] = load i32, i32* [[SRC_GEP_14]], align 4 +; CHECK-NEXT: [[R_16:%.*]] = ashr i32 [[SRC_15]], 16 +; CHECK-NEXT: [[DST_GEP_17:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 1 +; CHECK-NEXT: store i32 [[R_16]], i32* [[DST_GEP_17]], align 4 +; CHECK-NEXT: [[SRC_GEP_28:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2 +; CHECK-NEXT: [[SRC_29:%.*]] = load i32, i32* [[SRC_GEP_28]], align 4 +; CHECK-NEXT: [[R_210:%.*]] = ashr i32 [[SRC_29]], 16 +; CHECK-NEXT: [[DST_GEP_211:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2 +; CHECK-NEXT: store i32 [[R_210]], i32* [[DST_GEP_211]], align 4 +; CHECK-NEXT: [[SRC_GEP_312:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3 +; CHECK-NEXT: [[SRC_313:%.*]] = load i32, i32* [[SRC_GEP_312]], align 4 +; CHECK-NEXT: [[R_314:%.*]] = ashr i32 [[SRC_313]], 16 +; CHECK-NEXT: [[DST_GEP_315:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3 +; CHECK-NEXT: store i32 [[R_314]], i32* [[DST_GEP_315]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -99,30 +134,65 @@ define void @version_multiple(i32* nocapture %out_block, i32* nocapture readonly %counter) { ; CHECK-LABEL: @version_multiple( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[COUNTER:%.*]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[OUT_BLOCK:%.*]], align 4 -; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[TMP1]], [[TMP0]] -; CHECK-NEXT: store i32 [[XOR]], i32* [[OUT_BLOCK]], align 4 +; CHECK-NEXT: entry.slpmemcheck: +; CHECK-NEXT: [[OUT_BLOCK12:%.*]] = bitcast i32* [[OUT_BLOCK:%.*]] to i8* +; CHECK-NEXT: [[COUNTER14:%.*]] = bitcast i32* [[COUNTER:%.*]] to i8* +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[OUT_BLOCK]], i64 3 +; CHECK-NEXT: [[SCEVGEP13:%.*]] = bitcast i32* [[SCEVGEP]] to i8* +; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, i8* [[SCEVGEP13]], i64 1 +; CHECK-NEXT: [[SCEVGEP15:%.*]] = getelementptr i32, i32* [[COUNTER]], i64 3 +; CHECK-NEXT: [[SCEVGEP1516:%.*]] = bitcast i32* [[SCEVGEP15]] to i8* +; CHECK-NEXT: [[UGLYGEP17:%.*]] = getelementptr i8, i8* [[SCEVGEP1516]], i64 1 +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[OUT_BLOCK12]], [[UGLYGEP17]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[COUNTER14]], [[UGLYGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[MEMCHECK_CONFLICT:%.*]] = and i1 [[FOUND_CONFLICT]], true +; CHECK-NEXT: [[NOWRAP:%.*]] = icmp ugt i8* [[UGLYGEP]], [[OUT_BLOCK12]] +; CHECK-NEXT: [[CHECK:%.*]] = and i1 [[MEMCHECK_CONFLICT]], [[NOWRAP]] +; CHECK-NEXT: [[NOWRAP18:%.*]] = icmp ugt i8* [[UGLYGEP17]], [[COUNTER14]] +; CHECK-NEXT: [[CHECK19:%.*]] = and i1 [[CHECK]], [[NOWRAP18]] +; CHECK-NEXT: br i1 [[CHECK19]], label [[ENTRY_SCALAR:%.*]], label [[ENTRY_SLPVERSIONED:%.*]] +; CHECK: entry.slpversioned: ; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[COUNTER]], i64 1 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4 ; CHECK-NEXT: [[ARRAYIDX2_1:%.*]] = getelementptr inbounds i32, i32* [[OUT_BLOCK]], i64 1 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX2_1]], align 4 -; CHECK-NEXT: [[XOR_1:%.*]] = xor i32 [[TMP3]], [[TMP2]] -; CHECK-NEXT: store i32 [[XOR_1]], i32* [[ARRAYIDX2_1]], align 4 ; CHECK-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[COUNTER]], i64 2 -; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4 ; CHECK-NEXT: [[ARRAYIDX2_2:%.*]] = getelementptr inbounds i32, i32* [[OUT_BLOCK]], i64 2 -; CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX2_2]], align 4 -; CHECK-NEXT: [[XOR_2:%.*]] = xor i32 [[TMP5]], [[TMP4]] -; CHECK-NEXT: store i32 [[XOR_2]], i32* [[ARRAYIDX2_2]], align 4 ; CHECK-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[COUNTER]], i64 3 -; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[COUNTER]] to <4 x i32>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4, !alias.scope !0, !noalias !3 ; CHECK-NEXT: [[ARRAYIDX2_3:%.*]] = getelementptr inbounds i32, i32* [[OUT_BLOCK]], i64 3 -; CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX2_3]], align 4 -; CHECK-NEXT: [[XOR_3:%.*]] = xor i32 [[TMP7]], [[TMP6]] -; CHECK-NEXT: store i32 [[XOR_3]], i32* [[ARRAYIDX2_3]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[OUT_BLOCK]] to <4 x i32>* +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4, !alias.scope !3, !noalias !0 +; CHECK-NEXT: [[TMP4:%.*]] = xor <4 x i32> [[TMP3]], [[TMP1]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[OUT_BLOCK]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4, !alias.scope !3, !noalias !0 +; CHECK-NEXT: br label [[ENTRY_MERGE:%.*]] +; CHECK: entry.merge: ; CHECK-NEXT: ret void +; CHECK: entry.scalar: +; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[COUNTER]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* [[OUT_BLOCK]], align 4 +; CHECK-NEXT: [[XOR2:%.*]] = xor i32 [[TMP7]], [[TMP6]] +; CHECK-NEXT: store i32 [[XOR2]], i32* [[OUT_BLOCK]], align 4 +; CHECK-NEXT: [[ARRAYIDX_13:%.*]] = getelementptr inbounds i32, i32* [[COUNTER]], i64 1 +; CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX_13]], align 4 +; CHECK-NEXT: [[ARRAYIDX2_14:%.*]] = getelementptr inbounds i32, i32* [[OUT_BLOCK]], i64 1 +; CHECK-NEXT: [[TMP9:%.*]] = load i32, i32* [[ARRAYIDX2_14]], align 4 +; CHECK-NEXT: [[XOR_15:%.*]] = xor i32 [[TMP9]], [[TMP8]] +; CHECK-NEXT: store i32 [[XOR_15]], i32* [[ARRAYIDX2_14]], align 4 +; CHECK-NEXT: [[ARRAYIDX_26:%.*]] = getelementptr inbounds i32, i32* [[COUNTER]], i64 2 +; CHECK-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX_26]], align 4 +; CHECK-NEXT: [[ARRAYIDX2_27:%.*]] = getelementptr inbounds i32, i32* [[OUT_BLOCK]], i64 2 +; CHECK-NEXT: [[TMP11:%.*]] = load i32, i32* [[ARRAYIDX2_27]], align 4 +; CHECK-NEXT: [[XOR_28:%.*]] = xor i32 [[TMP11]], [[TMP10]] +; CHECK-NEXT: store i32 [[XOR_28]], i32* [[ARRAYIDX2_27]], align 4 +; CHECK-NEXT: [[ARRAYIDX_39:%.*]] = getelementptr inbounds i32, i32* [[COUNTER]], i64 3 +; CHECK-NEXT: [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX_39]], align 4 +; CHECK-NEXT: [[ARRAYIDX2_310:%.*]] = getelementptr inbounds i32, i32* [[OUT_BLOCK]], i64 3 +; CHECK-NEXT: [[TMP13:%.*]] = load i32, i32* [[ARRAYIDX2_310]], align 4 +; CHECK-NEXT: [[XOR_311:%.*]] = xor i32 [[TMP13]], [[TMP12]] +; CHECK-NEXT: store i32 [[XOR_311]], i32* [[ARRAYIDX2_310]], align 4 +; CHECK-NEXT: br label [[ENTRY_MERGE]] ; entry: %0 = load i32, i32* %counter, align 4 @@ -294,13 +364,31 @@ define void @slp_not_beneficial(i32* %A, i32* %B) { ; CHECK-LABEL: @slp_not_beneficial( -; CHECK-NEXT: bb: -; CHECK-NEXT: [[TMP:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 4 -; CHECK-NEXT: store i32 0, i32* [[TMP]], align 8 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 5 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 4 -; CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 8 -; CHECK-NEXT: store i32 [[TMP5]], i32* [[TMP3]], align 8 +; CHECK-NEXT: bb.slpmemcheck: +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 4 +; CHECK-NEXT: [[SCEVGEP6:%.*]] = bitcast i32* [[SCEVGEP]] to i8* +; CHECK-NEXT: [[SCEVGEP7:%.*]] = getelementptr i32, i32* [[A]], i64 5 +; CHECK-NEXT: [[SCEVGEP78:%.*]] = bitcast i32* [[SCEVGEP7]] to i8* +; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, i8* [[SCEVGEP78]], i64 1 +; CHECK-NEXT: [[SCEVGEP9:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 4 +; CHECK-NEXT: [[SCEVGEP910:%.*]] = bitcast i32* [[SCEVGEP9]] to i8* +; CHECK-NEXT: [[SCEVGEP11:%.*]] = getelementptr i32, i32* [[B]], i64 4 +; CHECK-NEXT: [[SCEVGEP1112:%.*]] = bitcast i32* [[SCEVGEP11]] to i8* +; CHECK-NEXT: [[UGLYGEP13:%.*]] = getelementptr i8, i8* [[SCEVGEP1112]], i64 1 +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[SCEVGEP6]], [[UGLYGEP13]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[SCEVGEP910]], [[UGLYGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[MEMCHECK_CONFLICT:%.*]] = and i1 [[FOUND_CONFLICT]], true +; CHECK-NEXT: [[NOWRAP:%.*]] = icmp ugt i8* [[UGLYGEP]], [[SCEVGEP6]] +; CHECK-NEXT: [[CHECK:%.*]] = and i1 [[MEMCHECK_CONFLICT]], [[NOWRAP]] +; CHECK-NEXT: [[NOWRAP14:%.*]] = icmp ugt i8* [[UGLYGEP13]], [[SCEVGEP910]] +; CHECK-NEXT: [[CHECK15:%.*]] = and i1 [[CHECK]], [[NOWRAP14]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 4 +; CHECK-NEXT: store i32 0, i32* [[TMP2]], align 8 +; CHECK-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 5 +; CHECK-NEXT: [[TMP44:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 4 +; CHECK-NEXT: [[TMP55:%.*]] = load i32, i32* [[TMP44]], align 8 +; CHECK-NEXT: store i32 [[TMP55]], i32* [[TMP33]], align 8 ; CHECK-NEXT: ret void ; bb: @@ -315,20 +403,34 @@ define void @widget(double* %ptr, double* %ptr.2) { ; CHECK-LABEL: @widget( -; CHECK-NEXT: bb1: -; CHECK-NEXT: [[TMP3:%.*]] = load double, double* null, align 8 -; CHECK-NEXT: [[TMP4:%.*]] = fmul double undef, [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds double, double* [[PTR:%.*]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = load double, double* [[TMP5]], align 8 -; CHECK-NEXT: [[TMP7:%.*]] = fadd double [[TMP6]], [[TMP4]] -; CHECK-NEXT: store double [[TMP7]], double* [[TMP5]], align 8 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds double, double* [[PTR_2:%.*]], i64 0 -; CHECK-NEXT: [[TMP9:%.*]] = load double, double* [[TMP8]], align 8 -; CHECK-NEXT: [[TMP10:%.*]] = fmul double undef, [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds double, double* [[PTR]], i32 1 -; CHECK-NEXT: [[TMP12:%.*]] = load double, double* [[TMP11]], align 8 -; CHECK-NEXT: [[TMP13:%.*]] = fadd double [[TMP12]], [[TMP10]] -; CHECK-NEXT: store double [[TMP13]], double* [[TMP11]], align 8 +; CHECK-NEXT: bb1.slpmemcheck: +; CHECK-NEXT: [[PTR13:%.*]] = bitcast double* [[PTR:%.*]] to i8* +; CHECK-NEXT: [[PTR_215:%.*]] = bitcast double* [[PTR_2:%.*]] to i8* +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr double, double* [[PTR]], i64 1 +; CHECK-NEXT: [[SCEVGEP14:%.*]] = bitcast double* [[SCEVGEP]] to i8* +; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, i8* [[SCEVGEP14]], i64 1 +; CHECK-NEXT: [[UGLYGEP16:%.*]] = getelementptr i8, i8* [[PTR_215]], i64 1 +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[PTR13]], [[UGLYGEP16]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[PTR_215]], [[UGLYGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[MEMCHECK_CONFLICT:%.*]] = and i1 [[FOUND_CONFLICT]], true +; CHECK-NEXT: [[NOWRAP:%.*]] = icmp ugt i8* [[UGLYGEP]], [[PTR13]] +; CHECK-NEXT: [[CHECK:%.*]] = and i1 [[MEMCHECK_CONFLICT]], [[NOWRAP]] +; CHECK-NEXT: [[NOWRAP17:%.*]] = icmp ugt i8* [[UGLYGEP16]], [[PTR_215]] +; CHECK-NEXT: [[CHECK18:%.*]] = and i1 [[CHECK]], [[NOWRAP17]] +; CHECK-NEXT: [[TMP32:%.*]] = load double, double* null, align 8 +; CHECK-NEXT: [[TMP43:%.*]] = fmul double undef, [[TMP32]] +; CHECK-NEXT: [[TMP54:%.*]] = getelementptr inbounds double, double* [[PTR]], i32 0 +; CHECK-NEXT: [[TMP65:%.*]] = load double, double* [[TMP54]], align 8 +; CHECK-NEXT: [[TMP76:%.*]] = fadd double [[TMP65]], [[TMP43]] +; CHECK-NEXT: store double [[TMP76]], double* [[TMP54]], align 8 +; CHECK-NEXT: [[TMP87:%.*]] = getelementptr inbounds double, double* [[PTR_2]], i64 0 +; CHECK-NEXT: [[TMP98:%.*]] = load double, double* [[TMP87]], align 8 +; CHECK-NEXT: [[TMP109:%.*]] = fmul double undef, [[TMP98]] +; CHECK-NEXT: [[TMP1110:%.*]] = getelementptr inbounds double, double* [[PTR]], i32 1 +; CHECK-NEXT: [[TMP1211:%.*]] = load double, double* [[TMP1110]], align 8 +; CHECK-NEXT: [[TMP1312:%.*]] = fadd double [[TMP1211]], [[TMP109]] +; CHECK-NEXT: store double [[TMP1312]], double* [[TMP1110]], align 8 ; CHECK-NEXT: br label [[BB15:%.*]] ; CHECK: bb15: ; CHECK-NEXT: br label [[BB15]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/memory-runtime-checks.ll b/llvm/test/Transforms/SLPVectorizer/X86/memory-runtime-checks.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/memory-runtime-checks.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/memory-runtime-checks.ll @@ -1,32 +1,70 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -scoped-noalias-aa -slp-vectorizer -mtriple=x86_64-apple-darwin -enable-new-pm=false -S %s | FileCheck %s +; RUN: opt -slp-memory-versioning -scoped-noalias-aa -slp-vectorizer -mtriple=x86_64-apple-darwin -enable-new-pm=false -S %s | FileCheck %s +; RUN: opt -slp-memory-versioning=false -scoped-noalias-aa -slp-vectorizer -mtriple=x86_64-apple-darwin -enable-new-pm=false -S %s | FileCheck --check-prefix=NOVERSION %s + +; NOVERSION-NOT: memcheck define void @version_multiple(i32* nocapture %out_block, i32* nocapture readonly %counter) { ; CHECK-LABEL: @version_multiple( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[COUNTER:%.*]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[OUT_BLOCK:%.*]], align 4 -; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[TMP1]], [[TMP0]] -; CHECK-NEXT: store i32 [[XOR]], i32* [[OUT_BLOCK]], align 4 +; CHECK-NEXT: entry.slpmemcheck: +; CHECK-NEXT: [[OUT_BLOCK12:%.*]] = bitcast i32* [[OUT_BLOCK:%.*]] to i8* +; CHECK-NEXT: [[COUNTER14:%.*]] = bitcast i32* [[COUNTER:%.*]] to i8* +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[OUT_BLOCK]], i64 3 +; CHECK-NEXT: [[SCEVGEP13:%.*]] = bitcast i32* [[SCEVGEP]] to i8* +; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, i8* [[SCEVGEP13]], i64 1 +; CHECK-NEXT: [[SCEVGEP15:%.*]] = getelementptr i32, i32* [[COUNTER]], i64 3 +; CHECK-NEXT: [[SCEVGEP1516:%.*]] = bitcast i32* [[SCEVGEP15]] to i8* +; CHECK-NEXT: [[UGLYGEP17:%.*]] = getelementptr i8, i8* [[SCEVGEP1516]], i64 1 +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[OUT_BLOCK12]], [[UGLYGEP17]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[COUNTER14]], [[UGLYGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[MEMCHECK_CONFLICT:%.*]] = and i1 [[FOUND_CONFLICT]], true +; CHECK-NEXT: [[NOWRAP:%.*]] = icmp ugt i8* [[UGLYGEP]], [[OUT_BLOCK12]] +; CHECK-NEXT: [[CHECK:%.*]] = and i1 [[MEMCHECK_CONFLICT]], [[NOWRAP]] +; CHECK-NEXT: [[NOWRAP18:%.*]] = icmp ugt i8* [[UGLYGEP17]], [[COUNTER14]] +; CHECK-NEXT: [[CHECK19:%.*]] = and i1 [[CHECK]], [[NOWRAP18]] +; CHECK-NEXT: br i1 [[CHECK19]], label [[ENTRY_SCALAR:%.*]], label [[ENTRY_SLPVERSIONED:%.*]] +; CHECK: entry.slpversioned: ; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[COUNTER]], i64 1 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4 ; CHECK-NEXT: [[ARRAYIDX2_1:%.*]] = getelementptr inbounds i32, i32* [[OUT_BLOCK]], i64 1 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX2_1]], align 4 -; CHECK-NEXT: [[XOR_1:%.*]] = xor i32 [[TMP3]], [[TMP2]] -; CHECK-NEXT: store i32 [[XOR_1]], i32* [[ARRAYIDX2_1]], align 4 ; CHECK-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[COUNTER]], i64 2 -; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4 ; CHECK-NEXT: [[ARRAYIDX2_2:%.*]] = getelementptr inbounds i32, i32* [[OUT_BLOCK]], i64 2 -; CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX2_2]], align 4 -; CHECK-NEXT: [[XOR_2:%.*]] = xor i32 [[TMP5]], [[TMP4]] -; CHECK-NEXT: store i32 [[XOR_2]], i32* [[ARRAYIDX2_2]], align 4 ; CHECK-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[COUNTER]], i64 3 -; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[COUNTER]] to <4 x i32>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4, !alias.scope !0, !noalias !3 ; CHECK-NEXT: [[ARRAYIDX2_3:%.*]] = getelementptr inbounds i32, i32* [[OUT_BLOCK]], i64 3 -; CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX2_3]], align 4 -; CHECK-NEXT: [[XOR_3:%.*]] = xor i32 [[TMP7]], [[TMP6]] -; CHECK-NEXT: store i32 [[XOR_3]], i32* [[ARRAYIDX2_3]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[OUT_BLOCK]] to <4 x i32>* +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4, !alias.scope !3, !noalias !0 +; CHECK-NEXT: [[TMP4:%.*]] = xor <4 x i32> [[TMP3]], [[TMP1]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[OUT_BLOCK]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4, !alias.scope !3, !noalias !0 +; CHECK-NEXT: br label [[ENTRY_MERGE:%.*]] +; CHECK: entry.merge: ; CHECK-NEXT: ret void +; CHECK: entry.scalar: +; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[COUNTER]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* [[OUT_BLOCK]], align 4 +; CHECK-NEXT: [[XOR2:%.*]] = xor i32 [[TMP7]], [[TMP6]] +; CHECK-NEXT: store i32 [[XOR2]], i32* [[OUT_BLOCK]], align 4 +; CHECK-NEXT: [[ARRAYIDX_13:%.*]] = getelementptr inbounds i32, i32* [[COUNTER]], i64 1 +; CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX_13]], align 4 +; CHECK-NEXT: [[ARRAYIDX2_14:%.*]] = getelementptr inbounds i32, i32* [[OUT_BLOCK]], i64 1 +; CHECK-NEXT: [[TMP9:%.*]] = load i32, i32* [[ARRAYIDX2_14]], align 4 +; CHECK-NEXT: [[XOR_15:%.*]] = xor i32 [[TMP9]], [[TMP8]] +; CHECK-NEXT: store i32 [[XOR_15]], i32* [[ARRAYIDX2_14]], align 4 +; CHECK-NEXT: [[ARRAYIDX_26:%.*]] = getelementptr inbounds i32, i32* [[COUNTER]], i64 2 +; CHECK-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX_26]], align 4 +; CHECK-NEXT: [[ARRAYIDX2_27:%.*]] = getelementptr inbounds i32, i32* [[OUT_BLOCK]], i64 2 +; CHECK-NEXT: [[TMP11:%.*]] = load i32, i32* [[ARRAYIDX2_27]], align 4 +; CHECK-NEXT: [[XOR_28:%.*]] = xor i32 [[TMP11]], [[TMP10]] +; CHECK-NEXT: store i32 [[XOR_28]], i32* [[ARRAYIDX2_27]], align 4 +; CHECK-NEXT: [[ARRAYIDX_39:%.*]] = getelementptr inbounds i32, i32* [[COUNTER]], i64 3 +; CHECK-NEXT: [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX_39]], align 4 +; CHECK-NEXT: [[ARRAYIDX2_310:%.*]] = getelementptr inbounds i32, i32* [[OUT_BLOCK]], i64 3 +; CHECK-NEXT: [[TMP13:%.*]] = load i32, i32* [[ARRAYIDX2_310]], align 4 +; CHECK-NEXT: [[XOR_311:%.*]] = xor i32 [[TMP13]], [[TMP12]] +; CHECK-NEXT: store i32 [[XOR_311]], i32* [[ARRAYIDX2_310]], align 4 +; CHECK-NEXT: br label [[ENTRY_MERGE]] ; entry: %0 = load i32, i32* %counter, align 4 @@ -61,7 +99,7 @@ ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x float*> poison, float* [[B:%.*]], i32 0 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x float*> [[TMP0]], float* [[B]], i32 1 ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr float, <2 x float*> [[TMP1]], <2 x i64> -; CHECK-NEXT: br i1 [[C:%.*]], label [[THEN:%.*]], label [[ELSE:%.*]] +; CHECK-NEXT: br i1 [[C:%.*]], label [[THEN_SLPMEMCHECK:%.*]], label [[ELSE:%.*]] ; CHECK: else: ; CHECK-NEXT: [[TMP3:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP2]], i32 4, <2 x i1> , <2 x float> undef) ; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <4 x i32> @@ -69,17 +107,35 @@ ; CHECK-NEXT: [[I71:%.*]] = shufflevector <8 x float> undef, <8 x float> [[TMP4]], <8 x i32> ; CHECK-NEXT: call void @use(<8 x float> [[I71]]) ; CHECK-NEXT: ret void -; CHECK: then: -; CHECK-NEXT: [[A_8:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 8 -; CHECK-NEXT: store float 0.000000e+00, float* [[A_8]], align 4 +; CHECK: then.slpmemcheck: +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr float, float* [[A:%.*]], i64 5 +; CHECK-NEXT: [[SCEVGEP8:%.*]] = bitcast float* [[SCEVGEP]] to i8* +; CHECK-NEXT: [[SCEVGEP9:%.*]] = getelementptr float, float* [[A]], i64 8 +; CHECK-NEXT: [[SCEVGEP910:%.*]] = bitcast float* [[SCEVGEP9]] to i8* +; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, i8* [[SCEVGEP910]], i64 1 +; CHECK-NEXT: [[SCEVGEP11:%.*]] = getelementptr float, float* [[B]], i64 14 +; CHECK-NEXT: [[SCEVGEP1112:%.*]] = bitcast float* [[SCEVGEP11]] to i8* +; CHECK-NEXT: [[SCEVGEP13:%.*]] = getelementptr float, float* [[B]], i64 14 +; CHECK-NEXT: [[SCEVGEP1314:%.*]] = bitcast float* [[SCEVGEP13]] to i8* +; CHECK-NEXT: [[UGLYGEP15:%.*]] = getelementptr i8, i8* [[SCEVGEP1314]], i64 1 +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[SCEVGEP8]], [[UGLYGEP15]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[SCEVGEP1112]], [[UGLYGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[MEMCHECK_CONFLICT:%.*]] = and i1 [[FOUND_CONFLICT]], true +; CHECK-NEXT: [[NOWRAP:%.*]] = icmp ugt i8* [[UGLYGEP]], [[SCEVGEP8]] +; CHECK-NEXT: [[CHECK:%.*]] = and i1 [[MEMCHECK_CONFLICT]], [[NOWRAP]] +; CHECK-NEXT: [[NOWRAP16:%.*]] = icmp ugt i8* [[UGLYGEP15]], [[SCEVGEP1112]] +; CHECK-NEXT: [[CHECK17:%.*]] = and i1 [[CHECK]], [[NOWRAP16]] +; CHECK-NEXT: [[A_83:%.*]] = getelementptr inbounds float, float* [[A]], i64 8 +; CHECK-NEXT: store float 0.000000e+00, float* [[A_83]], align 4 ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x float*> [[TMP2]], i32 1 -; CHECK-NEXT: [[L6:%.*]] = load float, float* [[TMP5]], align 4 -; CHECK-NEXT: [[A_5:%.*]] = getelementptr inbounds float, float* [[A]], i64 5 -; CHECK-NEXT: store float [[L6]], float* [[A_5]], align 4 -; CHECK-NEXT: [[A_6:%.*]] = getelementptr inbounds float, float* [[A]], i64 6 -; CHECK-NEXT: store float 0.000000e+00, float* [[A_6]], align 4 -; CHECK-NEXT: [[A_7:%.*]] = getelementptr inbounds float, float* [[A]], i64 7 -; CHECK-NEXT: store float 0.000000e+00, float* [[A_7]], align 4 +; CHECK-NEXT: [[L64:%.*]] = load float, float* [[TMP5]], align 4 +; CHECK-NEXT: [[A_55:%.*]] = getelementptr inbounds float, float* [[A]], i64 5 +; CHECK-NEXT: store float [[L64]], float* [[A_55]], align 4 +; CHECK-NEXT: [[A_66:%.*]] = getelementptr inbounds float, float* [[A]], i64 6 +; CHECK-NEXT: store float 0.000000e+00, float* [[A_66]], align 4 +; CHECK-NEXT: [[A_77:%.*]] = getelementptr inbounds float, float* [[A]], i64 7 +; CHECK-NEXT: store float 0.000000e+00, float* [[A_77]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -122,6 +178,7 @@ ; CHECK-LABEL: @preserve_loop_info( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP:%.*]] = alloca [3 x double], align 16 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast [3 x double]* [[TMP]] to i8* ; CHECK-NEXT: br label [[OUTER_HEADER:%.*]] ; CHECK: outer.header: ; CHECK-NEXT: br label [[INNER:%.*]] @@ -133,14 +190,30 @@ ; CHECK-NEXT: [[TMP5:%.*]] = load [3 x double]*, [3 x double]** undef, align 8 ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [3 x double], [3 x double]* [[TMP]], i64 0, i64 0 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [3 x double], [3 x double]* [[TMP]], i64 0, i64 1 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr [3 x double], [3 x double]* [[TMP]], i64 0, i64 1 +; CHECK-NEXT: [[SCEVGEP5:%.*]] = bitcast double* [[SCEVGEP]] to i8* +; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, i8* [[SCEVGEP5]], i64 1 +; CHECK-NEXT: [[SCEVGEP6:%.*]] = getelementptr [3 x double], [3 x double]* [[TMP5]], i64 undef, i64 1 +; CHECK-NEXT: [[SCEVGEP67:%.*]] = bitcast double* [[SCEVGEP6]] to i8* +; CHECK-NEXT: [[SCEVGEP8:%.*]] = getelementptr [3 x double], [3 x double]* [[TMP5]], i64 undef, i64 1 +; CHECK-NEXT: [[SCEVGEP89:%.*]] = bitcast double* [[SCEVGEP8]] to i8* +; CHECK-NEXT: [[UGLYGEP10:%.*]] = getelementptr i8, i8* [[SCEVGEP89]], i64 1 ; CHECK-NEXT: br label [[LOOP_3HEADER:%.*]] ; CHECK: loop.3header: -; CHECK-NEXT: br i1 undef, label [[LOOP_3LATCH:%.*]], label [[BB9:%.*]] -; CHECK: bb9: -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [3 x double], [3 x double]* [[TMP5]], i64 undef, i64 1 +; CHECK-NEXT: br i1 undef, label [[LOOP_3LATCH:%.*]], label [[BB9_SLPMEMCHECK:%.*]] +; CHECK: bb9.slpmemcheck: +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[TMP4]], [[UGLYGEP10]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[SCEVGEP67]], [[UGLYGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[MEMCHECK_CONFLICT:%.*]] = and i1 [[FOUND_CONFLICT]], true +; CHECK-NEXT: [[NOWRAP:%.*]] = icmp ugt i8* [[UGLYGEP]], [[TMP4]] +; CHECK-NEXT: [[CHECK:%.*]] = and i1 [[MEMCHECK_CONFLICT]], [[NOWRAP]] +; CHECK-NEXT: [[NOWRAP11:%.*]] = icmp ugt i8* [[UGLYGEP10]], [[SCEVGEP67]] +; CHECK-NEXT: [[CHECK12:%.*]] = and i1 [[CHECK]], [[NOWRAP11]] +; CHECK-NEXT: [[TMP102:%.*]] = getelementptr inbounds [3 x double], [3 x double]* [[TMP5]], i64 undef, i64 1 ; CHECK-NEXT: store double undef, double* [[TMP6]], align 16 -; CHECK-NEXT: [[TMP12:%.*]] = load double, double* [[TMP10]], align 8 -; CHECK-NEXT: store double [[TMP12]], double* [[TMP7]], align 8 +; CHECK-NEXT: [[TMP123:%.*]] = load double, double* [[TMP102]], align 8 +; CHECK-NEXT: store double [[TMP123]], double* [[TMP7]], align 8 ; CHECK-NEXT: br label [[LOOP_3LATCH]] ; CHECK: loop.3latch: ; CHECK-NEXT: br i1 undef, label [[BB14:%.*]], label [[LOOP_3HEADER]]