diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h --- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h +++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h @@ -244,6 +244,15 @@ SmallVector getInstructionsForAccess(Value *Ptr, bool isWrite) const; + /// Return the program order indices for the access location (Ptr, IsWrite). + /// Returns an empty ArrayRef if there are no accesses for the location. + ArrayRef getOrderForAccess(Value *Ptr, bool IsWrite) const { + auto I = Accesses.find({Ptr, IsWrite}); + if (I != Accesses.end()) + return I->second; + return {}; + } + private: /// A wrapper around ScalarEvolution, used to add runtime SCEV checks, and /// applies dynamic knowledge to simplify SCEV expressions and convert them @@ -392,7 +401,8 @@ AliasSetId(AliasSetId), Expr(Expr) {} }; - RuntimePointerChecking(ScalarEvolution *SE) : SE(SE) {} + RuntimePointerChecking(MemoryDepChecker &DC, ScalarEvolution *SE) + : DC(DC), SE(SE) {} /// Reset the state of the pointer runtime information. void reset() { @@ -418,11 +428,23 @@ void generateChecks(MemoryDepChecker::DepCandidates &DepCands, bool UseDependencies); - /// Returns the checks that generateChecks created. + /// Returns the checks that generateChecks created. They can be used to ensure + /// no read/write accesses overlap across all loop iterations. const SmallVectorImpl &getChecks() const { return Checks; } + // Returns an optional list of (pointer-difference expressions, access size) + // pairs that can be used prove that there are no vectorization-preventing + // dependencies at runtime. There are is a vectorization-preventing dependency + // if any pointer-difference is > getOrderedChecks() const { + if (!CanUseOrderedCheck) + return None; + return {OrderedChecks}; + } + /// Decide if we need to add a check between two groups of pointers, /// according to needsChecking. bool needsChecking(const RuntimeCheckingPtrGroup &M, @@ -477,7 +499,15 @@ bool UseDependencies); /// Generate the checks and return them. - SmallVector generateChecks() const; + SmallVector generateChecks(); + + /// Try to add a new (src, sink) pair to OrderedCheck for checking groups \p + /// CGI and \p CGJ. If ordered checks cannot be used for the groups, set + /// CanUseOrderedCheck to false. + void tryToCreateOrderedCheck(const RuntimeCheckingPtrGroup &CGI, + const RuntimeCheckingPtrGroup &CGJ); + + MemoryDepChecker &DC; /// Holds a pointer to the ScalarEvolution analysis. ScalarEvolution *SE; @@ -485,6 +515,13 @@ /// Set of run-time checks required to establish independence of /// otherwise may-aliasing pointers in the loop. SmallVector Checks; + + /// Flag indicating if pointer-difference checks can be used + bool CanUseOrderedCheck = true; + + /// A list of (Src, Sink) checking pairs that can be used to + /// prove that there are no vectorization-preventing dependencies. + SmallVector OrderedChecks; }; /// Drive the analysis of memory accesses in the loop diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h --- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h +++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h @@ -497,6 +497,11 @@ const SmallVectorImpl &PointerChecks, SCEVExpander &Expander); +Value *addOrderedRuntimeChecks( + Instruction *Loc, Loop *TheLoop, ArrayRef Checks, + SCEVExpander &Expander, + function_ref GetVF, unsigned IC); + /// Struct to hold information about a partially invariant condition. struct IVConditionInfo { /// Instructions that need to be duplicated and checked for the unswitching diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -234,8 +234,76 @@ Pointers.emplace_back(Ptr, ScStart, ScEnd, WritePtr, DepSetId, ASId, Sc); } -SmallVector -RuntimePointerChecking::generateChecks() const { +void RuntimePointerChecking::tryToCreateOrderedCheck( + const RuntimeCheckingPtrGroup &CGI, const RuntimeCheckingPtrGroup &CGJ) { + if (!CanUseOrderedCheck) + return; + + // If either group contains multiple different pointers, bail out. + // TODO: Support multiple pointers by using the minimum or maximum pointer, + // depending on src & sink. + if (CGI.Members.size() != 1 || CGJ.Members.size() != 1) { + CanUseOrderedCheck = false; + return; + } + + auto &PtrI = Pointers[CGI.Members[0]]; + auto &PtrJ = Pointers[CGJ.Members[0]]; + + // If either pointer is read and written, multiple checks may be needed. Bail + // out. + if (!DC.getOrderForAccess(PtrI.PointerValue, !PtrI.IsWritePtr).empty() || + !DC.getOrderForAccess(PtrJ.PointerValue, !PtrJ.IsWritePtr).empty()) { + CanUseOrderedCheck = false; + return; + } + + ArrayRef AccI = + DC.getOrderForAccess(PtrI.PointerValue, PtrI.IsWritePtr); + ArrayRef AccJ = + DC.getOrderForAccess(PtrJ.PointerValue, PtrJ.IsWritePtr); + // If either pointer is accessed multiple times, there may not be a clear + // src/sink relation. Bail out for now. + if (AccI.size() != 1 || AccJ.size() != 1) { + CanUseOrderedCheck = false; + return; + } + auto *Src = &PtrI; + auto *Sink = &PtrJ; + bool Swapped = false; + // If PtrJ is accessed before PtrI, swap src/sink. + if (AccJ[0] < AccI[0]) { + std::swap(Src, Sink); + Swapped = true; + } + + auto *SrcAR = dyn_cast(Src->Expr); + auto *SinkAR = dyn_cast(Sink->Expr); + if (!SrcAR || !SinkAR) { + CanUseOrderedCheck = false; + return; + } + + // Only matching constant steps matching the AllocSize are supported at the + // moment. This simplifies the difference computation. Can be extended in the + // future. + auto *Step = dyn_cast(SinkAR->getStepRecurrence(*SE)); + if (!Step || Step != SrcAR->getStepRecurrence(*SE)) { + CanUseOrderedCheck = false; + return; + } + + // When counting down, the dependence distance needs to be swapped. + if (Step->getValue()->isNegative()) + Swapped ^= true; + + if (Swapped) { + OrderedChecks.emplace_back(&CGJ, &CGI); + } else + OrderedChecks.emplace_back(&CGI, &CGJ); +} + +SmallVector RuntimePointerChecking::generateChecks() { SmallVector Checks; for (unsigned I = 0; I < CheckingGroups.size(); ++I) { @@ -243,8 +311,10 @@ const RuntimeCheckingPtrGroup &CGI = CheckingGroups[I]; const RuntimeCheckingPtrGroup &CGJ = CheckingGroups[J]; - if (needsChecking(CGI, CGJ)) + if (needsChecking(CGI, CGJ)) { + tryToCreateOrderedCheck(CGI, CGJ); Checks.push_back(std::make_pair(&CGI, &CGJ)); + } } } return Checks; @@ -2290,10 +2360,12 @@ const TargetLibraryInfo *TLI, AAResults *AA, DominatorTree *DT, LoopInfo *LI) : PSE(std::make_unique(*SE, *L)), - PtrRtChecking(std::make_unique(SE)), + PtrRtChecking(nullptr), DepChecker(std::make_unique(*PSE, L)), TheLoop(L) { - if (canAnalyzeLoop()) + if (canAnalyzeLoop()) { + PtrRtChecking = std::make_unique(*DepChecker, SE); analyzeLoop(AA, LI, TLI, DT); + } } void LoopAccessInfo::print(raw_ostream &OS, unsigned Depth) const { diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp --- a/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -1614,6 +1614,38 @@ return MemoryRuntimeCheck; } +Value *llvm::addOrderedRuntimeChecks( + Instruction *Loc, Loop *TheLoop, ArrayRef Checks, + SCEVExpander &Expander, + function_ref GetVF, unsigned IC) { + + LLVMContext &Ctx = Loc->getContext(); + IRBuilder ChkBuilder(Ctx, + Loc->getModule()->getDataLayout()); + ChkBuilder.SetInsertPoint(Loc); + // Our instructions might fold to a constant. + Value *MemoryRuntimeCheck = nullptr; + + for (auto &C : Checks) { + Type *PtrArithTy = Type::getInt8PtrTy(Ctx, C.first->AddressSpace); + Value *SrcLowerBound = + Expander.expandCodeFor(C.first->Low, PtrArithTy, Loc); + Value *SinkUpperBound = + Expander.expandCodeFor(C.second->High, PtrArithTy, Loc); + + Value *IsConflict = + ChkBuilder.CreateICmpULT(SrcLowerBound, SinkUpperBound, "bound"); + + if (MemoryRuntimeCheck) { + IsConflict = + ChkBuilder.CreateOr(MemoryRuntimeCheck, IsConflict, "conflict.rdx"); + } + MemoryRuntimeCheck = IsConflict; + } + + return MemoryRuntimeCheck; +} + Optional llvm::hasPartialIVCondition(Loop &L, unsigned MSSAThreshold, MemorySSA &MSSA, diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2009,7 +2009,8 @@ /// there is no vector code generation, the check blocks are removed /// completely. void Create(Loop *L, const LoopAccessInfo &LAI, - const SCEVUnionPredicate &UnionPred) { + const SCEVUnionPredicate &UnionPred, ElementCount VF, + unsigned IC) { BasicBlock *LoopHeader = L->getHeader(); BasicBlock *Preheader = L->getLoopPreheader(); @@ -2032,9 +2033,19 @@ MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr, "vector.memcheck"); - MemRuntimeCheckCond = - addRuntimeChecks(MemCheckBlock->getTerminator(), L, - RtPtrChecking.getChecks(), MemCheckExp); + auto OrderedChecks = RtPtrChecking.getOrderedChecks(); + if (OrderedChecks) { + MemRuntimeCheckCond = addOrderedRuntimeChecks( + MemCheckBlock->getTerminator(), L, *OrderedChecks, MemCheckExp, + [VF](IRBuilderBase &B, unsigned Bits) { + return getRuntimeVF(B, B.getIntNTy(Bits), VF); + }, + IC); + } else { + MemRuntimeCheckCond = + addRuntimeChecks(MemCheckBlock->getTerminator(), L, + RtPtrChecking.getChecks(), MemCheckExp); + } assert(MemRuntimeCheckCond && "no RT checks generated although RtPtrChecking " "claimed checks are required"); @@ -3207,13 +3218,17 @@ AddedSafetyChecks = true; - // We currently don't use LoopVersioning for the actual loop cloning but we - // still use it to add the noalias metadata. - LVer = std::make_unique( - *Legal->getLAI(), - Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, - DT, PSE.getSE()); - LVer->prepareNoAliasMetadata(); + // Only use noalias metadata when using memory checks guaranteeing no overlap + // across all iterations. + if (!Legal->getLAI()->getRuntimePointerChecking()->getOrderedChecks()) { + // We currently don't use LoopVersioning for the actual loop cloning but we + // still use it to add the noalias metadata. + LVer = std::make_unique( + *Legal->getLAI(), + Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, + DT, PSE.getSE()); + LVer->prepareNoAliasMetadata(); + } return MemCheckBlock; } @@ -10541,7 +10556,7 @@ GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, F->getParent()->getDataLayout()); if (!VF.Width.isScalar() || IC > 1) - Checks.Create(L, *LVL.getLAI(), PSE.getUnionPredicate()); + Checks.Create(L, *LVL.getLAI(), PSE.getUnionPredicate(), VF.Width, IC); using namespace ore; if (!VectorizeLoop) { diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll @@ -18,12 +18,9 @@ ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; CHECK: vector.memcheck: -; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr double, double* [[A:%.*]], i64 [[N]] -; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr double, double* [[B:%.*]], i64 [[N]] -; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt double* [[SCEVGEP4]], [[A]] -; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt double* [[SCEVGEP]], [[B]] -; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr double, double* %b, i64 [[N]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt double* [[SCEVGEP]], %a +; CHECK-NEXT: br i1 [[BOUND0]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 3 @@ -34,15 +31,15 @@ ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP4:%.*]] = xor i64 [[INDEX]], -1 ; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[TMP4]], [[N]] -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds double, double* %b, i64 [[TMP5]] ; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vscale.i32() ; CHECK-NEXT: [[DOTNEG:%.*]] = mul i32 [[TMP7]], -8 ; CHECK-NEXT: [[TMP8:%.*]] = or i32 [[DOTNEG]], 1 ; CHECK-NEXT: [[TMP9:%.*]] = sext i32 [[TMP8]] to i64 ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds double, double* [[TMP6]], i64 [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = bitcast double* [[TMP10]] to * -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , * [[TMP11]], align 8, !alias.scope !0 -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP5]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , * [[TMP11]], align 8 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds double, double* %a, i64 [[TMP5]] ; CHECK-NEXT: [[TMP13:%.*]] = fadd [[WIDE_LOAD]], shufflevector ( insertelement ( poison, double 1.000000e+00, i32 0), poison, zeroinitializer) ; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.vscale.i32() ; CHECK-NEXT: [[DOTNEG7:%.*]] = mul i32 [[TMP14]], -8 @@ -50,7 +47,7 @@ ; CHECK-NEXT: [[TMP16:%.*]] = sext i32 [[TMP15]] to i64 ; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds double, double* [[TMP12]], i64 [[TMP16]] ; CHECK-NEXT: [[TMP18:%.*]] = bitcast double* [[TMP17]] to * -; CHECK-NEXT: store [[TMP13]], * [[TMP18]], align 8, !alias.scope !3, !noalias !0 +; CHECK-NEXT: store [[TMP13]], * [[TMP18]], align 8 ; CHECK-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP20:%.*]] = shl i64 [[TMP19]], 3 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP20]] @@ -69,10 +66,10 @@ ; CHECK: for.body: ; CHECK-NEXT: [[I_08_IN:%.*]] = phi i64 [ [[I_08:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[I_08]] = add nsw i64 [[I_08_IN]], -1 -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[I_08]] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* %b, i64 [[I_08]] ; CHECK-NEXT: [[TMP22:%.*]] = load double, double* [[ARRAYIDX]], align 8 ; CHECK-NEXT: [[ADD:%.*]] = fadd double [[TMP22]], 1.000000e+00 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[I_08]] +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds double, double* %a, i64 [[I_08]] ; CHECK-NEXT: store double [[ADD]], double* [[ARRAYIDX1]], align 8 ; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i64 [[I_08_IN]], 1 ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP_LOOPEXIT]], !llvm.loop [[LOOP8:![0-9]+]] @@ -108,12 +105,9 @@ ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; CHECK: vector.memcheck: -; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i64, i64* [[A:%.*]], i64 [[N]] -; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr i64, i64* [[B:%.*]], i64 [[N]] -; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt i64* [[SCEVGEP4]], [[A]] -; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt i64* [[SCEVGEP]], [[B]] -; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i64, i64* %b, i64 [[N]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt i64* [[SCEVGEP]], %a +; CHECK-NEXT: br i1 [[BOUND1]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 3 @@ -124,15 +118,15 @@ ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP4:%.*]] = xor i64 [[INDEX]], -1 ; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[TMP4]], [[N]] -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, i64* [[B]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, i64* %b, i64 [[TMP5]] ; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vscale.i32() ; CHECK-NEXT: [[DOTNEG:%.*]] = mul i32 [[TMP7]], -8 ; CHECK-NEXT: [[TMP8:%.*]] = or i32 [[DOTNEG]], 1 ; CHECK-NEXT: [[TMP9:%.*]] = sext i32 [[TMP8]] to i64 ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, i64* [[TMP6]], i64 [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = bitcast i64* [[TMP10]] to * -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , * [[TMP11]], align 8, !alias.scope !9 -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[TMP5]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , * [[TMP11]], align 8 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, i64* %a, i64 [[TMP5]] ; CHECK-NEXT: [[TMP13:%.*]] = add [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer) ; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.vscale.i32() ; CHECK-NEXT: [[DOTNEG7:%.*]] = mul i32 [[TMP14]], -8 @@ -140,7 +134,7 @@ ; CHECK-NEXT: [[TMP16:%.*]] = sext i32 [[TMP15]] to i64 ; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i64, i64* [[TMP12]], i64 [[TMP16]] ; CHECK-NEXT: [[TMP18:%.*]] = bitcast i64* [[TMP17]] to * -; CHECK-NEXT: store [[TMP13]], * [[TMP18]], align 8, !alias.scope !12, !noalias !9 +; CHECK-NEXT: store [[TMP13]], * [[TMP18]], align 8 ; CHECK-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP20:%.*]] = shl i64 [[TMP19]], 3 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP20]] @@ -159,10 +153,10 @@ ; CHECK: for.body: ; CHECK-NEXT: [[I_09_IN:%.*]] = phi i64 [ [[I_09:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[I_09]] = add nsw i64 [[I_09_IN]], -1 -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, i64* [[B]], i64 [[I_09]] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, i64* %b, i64 [[I_09]] ; CHECK-NEXT: [[TMP22:%.*]] = load i64, i64* [[ARRAYIDX]], align 8 ; CHECK-NEXT: [[ADD:%.*]] = add i64 [[TMP22]], 1 -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[I_09]] +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, i64* %a, i64 [[I_09]] ; CHECK-NEXT: store i64 [[ADD]], i64* [[ARRAYIDX2]], align 8 ; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i64 [[I_09_IN]], 1 ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP_LOOPEXIT]], !llvm.loop [[LOOP15:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-qabs.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-qabs.ll --- a/llvm/test/Transforms/LoopVectorize/ARM/mve-qabs.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-qabs.ll @@ -11,33 +11,29 @@ ; CHECK-NEXT: br i1 [[CMP_NOT19]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]] ; CHECK: while.body.preheader: ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[BLOCKSIZE]], 16 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] -; CHECK: vector.memcheck: -; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, i8* [[PDST:%.*]], i32 [[BLOCKSIZE]] -; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, i8* [[PSRC:%.*]], i32 [[BLOCKSIZE]] -; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt i8* [[SCEVGEP1]], [[PDST]] -; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt i8* [[SCEVGEP]], [[PSRC]] -; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, i8* %pDst, i32 [[BLOCKSIZE]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt i8* [[SCEVGEP]], %pSrc +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = select i1 [[MIN_ITERS_CHECK]], i1 true, i1 [[BOUND1]] +; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH:%.+]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[BLOCKSIZE]], -16 -; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, i8* [[PSRC]], i32 [[N_VEC]] +; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, i8* [[PSRC:%.+]], i32 [[N_VEC]] ; CHECK-NEXT: [[IND_END3:%.*]] = and i32 [[BLOCKSIZE]], 15 -; CHECK-NEXT: [[IND_END5:%.*]] = getelementptr i8, i8* [[PDST]], i32 [[N_VEC]] +; CHECK-NEXT: [[IND_END5:%.*]] = getelementptr i8, i8* [[PDST:%.+]], i32 [[N_VEC]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, i8* [[PSRC]], i32 [[INDEX]] ; CHECK-NEXT: [[NEXT_GEP6:%.*]] = getelementptr i8, i8* [[PDST]], i32 [[INDEX]] ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[NEXT_GEP]] to <16 x i8>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, <16 x i8>* [[TMP0]], align 1, !alias.scope !0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, <16 x i8>* [[TMP0]], align 1 ; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt <16 x i8> [[WIDE_LOAD]], zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = icmp eq <16 x i8> [[WIDE_LOAD]], ; CHECK-NEXT: [[TMP3:%.*]] = sub <16 x i8> zeroinitializer, [[WIDE_LOAD]] ; CHECK-NEXT: [[TMP4:%.*]] = select <16 x i1> [[TMP2]], <16 x i8> , <16 x i8> [[TMP3]] ; CHECK-NEXT: [[TMP5:%.*]] = select <16 x i1> [[TMP1]], <16 x i8> [[WIDE_LOAD]], <16 x i8> [[TMP4]] ; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8* [[NEXT_GEP6]] to <16 x i8>* -; CHECK-NEXT: store <16 x i8> [[TMP5]], <16 x i8>* [[TMP6]], align 1, !alias.scope !3, !noalias !0 +; CHECK-NEXT: store <16 x i8> [[TMP5]], <16 x i8>* [[TMP6]], align 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16 ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP5:!llvm.loop !.*]] @@ -45,9 +41,9 @@ ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[BLOCKSIZE]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[WHILE_END]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[PSRC]], [[WHILE_BODY_PREHEADER]] ], [ [[PSRC]], [[VECTOR_MEMCHECK]] ] -; CHECK-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i32 [ [[IND_END3]], [[MIDDLE_BLOCK]] ], [ [[BLOCKSIZE]], [[WHILE_BODY_PREHEADER]] ], [ [[BLOCKSIZE]], [[VECTOR_MEMCHECK]] ] -; CHECK-NEXT: [[BC_RESUME_VAL4:%.*]] = phi i8* [ [[IND_END5]], [[MIDDLE_BLOCK]] ], [ [[PDST]], [[WHILE_BODY_PREHEADER]] ], [ [[PDST]], [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[PSRC]], [[WHILE_BODY_PREHEADER]] ] +; CHECK-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i32 [ [[IND_END3]], [[MIDDLE_BLOCK]] ], [ [[BLOCKSIZE]], [[WHILE_BODY_PREHEADER]] ] +; CHECK-NEXT: [[BC_RESUME_VAL4:%.*]] = phi i8* [ [[IND_END5]], [[MIDDLE_BLOCK]] ], [ [[PDST]], [[WHILE_BODY_PREHEADER]] ] ; CHECK-NEXT: br label [[WHILE_BODY:%.*]] ; CHECK: while.body: ; CHECK-NEXT: [[PSRC_ADDR_022:%.*]] = phi i8* [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -106,14 +102,10 @@ ; CHECK-NEXT: br i1 [[CMP_NOT20]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]] ; CHECK: while.body.preheader: ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[BLOCKSIZE]], 8 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] -; CHECK: vector.memcheck: -; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i16, i16* [[PDST:%.*]], i32 [[BLOCKSIZE]] -; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr i16, i16* [[PSRC:%.*]], i32 [[BLOCKSIZE]] -; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt i16* [[SCEVGEP4]], [[PDST]] -; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt i16* [[SCEVGEP]], [[PSRC]] -; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i16, i16* %pDst, i32 [[BLOCKSIZE]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt i16* [[SCEVGEP]], %pSrc +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = select i1 [[MIN_ITERS_CHECK]], i1 true, i1 [[BOUND1]] +; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH:%.+]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[BLOCKSIZE]], -8 ; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i16, i16* [[PSRC]], i32 [[N_VEC]] @@ -125,14 +117,14 @@ ; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i16, i16* [[PSRC]], i32 [[INDEX]] ; CHECK-NEXT: [[NEXT_GEP10:%.*]] = getelementptr i16, i16* [[PDST]], i32 [[INDEX]] ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i16* [[NEXT_GEP]] to <8 x i16>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 2, !alias.scope !8 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 2 ; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt <8 x i16> [[WIDE_LOAD]], zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = icmp eq <8 x i16> [[WIDE_LOAD]], ; CHECK-NEXT: [[TMP3:%.*]] = sub <8 x i16> zeroinitializer, [[WIDE_LOAD]] ; CHECK-NEXT: [[TMP4:%.*]] = select <8 x i1> [[TMP2]], <8 x i16> , <8 x i16> [[TMP3]] ; CHECK-NEXT: [[TMP5:%.*]] = select <8 x i1> [[TMP1]], <8 x i16> [[WIDE_LOAD]], <8 x i16> [[TMP4]] ; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16* [[NEXT_GEP10]] to <8 x i16>* -; CHECK-NEXT: store <8 x i16> [[TMP5]], <8 x i16>* [[TMP6]], align 2, !alias.scope !11, !noalias !8 +; CHECK-NEXT: store <8 x i16> [[TMP5]], <8 x i16>* [[TMP6]], align 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP13:!llvm.loop !.*]] @@ -140,9 +132,9 @@ ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[BLOCKSIZE]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[WHILE_END]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i16* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[PSRC]], [[WHILE_BODY_PREHEADER]] ], [ [[PSRC]], [[VECTOR_MEMCHECK]] ] -; CHECK-NEXT: [[BC_RESUME_VAL6:%.*]] = phi i32 [ [[IND_END7]], [[MIDDLE_BLOCK]] ], [ [[BLOCKSIZE]], [[WHILE_BODY_PREHEADER]] ], [ [[BLOCKSIZE]], [[VECTOR_MEMCHECK]] ] -; CHECK-NEXT: [[BC_RESUME_VAL8:%.*]] = phi i16* [ [[IND_END9]], [[MIDDLE_BLOCK]] ], [ [[PDST]], [[WHILE_BODY_PREHEADER]] ], [ [[PDST]], [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i16* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[PSRC]], [[WHILE_BODY_PREHEADER]] ] +; CHECK-NEXT: [[BC_RESUME_VAL6:%.*]] = phi i32 [ [[IND_END7]], [[MIDDLE_BLOCK]] ], [ [[BLOCKSIZE]], [[WHILE_BODY_PREHEADER]] ] +; CHECK-NEXT: [[BC_RESUME_VAL8:%.*]] = phi i16* [ [[IND_END9]], [[MIDDLE_BLOCK]] ], [ [[PDST]], [[WHILE_BODY_PREHEADER]] ] ; CHECK-NEXT: br label [[WHILE_BODY:%.*]] ; CHECK: while.body: ; CHECK-NEXT: [[PSRC_ADDR_023:%.*]] = phi i16* [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -201,14 +193,10 @@ ; CHECK-NEXT: br i1 [[CMP_NOT14]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]] ; CHECK: while.body.preheader: ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[BLOCKSIZE]], 4 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] -; CHECK: vector.memcheck: -; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[PDST:%.*]], i32 [[BLOCKSIZE]] -; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[PSRC:%.*]], i32 [[BLOCKSIZE]] -; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt i32* [[SCEVGEP4]], [[PDST]] -; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[PSRC]] -; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* %pDst, i32 [[BLOCKSIZE]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], %pSrc +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = select i1 [[MIN_ITERS_CHECK]], i1 true, i1 [[BOUND1]] +; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH:%.+]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[BLOCKSIZE]], -4 ; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i32, i32* [[PSRC]], i32 [[N_VEC]] @@ -220,14 +208,14 @@ ; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i32, i32* [[PSRC]], i32 [[INDEX]] ; CHECK-NEXT: [[NEXT_GEP10:%.*]] = getelementptr i32, i32* [[PDST]], i32 [[INDEX]] ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[NEXT_GEP]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4, !alias.scope !15 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], ; CHECK-NEXT: [[TMP3:%.*]] = sub nsw <4 x i32> zeroinitializer, [[WIDE_LOAD]] ; CHECK-NEXT: [[TMP4:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> , <4 x i32> [[TMP3]] ; CHECK-NEXT: [[TMP5:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[WIDE_LOAD]], <4 x i32> [[TMP4]] ; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[NEXT_GEP10]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4, !alias.scope !18, !noalias !15 +; CHECK-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP20:!llvm.loop !.*]] @@ -235,9 +223,9 @@ ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[BLOCKSIZE]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[WHILE_END]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[PSRC]], [[WHILE_BODY_PREHEADER]] ], [ [[PSRC]], [[VECTOR_MEMCHECK]] ] -; CHECK-NEXT: [[BC_RESUME_VAL6:%.*]] = phi i32 [ [[IND_END7]], [[MIDDLE_BLOCK]] ], [ [[BLOCKSIZE]], [[WHILE_BODY_PREHEADER]] ], [ [[BLOCKSIZE]], [[VECTOR_MEMCHECK]] ] -; CHECK-NEXT: [[BC_RESUME_VAL8:%.*]] = phi i32* [ [[IND_END9]], [[MIDDLE_BLOCK]] ], [ [[PDST]], [[WHILE_BODY_PREHEADER]] ], [ [[PDST]], [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[PSRC]], [[WHILE_BODY_PREHEADER]] ] +; CHECK-NEXT: [[BC_RESUME_VAL6:%.*]] = phi i32 [ [[IND_END7]], [[MIDDLE_BLOCK]] ], [ [[BLOCKSIZE]], [[WHILE_BODY_PREHEADER]] ] +; CHECK-NEXT: [[BC_RESUME_VAL8:%.*]] = phi i32* [ [[IND_END9]], [[MIDDLE_BLOCK]] ], [ [[PDST]], [[WHILE_BODY_PREHEADER]] ] ; CHECK-NEXT: br label [[WHILE_BODY:%.*]] ; CHECK: while.body: ; CHECK-NEXT: [[PSRC_ADDR_017:%.*]] = phi i32* [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] diff --git a/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll b/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll --- a/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll @@ -20,24 +20,15 @@ define void @foo1(i32* nocapture %A, i32* nocapture readonly %B, i32* nocapture readonly %trigger) local_unnamed_addr #0 { ; AVX1-LABEL: @foo1( ; AVX1-NEXT: entry: -; AVX1-NEXT: [[A1:%.*]] = bitcast i32* [[A:%.*]] to i8* -; AVX1-NEXT: [[TRIGGER3:%.*]] = bitcast i32* [[TRIGGER:%.*]] to i8* -; AVX1-NEXT: [[B6:%.*]] = bitcast i32* [[B:%.*]] to i8* +; AVX1-NEXT: [[TRIGGER1:%.*]] = bitcast i32* [[TRIGGER:%.*]] to i8* +; AVX1-NEXT: [[B3:%.*]] = bitcast i32* [[B:%.*]] to i8* ; AVX1-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; AVX1: vector.memcheck: -; AVX1-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[A]], i64 10000 +; AVX1-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 10000 ; AVX1-NEXT: [[SCEVGEP2:%.*]] = bitcast i32* [[SCEVGEP]] to i8* -; AVX1-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[TRIGGER]], i64 10000 -; AVX1-NEXT: [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8* -; AVX1-NEXT: [[SCEVGEP7:%.*]] = getelementptr i32, i32* [[B]], i64 10000 -; AVX1-NEXT: [[SCEVGEP78:%.*]] = bitcast i32* [[SCEVGEP7]] to i8* -; AVX1-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP45]] -; AVX1-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[TRIGGER3]], [[SCEVGEP2]] -; AVX1-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; AVX1-NEXT: [[BOUND09:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP78]] -; AVX1-NEXT: [[BOUND110:%.*]] = icmp ult i8* [[B6]], [[SCEVGEP2]] -; AVX1-NEXT: [[FOUND_CONFLICT11:%.*]] = and i1 [[BOUND09]], [[BOUND110]] -; AVX1-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT11]] +; AVX1-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[TRIGGER1]], [[SCEVGEP2]] +; AVX1-NEXT: [[BOUND04:%.*]] = icmp ult i8* [[B3]], [[SCEVGEP2]] +; AVX1-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[BOUND0]], [[BOUND04]] ; AVX1-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; AVX1: vector.ph: ; AVX1-NEXT: br label [[VECTOR_BODY:%.*]] @@ -47,17 +38,17 @@ ; AVX1-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP0]] ; AVX1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0 ; AVX1-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <8 x i32>* -; AVX1-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP3]], align 4, !alias.scope !0 +; AVX1-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP3]], align 4 ; AVX1-NEXT: [[TMP4:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], ; AVX1-NEXT: [[TMP5:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP0]] ; AVX1-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[TMP5]], i32 0 ; AVX1-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>* -; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP7]], i32 4, <8 x i1> [[TMP4]], <8 x i32> poison), !alias.scope !3 +; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP7]], i32 4, <8 x i1> [[TMP4]], <8 x i32> poison) ; AVX1-NEXT: [[TMP8:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]] ; AVX1-NEXT: [[TMP9:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP0]] ; AVX1-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[TMP9]], i32 0 ; AVX1-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <8 x i32>* -; AVX1-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP8]], <8 x i32>* [[TMP11]], i32 4, <8 x i1> [[TMP4]]), !alias.scope !5, !noalias !7 +; AVX1-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP8]], <8 x i32>* [[TMP11]], i32 4, <8 x i1> [[TMP4]]) ; AVX1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; AVX1-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000 ; AVX1-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] @@ -89,24 +80,15 @@ ; ; AVX2-LABEL: @foo1( ; AVX2-NEXT: entry: -; AVX2-NEXT: [[A1:%.*]] = bitcast i32* [[A:%.*]] to i8* ; AVX2-NEXT: [[TRIGGER3:%.*]] = bitcast i32* [[TRIGGER:%.*]] to i8* ; AVX2-NEXT: [[B6:%.*]] = bitcast i32* [[B:%.*]] to i8* ; AVX2-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; AVX2: vector.memcheck: -; AVX2-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[A]], i64 10000 +; AVX2-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 10000 ; AVX2-NEXT: [[SCEVGEP2:%.*]] = bitcast i32* [[SCEVGEP]] to i8* -; AVX2-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[TRIGGER]], i64 10000 -; AVX2-NEXT: [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8* -; AVX2-NEXT: [[SCEVGEP7:%.*]] = getelementptr i32, i32* [[B]], i64 10000 -; AVX2-NEXT: [[SCEVGEP78:%.*]] = bitcast i32* [[SCEVGEP7]] to i8* -; AVX2-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP45]] -; AVX2-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[TRIGGER3]], [[SCEVGEP2]] -; AVX2-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; AVX2-NEXT: [[BOUND09:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP78]] -; AVX2-NEXT: [[BOUND110:%.*]] = icmp ult i8* [[B6]], [[SCEVGEP2]] -; AVX2-NEXT: [[FOUND_CONFLICT11:%.*]] = and i1 [[BOUND09]], [[BOUND110]] -; AVX2-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT11]] +; AVX2-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[TRIGGER3]], [[SCEVGEP2]] +; AVX2-NEXT: [[BOUND04:%.*]] = icmp ult i8* [[B6]], [[SCEVGEP2]] +; AVX2-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[BOUND0]], [[BOUND04]] ; AVX2-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; AVX2: vector.ph: ; AVX2-NEXT: br label [[VECTOR_BODY:%.*]] @@ -122,16 +104,16 @@ ; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP3]] ; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0 ; AVX2-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <8 x i32>* -; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP9]], align 4, !alias.scope !0 +; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP9]], align 4 ; AVX2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 8 ; AVX2-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <8 x i32>* -; AVX2-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i32>, <8 x i32>* [[TMP11]], align 4, !alias.scope !0 +; AVX2-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i32>, <8 x i32>* [[TMP11]], align 4 ; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 16 ; AVX2-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <8 x i32>* -; AVX2-NEXT: [[WIDE_LOAD13:%.*]] = load <8 x i32>, <8 x i32>* [[TMP13]], align 4, !alias.scope !0 +; AVX2-NEXT: [[WIDE_LOAD13:%.*]] = load <8 x i32>, <8 x i32>* [[TMP13]], align 4 ; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 24 ; AVX2-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <8 x i32>* -; AVX2-NEXT: [[WIDE_LOAD14:%.*]] = load <8 x i32>, <8 x i32>* [[TMP15]], align 4, !alias.scope !0 +; AVX2-NEXT: [[WIDE_LOAD14:%.*]] = load <8 x i32>, <8 x i32>* [[TMP15]], align 4 ; AVX2-NEXT: [[TMP16:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], ; AVX2-NEXT: [[TMP17:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD12]], ; AVX2-NEXT: [[TMP18:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD13]], @@ -142,16 +124,16 @@ ; AVX2-NEXT: [[TMP23:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP3]] ; AVX2-NEXT: [[TMP24:%.*]] = getelementptr i32, i32* [[TMP20]], i32 0 ; AVX2-NEXT: [[TMP25:%.*]] = bitcast i32* [[TMP24]] to <8 x i32>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP25]], i32 4, <8 x i1> [[TMP16]], <8 x i32> poison), !alias.scope !3 +; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP25]], i32 4, <8 x i1> [[TMP16]], <8 x i32> poison) ; AVX2-NEXT: [[TMP26:%.*]] = getelementptr i32, i32* [[TMP20]], i32 8 ; AVX2-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <8 x i32>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP27]], i32 4, <8 x i1> [[TMP17]], <8 x i32> poison), !alias.scope !3 +; AVX2-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP27]], i32 4, <8 x i1> [[TMP17]], <8 x i32> poison) ; AVX2-NEXT: [[TMP28:%.*]] = getelementptr i32, i32* [[TMP20]], i32 16 ; AVX2-NEXT: [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <8 x i32>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP29]], i32 4, <8 x i1> [[TMP18]], <8 x i32> poison), !alias.scope !3 +; AVX2-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP29]], i32 4, <8 x i1> [[TMP18]], <8 x i32> poison) ; AVX2-NEXT: [[TMP30:%.*]] = getelementptr i32, i32* [[TMP20]], i32 24 ; AVX2-NEXT: [[TMP31:%.*]] = bitcast i32* [[TMP30]] to <8 x i32>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD17:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP31]], i32 4, <8 x i1> [[TMP19]], <8 x i32> poison), !alias.scope !3 +; AVX2-NEXT: [[WIDE_MASKED_LOAD17:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP31]], i32 4, <8 x i1> [[TMP19]], <8 x i32> poison) ; AVX2-NEXT: [[TMP32:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]] ; AVX2-NEXT: [[TMP33:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD15]], [[WIDE_LOAD12]] ; AVX2-NEXT: [[TMP34:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD16]], [[WIDE_LOAD13]] @@ -162,16 +144,16 @@ ; AVX2-NEXT: [[TMP39:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP3]] ; AVX2-NEXT: [[TMP40:%.*]] = getelementptr i32, i32* [[TMP36]], i32 0 ; AVX2-NEXT: [[TMP41:%.*]] = bitcast i32* [[TMP40]] to <8 x i32>* -; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP32]], <8 x i32>* [[TMP41]], i32 4, <8 x i1> [[TMP16]]), !alias.scope !5, !noalias !7 +; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP32]], <8 x i32>* [[TMP41]], i32 4, <8 x i1> [[TMP16]]) ; AVX2-NEXT: [[TMP42:%.*]] = getelementptr i32, i32* [[TMP36]], i32 8 ; AVX2-NEXT: [[TMP43:%.*]] = bitcast i32* [[TMP42]] to <8 x i32>* -; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP33]], <8 x i32>* [[TMP43]], i32 4, <8 x i1> [[TMP17]]), !alias.scope !5, !noalias !7 +; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP33]], <8 x i32>* [[TMP43]], i32 4, <8 x i1> [[TMP17]]) ; AVX2-NEXT: [[TMP44:%.*]] = getelementptr i32, i32* [[TMP36]], i32 16 ; AVX2-NEXT: [[TMP45:%.*]] = bitcast i32* [[TMP44]] to <8 x i32>* -; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP34]], <8 x i32>* [[TMP45]], i32 4, <8 x i1> [[TMP18]]), !alias.scope !5, !noalias !7 +; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP34]], <8 x i32>* [[TMP45]], i32 4, <8 x i1> [[TMP18]]) ; AVX2-NEXT: [[TMP46:%.*]] = getelementptr i32, i32* [[TMP36]], i32 24 ; AVX2-NEXT: [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <8 x i32>* -; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP35]], <8 x i32>* [[TMP47]], i32 4, <8 x i1> [[TMP19]]), !alias.scope !5, !noalias !7 +; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP35]], <8 x i32>* [[TMP47]], i32 4, <8 x i1> [[TMP19]]) ; AVX2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; AVX2-NEXT: [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984 ; AVX2-NEXT: br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] @@ -203,24 +185,15 @@ ; ; AVX512-LABEL: @foo1( ; AVX512-NEXT: iter.check: -; AVX512-NEXT: [[A1:%.*]] = bitcast i32* [[A:%.*]] to i8* ; AVX512-NEXT: [[TRIGGER3:%.*]] = bitcast i32* [[TRIGGER:%.*]] to i8* ; AVX512-NEXT: [[B6:%.*]] = bitcast i32* [[B:%.*]] to i8* ; AVX512-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; AVX512: vector.memcheck: -; AVX512-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[A]], i64 10000 +; AVX512-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 10000 ; AVX512-NEXT: [[SCEVGEP2:%.*]] = bitcast i32* [[SCEVGEP]] to i8* -; AVX512-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[TRIGGER]], i64 10000 -; AVX512-NEXT: [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8* -; AVX512-NEXT: [[SCEVGEP7:%.*]] = getelementptr i32, i32* [[B]], i64 10000 -; AVX512-NEXT: [[SCEVGEP78:%.*]] = bitcast i32* [[SCEVGEP7]] to i8* -; AVX512-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP45]] -; AVX512-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[TRIGGER3]], [[SCEVGEP2]] -; AVX512-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; AVX512-NEXT: [[BOUND09:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP78]] -; AVX512-NEXT: [[BOUND110:%.*]] = icmp ult i8* [[B6]], [[SCEVGEP2]] -; AVX512-NEXT: [[FOUND_CONFLICT11:%.*]] = and i1 [[BOUND09]], [[BOUND110]] -; AVX512-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT11]] +; AVX512-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[TRIGGER3]], [[SCEVGEP2]] +; AVX512-NEXT: [[BOUND04:%.*]] = icmp ult i8* [[B6]], [[SCEVGEP2]] +; AVX512-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[BOUND0]], [[BOUND04]] ; AVX512-NEXT: br i1 [[CONFLICT_RDX]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] ; AVX512: vector.main.loop.iter.check: ; AVX512-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] @@ -238,16 +211,16 @@ ; AVX512-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP3]] ; AVX512-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0 ; AVX512-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32>* [[TMP9]], align 4, !alias.scope !0 +; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32>* [[TMP9]], align 4 ; AVX512-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 16 ; AVX512-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i32>, <16 x i32>* [[TMP11]], align 4, !alias.scope !0 +; AVX512-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i32>, <16 x i32>* [[TMP11]], align 4 ; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 32 ; AVX512-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_LOAD13:%.*]] = load <16 x i32>, <16 x i32>* [[TMP13]], align 4, !alias.scope !0 +; AVX512-NEXT: [[WIDE_LOAD13:%.*]] = load <16 x i32>, <16 x i32>* [[TMP13]], align 4 ; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 48 ; AVX512-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_LOAD14:%.*]] = load <16 x i32>, <16 x i32>* [[TMP15]], align 4, !alias.scope !0 +; AVX512-NEXT: [[WIDE_LOAD14:%.*]] = load <16 x i32>, <16 x i32>* [[TMP15]], align 4 ; AVX512-NEXT: [[TMP16:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD]], ; AVX512-NEXT: [[TMP17:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD12]], ; AVX512-NEXT: [[TMP18:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD13]], @@ -258,16 +231,16 @@ ; AVX512-NEXT: [[TMP23:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP3]] ; AVX512-NEXT: [[TMP24:%.*]] = getelementptr i32, i32* [[TMP20]], i32 0 ; AVX512-NEXT: [[TMP25:%.*]] = bitcast i32* [[TMP24]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP25]], i32 4, <16 x i1> [[TMP16]], <16 x i32> poison), !alias.scope !3 +; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP25]], i32 4, <16 x i1> [[TMP16]], <16 x i32> poison) ; AVX512-NEXT: [[TMP26:%.*]] = getelementptr i32, i32* [[TMP20]], i32 16 ; AVX512-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP27]], i32 4, <16 x i1> [[TMP17]], <16 x i32> poison), !alias.scope !3 +; AVX512-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP27]], i32 4, <16 x i1> [[TMP17]], <16 x i32> poison) ; AVX512-NEXT: [[TMP28:%.*]] = getelementptr i32, i32* [[TMP20]], i32 32 ; AVX512-NEXT: [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP29]], i32 4, <16 x i1> [[TMP18]], <16 x i32> poison), !alias.scope !3 +; AVX512-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP29]], i32 4, <16 x i1> [[TMP18]], <16 x i32> poison) ; AVX512-NEXT: [[TMP30:%.*]] = getelementptr i32, i32* [[TMP20]], i32 48 ; AVX512-NEXT: [[TMP31:%.*]] = bitcast i32* [[TMP30]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD17:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP31]], i32 4, <16 x i1> [[TMP19]], <16 x i32> poison), !alias.scope !3 +; AVX512-NEXT: [[WIDE_MASKED_LOAD17:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP31]], i32 4, <16 x i1> [[TMP19]], <16 x i32> poison) ; AVX512-NEXT: [[TMP32:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]] ; AVX512-NEXT: [[TMP33:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD15]], [[WIDE_LOAD12]] ; AVX512-NEXT: [[TMP34:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD16]], [[WIDE_LOAD13]] @@ -278,16 +251,16 @@ ; AVX512-NEXT: [[TMP39:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP3]] ; AVX512-NEXT: [[TMP40:%.*]] = getelementptr i32, i32* [[TMP36]], i32 0 ; AVX512-NEXT: [[TMP41:%.*]] = bitcast i32* [[TMP40]] to <16 x i32>* -; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP32]], <16 x i32>* [[TMP41]], i32 4, <16 x i1> [[TMP16]]), !alias.scope !5, !noalias !7 +; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP32]], <16 x i32>* [[TMP41]], i32 4, <16 x i1> [[TMP16]]) ; AVX512-NEXT: [[TMP42:%.*]] = getelementptr i32, i32* [[TMP36]], i32 16 ; AVX512-NEXT: [[TMP43:%.*]] = bitcast i32* [[TMP42]] to <16 x i32>* -; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP33]], <16 x i32>* [[TMP43]], i32 4, <16 x i1> [[TMP17]]), !alias.scope !5, !noalias !7 +; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP33]], <16 x i32>* [[TMP43]], i32 4, <16 x i1> [[TMP17]]) ; AVX512-NEXT: [[TMP44:%.*]] = getelementptr i32, i32* [[TMP36]], i32 32 ; AVX512-NEXT: [[TMP45:%.*]] = bitcast i32* [[TMP44]] to <16 x i32>* -; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP34]], <16 x i32>* [[TMP45]], i32 4, <16 x i1> [[TMP18]]), !alias.scope !5, !noalias !7 +; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP34]], <16 x i32>* [[TMP45]], i32 4, <16 x i1> [[TMP18]]) ; AVX512-NEXT: [[TMP46:%.*]] = getelementptr i32, i32* [[TMP36]], i32 48 ; AVX512-NEXT: [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <16 x i32>* -; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP35]], <16 x i32>* [[TMP47]], i32 4, <16 x i1> [[TMP19]]), !alias.scope !5, !noalias !7 +; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP35]], <16 x i32>* [[TMP47]], i32 4, <16 x i1> [[TMP19]]) ; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64 ; AVX512-NEXT: [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984 ; AVX512-NEXT: br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] @@ -379,24 +352,15 @@ define void @foo1_addrspace1(i32 addrspace(1)* nocapture %A, i32 addrspace(1)* nocapture readonly %B, i32 addrspace(1)* nocapture readonly %trigger) local_unnamed_addr #0 { ; AVX1-LABEL: @foo1_addrspace1( ; AVX1-NEXT: entry: -; AVX1-NEXT: [[A1:%.*]] = bitcast i32 addrspace(1)* [[A:%.*]] to i8 addrspace(1)* ; AVX1-NEXT: [[TRIGGER3:%.*]] = bitcast i32 addrspace(1)* [[TRIGGER:%.*]] to i8 addrspace(1)* ; AVX1-NEXT: [[B6:%.*]] = bitcast i32 addrspace(1)* [[B:%.*]] to i8 addrspace(1)* ; AVX1-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; AVX1: vector.memcheck: -; AVX1-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 10000 +; AVX1-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32 addrspace(1)* [[A:%.*]], i64 10000 ; AVX1-NEXT: [[SCEVGEP2:%.*]] = bitcast i32 addrspace(1)* [[SCEVGEP]] to i8 addrspace(1)* -; AVX1-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32 addrspace(1)* [[TRIGGER]], i64 10000 -; AVX1-NEXT: [[SCEVGEP45:%.*]] = bitcast i32 addrspace(1)* [[SCEVGEP4]] to i8 addrspace(1)* -; AVX1-NEXT: [[SCEVGEP7:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 10000 -; AVX1-NEXT: [[SCEVGEP78:%.*]] = bitcast i32 addrspace(1)* [[SCEVGEP7]] to i8 addrspace(1)* -; AVX1-NEXT: [[BOUND0:%.*]] = icmp ult i8 addrspace(1)* [[A1]], [[SCEVGEP45]] -; AVX1-NEXT: [[BOUND1:%.*]] = icmp ult i8 addrspace(1)* [[TRIGGER3]], [[SCEVGEP2]] -; AVX1-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; AVX1-NEXT: [[BOUND09:%.*]] = icmp ult i8 addrspace(1)* [[A1]], [[SCEVGEP78]] -; AVX1-NEXT: [[BOUND110:%.*]] = icmp ult i8 addrspace(1)* [[B6]], [[SCEVGEP2]] -; AVX1-NEXT: [[FOUND_CONFLICT11:%.*]] = and i1 [[BOUND09]], [[BOUND110]] -; AVX1-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT11]] +; AVX1-NEXT: [[BOUND0:%.*]] = icmp ult i8 addrspace(1)* [[TRIGGER3]], [[SCEVGEP2]] +; AVX1-NEXT: [[BOUND04:%.*]] = icmp ult i8 addrspace(1)* [[B3]], [[SCEVGEP2]] +; AVX1-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[BOUND0]], [[BOUND04]] ; AVX1-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; AVX1: vector.ph: ; AVX1-NEXT: br label [[VECTOR_BODY:%.*]] @@ -406,17 +370,17 @@ ; AVX1-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[TMP0]] ; AVX1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP1]], i32 0 ; AVX1-NEXT: [[TMP3:%.*]] = bitcast i32 addrspace(1)* [[TMP2]] to <8 x i32> addrspace(1)* -; AVX1-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP3]], align 4, !alias.scope !11 +; AVX1-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP3]], align 4 ; AVX1-NEXT: [[TMP4:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], ; AVX1-NEXT: [[TMP5:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 [[TMP0]] ; AVX1-NEXT: [[TMP6:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP5]], i32 0 ; AVX1-NEXT: [[TMP7:%.*]] = bitcast i32 addrspace(1)* [[TMP6]] to <8 x i32> addrspace(1)* -; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP7]], i32 4, <8 x i1> [[TMP4]], <8 x i32> poison), !alias.scope !14 +; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP7]], i32 4, <8 x i1> [[TMP4]], <8 x i32> poison) ; AVX1-NEXT: [[TMP8:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]] ; AVX1-NEXT: [[TMP9:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 [[TMP0]] ; AVX1-NEXT: [[TMP10:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP9]], i32 0 ; AVX1-NEXT: [[TMP11:%.*]] = bitcast i32 addrspace(1)* [[TMP10]] to <8 x i32> addrspace(1)* -; AVX1-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP8]], <8 x i32> addrspace(1)* [[TMP11]], i32 4, <8 x i1> [[TMP4]]), !alias.scope !16, !noalias !18 +; AVX1-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP8]], <8 x i32> addrspace(1)* [[TMP11]], i32 4, <8 x i1> [[TMP4]]) ; AVX1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; AVX1-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000 ; AVX1-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] @@ -448,24 +412,15 @@ ; ; AVX2-LABEL: @foo1_addrspace1( ; AVX2-NEXT: entry: -; AVX2-NEXT: [[A1:%.*]] = bitcast i32 addrspace(1)* [[A:%.*]] to i8 addrspace(1)* ; AVX2-NEXT: [[TRIGGER3:%.*]] = bitcast i32 addrspace(1)* [[TRIGGER:%.*]] to i8 addrspace(1)* ; AVX2-NEXT: [[B6:%.*]] = bitcast i32 addrspace(1)* [[B:%.*]] to i8 addrspace(1)* ; AVX2-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; AVX2: vector.memcheck: -; AVX2-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 10000 +; AVX2-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32 addrspace(1)* [[A:%.*]], i64 10000 ; AVX2-NEXT: [[SCEVGEP2:%.*]] = bitcast i32 addrspace(1)* [[SCEVGEP]] to i8 addrspace(1)* -; AVX2-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32 addrspace(1)* [[TRIGGER]], i64 10000 -; AVX2-NEXT: [[SCEVGEP45:%.*]] = bitcast i32 addrspace(1)* [[SCEVGEP4]] to i8 addrspace(1)* -; AVX2-NEXT: [[SCEVGEP7:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 10000 -; AVX2-NEXT: [[SCEVGEP78:%.*]] = bitcast i32 addrspace(1)* [[SCEVGEP7]] to i8 addrspace(1)* -; AVX2-NEXT: [[BOUND0:%.*]] = icmp ult i8 addrspace(1)* [[A1]], [[SCEVGEP45]] -; AVX2-NEXT: [[BOUND1:%.*]] = icmp ult i8 addrspace(1)* [[TRIGGER3]], [[SCEVGEP2]] -; AVX2-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; AVX2-NEXT: [[BOUND09:%.*]] = icmp ult i8 addrspace(1)* [[A1]], [[SCEVGEP78]] -; AVX2-NEXT: [[BOUND110:%.*]] = icmp ult i8 addrspace(1)* [[B6]], [[SCEVGEP2]] -; AVX2-NEXT: [[FOUND_CONFLICT11:%.*]] = and i1 [[BOUND09]], [[BOUND110]] -; AVX2-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT11]] +; AVX2-NEXT: [[BOUND0:%.*]] = icmp ult i8 addrspace(1)* [[TRIGGER3]], [[SCEVGEP2]] +; AVX2-NEXT: [[BOUND04:%.*]] = icmp ult i8 addrspace(1)* [[B6]], [[SCEVGEP2]] +; AVX2-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[BOUND0]], [[BOUND04]] ; AVX2-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; AVX2: vector.ph: ; AVX2-NEXT: br label [[VECTOR_BODY:%.*]] @@ -481,16 +436,16 @@ ; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[TMP3]] ; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP4]], i32 0 ; AVX2-NEXT: [[TMP9:%.*]] = bitcast i32 addrspace(1)* [[TMP8]] to <8 x i32> addrspace(1)* -; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP9]], align 4, !alias.scope !11 +; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP9]], align 4 ; AVX2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP4]], i32 8 ; AVX2-NEXT: [[TMP11:%.*]] = bitcast i32 addrspace(1)* [[TMP10]] to <8 x i32> addrspace(1)* -; AVX2-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP11]], align 4, !alias.scope !11 +; AVX2-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP11]], align 4 ; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP4]], i32 16 ; AVX2-NEXT: [[TMP13:%.*]] = bitcast i32 addrspace(1)* [[TMP12]] to <8 x i32> addrspace(1)* -; AVX2-NEXT: [[WIDE_LOAD13:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP13]], align 4, !alias.scope !11 +; AVX2-NEXT: [[WIDE_LOAD13:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP13]], align 4 ; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP4]], i32 24 ; AVX2-NEXT: [[TMP15:%.*]] = bitcast i32 addrspace(1)* [[TMP14]] to <8 x i32> addrspace(1)* -; AVX2-NEXT: [[WIDE_LOAD14:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP15]], align 4, !alias.scope !11 +; AVX2-NEXT: [[WIDE_LOAD14:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP15]], align 4 ; AVX2-NEXT: [[TMP16:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], ; AVX2-NEXT: [[TMP17:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD12]], ; AVX2-NEXT: [[TMP18:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD13]], @@ -501,16 +456,16 @@ ; AVX2-NEXT: [[TMP23:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 [[TMP3]] ; AVX2-NEXT: [[TMP24:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP20]], i32 0 ; AVX2-NEXT: [[TMP25:%.*]] = bitcast i32 addrspace(1)* [[TMP24]] to <8 x i32> addrspace(1)* -; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP25]], i32 4, <8 x i1> [[TMP16]], <8 x i32> poison), !alias.scope !14 +; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP25]], i32 4, <8 x i1> [[TMP16]], <8 x i32> poison) ; AVX2-NEXT: [[TMP26:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP20]], i32 8 ; AVX2-NEXT: [[TMP27:%.*]] = bitcast i32 addrspace(1)* [[TMP26]] to <8 x i32> addrspace(1)* -; AVX2-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP27]], i32 4, <8 x i1> [[TMP17]], <8 x i32> poison), !alias.scope !14 +; AVX2-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP27]], i32 4, <8 x i1> [[TMP17]], <8 x i32> poison) ; AVX2-NEXT: [[TMP28:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP20]], i32 16 ; AVX2-NEXT: [[TMP29:%.*]] = bitcast i32 addrspace(1)* [[TMP28]] to <8 x i32> addrspace(1)* -; AVX2-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP29]], i32 4, <8 x i1> [[TMP18]], <8 x i32> poison), !alias.scope !14 +; AVX2-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP29]], i32 4, <8 x i1> [[TMP18]], <8 x i32> poison) ; AVX2-NEXT: [[TMP30:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP20]], i32 24 ; AVX2-NEXT: [[TMP31:%.*]] = bitcast i32 addrspace(1)* [[TMP30]] to <8 x i32> addrspace(1)* -; AVX2-NEXT: [[WIDE_MASKED_LOAD17:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP31]], i32 4, <8 x i1> [[TMP19]], <8 x i32> poison), !alias.scope !14 +; AVX2-NEXT: [[WIDE_MASKED_LOAD17:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP31]], i32 4, <8 x i1> [[TMP19]], <8 x i32> poison) ; AVX2-NEXT: [[TMP32:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]] ; AVX2-NEXT: [[TMP33:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD15]], [[WIDE_LOAD12]] ; AVX2-NEXT: [[TMP34:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD16]], [[WIDE_LOAD13]] @@ -521,16 +476,16 @@ ; AVX2-NEXT: [[TMP39:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 [[TMP3]] ; AVX2-NEXT: [[TMP40:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP36]], i32 0 ; AVX2-NEXT: [[TMP41:%.*]] = bitcast i32 addrspace(1)* [[TMP40]] to <8 x i32> addrspace(1)* -; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP32]], <8 x i32> addrspace(1)* [[TMP41]], i32 4, <8 x i1> [[TMP16]]), !alias.scope !16, !noalias !18 +; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP32]], <8 x i32> addrspace(1)* [[TMP41]], i32 4, <8 x i1> [[TMP16]]) ; AVX2-NEXT: [[TMP42:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP36]], i32 8 ; AVX2-NEXT: [[TMP43:%.*]] = bitcast i32 addrspace(1)* [[TMP42]] to <8 x i32> addrspace(1)* -; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP33]], <8 x i32> addrspace(1)* [[TMP43]], i32 4, <8 x i1> [[TMP17]]), !alias.scope !16, !noalias !18 +; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP33]], <8 x i32> addrspace(1)* [[TMP43]], i32 4, <8 x i1> [[TMP17]]) ; AVX2-NEXT: [[TMP44:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP36]], i32 16 ; AVX2-NEXT: [[TMP45:%.*]] = bitcast i32 addrspace(1)* [[TMP44]] to <8 x i32> addrspace(1)* -; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP34]], <8 x i32> addrspace(1)* [[TMP45]], i32 4, <8 x i1> [[TMP18]]), !alias.scope !16, !noalias !18 +; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP34]], <8 x i32> addrspace(1)* [[TMP45]], i32 4, <8 x i1> [[TMP18]]) ; AVX2-NEXT: [[TMP46:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP36]], i32 24 ; AVX2-NEXT: [[TMP47:%.*]] = bitcast i32 addrspace(1)* [[TMP46]] to <8 x i32> addrspace(1)* -; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP35]], <8 x i32> addrspace(1)* [[TMP47]], i32 4, <8 x i1> [[TMP19]]), !alias.scope !16, !noalias !18 +; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP35]], <8 x i32> addrspace(1)* [[TMP47]], i32 4, <8 x i1> [[TMP19]]) ; AVX2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; AVX2-NEXT: [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984 ; AVX2-NEXT: br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] @@ -562,24 +517,15 @@ ; ; AVX512-LABEL: @foo1_addrspace1( ; AVX512-NEXT: iter.check: -; AVX512-NEXT: [[A1:%.*]] = bitcast i32 addrspace(1)* [[A:%.*]] to i8 addrspace(1)* ; AVX512-NEXT: [[TRIGGER3:%.*]] = bitcast i32 addrspace(1)* [[TRIGGER:%.*]] to i8 addrspace(1)* ; AVX512-NEXT: [[B6:%.*]] = bitcast i32 addrspace(1)* [[B:%.*]] to i8 addrspace(1)* ; AVX512-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; AVX512: vector.memcheck: -; AVX512-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 10000 +; AVX512-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32 addrspace(1)* [[A:%.*]], i64 10000 ; AVX512-NEXT: [[SCEVGEP2:%.*]] = bitcast i32 addrspace(1)* [[SCEVGEP]] to i8 addrspace(1)* -; AVX512-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32 addrspace(1)* [[TRIGGER]], i64 10000 -; AVX512-NEXT: [[SCEVGEP45:%.*]] = bitcast i32 addrspace(1)* [[SCEVGEP4]] to i8 addrspace(1)* -; AVX512-NEXT: [[SCEVGEP7:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 10000 -; AVX512-NEXT: [[SCEVGEP78:%.*]] = bitcast i32 addrspace(1)* [[SCEVGEP7]] to i8 addrspace(1)* -; AVX512-NEXT: [[BOUND0:%.*]] = icmp ult i8 addrspace(1)* [[A1]], [[SCEVGEP45]] -; AVX512-NEXT: [[BOUND1:%.*]] = icmp ult i8 addrspace(1)* [[TRIGGER3]], [[SCEVGEP2]] -; AVX512-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; AVX512-NEXT: [[BOUND09:%.*]] = icmp ult i8 addrspace(1)* [[A1]], [[SCEVGEP78]] -; AVX512-NEXT: [[BOUND110:%.*]] = icmp ult i8 addrspace(1)* [[B6]], [[SCEVGEP2]] -; AVX512-NEXT: [[FOUND_CONFLICT11:%.*]] = and i1 [[BOUND09]], [[BOUND110]] -; AVX512-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT11]] +; AVX512-NEXT: [[BOUND0:%.*]] = icmp ult i8 addrspace(1)* [[TRIGGER3]], [[SCEVGEP2]] +; AVX512-NEXT: [[BOUND04:%.*]] = icmp ult i8 addrspace(1)* [[B6]], [[SCEVGEP2]] +; AVX512-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[BOUND0]], [[BOUND04]] ; AVX512-NEXT: br i1 [[CONFLICT_RDX]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] ; AVX512: vector.main.loop.iter.check: ; AVX512-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] @@ -597,16 +543,16 @@ ; AVX512-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[TMP3]] ; AVX512-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP4]], i32 0 ; AVX512-NEXT: [[TMP9:%.*]] = bitcast i32 addrspace(1)* [[TMP8]] to <16 x i32> addrspace(1)* -; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP9]], align 4, !alias.scope !13 +; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP9]], align 4 ; AVX512-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP4]], i32 16 ; AVX512-NEXT: [[TMP11:%.*]] = bitcast i32 addrspace(1)* [[TMP10]] to <16 x i32> addrspace(1)* -; AVX512-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP11]], align 4, !alias.scope !13 +; AVX512-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP11]], align 4 ; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP4]], i32 32 ; AVX512-NEXT: [[TMP13:%.*]] = bitcast i32 addrspace(1)* [[TMP12]] to <16 x i32> addrspace(1)* -; AVX512-NEXT: [[WIDE_LOAD13:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP13]], align 4, !alias.scope !13 +; AVX512-NEXT: [[WIDE_LOAD13:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP13]], align 4 ; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP4]], i32 48 ; AVX512-NEXT: [[TMP15:%.*]] = bitcast i32 addrspace(1)* [[TMP14]] to <16 x i32> addrspace(1)* -; AVX512-NEXT: [[WIDE_LOAD14:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP15]], align 4, !alias.scope !13 +; AVX512-NEXT: [[WIDE_LOAD14:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP15]], align 4 ; AVX512-NEXT: [[TMP16:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD]], ; AVX512-NEXT: [[TMP17:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD12]], ; AVX512-NEXT: [[TMP18:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD13]], @@ -617,16 +563,16 @@ ; AVX512-NEXT: [[TMP23:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 [[TMP3]] ; AVX512-NEXT: [[TMP24:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP20]], i32 0 ; AVX512-NEXT: [[TMP25:%.*]] = bitcast i32 addrspace(1)* [[TMP24]] to <16 x i32> addrspace(1)* -; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP25]], i32 4, <16 x i1> [[TMP16]], <16 x i32> poison), !alias.scope !16 +; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP25]], i32 4, <16 x i1> [[TMP16]], <16 x i32> poison) ; AVX512-NEXT: [[TMP26:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP20]], i32 16 ; AVX512-NEXT: [[TMP27:%.*]] = bitcast i32 addrspace(1)* [[TMP26]] to <16 x i32> addrspace(1)* -; AVX512-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP27]], i32 4, <16 x i1> [[TMP17]], <16 x i32> poison), !alias.scope !16 +; AVX512-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP27]], i32 4, <16 x i1> [[TMP17]], <16 x i32> poison) ; AVX512-NEXT: [[TMP28:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP20]], i32 32 ; AVX512-NEXT: [[TMP29:%.*]] = bitcast i32 addrspace(1)* [[TMP28]] to <16 x i32> addrspace(1)* -; AVX512-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP29]], i32 4, <16 x i1> [[TMP18]], <16 x i32> poison), !alias.scope !16 +; AVX512-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP29]], i32 4, <16 x i1> [[TMP18]], <16 x i32> poison) ; AVX512-NEXT: [[TMP30:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP20]], i32 48 ; AVX512-NEXT: [[TMP31:%.*]] = bitcast i32 addrspace(1)* [[TMP30]] to <16 x i32> addrspace(1)* -; AVX512-NEXT: [[WIDE_MASKED_LOAD17:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP31]], i32 4, <16 x i1> [[TMP19]], <16 x i32> poison), !alias.scope !16 +; AVX512-NEXT: [[WIDE_MASKED_LOAD17:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP31]], i32 4, <16 x i1> [[TMP19]], <16 x i32> poison) ; AVX512-NEXT: [[TMP32:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]] ; AVX512-NEXT: [[TMP33:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD15]], [[WIDE_LOAD12]] ; AVX512-NEXT: [[TMP34:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD16]], [[WIDE_LOAD13]] @@ -637,16 +583,16 @@ ; AVX512-NEXT: [[TMP39:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 [[TMP3]] ; AVX512-NEXT: [[TMP40:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP36]], i32 0 ; AVX512-NEXT: [[TMP41:%.*]] = bitcast i32 addrspace(1)* [[TMP40]] to <16 x i32> addrspace(1)* -; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP32]], <16 x i32> addrspace(1)* [[TMP41]], i32 4, <16 x i1> [[TMP16]]), !alias.scope !18, !noalias !20 +; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP32]], <16 x i32> addrspace(1)* [[TMP41]], i32 4, <16 x i1> [[TMP16]]) ; AVX512-NEXT: [[TMP42:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP36]], i32 16 ; AVX512-NEXT: [[TMP43:%.*]] = bitcast i32 addrspace(1)* [[TMP42]] to <16 x i32> addrspace(1)* -; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP33]], <16 x i32> addrspace(1)* [[TMP43]], i32 4, <16 x i1> [[TMP17]]), !alias.scope !18, !noalias !20 +; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP33]], <16 x i32> addrspace(1)* [[TMP43]], i32 4, <16 x i1> [[TMP17]]) ; AVX512-NEXT: [[TMP44:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP36]], i32 32 ; AVX512-NEXT: [[TMP45:%.*]] = bitcast i32 addrspace(1)* [[TMP44]] to <16 x i32> addrspace(1)* -; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP34]], <16 x i32> addrspace(1)* [[TMP45]], i32 4, <16 x i1> [[TMP18]]), !alias.scope !18, !noalias !20 +; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP34]], <16 x i32> addrspace(1)* [[TMP45]], i32 4, <16 x i1> [[TMP18]]) ; AVX512-NEXT: [[TMP46:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP36]], i32 48 ; AVX512-NEXT: [[TMP47:%.*]] = bitcast i32 addrspace(1)* [[TMP46]] to <16 x i32> addrspace(1)* -; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP35]], <16 x i32> addrspace(1)* [[TMP47]], i32 4, <16 x i1> [[TMP19]]), !alias.scope !18, !noalias !20 +; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP35]], <16 x i32> addrspace(1)* [[TMP47]], i32 4, <16 x i1> [[TMP19]]) ; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64 ; AVX512-NEXT: [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984 ; AVX512-NEXT: br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] @@ -747,24 +693,15 @@ define void @foo2(float* nocapture %A, float* nocapture readonly %B, i32* nocapture readonly %trigger) local_unnamed_addr #0 { ; AVX1-LABEL: @foo2( ; AVX1-NEXT: entry: -; AVX1-NEXT: [[A1:%.*]] = bitcast float* [[A:%.*]] to i8* ; AVX1-NEXT: [[TRIGGER3:%.*]] = bitcast i32* [[TRIGGER:%.*]] to i8* ; AVX1-NEXT: [[B6:%.*]] = bitcast float* [[B:%.*]] to i8* ; AVX1-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; AVX1: vector.memcheck: -; AVX1-NEXT: [[SCEVGEP:%.*]] = getelementptr float, float* [[A]], i64 10000 +; AVX1-NEXT: [[SCEVGEP:%.*]] = getelementptr float, float* [[A:%.*]], i64 10000 ; AVX1-NEXT: [[SCEVGEP2:%.*]] = bitcast float* [[SCEVGEP]] to i8* -; AVX1-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[TRIGGER]], i64 10000 -; AVX1-NEXT: [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8* -; AVX1-NEXT: [[SCEVGEP7:%.*]] = getelementptr float, float* [[B]], i64 10000 -; AVX1-NEXT: [[SCEVGEP78:%.*]] = bitcast float* [[SCEVGEP7]] to i8* -; AVX1-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP45]] -; AVX1-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[TRIGGER3]], [[SCEVGEP2]] -; AVX1-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; AVX1-NEXT: [[BOUND09:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP78]] -; AVX1-NEXT: [[BOUND110:%.*]] = icmp ult i8* [[B6]], [[SCEVGEP2]] -; AVX1-NEXT: [[FOUND_CONFLICT11:%.*]] = and i1 [[BOUND09]], [[BOUND110]] -; AVX1-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT11]] +; AVX1-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[TRIGGER3]], [[SCEVGEP2]] +; AVX1-NEXT: [[BOUND04:%.*]] = icmp ult i8* [[B3]], [[SCEVGEP2]] +; AVX1-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[BOUND0]], [[BOUND04]] ; AVX1-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; AVX1: vector.ph: ; AVX1-NEXT: br label [[VECTOR_BODY:%.*]] @@ -774,18 +711,18 @@ ; AVX1-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP0]] ; AVX1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0 ; AVX1-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <8 x i32>* -; AVX1-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP3]], align 4, !alias.scope !21 +; AVX1-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP3]], align 4 ; AVX1-NEXT: [[TMP4:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], ; AVX1-NEXT: [[TMP5:%.*]] = getelementptr float, float* [[B]], i64 [[TMP0]] ; AVX1-NEXT: [[TMP6:%.*]] = getelementptr float, float* [[TMP5]], i32 0 ; AVX1-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <8 x float>* -; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP7]], i32 4, <8 x i1> [[TMP4]], <8 x float> poison), !alias.scope !24 +; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP7]], i32 4, <8 x i1> [[TMP4]], <8 x float> poison) ; AVX1-NEXT: [[TMP8:%.*]] = sitofp <8 x i32> [[WIDE_LOAD]] to <8 x float> ; AVX1-NEXT: [[TMP9:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD]], [[TMP8]] ; AVX1-NEXT: [[TMP10:%.*]] = getelementptr float, float* [[A]], i64 [[TMP0]] ; AVX1-NEXT: [[TMP11:%.*]] = getelementptr float, float* [[TMP10]], i32 0 ; AVX1-NEXT: [[TMP12:%.*]] = bitcast float* [[TMP11]] to <8 x float>* -; AVX1-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP9]], <8 x float>* [[TMP12]], i32 4, <8 x i1> [[TMP4]]), !alias.scope !26, !noalias !28 +; AVX1-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP9]], <8 x float>* [[TMP12]], i32 4, <8 x i1> [[TMP4]]) ; AVX1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; AVX1-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000 ; AVX1-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]] @@ -818,24 +755,15 @@ ; ; AVX2-LABEL: @foo2( ; AVX2-NEXT: entry: -; AVX2-NEXT: [[A1:%.*]] = bitcast float* [[A:%.*]] to i8* ; AVX2-NEXT: [[TRIGGER3:%.*]] = bitcast i32* [[TRIGGER:%.*]] to i8* ; AVX2-NEXT: [[B6:%.*]] = bitcast float* [[B:%.*]] to i8* ; AVX2-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; AVX2: vector.memcheck: -; AVX2-NEXT: [[SCEVGEP:%.*]] = getelementptr float, float* [[A]], i64 10000 +; AVX2-NEXT: [[SCEVGEP:%.*]] = getelementptr float, float* [[A:%.*]], i64 10000 ; AVX2-NEXT: [[SCEVGEP2:%.*]] = bitcast float* [[SCEVGEP]] to i8* -; AVX2-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[TRIGGER]], i64 10000 -; AVX2-NEXT: [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8* -; AVX2-NEXT: [[SCEVGEP7:%.*]] = getelementptr float, float* [[B]], i64 10000 -; AVX2-NEXT: [[SCEVGEP78:%.*]] = bitcast float* [[SCEVGEP7]] to i8* -; AVX2-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP45]] -; AVX2-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[TRIGGER3]], [[SCEVGEP2]] -; AVX2-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; AVX2-NEXT: [[BOUND09:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP78]] -; AVX2-NEXT: [[BOUND110:%.*]] = icmp ult i8* [[B6]], [[SCEVGEP2]] -; AVX2-NEXT: [[FOUND_CONFLICT11:%.*]] = and i1 [[BOUND09]], [[BOUND110]] -; AVX2-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT11]] +; AVX2-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[TRIGGER3]], [[SCEVGEP2]] +; AVX2-NEXT: [[BOUND04:%.*]] = icmp ult i8* [[B6]], [[SCEVGEP2]] +; AVX2-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[BOUND0]], [[BOUND04]] ; AVX2-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; AVX2: vector.ph: ; AVX2-NEXT: br label [[VECTOR_BODY:%.*]] @@ -851,16 +779,16 @@ ; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP3]] ; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0 ; AVX2-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <8 x i32>* -; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP9]], align 4, !alias.scope !21 +; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP9]], align 4 ; AVX2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 8 ; AVX2-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <8 x i32>* -; AVX2-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i32>, <8 x i32>* [[TMP11]], align 4, !alias.scope !21 +; AVX2-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i32>, <8 x i32>* [[TMP11]], align 4 ; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 16 ; AVX2-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <8 x i32>* -; AVX2-NEXT: [[WIDE_LOAD13:%.*]] = load <8 x i32>, <8 x i32>* [[TMP13]], align 4, !alias.scope !21 +; AVX2-NEXT: [[WIDE_LOAD13:%.*]] = load <8 x i32>, <8 x i32>* [[TMP13]], align 4 ; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 24 ; AVX2-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <8 x i32>* -; AVX2-NEXT: [[WIDE_LOAD14:%.*]] = load <8 x i32>, <8 x i32>* [[TMP15]], align 4, !alias.scope !21 +; AVX2-NEXT: [[WIDE_LOAD14:%.*]] = load <8 x i32>, <8 x i32>* [[TMP15]], align 4 ; AVX2-NEXT: [[TMP16:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], ; AVX2-NEXT: [[TMP17:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD12]], ; AVX2-NEXT: [[TMP18:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD13]], @@ -871,16 +799,16 @@ ; AVX2-NEXT: [[TMP23:%.*]] = getelementptr float, float* [[B]], i64 [[TMP3]] ; AVX2-NEXT: [[TMP24:%.*]] = getelementptr float, float* [[TMP20]], i32 0 ; AVX2-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP24]] to <8 x float>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP25]], i32 4, <8 x i1> [[TMP16]], <8 x float> poison), !alias.scope !24 +; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP25]], i32 4, <8 x i1> [[TMP16]], <8 x float> poison) ; AVX2-NEXT: [[TMP26:%.*]] = getelementptr float, float* [[TMP20]], i32 8 ; AVX2-NEXT: [[TMP27:%.*]] = bitcast float* [[TMP26]] to <8 x float>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP27]], i32 4, <8 x i1> [[TMP17]], <8 x float> poison), !alias.scope !24 +; AVX2-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP27]], i32 4, <8 x i1> [[TMP17]], <8 x float> poison) ; AVX2-NEXT: [[TMP28:%.*]] = getelementptr float, float* [[TMP20]], i32 16 ; AVX2-NEXT: [[TMP29:%.*]] = bitcast float* [[TMP28]] to <8 x float>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP29]], i32 4, <8 x i1> [[TMP18]], <8 x float> poison), !alias.scope !24 +; AVX2-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP29]], i32 4, <8 x i1> [[TMP18]], <8 x float> poison) ; AVX2-NEXT: [[TMP30:%.*]] = getelementptr float, float* [[TMP20]], i32 24 ; AVX2-NEXT: [[TMP31:%.*]] = bitcast float* [[TMP30]] to <8 x float>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD17:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP31]], i32 4, <8 x i1> [[TMP19]], <8 x float> poison), !alias.scope !24 +; AVX2-NEXT: [[WIDE_MASKED_LOAD17:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP31]], i32 4, <8 x i1> [[TMP19]], <8 x float> poison) ; AVX2-NEXT: [[TMP32:%.*]] = sitofp <8 x i32> [[WIDE_LOAD]] to <8 x float> ; AVX2-NEXT: [[TMP33:%.*]] = sitofp <8 x i32> [[WIDE_LOAD12]] to <8 x float> ; AVX2-NEXT: [[TMP34:%.*]] = sitofp <8 x i32> [[WIDE_LOAD13]] to <8 x float> @@ -895,16 +823,16 @@ ; AVX2-NEXT: [[TMP43:%.*]] = getelementptr float, float* [[A]], i64 [[TMP3]] ; AVX2-NEXT: [[TMP44:%.*]] = getelementptr float, float* [[TMP40]], i32 0 ; AVX2-NEXT: [[TMP45:%.*]] = bitcast float* [[TMP44]] to <8 x float>* -; AVX2-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP36]], <8 x float>* [[TMP45]], i32 4, <8 x i1> [[TMP16]]), !alias.scope !26, !noalias !28 +; AVX2-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP36]], <8 x float>* [[TMP45]], i32 4, <8 x i1> [[TMP16]]) ; AVX2-NEXT: [[TMP46:%.*]] = getelementptr float, float* [[TMP40]], i32 8 ; AVX2-NEXT: [[TMP47:%.*]] = bitcast float* [[TMP46]] to <8 x float>* -; AVX2-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP37]], <8 x float>* [[TMP47]], i32 4, <8 x i1> [[TMP17]]), !alias.scope !26, !noalias !28 +; AVX2-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP37]], <8 x float>* [[TMP47]], i32 4, <8 x i1> [[TMP17]]) ; AVX2-NEXT: [[TMP48:%.*]] = getelementptr float, float* [[TMP40]], i32 16 ; AVX2-NEXT: [[TMP49:%.*]] = bitcast float* [[TMP48]] to <8 x float>* -; AVX2-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP38]], <8 x float>* [[TMP49]], i32 4, <8 x i1> [[TMP18]]), !alias.scope !26, !noalias !28 +; AVX2-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP38]], <8 x float>* [[TMP49]], i32 4, <8 x i1> [[TMP18]]) ; AVX2-NEXT: [[TMP50:%.*]] = getelementptr float, float* [[TMP40]], i32 24 ; AVX2-NEXT: [[TMP51:%.*]] = bitcast float* [[TMP50]] to <8 x float>* -; AVX2-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP39]], <8 x float>* [[TMP51]], i32 4, <8 x i1> [[TMP19]]), !alias.scope !26, !noalias !28 +; AVX2-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP39]], <8 x float>* [[TMP51]], i32 4, <8 x i1> [[TMP19]]) ; AVX2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; AVX2-NEXT: [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984 ; AVX2-NEXT: br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]] @@ -937,24 +865,15 @@ ; ; AVX512-LABEL: @foo2( ; AVX512-NEXT: iter.check: -; AVX512-NEXT: [[A1:%.*]] = bitcast float* [[A:%.*]] to i8* ; AVX512-NEXT: [[TRIGGER3:%.*]] = bitcast i32* [[TRIGGER:%.*]] to i8* ; AVX512-NEXT: [[B6:%.*]] = bitcast float* [[B:%.*]] to i8* ; AVX512-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; AVX512: vector.memcheck: -; AVX512-NEXT: [[SCEVGEP:%.*]] = getelementptr float, float* [[A]], i64 10000 +; AVX512-NEXT: [[SCEVGEP:%.*]] = getelementptr float, float* [[A:%.*]], i64 10000 ; AVX512-NEXT: [[SCEVGEP2:%.*]] = bitcast float* [[SCEVGEP]] to i8* -; AVX512-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[TRIGGER]], i64 10000 -; AVX512-NEXT: [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8* -; AVX512-NEXT: [[SCEVGEP7:%.*]] = getelementptr float, float* [[B]], i64 10000 -; AVX512-NEXT: [[SCEVGEP78:%.*]] = bitcast float* [[SCEVGEP7]] to i8* -; AVX512-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP45]] -; AVX512-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[TRIGGER3]], [[SCEVGEP2]] -; AVX512-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; AVX512-NEXT: [[BOUND09:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP78]] -; AVX512-NEXT: [[BOUND110:%.*]] = icmp ult i8* [[B6]], [[SCEVGEP2]] -; AVX512-NEXT: [[FOUND_CONFLICT11:%.*]] = and i1 [[BOUND09]], [[BOUND110]] -; AVX512-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT11]] +; AVX512-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[TRIGGER3]], [[SCEVGEP2]] +; AVX512-NEXT: [[BOUND04:%.*]] = icmp ult i8* [[B6]], [[SCEVGEP2]] +; AVX512-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[BOUND0]], [[BOUND04]] ; AVX512-NEXT: br i1 [[CONFLICT_RDX]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] ; AVX512: vector.main.loop.iter.check: ; AVX512-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] @@ -972,16 +891,16 @@ ; AVX512-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP3]] ; AVX512-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0 ; AVX512-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32>* [[TMP9]], align 4, !alias.scope !24 +; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32>* [[TMP9]], align 4 ; AVX512-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 16 ; AVX512-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i32>, <16 x i32>* [[TMP11]], align 4, !alias.scope !24 +; AVX512-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i32>, <16 x i32>* [[TMP11]], align 4 ; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 32 ; AVX512-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_LOAD13:%.*]] = load <16 x i32>, <16 x i32>* [[TMP13]], align 4, !alias.scope !24 +; AVX512-NEXT: [[WIDE_LOAD13:%.*]] = load <16 x i32>, <16 x i32>* [[TMP13]], align 4 ; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 48 ; AVX512-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_LOAD14:%.*]] = load <16 x i32>, <16 x i32>* [[TMP15]], align 4, !alias.scope !24 +; AVX512-NEXT: [[WIDE_LOAD14:%.*]] = load <16 x i32>, <16 x i32>* [[TMP15]], align 4 ; AVX512-NEXT: [[TMP16:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD]], ; AVX512-NEXT: [[TMP17:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD12]], ; AVX512-NEXT: [[TMP18:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD13]], @@ -992,16 +911,16 @@ ; AVX512-NEXT: [[TMP23:%.*]] = getelementptr float, float* [[B]], i64 [[TMP3]] ; AVX512-NEXT: [[TMP24:%.*]] = getelementptr float, float* [[TMP20]], i32 0 ; AVX512-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP24]] to <16 x float>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP25]], i32 4, <16 x i1> [[TMP16]], <16 x float> poison), !alias.scope !27 +; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP25]], i32 4, <16 x i1> [[TMP16]], <16 x float> poison) ; AVX512-NEXT: [[TMP26:%.*]] = getelementptr float, float* [[TMP20]], i32 16 ; AVX512-NEXT: [[TMP27:%.*]] = bitcast float* [[TMP26]] to <16 x float>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP27]], i32 4, <16 x i1> [[TMP17]], <16 x float> poison), !alias.scope !27 +; AVX512-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP27]], i32 4, <16 x i1> [[TMP17]], <16 x float> poison) ; AVX512-NEXT: [[TMP28:%.*]] = getelementptr float, float* [[TMP20]], i32 32 ; AVX512-NEXT: [[TMP29:%.*]] = bitcast float* [[TMP28]] to <16 x float>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP29]], i32 4, <16 x i1> [[TMP18]], <16 x float> poison), !alias.scope !27 +; AVX512-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP29]], i32 4, <16 x i1> [[TMP18]], <16 x float> poison) ; AVX512-NEXT: [[TMP30:%.*]] = getelementptr float, float* [[TMP20]], i32 48 ; AVX512-NEXT: [[TMP31:%.*]] = bitcast float* [[TMP30]] to <16 x float>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD17:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP31]], i32 4, <16 x i1> [[TMP19]], <16 x float> poison), !alias.scope !27 +; AVX512-NEXT: [[WIDE_MASKED_LOAD17:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP31]], i32 4, <16 x i1> [[TMP19]], <16 x float> poison) ; AVX512-NEXT: [[TMP32:%.*]] = sitofp <16 x i32> [[WIDE_LOAD]] to <16 x float> ; AVX512-NEXT: [[TMP33:%.*]] = sitofp <16 x i32> [[WIDE_LOAD12]] to <16 x float> ; AVX512-NEXT: [[TMP34:%.*]] = sitofp <16 x i32> [[WIDE_LOAD13]] to <16 x float> @@ -1016,16 +935,16 @@ ; AVX512-NEXT: [[TMP43:%.*]] = getelementptr float, float* [[A]], i64 [[TMP3]] ; AVX512-NEXT: [[TMP44:%.*]] = getelementptr float, float* [[TMP40]], i32 0 ; AVX512-NEXT: [[TMP45:%.*]] = bitcast float* [[TMP44]] to <16 x float>* -; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP36]], <16 x float>* [[TMP45]], i32 4, <16 x i1> [[TMP16]]), !alias.scope !29, !noalias !31 +; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP36]], <16 x float>* [[TMP45]], i32 4, <16 x i1> [[TMP16]]) ; AVX512-NEXT: [[TMP46:%.*]] = getelementptr float, float* [[TMP40]], i32 16 ; AVX512-NEXT: [[TMP47:%.*]] = bitcast float* [[TMP46]] to <16 x float>* -; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP37]], <16 x float>* [[TMP47]], i32 4, <16 x i1> [[TMP17]]), !alias.scope !29, !noalias !31 +; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP37]], <16 x float>* [[TMP47]], i32 4, <16 x i1> [[TMP17]]) ; AVX512-NEXT: [[TMP48:%.*]] = getelementptr float, float* [[TMP40]], i32 32 ; AVX512-NEXT: [[TMP49:%.*]] = bitcast float* [[TMP48]] to <16 x float>* -; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP38]], <16 x float>* [[TMP49]], i32 4, <16 x i1> [[TMP18]]), !alias.scope !29, !noalias !31 +; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP38]], <16 x float>* [[TMP49]], i32 4, <16 x i1> [[TMP18]]) ; AVX512-NEXT: [[TMP50:%.*]] = getelementptr float, float* [[TMP40]], i32 48 ; AVX512-NEXT: [[TMP51:%.*]] = bitcast float* [[TMP50]] to <16 x float>* -; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP39]], <16 x float>* [[TMP51]], i32 4, <16 x i1> [[TMP19]]), !alias.scope !29, !noalias !31 +; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP39]], <16 x float>* [[TMP51]], i32 4, <16 x i1> [[TMP19]]) ; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64 ; AVX512-NEXT: [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984 ; AVX512-NEXT: br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]] @@ -1162,16 +1081,16 @@ ; AVX-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP3]] ; AVX-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0 ; AVX-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>* -; AVX-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4, !alias.scope !31 +; AVX-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4, !alias.scope !7 ; AVX-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 4 ; AVX-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <4 x i32>* -; AVX-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP11]], align 4, !alias.scope !31 +; AVX-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP11]], align 4, !alias.scope !7 ; AVX-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 8 ; AVX-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <4 x i32>* -; AVX-NEXT: [[WIDE_LOAD13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP13]], align 4, !alias.scope !31 +; AVX-NEXT: [[WIDE_LOAD13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP13]], align 4, !alias.scope !7 ; AVX-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 12 ; AVX-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <4 x i32>* -; AVX-NEXT: [[WIDE_LOAD14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP15]], align 4, !alias.scope !31 +; AVX-NEXT: [[WIDE_LOAD14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP15]], align 4, !alias.scope !7 ; AVX-NEXT: [[TMP16:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD]], ; AVX-NEXT: [[TMP17:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD12]], ; AVX-NEXT: [[TMP18:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD13]], @@ -1182,16 +1101,16 @@ ; AVX-NEXT: [[TMP23:%.*]] = getelementptr double, double* [[B]], i64 [[TMP3]] ; AVX-NEXT: [[TMP24:%.*]] = getelementptr double, double* [[TMP20]], i32 0 ; AVX-NEXT: [[TMP25:%.*]] = bitcast double* [[TMP24]] to <4 x double>* -; AVX-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP25]], i32 8, <4 x i1> [[TMP16]], <4 x double> poison), !alias.scope !34 +; AVX-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP25]], i32 8, <4 x i1> [[TMP16]], <4 x double> poison), !alias.scope !10 ; AVX-NEXT: [[TMP26:%.*]] = getelementptr double, double* [[TMP20]], i32 4 ; AVX-NEXT: [[TMP27:%.*]] = bitcast double* [[TMP26]] to <4 x double>* -; AVX-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP27]], i32 8, <4 x i1> [[TMP17]], <4 x double> poison), !alias.scope !34 +; AVX-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP27]], i32 8, <4 x i1> [[TMP17]], <4 x double> poison), !alias.scope !10 ; AVX-NEXT: [[TMP28:%.*]] = getelementptr double, double* [[TMP20]], i32 8 ; AVX-NEXT: [[TMP29:%.*]] = bitcast double* [[TMP28]] to <4 x double>* -; AVX-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP29]], i32 8, <4 x i1> [[TMP18]], <4 x double> poison), !alias.scope !34 +; AVX-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP29]], i32 8, <4 x i1> [[TMP18]], <4 x double> poison), !alias.scope !10 ; AVX-NEXT: [[TMP30:%.*]] = getelementptr double, double* [[TMP20]], i32 12 ; AVX-NEXT: [[TMP31:%.*]] = bitcast double* [[TMP30]] to <4 x double>* -; AVX-NEXT: [[WIDE_MASKED_LOAD17:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP31]], i32 8, <4 x i1> [[TMP19]], <4 x double> poison), !alias.scope !34 +; AVX-NEXT: [[WIDE_MASKED_LOAD17:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP31]], i32 8, <4 x i1> [[TMP19]], <4 x double> poison), !alias.scope !10 ; AVX-NEXT: [[TMP32:%.*]] = sitofp <4 x i32> [[WIDE_LOAD]] to <4 x double> ; AVX-NEXT: [[TMP33:%.*]] = sitofp <4 x i32> [[WIDE_LOAD12]] to <4 x double> ; AVX-NEXT: [[TMP34:%.*]] = sitofp <4 x i32> [[WIDE_LOAD13]] to <4 x double> @@ -1206,16 +1125,16 @@ ; AVX-NEXT: [[TMP43:%.*]] = getelementptr double, double* [[A]], i64 [[TMP3]] ; AVX-NEXT: [[TMP44:%.*]] = getelementptr double, double* [[TMP40]], i32 0 ; AVX-NEXT: [[TMP45:%.*]] = bitcast double* [[TMP44]] to <4 x double>* -; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP36]], <4 x double>* [[TMP45]], i32 8, <4 x i1> [[TMP16]]), !alias.scope !36, !noalias !38 +; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP36]], <4 x double>* [[TMP45]], i32 8, <4 x i1> [[TMP16]]), !alias.scope !12, !noalias !14 ; AVX-NEXT: [[TMP46:%.*]] = getelementptr double, double* [[TMP40]], i32 4 ; AVX-NEXT: [[TMP47:%.*]] = bitcast double* [[TMP46]] to <4 x double>* -; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP37]], <4 x double>* [[TMP47]], i32 8, <4 x i1> [[TMP17]]), !alias.scope !36, !noalias !38 +; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP37]], <4 x double>* [[TMP47]], i32 8, <4 x i1> [[TMP17]]), !alias.scope !12, !noalias !14 ; AVX-NEXT: [[TMP48:%.*]] = getelementptr double, double* [[TMP40]], i32 8 ; AVX-NEXT: [[TMP49:%.*]] = bitcast double* [[TMP48]] to <4 x double>* -; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP38]], <4 x double>* [[TMP49]], i32 8, <4 x i1> [[TMP18]]), !alias.scope !36, !noalias !38 +; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP38]], <4 x double>* [[TMP49]], i32 8, <4 x i1> [[TMP18]]), !alias.scope !12, !noalias !14 ; AVX-NEXT: [[TMP50:%.*]] = getelementptr double, double* [[TMP40]], i32 12 ; AVX-NEXT: [[TMP51:%.*]] = bitcast double* [[TMP50]] to <4 x double>* -; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP39]], <4 x double>* [[TMP51]], i32 8, <4 x i1> [[TMP19]]), !alias.scope !36, !noalias !38 +; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP39]], <4 x double>* [[TMP51]], i32 8, <4 x i1> [[TMP19]]), !alias.scope !12, !noalias !14 ; AVX-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; AVX-NEXT: [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000 ; AVX-NEXT: br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP39:![0-9]+]] @@ -1281,16 +1200,16 @@ ; AVX512-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP3]] ; AVX512-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0 ; AVX512-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <8 x i32>* -; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP9]], align 4, !alias.scope !35 +; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP9]], align 4, !alias.scope !11 ; AVX512-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 8 ; AVX512-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <8 x i32>* -; AVX512-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i32>, <8 x i32>* [[TMP11]], align 4, !alias.scope !35 +; AVX512-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i32>, <8 x i32>* [[TMP11]], align 4, !alias.scope !11 ; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 16 ; AVX512-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <8 x i32>* -; AVX512-NEXT: [[WIDE_LOAD13:%.*]] = load <8 x i32>, <8 x i32>* [[TMP13]], align 4, !alias.scope !35 +; AVX512-NEXT: [[WIDE_LOAD13:%.*]] = load <8 x i32>, <8 x i32>* [[TMP13]], align 4, !alias.scope !11 ; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 24 ; AVX512-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <8 x i32>* -; AVX512-NEXT: [[WIDE_LOAD14:%.*]] = load <8 x i32>, <8 x i32>* [[TMP15]], align 4, !alias.scope !35 +; AVX512-NEXT: [[WIDE_LOAD14:%.*]] = load <8 x i32>, <8 x i32>* [[TMP15]], align 4, !alias.scope !11 ; AVX512-NEXT: [[TMP16:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], ; AVX512-NEXT: [[TMP17:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD12]], ; AVX512-NEXT: [[TMP18:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD13]], @@ -1301,16 +1220,16 @@ ; AVX512-NEXT: [[TMP23:%.*]] = getelementptr double, double* [[B]], i64 [[TMP3]] ; AVX512-NEXT: [[TMP24:%.*]] = getelementptr double, double* [[TMP20]], i32 0 ; AVX512-NEXT: [[TMP25:%.*]] = bitcast double* [[TMP24]] to <8 x double>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP25]], i32 8, <8 x i1> [[TMP16]], <8 x double> poison), !alias.scope !38 +; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP25]], i32 8, <8 x i1> [[TMP16]], <8 x double> poison), !alias.scope !14 ; AVX512-NEXT: [[TMP26:%.*]] = getelementptr double, double* [[TMP20]], i32 8 ; AVX512-NEXT: [[TMP27:%.*]] = bitcast double* [[TMP26]] to <8 x double>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP27]], i32 8, <8 x i1> [[TMP17]], <8 x double> poison), !alias.scope !38 +; AVX512-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP27]], i32 8, <8 x i1> [[TMP17]], <8 x double> poison), !alias.scope !14 ; AVX512-NEXT: [[TMP28:%.*]] = getelementptr double, double* [[TMP20]], i32 16 ; AVX512-NEXT: [[TMP29:%.*]] = bitcast double* [[TMP28]] to <8 x double>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP29]], i32 8, <8 x i1> [[TMP18]], <8 x double> poison), !alias.scope !38 +; AVX512-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP29]], i32 8, <8 x i1> [[TMP18]], <8 x double> poison), !alias.scope !14 ; AVX512-NEXT: [[TMP30:%.*]] = getelementptr double, double* [[TMP20]], i32 24 ; AVX512-NEXT: [[TMP31:%.*]] = bitcast double* [[TMP30]] to <8 x double>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD17:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP31]], i32 8, <8 x i1> [[TMP19]], <8 x double> poison), !alias.scope !38 +; AVX512-NEXT: [[WIDE_MASKED_LOAD17:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP31]], i32 8, <8 x i1> [[TMP19]], <8 x double> poison), !alias.scope !14 ; AVX512-NEXT: [[TMP32:%.*]] = sitofp <8 x i32> [[WIDE_LOAD]] to <8 x double> ; AVX512-NEXT: [[TMP33:%.*]] = sitofp <8 x i32> [[WIDE_LOAD12]] to <8 x double> ; AVX512-NEXT: [[TMP34:%.*]] = sitofp <8 x i32> [[WIDE_LOAD13]] to <8 x double> @@ -1325,16 +1244,16 @@ ; AVX512-NEXT: [[TMP43:%.*]] = getelementptr double, double* [[A]], i64 [[TMP3]] ; AVX512-NEXT: [[TMP44:%.*]] = getelementptr double, double* [[TMP40]], i32 0 ; AVX512-NEXT: [[TMP45:%.*]] = bitcast double* [[TMP44]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP36]], <8 x double>* [[TMP45]], i32 8, <8 x i1> [[TMP16]]), !alias.scope !40, !noalias !42 +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP36]], <8 x double>* [[TMP45]], i32 8, <8 x i1> [[TMP16]]), !alias.scope !16, !noalias !18 ; AVX512-NEXT: [[TMP46:%.*]] = getelementptr double, double* [[TMP40]], i32 8 ; AVX512-NEXT: [[TMP47:%.*]] = bitcast double* [[TMP46]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP37]], <8 x double>* [[TMP47]], i32 8, <8 x i1> [[TMP17]]), !alias.scope !40, !noalias !42 +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP37]], <8 x double>* [[TMP47]], i32 8, <8 x i1> [[TMP17]]), !alias.scope !16, !noalias !18 ; AVX512-NEXT: [[TMP48:%.*]] = getelementptr double, double* [[TMP40]], i32 16 ; AVX512-NEXT: [[TMP49:%.*]] = bitcast double* [[TMP48]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP38]], <8 x double>* [[TMP49]], i32 8, <8 x i1> [[TMP18]]), !alias.scope !40, !noalias !42 +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP38]], <8 x double>* [[TMP49]], i32 8, <8 x i1> [[TMP18]]), !alias.scope !16, !noalias !18 ; AVX512-NEXT: [[TMP50:%.*]] = getelementptr double, double* [[TMP40]], i32 24 ; AVX512-NEXT: [[TMP51:%.*]] = bitcast double* [[TMP50]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP39]], <8 x double>* [[TMP51]], i32 8, <8 x i1> [[TMP19]]), !alias.scope !40, !noalias !42 +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP39]], <8 x double>* [[TMP51]], i32 8, <8 x i1> [[TMP19]]), !alias.scope !16, !noalias !18 ; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; AVX512-NEXT: [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984 ; AVX512-NEXT: br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP43:![0-9]+]] @@ -1457,15 +1376,15 @@ ; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; AVX512-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <8 x i64> [[VEC_IND]] -; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP0]], i32 4, <8 x i1> , <8 x i32> undef), !alias.scope !45 +; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP0]], i32 4, <8 x i1> , <8 x i32> undef), !alias.scope !21 ; AVX512-NEXT: [[TMP1:%.*]] = icmp slt <8 x i32> [[WIDE_MASKED_GATHER]], ; AVX512-NEXT: [[TMP2:%.*]] = shl nuw nsw <8 x i64> [[VEC_IND]], ; AVX512-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, double* [[B]], <8 x i64> [[TMP2]] -; AVX512-NEXT: [[WIDE_MASKED_GATHER12:%.*]] = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> [[TMP3]], i32 8, <8 x i1> [[TMP1]], <8 x double> undef), !alias.scope !48 +; AVX512-NEXT: [[WIDE_MASKED_GATHER12:%.*]] = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> [[TMP3]], i32 8, <8 x i1> [[TMP1]], <8 x double> undef), !alias.scope !24 ; AVX512-NEXT: [[TMP4:%.*]] = sitofp <8 x i32> [[WIDE_MASKED_GATHER]] to <8 x double> ; AVX512-NEXT: [[TMP5:%.*]] = fadd <8 x double> [[WIDE_MASKED_GATHER12]], [[TMP4]] ; AVX512-NEXT: [[TMP6:%.*]] = getelementptr inbounds double, double* [[A]], <8 x i64> [[VEC_IND]] -; AVX512-NEXT: call void @llvm.masked.scatter.v8f64.v8p0f64(<8 x double> [[TMP5]], <8 x double*> [[TMP6]], i32 8, <8 x i1> [[TMP1]]), !alias.scope !50, !noalias !52 +; AVX512-NEXT: call void @llvm.masked.scatter.v8f64.v8p0f64(<8 x double> [[TMP5]], <8 x double*> [[TMP6]], i32 8, <8 x i1> [[TMP1]]), !alias.scope !26, !noalias !28 ; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; AVX512-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], ; AVX512-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 624 @@ -1669,22 +1588,22 @@ ; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0 ; AVX2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 -3 ; AVX2-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>* -; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP10]], align 4, !alias.scope !41 +; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP10]], align 4, !alias.scope !17 ; AVX2-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD]], <4 x i32> poison, <4 x i32> ; AVX2-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 -4 ; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i32 -3 ; AVX2-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <4 x i32>* -; AVX2-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP13]], align 4, !alias.scope !41 +; AVX2-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP13]], align 4, !alias.scope !17 ; AVX2-NEXT: [[REVERSE13:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD12]], <4 x i32> poison, <4 x i32> ; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 -8 ; AVX2-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP14]], i32 -3 ; AVX2-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP15]] to <4 x i32>* -; AVX2-NEXT: [[WIDE_LOAD14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP16]], align 4, !alias.scope !41 +; AVX2-NEXT: [[WIDE_LOAD14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP16]], align 4, !alias.scope !17 ; AVX2-NEXT: [[REVERSE15:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD14]], <4 x i32> poison, <4 x i32> ; AVX2-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 -12 ; AVX2-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP17]], i32 -3 ; AVX2-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>* -; AVX2-NEXT: [[WIDE_LOAD16:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4, !alias.scope !41 +; AVX2-NEXT: [[WIDE_LOAD16:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4, !alias.scope !17 ; AVX2-NEXT: [[REVERSE17:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD16]], <4 x i32> poison, <4 x i32> ; AVX2-NEXT: [[TMP20:%.*]] = icmp sgt <4 x i32> [[REVERSE]], zeroinitializer ; AVX2-NEXT: [[TMP21:%.*]] = icmp sgt <4 x i32> [[REVERSE13]], zeroinitializer @@ -1698,25 +1617,25 @@ ; AVX2-NEXT: [[TMP29:%.*]] = getelementptr double, double* [[TMP28]], i32 -3 ; AVX2-NEXT: [[REVERSE18:%.*]] = shufflevector <4 x i1> [[TMP20]], <4 x i1> poison, <4 x i32> ; AVX2-NEXT: [[TMP30:%.*]] = bitcast double* [[TMP29]] to <4 x double>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP30]], i32 8, <4 x i1> [[REVERSE18]], <4 x double> poison), !alias.scope !44 +; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP30]], i32 8, <4 x i1> [[REVERSE18]], <4 x double> poison), !alias.scope !20 ; AVX2-NEXT: [[REVERSE19:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD]], <4 x double> poison, <4 x i32> ; AVX2-NEXT: [[TMP31:%.*]] = getelementptr double, double* [[TMP24]], i32 -4 ; AVX2-NEXT: [[TMP32:%.*]] = getelementptr double, double* [[TMP31]], i32 -3 ; AVX2-NEXT: [[REVERSE20:%.*]] = shufflevector <4 x i1> [[TMP21]], <4 x i1> poison, <4 x i32> ; AVX2-NEXT: [[TMP33:%.*]] = bitcast double* [[TMP32]] to <4 x double>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD21:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP33]], i32 8, <4 x i1> [[REVERSE20]], <4 x double> poison), !alias.scope !44 +; AVX2-NEXT: [[WIDE_MASKED_LOAD21:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP33]], i32 8, <4 x i1> [[REVERSE20]], <4 x double> poison), !alias.scope !20 ; AVX2-NEXT: [[REVERSE22:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD21]], <4 x double> poison, <4 x i32> ; AVX2-NEXT: [[TMP34:%.*]] = getelementptr double, double* [[TMP24]], i32 -8 ; AVX2-NEXT: [[TMP35:%.*]] = getelementptr double, double* [[TMP34]], i32 -3 ; AVX2-NEXT: [[REVERSE23:%.*]] = shufflevector <4 x i1> [[TMP22]], <4 x i1> poison, <4 x i32> ; AVX2-NEXT: [[TMP36:%.*]] = bitcast double* [[TMP35]] to <4 x double>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD24:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP36]], i32 8, <4 x i1> [[REVERSE23]], <4 x double> poison), !alias.scope !44 +; AVX2-NEXT: [[WIDE_MASKED_LOAD24:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP36]], i32 8, <4 x i1> [[REVERSE23]], <4 x double> poison), !alias.scope !20 ; AVX2-NEXT: [[REVERSE25:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD24]], <4 x double> poison, <4 x i32> ; AVX2-NEXT: [[TMP37:%.*]] = getelementptr double, double* [[TMP24]], i32 -12 ; AVX2-NEXT: [[TMP38:%.*]] = getelementptr double, double* [[TMP37]], i32 -3 ; AVX2-NEXT: [[REVERSE26:%.*]] = shufflevector <4 x i1> [[TMP23]], <4 x i1> poison, <4 x i32> ; AVX2-NEXT: [[TMP39:%.*]] = bitcast double* [[TMP38]] to <4 x double>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP39]], i32 8, <4 x i1> [[REVERSE26]], <4 x double> poison), !alias.scope !44 +; AVX2-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP39]], i32 8, <4 x i1> [[REVERSE26]], <4 x double> poison), !alias.scope !20 ; AVX2-NEXT: [[REVERSE28:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD27]], <4 x double> poison, <4 x i32> ; AVX2-NEXT: [[TMP40:%.*]] = fadd <4 x double> [[REVERSE19]], ; AVX2-NEXT: [[TMP41:%.*]] = fadd <4 x double> [[REVERSE22]], @@ -1730,22 +1649,22 @@ ; AVX2-NEXT: [[TMP48:%.*]] = getelementptr double, double* [[TMP44]], i32 0 ; AVX2-NEXT: [[TMP49:%.*]] = getelementptr double, double* [[TMP48]], i32 -3 ; AVX2-NEXT: [[TMP50:%.*]] = bitcast double* [[TMP49]] to <4 x double>* -; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[REVERSE29]], <4 x double>* [[TMP50]], i32 8, <4 x i1> [[REVERSE18]]), !alias.scope !46, !noalias !48 +; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[REVERSE29]], <4 x double>* [[TMP50]], i32 8, <4 x i1> [[REVERSE18]]), !alias.scope !22, !noalias !24 ; AVX2-NEXT: [[REVERSE31:%.*]] = shufflevector <4 x double> [[TMP41]], <4 x double> poison, <4 x i32> ; AVX2-NEXT: [[TMP51:%.*]] = getelementptr double, double* [[TMP44]], i32 -4 ; AVX2-NEXT: [[TMP52:%.*]] = getelementptr double, double* [[TMP51]], i32 -3 ; AVX2-NEXT: [[TMP53:%.*]] = bitcast double* [[TMP52]] to <4 x double>* -; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[REVERSE31]], <4 x double>* [[TMP53]], i32 8, <4 x i1> [[REVERSE20]]), !alias.scope !46, !noalias !48 +; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[REVERSE31]], <4 x double>* [[TMP53]], i32 8, <4 x i1> [[REVERSE20]]), !alias.scope !22, !noalias !24 ; AVX2-NEXT: [[REVERSE33:%.*]] = shufflevector <4 x double> [[TMP42]], <4 x double> poison, <4 x i32> ; AVX2-NEXT: [[TMP54:%.*]] = getelementptr double, double* [[TMP44]], i32 -8 ; AVX2-NEXT: [[TMP55:%.*]] = getelementptr double, double* [[TMP54]], i32 -3 ; AVX2-NEXT: [[TMP56:%.*]] = bitcast double* [[TMP55]] to <4 x double>* -; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[REVERSE33]], <4 x double>* [[TMP56]], i32 8, <4 x i1> [[REVERSE23]]), !alias.scope !46, !noalias !48 +; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[REVERSE33]], <4 x double>* [[TMP56]], i32 8, <4 x i1> [[REVERSE23]]), !alias.scope !22, !noalias !24 ; AVX2-NEXT: [[REVERSE35:%.*]] = shufflevector <4 x double> [[TMP43]], <4 x double> poison, <4 x i32> ; AVX2-NEXT: [[TMP57:%.*]] = getelementptr double, double* [[TMP44]], i32 -12 ; AVX2-NEXT: [[TMP58:%.*]] = getelementptr double, double* [[TMP57]], i32 -3 ; AVX2-NEXT: [[TMP59:%.*]] = bitcast double* [[TMP58]] to <4 x double>* -; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[REVERSE35]], <4 x double>* [[TMP59]], i32 8, <4 x i1> [[REVERSE26]]), !alias.scope !46, !noalias !48 +; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[REVERSE35]], <4 x double>* [[TMP59]], i32 8, <4 x i1> [[REVERSE26]]), !alias.scope !22, !noalias !24 ; AVX2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; AVX2-NEXT: [[TMP60:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 ; AVX2-NEXT: br i1 [[TMP60]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP49:![0-9]+]] @@ -1812,22 +1731,22 @@ ; AVX512-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0 ; AVX512-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 -7 ; AVX512-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <8 x i32>* -; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP10]], align 4, !alias.scope !55 +; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP10]], align 4, !alias.scope !31 ; AVX512-NEXT: [[REVERSE:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD]], <8 x i32> poison, <8 x i32> ; AVX512-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 -8 ; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i32 -7 ; AVX512-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <8 x i32>* -; AVX512-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i32>, <8 x i32>* [[TMP13]], align 4, !alias.scope !55 +; AVX512-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i32>, <8 x i32>* [[TMP13]], align 4, !alias.scope !31 ; AVX512-NEXT: [[REVERSE13:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD12]], <8 x i32> poison, <8 x i32> ; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 -16 ; AVX512-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP14]], i32 -7 ; AVX512-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP15]] to <8 x i32>* -; AVX512-NEXT: [[WIDE_LOAD14:%.*]] = load <8 x i32>, <8 x i32>* [[TMP16]], align 4, !alias.scope !55 +; AVX512-NEXT: [[WIDE_LOAD14:%.*]] = load <8 x i32>, <8 x i32>* [[TMP16]], align 4, !alias.scope !31 ; AVX512-NEXT: [[REVERSE15:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD14]], <8 x i32> poison, <8 x i32> ; AVX512-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 -24 ; AVX512-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP17]], i32 -7 ; AVX512-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <8 x i32>* -; AVX512-NEXT: [[WIDE_LOAD16:%.*]] = load <8 x i32>, <8 x i32>* [[TMP19]], align 4, !alias.scope !55 +; AVX512-NEXT: [[WIDE_LOAD16:%.*]] = load <8 x i32>, <8 x i32>* [[TMP19]], align 4, !alias.scope !31 ; AVX512-NEXT: [[REVERSE17:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD16]], <8 x i32> poison, <8 x i32> ; AVX512-NEXT: [[TMP20:%.*]] = icmp sgt <8 x i32> [[REVERSE]], zeroinitializer ; AVX512-NEXT: [[TMP21:%.*]] = icmp sgt <8 x i32> [[REVERSE13]], zeroinitializer @@ -1841,25 +1760,25 @@ ; AVX512-NEXT: [[TMP29:%.*]] = getelementptr double, double* [[TMP28]], i32 -7 ; AVX512-NEXT: [[REVERSE18:%.*]] = shufflevector <8 x i1> [[TMP20]], <8 x i1> poison, <8 x i32> ; AVX512-NEXT: [[TMP30:%.*]] = bitcast double* [[TMP29]] to <8 x double>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP30]], i32 8, <8 x i1> [[REVERSE18]], <8 x double> poison), !alias.scope !58 +; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP30]], i32 8, <8 x i1> [[REVERSE18]], <8 x double> poison), !alias.scope !34 ; AVX512-NEXT: [[REVERSE19:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD]], <8 x double> poison, <8 x i32> ; AVX512-NEXT: [[TMP31:%.*]] = getelementptr double, double* [[TMP24]], i32 -8 ; AVX512-NEXT: [[TMP32:%.*]] = getelementptr double, double* [[TMP31]], i32 -7 ; AVX512-NEXT: [[REVERSE20:%.*]] = shufflevector <8 x i1> [[TMP21]], <8 x i1> poison, <8 x i32> ; AVX512-NEXT: [[TMP33:%.*]] = bitcast double* [[TMP32]] to <8 x double>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD21:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP33]], i32 8, <8 x i1> [[REVERSE20]], <8 x double> poison), !alias.scope !58 +; AVX512-NEXT: [[WIDE_MASKED_LOAD21:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP33]], i32 8, <8 x i1> [[REVERSE20]], <8 x double> poison), !alias.scope !34 ; AVX512-NEXT: [[REVERSE22:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD21]], <8 x double> poison, <8 x i32> ; AVX512-NEXT: [[TMP34:%.*]] = getelementptr double, double* [[TMP24]], i32 -16 ; AVX512-NEXT: [[TMP35:%.*]] = getelementptr double, double* [[TMP34]], i32 -7 ; AVX512-NEXT: [[REVERSE23:%.*]] = shufflevector <8 x i1> [[TMP22]], <8 x i1> poison, <8 x i32> ; AVX512-NEXT: [[TMP36:%.*]] = bitcast double* [[TMP35]] to <8 x double>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD24:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP36]], i32 8, <8 x i1> [[REVERSE23]], <8 x double> poison), !alias.scope !58 +; AVX512-NEXT: [[WIDE_MASKED_LOAD24:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP36]], i32 8, <8 x i1> [[REVERSE23]], <8 x double> poison), !alias.scope !34 ; AVX512-NEXT: [[REVERSE25:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD24]], <8 x double> poison, <8 x i32> ; AVX512-NEXT: [[TMP37:%.*]] = getelementptr double, double* [[TMP24]], i32 -24 ; AVX512-NEXT: [[TMP38:%.*]] = getelementptr double, double* [[TMP37]], i32 -7 ; AVX512-NEXT: [[REVERSE26:%.*]] = shufflevector <8 x i1> [[TMP23]], <8 x i1> poison, <8 x i32> ; AVX512-NEXT: [[TMP39:%.*]] = bitcast double* [[TMP38]] to <8 x double>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP39]], i32 8, <8 x i1> [[REVERSE26]], <8 x double> poison), !alias.scope !58 +; AVX512-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP39]], i32 8, <8 x i1> [[REVERSE26]], <8 x double> poison), !alias.scope !34 ; AVX512-NEXT: [[REVERSE28:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD27]], <8 x double> poison, <8 x i32> ; AVX512-NEXT: [[TMP40:%.*]] = fadd <8 x double> [[REVERSE19]], ; AVX512-NEXT: [[TMP41:%.*]] = fadd <8 x double> [[REVERSE22]], @@ -1873,22 +1792,22 @@ ; AVX512-NEXT: [[TMP48:%.*]] = getelementptr double, double* [[TMP44]], i32 0 ; AVX512-NEXT: [[TMP49:%.*]] = getelementptr double, double* [[TMP48]], i32 -7 ; AVX512-NEXT: [[TMP50:%.*]] = bitcast double* [[TMP49]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[REVERSE29]], <8 x double>* [[TMP50]], i32 8, <8 x i1> [[REVERSE18]]), !alias.scope !60, !noalias !62 +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[REVERSE29]], <8 x double>* [[TMP50]], i32 8, <8 x i1> [[REVERSE18]]), !alias.scope !36, !noalias !38 ; AVX512-NEXT: [[REVERSE31:%.*]] = shufflevector <8 x double> [[TMP41]], <8 x double> poison, <8 x i32> ; AVX512-NEXT: [[TMP51:%.*]] = getelementptr double, double* [[TMP44]], i32 -8 ; AVX512-NEXT: [[TMP52:%.*]] = getelementptr double, double* [[TMP51]], i32 -7 ; AVX512-NEXT: [[TMP53:%.*]] = bitcast double* [[TMP52]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[REVERSE31]], <8 x double>* [[TMP53]], i32 8, <8 x i1> [[REVERSE20]]), !alias.scope !60, !noalias !62 +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[REVERSE31]], <8 x double>* [[TMP53]], i32 8, <8 x i1> [[REVERSE20]]), !alias.scope !36, !noalias !38 ; AVX512-NEXT: [[REVERSE33:%.*]] = shufflevector <8 x double> [[TMP42]], <8 x double> poison, <8 x i32> ; AVX512-NEXT: [[TMP54:%.*]] = getelementptr double, double* [[TMP44]], i32 -16 ; AVX512-NEXT: [[TMP55:%.*]] = getelementptr double, double* [[TMP54]], i32 -7 ; AVX512-NEXT: [[TMP56:%.*]] = bitcast double* [[TMP55]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[REVERSE33]], <8 x double>* [[TMP56]], i32 8, <8 x i1> [[REVERSE23]]), !alias.scope !60, !noalias !62 +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[REVERSE33]], <8 x double>* [[TMP56]], i32 8, <8 x i1> [[REVERSE23]]), !alias.scope !36, !noalias !38 ; AVX512-NEXT: [[REVERSE35:%.*]] = shufflevector <8 x double> [[TMP43]], <8 x double> poison, <8 x i32> ; AVX512-NEXT: [[TMP57:%.*]] = getelementptr double, double* [[TMP44]], i32 -24 ; AVX512-NEXT: [[TMP58:%.*]] = getelementptr double, double* [[TMP57]], i32 -7 ; AVX512-NEXT: [[TMP59:%.*]] = bitcast double* [[TMP58]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[REVERSE35]], <8 x double>* [[TMP59]], i32 8, <8 x i1> [[REVERSE26]]), !alias.scope !60, !noalias !62 +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[REVERSE35]], <8 x double>* [[TMP59]], i32 8, <8 x i1> [[REVERSE26]]), !alias.scope !36, !noalias !38 ; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; AVX512-NEXT: [[TMP60:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 ; AVX512-NEXT: br i1 [[TMP60]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP63:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr23997.ll b/llvm/test/Transforms/LoopVectorize/X86/pr23997.ll --- a/llvm/test/Transforms/LoopVectorize/X86/pr23997.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/pr23997.ll @@ -22,11 +22,8 @@ ; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[UMAX]], 3 ; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[TMP3]], 16 ; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, i8 addrspace(1)* [[TMP0]], i64 [[TMP4]] -; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, i8 addrspace(1)* [[TMP1]], i64 [[TMP4]] -; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8 addrspace(1)* [[DOT10]], [[SCEVGEP1]] ; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8 addrspace(1)* [[DOT12]], [[SCEVGEP]] -; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 [[BOUND1]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[UMAX2]], -16 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] @@ -34,28 +31,28 @@ ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* [[DOT13]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 addrspace(1)* addrspace(1)* [[TMP5]] to <4 x i8 addrspace(1)*> addrspace(1)* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8 addrspace(1)*>, <4 x i8 addrspace(1)*> addrspace(1)* [[TMP6]], align 8, !alias.scope !0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8 addrspace(1)*>, <4 x i8 addrspace(1)*> addrspace(1)* [[TMP6]], align 8 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* [[TMP5]], i64 4 ; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8 addrspace(1)* addrspace(1)* [[TMP7]] to <4 x i8 addrspace(1)*> addrspace(1)* -; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i8 addrspace(1)*>, <4 x i8 addrspace(1)*> addrspace(1)* [[TMP8]], align 8, !alias.scope !0 +; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i8 addrspace(1)*>, <4 x i8 addrspace(1)*> addrspace(1)* [[TMP8]], align 8 ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* [[TMP5]], i64 8 ; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 addrspace(1)* addrspace(1)* [[TMP9]] to <4 x i8 addrspace(1)*> addrspace(1)* -; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i8 addrspace(1)*>, <4 x i8 addrspace(1)*> addrspace(1)* [[TMP10]], align 8, !alias.scope !0 +; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i8 addrspace(1)*>, <4 x i8 addrspace(1)*> addrspace(1)* [[TMP10]], align 8 ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* [[TMP5]], i64 12 ; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 addrspace(1)* addrspace(1)* [[TMP11]] to <4 x i8 addrspace(1)*> addrspace(1)* -; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i8 addrspace(1)*>, <4 x i8 addrspace(1)*> addrspace(1)* [[TMP12]], align 8, !alias.scope !0 +; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i8 addrspace(1)*>, <4 x i8 addrspace(1)*> addrspace(1)* [[TMP12]], align 8 ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* [[DOT11]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP14:%.*]] = bitcast i8 addrspace(1)* addrspace(1)* [[TMP13]] to <4 x i8 addrspace(1)*> addrspace(1)* -; CHECK-NEXT: store <4 x i8 addrspace(1)*> [[WIDE_LOAD]], <4 x i8 addrspace(1)*> addrspace(1)* [[TMP14]], align 8, !alias.scope !3, !noalias !0 +; CHECK-NEXT: store <4 x i8 addrspace(1)*> [[WIDE_LOAD]], <4 x i8 addrspace(1)*> addrspace(1)* [[TMP14]], align 8 ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* [[TMP13]], i64 4 ; CHECK-NEXT: [[TMP16:%.*]] = bitcast i8 addrspace(1)* addrspace(1)* [[TMP15]] to <4 x i8 addrspace(1)*> addrspace(1)* -; CHECK-NEXT: store <4 x i8 addrspace(1)*> [[WIDE_LOAD3]], <4 x i8 addrspace(1)*> addrspace(1)* [[TMP16]], align 8, !alias.scope !3, !noalias !0 +; CHECK-NEXT: store <4 x i8 addrspace(1)*> [[WIDE_LOAD3]], <4 x i8 addrspace(1)*> addrspace(1)* [[TMP16]], align 8 ; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* [[TMP13]], i64 8 ; CHECK-NEXT: [[TMP18:%.*]] = bitcast i8 addrspace(1)* addrspace(1)* [[TMP17]] to <4 x i8 addrspace(1)*> addrspace(1)* -; CHECK-NEXT: store <4 x i8 addrspace(1)*> [[WIDE_LOAD4]], <4 x i8 addrspace(1)*> addrspace(1)* [[TMP18]], align 8, !alias.scope !3, !noalias !0 +; CHECK-NEXT: store <4 x i8 addrspace(1)*> [[WIDE_LOAD4]], <4 x i8 addrspace(1)*> addrspace(1)* [[TMP18]], align 8 ; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* [[TMP13]], i64 12 ; CHECK-NEXT: [[TMP20:%.*]] = bitcast i8 addrspace(1)* addrspace(1)* [[TMP19]] to <4 x i8 addrspace(1)*> addrspace(1)* -; CHECK-NEXT: store <4 x i8 addrspace(1)*> [[WIDE_LOAD5]], <4 x i8 addrspace(1)*> addrspace(1)* [[TMP20]], align 8, !alias.scope !3, !noalias !0 +; CHECK-NEXT: store <4 x i8 addrspace(1)*> [[WIDE_LOAD5]], <4 x i8 addrspace(1)*> addrspace(1)* [[TMP20]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll @@ -27,17 +27,13 @@ ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP0]], 3 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; CHECK: vector.memcheck: +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[A]], i64 1 ; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[N]], -1 ; CHECK-NEXT: [[TMP4:%.*]] = zext i32 [[TMP3]] to i64 ; CHECK-NEXT: [[TMP5:%.*]] = add nuw nsw i64 [[TMP4]], 1 -; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[TMP5]] -; CHECK-NEXT: [[SCEVGEP3:%.*]] = getelementptr i32, i32* [[A]], i64 1 -; CHECK-NEXT: [[TMP6:%.*]] = add nuw nsw i64 [[TMP4]], 2 -; CHECK-NEXT: [[SCEVGEP5:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP6]] -; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt i32* [[SCEVGEP5]], [[B]] -; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i32* [[SCEVGEP3]], [[SCEVGEP]] -; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[TMP5]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i32* [[SCEVGEP]], [[SCEVGEP2]] +; CHECK-NEXT: br i1 [[BOUND0]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], 8589934588 ; CHECK-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <4 x i32> poison, i32 [[PRE_LOAD]], i64 3 @@ -48,12 +44,12 @@ ; CHECK-NEXT: [[TMP7:%.*]] = or i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP7]] ; CHECK-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_LOAD]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4, !alias.scope !0 +; CHECK-NEXT: [[WIDE_LOAD]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4 ; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[WIDE_LOAD]], <4 x i32> ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP12:%.*]] = add <4 x i32> [[WIDE_LOAD]], [[TMP10]] ; CHECK-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* [[TMP13]], align 4, !alias.scope !3, !noalias !0 +; CHECK-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* [[TMP13]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] @@ -91,17 +87,13 @@ ; UNROLL-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP0]], 7 ; UNROLL-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; UNROLL: vector.memcheck: +; UNROLL-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[A]], i64 1 ; UNROLL-NEXT: [[TMP3:%.*]] = add i32 [[N]], -1 ; UNROLL-NEXT: [[TMP4:%.*]] = zext i32 [[TMP3]] to i64 ; UNROLL-NEXT: [[TMP5:%.*]] = add nuw nsw i64 [[TMP4]], 1 -; UNROLL-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[TMP5]] -; UNROLL-NEXT: [[SCEVGEP3:%.*]] = getelementptr i32, i32* [[A]], i64 1 -; UNROLL-NEXT: [[TMP6:%.*]] = add nuw nsw i64 [[TMP4]], 2 -; UNROLL-NEXT: [[SCEVGEP5:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP6]] -; UNROLL-NEXT: [[BOUND0:%.*]] = icmp ugt i32* [[SCEVGEP5]], [[B]] -; UNROLL-NEXT: [[BOUND1:%.*]] = icmp ult i32* [[SCEVGEP3]], [[SCEVGEP]] -; UNROLL-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; UNROLL-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; UNROLL-NEXT: [[SCEVGEP2:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[TMP5]] +; UNROLL-NEXT: [[BOUND0:%.*]] = icmp ult i32* [[SCEVGEP]], [[SCEVGEP2]] +; UNROLL-NEXT: br i1 [[BOUND0]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; UNROLL: vector.ph: ; UNROLL-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], 8589934584 ; UNROLL-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <4 x i32> poison, i32 [[PRE_LOAD]], i64 3 @@ -112,20 +104,20 @@ ; UNROLL-NEXT: [[TMP7:%.*]] = or i64 [[INDEX]], 1 ; UNROLL-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP7]] ; UNROLL-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>* -; UNROLL-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4, !alias.scope !0 +; UNROLL-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4 ; UNROLL-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i64 4 ; UNROLL-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <4 x i32>* -; UNROLL-NEXT: [[WIDE_LOAD7]] = load <4 x i32>, <4 x i32>* [[TMP11]], align 4, !alias.scope !0 +; UNROLL-NEXT: [[WIDE_LOAD7]] = load <4 x i32>, <4 x i32>* [[TMP11]], align 4 ; UNROLL-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[WIDE_LOAD]], <4 x i32> ; UNROLL-NEXT: [[TMP13:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD]], <4 x i32> [[WIDE_LOAD7]], <4 x i32> ; UNROLL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]] ; UNROLL-NEXT: [[TMP15:%.*]] = add <4 x i32> [[WIDE_LOAD]], [[TMP12]] ; UNROLL-NEXT: [[TMP16:%.*]] = add <4 x i32> [[WIDE_LOAD7]], [[TMP13]] ; UNROLL-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP14]] to <4 x i32>* -; UNROLL-NEXT: store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4, !alias.scope !3, !noalias !0 +; UNROLL-NEXT: store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4 ; UNROLL-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP14]], i64 4 ; UNROLL-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>* -; UNROLL-NEXT: store <4 x i32> [[TMP16]], <4 x i32>* [[TMP19]], align 4, !alias.scope !3, !noalias !0 +; UNROLL-NEXT: store <4 x i32> [[TMP16]], <4 x i32>* [[TMP19]], align 4 ; UNROLL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; UNROLL-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; UNROLL-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] @@ -154,7 +146,6 @@ ; ; UNROLL-NO-IC-LABEL: @recurrence_1( ; UNROLL-NO-IC-NEXT: entry: -; UNROLL-NO-IC-NEXT: [[B1:%.*]] = bitcast i32* [[B:%.*]] to i8* ; UNROLL-NO-IC-NEXT: br label [[FOR_PREHEADER:%.*]] ; UNROLL-NO-IC: for.preheader: ; UNROLL-NO-IC-NEXT: [[ARRAYIDX_PHI_TRANS_INSERT:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 0 @@ -165,20 +156,15 @@ ; UNROLL-NO-IC-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 8 ; UNROLL-NO-IC-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; UNROLL-NO-IC: vector.memcheck: +; UNROLL-NO-IC-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[A]], i64 1 +; UNROLL-NO-IC-NEXT: [[SCEVGEP1:%.*]] = bitcast i32* [[SCEVGEP]] to i8* ; UNROLL-NO-IC-NEXT: [[TMP3:%.*]] = add i32 [[N]], -1 ; UNROLL-NO-IC-NEXT: [[TMP4:%.*]] = zext i32 [[TMP3]] to i64 ; UNROLL-NO-IC-NEXT: [[TMP5:%.*]] = add nuw nsw i64 [[TMP4]], 1 -; UNROLL-NO-IC-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP5]] -; UNROLL-NO-IC-NEXT: [[SCEVGEP2:%.*]] = bitcast i32* [[SCEVGEP]] to i8* -; UNROLL-NO-IC-NEXT: [[SCEVGEP3:%.*]] = getelementptr i32, i32* [[A]], i64 1 -; UNROLL-NO-IC-NEXT: [[SCEVGEP34:%.*]] = bitcast i32* [[SCEVGEP3]] to i8* -; UNROLL-NO-IC-NEXT: [[TMP6:%.*]] = add nuw nsw i64 [[TMP4]], 2 -; UNROLL-NO-IC-NEXT: [[SCEVGEP5:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP6]] -; UNROLL-NO-IC-NEXT: [[SCEVGEP56:%.*]] = bitcast i32* [[SCEVGEP5]] to i8* -; UNROLL-NO-IC-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[B1]], [[SCEVGEP56]] -; UNROLL-NO-IC-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[SCEVGEP34]], [[SCEVGEP2]] -; UNROLL-NO-IC-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; UNROLL-NO-IC-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; UNROLL-NO-IC-NEXT: [[SCEVGEP2:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[TMP5]] +; UNROLL-NO-IC-NEXT: [[SCEVGEP23:%.*]] = bitcast i32* [[SCEVGEP2]] to i8* +; UNROLL-NO-IC-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[SCEVGEP1]], [[SCEVGEP23]] +; UNROLL-NO-IC-NEXT: br i1 [[BOUND0]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; UNROLL-NO-IC: vector.ph: ; UNROLL-NO-IC-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 8 ; UNROLL-NO-IC-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]] @@ -195,10 +181,10 @@ ; UNROLL-NO-IC-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP10]] ; UNROLL-NO-IC-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i32 0 ; UNROLL-NO-IC-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>* -; UNROLL-NO-IC-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4, !alias.scope !0 +; UNROLL-NO-IC-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4 ; UNROLL-NO-IC-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i32 4 ; UNROLL-NO-IC-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP15]] to <4 x i32>* -; UNROLL-NO-IC-NEXT: [[WIDE_LOAD7]] = load <4 x i32>, <4 x i32>* [[TMP16]], align 4, !alias.scope !0 +; UNROLL-NO-IC-NEXT: [[WIDE_LOAD7]] = load <4 x i32>, <4 x i32>* [[TMP16]], align 4 ; UNROLL-NO-IC-NEXT: [[TMP17:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[WIDE_LOAD]], <4 x i32> ; UNROLL-NO-IC-NEXT: [[TMP18:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD]], <4 x i32> [[WIDE_LOAD7]], <4 x i32> ; UNROLL-NO-IC-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP7]] @@ -207,10 +193,10 @@ ; UNROLL-NO-IC-NEXT: [[TMP22:%.*]] = add <4 x i32> [[WIDE_LOAD7]], [[TMP18]] ; UNROLL-NO-IC-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[TMP19]], i32 0 ; UNROLL-NO-IC-NEXT: [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>* -; UNROLL-NO-IC-NEXT: store <4 x i32> [[TMP21]], <4 x i32>* [[TMP24]], align 4, !alias.scope !3, !noalias !0 +; UNROLL-NO-IC-NEXT: store <4 x i32> [[TMP21]], <4 x i32>* [[TMP24]], align 4 ; UNROLL-NO-IC-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TMP19]], i32 4 ; UNROLL-NO-IC-NEXT: [[TMP26:%.*]] = bitcast i32* [[TMP25]] to <4 x i32>* -; UNROLL-NO-IC-NEXT: store <4 x i32> [[TMP22]], <4 x i32>* [[TMP26]], align 4, !alias.scope !3, !noalias !0 +; UNROLL-NO-IC-NEXT: store <4 x i32> [[TMP22]], <4 x i32>* [[TMP26]], align 4 ; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; UNROLL-NO-IC-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; UNROLL-NO-IC-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] @@ -240,7 +226,6 @@ ; ; UNROLL-NO-VF-LABEL: @recurrence_1( ; UNROLL-NO-VF-NEXT: entry: -; UNROLL-NO-VF-NEXT: [[B1:%.*]] = bitcast i32* [[B:%.*]] to i8* ; UNROLL-NO-VF-NEXT: br label [[FOR_PREHEADER:%.*]] ; UNROLL-NO-VF: for.preheader: ; UNROLL-NO-VF-NEXT: [[ARRAYIDX_PHI_TRANS_INSERT:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 0 @@ -251,20 +236,15 @@ ; UNROLL-NO-VF-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 2 ; UNROLL-NO-VF-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; UNROLL-NO-VF: vector.memcheck: +; UNROLL-NO-VF-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[A]], i64 1 +; UNROLL-NO-VF-NEXT: [[SCEVGEP1:%.*]] = bitcast i32* [[SCEVGEP]] to i8* ; UNROLL-NO-VF-NEXT: [[TMP3:%.*]] = add i32 [[N]], -1 ; UNROLL-NO-VF-NEXT: [[TMP4:%.*]] = zext i32 [[TMP3]] to i64 ; UNROLL-NO-VF-NEXT: [[TMP5:%.*]] = add nuw nsw i64 [[TMP4]], 1 -; UNROLL-NO-VF-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP5]] -; UNROLL-NO-VF-NEXT: [[SCEVGEP2:%.*]] = bitcast i32* [[SCEVGEP]] to i8* -; UNROLL-NO-VF-NEXT: [[SCEVGEP3:%.*]] = getelementptr i32, i32* [[A]], i64 1 -; UNROLL-NO-VF-NEXT: [[SCEVGEP34:%.*]] = bitcast i32* [[SCEVGEP3]] to i8* -; UNROLL-NO-VF-NEXT: [[TMP6:%.*]] = add nuw nsw i64 [[TMP4]], 2 -; UNROLL-NO-VF-NEXT: [[SCEVGEP5:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP6]] -; UNROLL-NO-VF-NEXT: [[SCEVGEP56:%.*]] = bitcast i32* [[SCEVGEP5]] to i8* -; UNROLL-NO-VF-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[B1]], [[SCEVGEP56]] -; UNROLL-NO-VF-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[SCEVGEP34]], [[SCEVGEP2]] -; UNROLL-NO-VF-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; UNROLL-NO-VF-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; UNROLL-NO-VF-NEXT: [[SCEVGEP2:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[TMP5]] +; UNROLL-NO-VF-NEXT: [[SCEVGEP23:%.*]] = bitcast i32* [[SCEVGEP2]] to i8* +; UNROLL-NO-VF-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[SCEVGEP1]], [[SCEVGEP23]] +; UNROLL-NO-VF-NEXT: br i1 [[BOUND0]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; UNROLL-NO-VF: vector.ph: ; UNROLL-NO-VF-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 2 ; UNROLL-NO-VF-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]] @@ -278,14 +258,14 @@ ; UNROLL-NO-VF-NEXT: [[TMP8:%.*]] = add nuw nsw i64 [[INDUCTION7]], 1 ; UNROLL-NO-VF-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP7]] ; UNROLL-NO-VF-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP8]] -; UNROLL-NO-VF-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP9]], align 4, !alias.scope !0 -; UNROLL-NO-VF-NEXT: [[TMP12]] = load i32, i32* [[TMP10]], align 4, !alias.scope !0 +; UNROLL-NO-VF-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP9]], align 4 +; UNROLL-NO-VF-NEXT: [[TMP12]] = load i32, i32* [[TMP10]], align 4 ; UNROLL-NO-VF-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDUCTION]] ; UNROLL-NO-VF-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDUCTION7]] ; UNROLL-NO-VF-NEXT: [[TMP15:%.*]] = add i32 [[TMP11]], [[VECTOR_RECUR]] ; UNROLL-NO-VF-NEXT: [[TMP16:%.*]] = add i32 [[TMP12]], [[TMP11]] -; UNROLL-NO-VF-NEXT: store i32 [[TMP15]], i32* [[TMP13]], align 4, !alias.scope !3, !noalias !0 -; UNROLL-NO-VF-NEXT: store i32 [[TMP16]], i32* [[TMP14]], align 4, !alias.scope !3, !noalias !0 +; UNROLL-NO-VF-NEXT: store i32 [[TMP15]], i32* [[TMP13]], align 4 +; UNROLL-NO-VF-NEXT: store i32 [[TMP16]], i32* [[TMP14]], align 4 ; UNROLL-NO-VF-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; UNROLL-NO-VF-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; UNROLL-NO-VF-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] @@ -313,7 +293,6 @@ ; ; SINK-AFTER-LABEL: @recurrence_1( ; SINK-AFTER-NEXT: entry: -; SINK-AFTER-NEXT: [[B1:%.*]] = bitcast i32* [[B:%.*]] to i8* ; SINK-AFTER-NEXT: br label [[FOR_PREHEADER:%.*]] ; SINK-AFTER: for.preheader: ; SINK-AFTER-NEXT: [[ARRAYIDX_PHI_TRANS_INSERT:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 0 @@ -324,20 +303,15 @@ ; SINK-AFTER-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 4 ; SINK-AFTER-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; SINK-AFTER: vector.memcheck: +; SINK-AFTER-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[A]], i64 1 +; SINK-AFTER-NEXT: [[SCEVGEP1:%.*]] = bitcast i32* [[SCEVGEP]] to i8* ; SINK-AFTER-NEXT: [[TMP3:%.*]] = add i32 [[N]], -1 ; SINK-AFTER-NEXT: [[TMP4:%.*]] = zext i32 [[TMP3]] to i64 ; SINK-AFTER-NEXT: [[TMP5:%.*]] = add nuw nsw i64 [[TMP4]], 1 -; SINK-AFTER-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP5]] -; SINK-AFTER-NEXT: [[SCEVGEP2:%.*]] = bitcast i32* [[SCEVGEP]] to i8* -; SINK-AFTER-NEXT: [[SCEVGEP3:%.*]] = getelementptr i32, i32* [[A]], i64 1 -; SINK-AFTER-NEXT: [[SCEVGEP34:%.*]] = bitcast i32* [[SCEVGEP3]] to i8* -; SINK-AFTER-NEXT: [[TMP6:%.*]] = add nuw nsw i64 [[TMP4]], 2 -; SINK-AFTER-NEXT: [[SCEVGEP5:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP6]] -; SINK-AFTER-NEXT: [[SCEVGEP56:%.*]] = bitcast i32* [[SCEVGEP5]] to i8* -; SINK-AFTER-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[B1]], [[SCEVGEP56]] -; SINK-AFTER-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[SCEVGEP34]], [[SCEVGEP2]] -; SINK-AFTER-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; SINK-AFTER-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; SINK-AFTER-NEXT: [[SCEVGEP2:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[TMP5]] +; SINK-AFTER-NEXT: [[SCEVGEP23:%.*]] = bitcast i32* [[SCEVGEP2]] to i8* +; SINK-AFTER-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[SCEVGEP1]], [[SCEVGEP23]] +; SINK-AFTER-NEXT: br i1 [[BOUND0]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; SINK-AFTER: vector.ph: ; SINK-AFTER-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 4 ; SINK-AFTER-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]] @@ -351,13 +325,13 @@ ; SINK-AFTER-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP8]] ; SINK-AFTER-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP9]], i32 0 ; SINK-AFTER-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <4 x i32>* -; SINK-AFTER-NEXT: [[WIDE_LOAD]] = load <4 x i32>, <4 x i32>* [[TMP11]], align 4, !alias.scope !0 +; SINK-AFTER-NEXT: [[WIDE_LOAD]] = load <4 x i32>, <4 x i32>* [[TMP11]], align 4 ; SINK-AFTER-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[WIDE_LOAD]], <4 x i32> ; SINK-AFTER-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP7]] ; SINK-AFTER-NEXT: [[TMP14:%.*]] = add <4 x i32> [[WIDE_LOAD]], [[TMP12]] ; SINK-AFTER-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP13]], i32 0 ; SINK-AFTER-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP15]] to <4 x i32>* -; SINK-AFTER-NEXT: store <4 x i32> [[TMP14]], <4 x i32>* [[TMP16]], align 4, !alias.scope !3, !noalias !0 +; SINK-AFTER-NEXT: store <4 x i32> [[TMP14]], <4 x i32>* [[TMP16]], align 4 ; SINK-AFTER-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; SINK-AFTER-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; SINK-AFTER-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] @@ -387,7 +361,6 @@ ; ; NO-SINK-AFTER-LABEL: @recurrence_1( ; NO-SINK-AFTER-NEXT: entry: -; NO-SINK-AFTER-NEXT: [[B1:%.*]] = bitcast i32* [[B:%.*]] to i8* ; NO-SINK-AFTER-NEXT: br label [[FOR_PREHEADER:%.*]] ; NO-SINK-AFTER: for.preheader: ; NO-SINK-AFTER-NEXT: [[ARRAYIDX_PHI_TRANS_INSERT:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 0 @@ -398,20 +371,15 @@ ; NO-SINK-AFTER-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 4 ; NO-SINK-AFTER-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; NO-SINK-AFTER: vector.memcheck: +; NO-SINK-AFTER-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[A]], i64 1 +; NO-SINK-AFTER-NEXT: [[SCEVGEP1:%.*]] = bitcast i32* [[SCEVGEP]] to i8* ; NO-SINK-AFTER-NEXT: [[TMP3:%.*]] = add i32 [[N]], -1 ; NO-SINK-AFTER-NEXT: [[TMP4:%.*]] = zext i32 [[TMP3]] to i64 ; NO-SINK-AFTER-NEXT: [[TMP5:%.*]] = add nuw nsw i64 [[TMP4]], 1 -; NO-SINK-AFTER-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP5]] -; NO-SINK-AFTER-NEXT: [[SCEVGEP2:%.*]] = bitcast i32* [[SCEVGEP]] to i8* -; NO-SINK-AFTER-NEXT: [[SCEVGEP3:%.*]] = getelementptr i32, i32* [[A]], i64 1 -; NO-SINK-AFTER-NEXT: [[SCEVGEP34:%.*]] = bitcast i32* [[SCEVGEP3]] to i8* -; NO-SINK-AFTER-NEXT: [[TMP6:%.*]] = add nuw nsw i64 [[TMP4]], 2 -; NO-SINK-AFTER-NEXT: [[SCEVGEP5:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP6]] -; NO-SINK-AFTER-NEXT: [[SCEVGEP56:%.*]] = bitcast i32* [[SCEVGEP5]] to i8* -; NO-SINK-AFTER-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[B1]], [[SCEVGEP56]] -; NO-SINK-AFTER-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[SCEVGEP34]], [[SCEVGEP2]] -; NO-SINK-AFTER-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; NO-SINK-AFTER-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; NO-SINK-AFTER-NEXT: [[SCEVGEP2:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[TMP5]] +; NO-SINK-AFTER-NEXT: [[SCEVGEP23:%.*]] = bitcast i32* [[SCEVGEP2]] to i8* +; NO-SINK-AFTER-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[SCEVGEP1]], [[SCEVGEP23]] +; NO-SINK-AFTER-NEXT: br i1 [[BOUND0]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; NO-SINK-AFTER: vector.ph: ; NO-SINK-AFTER-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 4 ; NO-SINK-AFTER-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]] @@ -425,13 +393,13 @@ ; NO-SINK-AFTER-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP8]] ; NO-SINK-AFTER-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP9]], i32 0 ; NO-SINK-AFTER-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <4 x i32>* -; NO-SINK-AFTER-NEXT: [[WIDE_LOAD]] = load <4 x i32>, <4 x i32>* [[TMP11]], align 4, !alias.scope !0 +; NO-SINK-AFTER-NEXT: [[WIDE_LOAD]] = load <4 x i32>, <4 x i32>* [[TMP11]], align 4 ; NO-SINK-AFTER-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[WIDE_LOAD]], <4 x i32> ; NO-SINK-AFTER-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP7]] ; NO-SINK-AFTER-NEXT: [[TMP14:%.*]] = add <4 x i32> [[WIDE_LOAD]], [[TMP12]] ; NO-SINK-AFTER-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP13]], i32 0 ; NO-SINK-AFTER-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP15]] to <4 x i32>* -; NO-SINK-AFTER-NEXT: store <4 x i32> [[TMP14]], <4 x i32>* [[TMP16]], align 4, !alias.scope !3, !noalias !0 +; NO-SINK-AFTER-NEXT: store <4 x i32> [[TMP14]], <4 x i32>* [[TMP16]], align 4 ; NO-SINK-AFTER-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; NO-SINK-AFTER-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; NO-SINK-AFTER-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] @@ -1012,7 +980,7 @@ ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = or i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[OFFSET_IDX]] ; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16* [[TMP9]] to <4 x i16>* -; CHECK-NEXT: [[WIDE_LOAD]] = load <4 x i16>, <4 x i16>* [[TMP10]], align 2, !alias.scope !11 +; CHECK-NEXT: [[WIDE_LOAD]] = load <4 x i16>, <4 x i16>* [[TMP10]], align 2 ; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[WIDE_LOAD]], <4 x i32> ; CHECK-NEXT: [[TMP12:%.*]] = sitofp <4 x i16> [[WIDE_LOAD]] to <4 x double> ; CHECK-NEXT: [[TMP13:%.*]] = sitofp <4 x i16> [[TMP11]] to <4 x double> @@ -1020,7 +988,7 @@ ; CHECK-NEXT: [[TMP15:%.*]] = fsub fast <4 x double> [[TMP12]], [[TMP14]] ; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[OFFSET_IDX]] ; CHECK-NEXT: [[TMP17:%.*]] = bitcast double* [[TMP16]] to <4 x double>* -; CHECK-NEXT: store <4 x double> [[TMP15]], <4 x double>* [[TMP17]], align 8, !alias.scope !14, !noalias !11 +; CHECK-NEXT: store <4 x double> [[TMP15]], <4 x double>* [[TMP17]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] @@ -1098,10 +1066,10 @@ ; UNROLL-NEXT: [[OFFSET_IDX:%.*]] = or i64 [[INDEX]], 1 ; UNROLL-NEXT: [[TMP9:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[OFFSET_IDX]] ; UNROLL-NEXT: [[TMP10:%.*]] = bitcast i16* [[TMP9]] to <4 x i16>* -; UNROLL-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, <4 x i16>* [[TMP10]], align 2, !alias.scope !11 +; UNROLL-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, <4 x i16>* [[TMP10]], align 2 ; UNROLL-NEXT: [[TMP11:%.*]] = getelementptr inbounds i16, i16* [[TMP9]], i64 4 ; UNROLL-NEXT: [[TMP12:%.*]] = bitcast i16* [[TMP11]] to <4 x i16>* -; UNROLL-NEXT: [[WIDE_LOAD8]] = load <4 x i16>, <4 x i16>* [[TMP12]], align 2, !alias.scope !11 +; UNROLL-NEXT: [[WIDE_LOAD8]] = load <4 x i16>, <4 x i16>* [[TMP12]], align 2 ; UNROLL-NEXT: [[TMP13:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[WIDE_LOAD]], <4 x i32> ; UNROLL-NEXT: [[TMP14:%.*]] = shufflevector <4 x i16> [[WIDE_LOAD]], <4 x i16> [[WIDE_LOAD8]], <4 x i32> ; UNROLL-NEXT: [[TMP15:%.*]] = sitofp <4 x i16> [[WIDE_LOAD]] to <4 x double> @@ -1114,10 +1082,10 @@ ; UNROLL-NEXT: [[TMP22:%.*]] = fsub fast <4 x double> [[TMP16]], [[TMP20]] ; UNROLL-NEXT: [[TMP23:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[OFFSET_IDX]] ; UNROLL-NEXT: [[TMP24:%.*]] = bitcast double* [[TMP23]] to <4 x double>* -; UNROLL-NEXT: store <4 x double> [[TMP21]], <4 x double>* [[TMP24]], align 8, !alias.scope !14, !noalias !11 +; UNROLL-NEXT: store <4 x double> [[TMP21]], <4 x double>* [[TMP24]], align 8 ; UNROLL-NEXT: [[TMP25:%.*]] = getelementptr inbounds double, double* [[TMP23]], i64 4 ; UNROLL-NEXT: [[TMP26:%.*]] = bitcast double* [[TMP25]] to <4 x double>* -; UNROLL-NEXT: store <4 x double> [[TMP22]], <4 x double>* [[TMP26]], align 8, !alias.scope !14, !noalias !11 +; UNROLL-NEXT: store <4 x double> [[TMP22]], <4 x double>* [[TMP26]], align 8 ; UNROLL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; UNROLL-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; UNROLL-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] @@ -1202,10 +1170,10 @@ ; UNROLL-NO-IC-NEXT: [[TMP10:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[TMP8]] ; UNROLL-NO-IC-NEXT: [[TMP11:%.*]] = getelementptr inbounds i16, i16* [[TMP9]], i32 0 ; UNROLL-NO-IC-NEXT: [[TMP12:%.*]] = bitcast i16* [[TMP11]] to <4 x i16>* -; UNROLL-NO-IC-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, <4 x i16>* [[TMP12]], align 2, !alias.scope !11 +; UNROLL-NO-IC-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, <4 x i16>* [[TMP12]], align 2 ; UNROLL-NO-IC-NEXT: [[TMP13:%.*]] = getelementptr inbounds i16, i16* [[TMP9]], i32 4 ; UNROLL-NO-IC-NEXT: [[TMP14:%.*]] = bitcast i16* [[TMP13]] to <4 x i16>* -; UNROLL-NO-IC-NEXT: [[WIDE_LOAD8]] = load <4 x i16>, <4 x i16>* [[TMP14]], align 2, !alias.scope !11 +; UNROLL-NO-IC-NEXT: [[WIDE_LOAD8]] = load <4 x i16>, <4 x i16>* [[TMP14]], align 2 ; UNROLL-NO-IC-NEXT: [[TMP15:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[WIDE_LOAD]], <4 x i32> ; UNROLL-NO-IC-NEXT: [[TMP16:%.*]] = shufflevector <4 x i16> [[WIDE_LOAD]], <4 x i16> [[WIDE_LOAD8]], <4 x i32> ; UNROLL-NO-IC-NEXT: [[TMP17:%.*]] = sitofp <4 x i16> [[WIDE_LOAD]] to <4 x double> @@ -1220,10 +1188,10 @@ ; UNROLL-NO-IC-NEXT: [[TMP26:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP8]] ; UNROLL-NO-IC-NEXT: [[TMP27:%.*]] = getelementptr inbounds double, double* [[TMP25]], i32 0 ; UNROLL-NO-IC-NEXT: [[TMP28:%.*]] = bitcast double* [[TMP27]] to <4 x double>* -; UNROLL-NO-IC-NEXT: store <4 x double> [[TMP23]], <4 x double>* [[TMP28]], align 8, !alias.scope !14, !noalias !11 +; UNROLL-NO-IC-NEXT: store <4 x double> [[TMP23]], <4 x double>* [[TMP28]], align 8 ; UNROLL-NO-IC-NEXT: [[TMP29:%.*]] = getelementptr inbounds double, double* [[TMP25]], i32 4 ; UNROLL-NO-IC-NEXT: [[TMP30:%.*]] = bitcast double* [[TMP29]] to <4 x double>* -; UNROLL-NO-IC-NEXT: store <4 x double> [[TMP24]], <4 x double>* [[TMP30]], align 8, !alias.scope !14, !noalias !11 +; UNROLL-NO-IC-NEXT: store <4 x double> [[TMP24]], <4 x double>* [[TMP30]], align 8 ; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; UNROLL-NO-IC-NEXT: [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; UNROLL-NO-IC-NEXT: br i1 [[TMP31]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] @@ -1302,8 +1270,8 @@ ; UNROLL-NO-VF-NEXT: [[INDUCTION8:%.*]] = add i64 [[OFFSET_IDX]], 1 ; UNROLL-NO-VF-NEXT: [[TMP7:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[INDUCTION]] ; UNROLL-NO-VF-NEXT: [[TMP8:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[INDUCTION8]] -; UNROLL-NO-VF-NEXT: [[TMP9:%.*]] = load i16, i16* [[TMP7]], align 2, !alias.scope !10 -; UNROLL-NO-VF-NEXT: [[TMP10]] = load i16, i16* [[TMP8]], align 2, !alias.scope !10 +; UNROLL-NO-VF-NEXT: [[TMP9:%.*]] = load i16, i16* [[TMP7]], align 2 +; UNROLL-NO-VF-NEXT: [[TMP10]] = load i16, i16* [[TMP8]], align 2 ; UNROLL-NO-VF-NEXT: [[TMP11:%.*]] = sitofp i16 [[TMP9]] to double ; UNROLL-NO-VF-NEXT: [[TMP12:%.*]] = sitofp i16 [[TMP10]] to double ; UNROLL-NO-VF-NEXT: [[TMP13:%.*]] = sitofp i16 [[VECTOR_RECUR]] to double @@ -1314,8 +1282,8 @@ ; UNROLL-NO-VF-NEXT: [[TMP18:%.*]] = fsub fast double [[TMP12]], [[TMP16]] ; UNROLL-NO-VF-NEXT: [[TMP19:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDUCTION]] ; UNROLL-NO-VF-NEXT: [[TMP20:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDUCTION8]] -; UNROLL-NO-VF-NEXT: store double [[TMP17]], double* [[TMP19]], align 8, !alias.scope !13, !noalias !10 -; UNROLL-NO-VF-NEXT: store double [[TMP18]], double* [[TMP20]], align 8, !alias.scope !13, !noalias !10 +; UNROLL-NO-VF-NEXT: store double [[TMP17]], double* [[TMP19]], align 8 +; UNROLL-NO-VF-NEXT: store double [[TMP18]], double* [[TMP20]], align 8 ; UNROLL-NO-VF-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; UNROLL-NO-VF-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; UNROLL-NO-VF-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] @@ -1395,7 +1363,7 @@ ; SINK-AFTER-NEXT: [[TMP8:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[TMP7]] ; SINK-AFTER-NEXT: [[TMP9:%.*]] = getelementptr inbounds i16, i16* [[TMP8]], i32 0 ; SINK-AFTER-NEXT: [[TMP10:%.*]] = bitcast i16* [[TMP9]] to <4 x i16>* -; SINK-AFTER-NEXT: [[WIDE_LOAD]] = load <4 x i16>, <4 x i16>* [[TMP10]], align 2, !alias.scope !11 +; SINK-AFTER-NEXT: [[WIDE_LOAD]] = load <4 x i16>, <4 x i16>* [[TMP10]], align 2 ; SINK-AFTER-NEXT: [[TMP11:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[WIDE_LOAD]], <4 x i32> ; SINK-AFTER-NEXT: [[TMP12:%.*]] = sitofp <4 x i16> [[WIDE_LOAD]] to <4 x double> ; SINK-AFTER-NEXT: [[TMP13:%.*]] = sitofp <4 x i16> [[TMP11]] to <4 x double> @@ -1404,7 +1372,7 @@ ; SINK-AFTER-NEXT: [[TMP16:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP7]] ; SINK-AFTER-NEXT: [[TMP17:%.*]] = getelementptr inbounds double, double* [[TMP16]], i32 0 ; SINK-AFTER-NEXT: [[TMP18:%.*]] = bitcast double* [[TMP17]] to <4 x double>* -; SINK-AFTER-NEXT: store <4 x double> [[TMP15]], <4 x double>* [[TMP18]], align 8, !alias.scope !14, !noalias !11 +; SINK-AFTER-NEXT: store <4 x double> [[TMP15]], <4 x double>* [[TMP18]], align 8 ; SINK-AFTER-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; SINK-AFTER-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; SINK-AFTER-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] @@ -1486,7 +1454,7 @@ ; NO-SINK-AFTER-NEXT: [[TMP8:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[TMP7]] ; NO-SINK-AFTER-NEXT: [[TMP9:%.*]] = getelementptr inbounds i16, i16* [[TMP8]], i32 0 ; NO-SINK-AFTER-NEXT: [[TMP10:%.*]] = bitcast i16* [[TMP9]] to <4 x i16>* -; NO-SINK-AFTER-NEXT: [[WIDE_LOAD]] = load <4 x i16>, <4 x i16>* [[TMP10]], align 2, !alias.scope !11 +; NO-SINK-AFTER-NEXT: [[WIDE_LOAD]] = load <4 x i16>, <4 x i16>* [[TMP10]], align 2 ; NO-SINK-AFTER-NEXT: [[TMP11:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[WIDE_LOAD]], <4 x i32> ; NO-SINK-AFTER-NEXT: [[TMP12:%.*]] = sitofp <4 x i16> [[WIDE_LOAD]] to <4 x double> ; NO-SINK-AFTER-NEXT: [[TMP13:%.*]] = sitofp <4 x i16> [[TMP11]] to <4 x double> @@ -1495,7 +1463,7 @@ ; NO-SINK-AFTER-NEXT: [[TMP16:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP7]] ; NO-SINK-AFTER-NEXT: [[TMP17:%.*]] = getelementptr inbounds double, double* [[TMP16]], i32 0 ; NO-SINK-AFTER-NEXT: [[TMP18:%.*]] = bitcast double* [[TMP17]] to <4 x double>* -; NO-SINK-AFTER-NEXT: store <4 x double> [[TMP15]], <4 x double>* [[TMP18]], align 8, !alias.scope !14, !noalias !11 +; NO-SINK-AFTER-NEXT: store <4 x double> [[TMP15]], <4 x double>* [[TMP18]], align 8 ; NO-SINK-AFTER-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; NO-SINK-AFTER-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; NO-SINK-AFTER-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] @@ -3246,14 +3214,14 @@ ; CHECK-NEXT: [[TMP3:%.*]] = or i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[TMP3]] ; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16* [[TMP4]] to <4 x i16>* -; CHECK-NEXT: [[WIDE_LOAD]] = load <4 x i16>, <4 x i16>* [[TMP5]], align 2, !alias.scope !26 +; CHECK-NEXT: [[WIDE_LOAD]] = load <4 x i16>, <4 x i16>* [[TMP5]], align 2 ; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[WIDE_LOAD]], <4 x i32> ; CHECK-NEXT: [[TMP7:%.*]] = sext <4 x i16> [[TMP6]] to <4 x i32> ; CHECK-NEXT: [[TMP8:%.*]] = sext <4 x i16> [[WIDE_LOAD]] to <4 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = mul nsw <4 x i32> [[TMP8]], [[TMP7]] ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* [[TMP11]], align 4, !alias.scope !29, !noalias !26 +; CHECK-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* [[TMP11]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP31:![0-9]+]] @@ -3307,10 +3275,10 @@ ; UNROLL-NEXT: [[TMP3:%.*]] = or i64 [[INDEX]], 1 ; UNROLL-NEXT: [[TMP4:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[TMP3]] ; UNROLL-NEXT: [[TMP5:%.*]] = bitcast i16* [[TMP4]] to <4 x i16>* -; UNROLL-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, <4 x i16>* [[TMP5]], align 2, !alias.scope !26 +; UNROLL-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, <4 x i16>* [[TMP5]], align 2 ; UNROLL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[TMP4]], i64 4 ; UNROLL-NEXT: [[TMP7:%.*]] = bitcast i16* [[TMP6]] to <4 x i16>* -; UNROLL-NEXT: [[WIDE_LOAD7]] = load <4 x i16>, <4 x i16>* [[TMP7]], align 2, !alias.scope !26 +; UNROLL-NEXT: [[WIDE_LOAD7]] = load <4 x i16>, <4 x i16>* [[TMP7]], align 2 ; UNROLL-NEXT: [[TMP8:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[WIDE_LOAD]], <4 x i32> ; UNROLL-NEXT: [[TMP9:%.*]] = shufflevector <4 x i16> [[WIDE_LOAD]], <4 x i16> [[WIDE_LOAD7]], <4 x i32> ; UNROLL-NEXT: [[TMP10:%.*]] = sext <4 x i16> [[TMP8]] to <4 x i32> @@ -3321,10 +3289,10 @@ ; UNROLL-NEXT: [[TMP15:%.*]] = mul nsw <4 x i32> [[TMP13]], [[TMP11]] ; UNROLL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]] ; UNROLL-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>* -; UNROLL-NEXT: store <4 x i32> [[TMP14]], <4 x i32>* [[TMP17]], align 4, !alias.scope !29, !noalias !26 +; UNROLL-NEXT: store <4 x i32> [[TMP14]], <4 x i32>* [[TMP17]], align 4 ; UNROLL-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP16]], i64 4 ; UNROLL-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>* -; UNROLL-NEXT: store <4 x i32> [[TMP15]], <4 x i32>* [[TMP19]], align 4, !alias.scope !29, !noalias !26 +; UNROLL-NEXT: store <4 x i32> [[TMP15]], <4 x i32>* [[TMP19]], align 4 ; UNROLL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; UNROLL-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; UNROLL-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP31:![0-9]+]] @@ -3386,10 +3354,10 @@ ; UNROLL-NO-IC-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[TMP4]] ; UNROLL-NO-IC-NEXT: [[TMP7:%.*]] = getelementptr inbounds i16, i16* [[TMP5]], i32 0 ; UNROLL-NO-IC-NEXT: [[TMP8:%.*]] = bitcast i16* [[TMP7]] to <4 x i16>* -; UNROLL-NO-IC-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, <4 x i16>* [[TMP8]], align 2, !alias.scope !26 +; UNROLL-NO-IC-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, <4 x i16>* [[TMP8]], align 2 ; UNROLL-NO-IC-NEXT: [[TMP9:%.*]] = getelementptr inbounds i16, i16* [[TMP5]], i32 4 ; UNROLL-NO-IC-NEXT: [[TMP10:%.*]] = bitcast i16* [[TMP9]] to <4 x i16>* -; UNROLL-NO-IC-NEXT: [[WIDE_LOAD7]] = load <4 x i16>, <4 x i16>* [[TMP10]], align 2, !alias.scope !26 +; UNROLL-NO-IC-NEXT: [[WIDE_LOAD7]] = load <4 x i16>, <4 x i16>* [[TMP10]], align 2 ; UNROLL-NO-IC-NEXT: [[TMP11:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[WIDE_LOAD]], <4 x i32> ; UNROLL-NO-IC-NEXT: [[TMP12:%.*]] = shufflevector <4 x i16> [[WIDE_LOAD]], <4 x i16> [[WIDE_LOAD7]], <4 x i32> ; UNROLL-NO-IC-NEXT: [[TMP13:%.*]] = sext <4 x i16> [[TMP11]] to <4 x i32> @@ -3402,10 +3370,10 @@ ; UNROLL-NO-IC-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP2]] ; UNROLL-NO-IC-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP19]], i32 0 ; UNROLL-NO-IC-NEXT: [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>* -; UNROLL-NO-IC-NEXT: store <4 x i32> [[TMP17]], <4 x i32>* [[TMP22]], align 4, !alias.scope !29, !noalias !26 +; UNROLL-NO-IC-NEXT: store <4 x i32> [[TMP17]], <4 x i32>* [[TMP22]], align 4 ; UNROLL-NO-IC-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[TMP19]], i32 4 ; UNROLL-NO-IC-NEXT: [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>* -; UNROLL-NO-IC-NEXT: store <4 x i32> [[TMP18]], <4 x i32>* [[TMP24]], align 4, !alias.scope !29, !noalias !26 +; UNROLL-NO-IC-NEXT: store <4 x i32> [[TMP18]], <4 x i32>* [[TMP24]], align 4 ; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; UNROLL-NO-IC-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; UNROLL-NO-IC-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP31:![0-9]+]] @@ -3465,8 +3433,8 @@ ; UNROLL-NO-VF-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[INDUCTION7]], 1 ; UNROLL-NO-VF-NEXT: [[TMP3:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[TMP1]] ; UNROLL-NO-VF-NEXT: [[TMP4:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[TMP2]] -; UNROLL-NO-VF-NEXT: [[TMP5:%.*]] = load i16, i16* [[TMP3]], align 2, !alias.scope !25 -; UNROLL-NO-VF-NEXT: [[TMP6]] = load i16, i16* [[TMP4]], align 2, !alias.scope !25 +; UNROLL-NO-VF-NEXT: [[TMP5:%.*]] = load i16, i16* [[TMP3]], align 2 +; UNROLL-NO-VF-NEXT: [[TMP6]] = load i16, i16* [[TMP4]], align 2 ; UNROLL-NO-VF-NEXT: [[TMP7:%.*]] = sext i16 [[VECTOR_RECUR]] to i32 ; UNROLL-NO-VF-NEXT: [[TMP8:%.*]] = sext i16 [[TMP5]] to i32 ; UNROLL-NO-VF-NEXT: [[TMP9:%.*]] = sext i16 [[TMP5]] to i32 @@ -3475,8 +3443,8 @@ ; UNROLL-NO-VF-NEXT: [[TMP12:%.*]] = mul nsw i32 [[TMP10]], [[TMP8]] ; UNROLL-NO-VF-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDUCTION]] ; UNROLL-NO-VF-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDUCTION7]] -; UNROLL-NO-VF-NEXT: store i32 [[TMP11]], i32* [[TMP13]], align 4, !alias.scope !28, !noalias !25 -; UNROLL-NO-VF-NEXT: store i32 [[TMP12]], i32* [[TMP14]], align 4, !alias.scope !28, !noalias !25 +; UNROLL-NO-VF-NEXT: store i32 [[TMP11]], i32* [[TMP13]], align 4 +; UNROLL-NO-VF-NEXT: store i32 [[TMP12]], i32* [[TMP14]], align 4 ; UNROLL-NO-VF-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; UNROLL-NO-VF-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; UNROLL-NO-VF-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]] @@ -3534,7 +3502,7 @@ ; SINK-AFTER-NEXT: [[TMP3:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[TMP2]] ; SINK-AFTER-NEXT: [[TMP4:%.*]] = getelementptr inbounds i16, i16* [[TMP3]], i32 0 ; SINK-AFTER-NEXT: [[TMP5:%.*]] = bitcast i16* [[TMP4]] to <4 x i16>* -; SINK-AFTER-NEXT: [[WIDE_LOAD]] = load <4 x i16>, <4 x i16>* [[TMP5]], align 2, !alias.scope !26 +; SINK-AFTER-NEXT: [[WIDE_LOAD]] = load <4 x i16>, <4 x i16>* [[TMP5]], align 2 ; SINK-AFTER-NEXT: [[TMP6:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[WIDE_LOAD]], <4 x i32> ; SINK-AFTER-NEXT: [[TMP7:%.*]] = sext <4 x i16> [[TMP6]] to <4 x i32> ; SINK-AFTER-NEXT: [[TMP8:%.*]] = sext <4 x i16> [[WIDE_LOAD]] to <4 x i32> @@ -3542,7 +3510,7 @@ ; SINK-AFTER-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP1]] ; SINK-AFTER-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 0 ; SINK-AFTER-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>* -; SINK-AFTER-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* [[TMP12]], align 4, !alias.scope !29, !noalias !26 +; SINK-AFTER-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* [[TMP12]], align 4 ; SINK-AFTER-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; SINK-AFTER-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; SINK-AFTER-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP31:![0-9]+]] @@ -3602,7 +3570,7 @@ ; NO-SINK-AFTER-NEXT: [[TMP3:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[TMP2]] ; NO-SINK-AFTER-NEXT: [[TMP4:%.*]] = getelementptr inbounds i16, i16* [[TMP3]], i32 0 ; NO-SINK-AFTER-NEXT: [[TMP5:%.*]] = bitcast i16* [[TMP4]] to <4 x i16>* -; NO-SINK-AFTER-NEXT: [[WIDE_LOAD]] = load <4 x i16>, <4 x i16>* [[TMP5]], align 2, !alias.scope !26 +; NO-SINK-AFTER-NEXT: [[WIDE_LOAD]] = load <4 x i16>, <4 x i16>* [[TMP5]], align 2 ; NO-SINK-AFTER-NEXT: [[TMP6:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[WIDE_LOAD]], <4 x i32> ; NO-SINK-AFTER-NEXT: [[TMP7:%.*]] = sext <4 x i16> [[TMP6]] to <4 x i32> ; NO-SINK-AFTER-NEXT: [[TMP8:%.*]] = sext <4 x i16> [[WIDE_LOAD]] to <4 x i32> @@ -3610,7 +3578,7 @@ ; NO-SINK-AFTER-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP1]] ; NO-SINK-AFTER-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 0 ; NO-SINK-AFTER-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>* -; NO-SINK-AFTER-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* [[TMP12]], align 4, !alias.scope !29, !noalias !26 +; NO-SINK-AFTER-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* [[TMP12]], align 4 ; NO-SINK-AFTER-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; NO-SINK-AFTER-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; NO-SINK-AFTER-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP31:![0-9]+]] @@ -3685,26 +3653,17 @@ ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; CHECK: vector.memcheck: -; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[C:%.*]], i64 [[N]] -; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[N]] +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[N]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt i32* [[SCEVGEP]], [[C:%.*]] +; CHECK-NEXT: [[SCEVGEP3:%.*]] = getelementptr [2 x i16], [2 x i16]* [[A]], i64 [[N]], i64 0 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i16* [[SCEVGEP3]] to i32* +; CHECK-NEXT: [[BOUND05:%.*]] = icmp ugt i32* [[TMP0]], [[C]] +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[BOUND0]], [[BOUND05]] ; CHECK-NEXT: [[SCEVGEP6:%.*]] = getelementptr [2 x i16], [2 x i16]* [[A]], i64 0, i64 1 -; CHECK-NEXT: [[SCEVGEP8:%.*]] = getelementptr [2 x i16], [2 x i16]* [[A]], i64 [[N]], i64 0 -; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt i32* [[SCEVGEP4]], [[C]] -; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[B]] -; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; CHECK-NEXT: [[TMP0:%.*]] = bitcast i16* [[SCEVGEP8]] to i32* -; CHECK-NEXT: [[BOUND010:%.*]] = icmp ugt i32* [[TMP0]], [[C]] ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[SCEVGEP]] to i16* -; CHECK-NEXT: [[BOUND111:%.*]] = icmp ult i16* [[SCEVGEP6]], [[TMP1]] -; CHECK-NEXT: [[FOUND_CONFLICT12:%.*]] = and i1 [[BOUND010]], [[BOUND111]] -; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT12]] -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[SCEVGEP8]] to i32* -; CHECK-NEXT: [[BOUND013:%.*]] = icmp ugt i32* [[TMP2]], [[B]] -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[SCEVGEP4]] to i16* -; CHECK-NEXT: [[BOUND114:%.*]] = icmp ult i16* [[SCEVGEP6]], [[TMP3]] -; CHECK-NEXT: [[FOUND_CONFLICT15:%.*]] = and i1 [[BOUND013]], [[BOUND114]] -; CHECK-NEXT: [[CONFLICT_RDX16:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT15]] -; CHECK-NEXT: br i1 [[CONFLICT_RDX16]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: [[BOUND08:%.*]] = icmp ult i16* [[SCEVGEP6]], [[TMP1]] +; CHECK-NEXT: [[CONFLICT_RDX9:%.*]] = or i1 [[CONFLICT_RDX]], [[BOUND08]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX9]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[N]], -4 ; CHECK-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <4 x i16> poison, i16 [[DOTPRE]], i64 3 @@ -3721,11 +3680,11 @@ ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2 x i16], [2 x i16]* [[A]], i64 [[TMP5]], i64 1 ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i16], [2 x i16]* [[A]], i64 [[TMP6]], i64 1 ; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP7]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> , <4 x i32>* [[TMP12]], align 4, !alias.scope !33, !noalias !36 -; CHECK-NEXT: [[TMP13:%.*]] = load i16, i16* [[TMP8]], align 2, !alias.scope !39 -; CHECK-NEXT: [[TMP14:%.*]] = load i16, i16* [[TMP9]], align 2, !alias.scope !39 -; CHECK-NEXT: [[TMP15:%.*]] = load i16, i16* [[TMP10]], align 2, !alias.scope !39 -; CHECK-NEXT: [[TMP16:%.*]] = load i16, i16* [[TMP11]], align 2, !alias.scope !39 +; CHECK-NEXT: store <4 x i32> , <4 x i32>* [[TMP12]], align 4 +; CHECK-NEXT: [[TMP13:%.*]] = load i16, i16* [[TMP8]], align 2 +; CHECK-NEXT: [[TMP14:%.*]] = load i16, i16* [[TMP9]], align 2 +; CHECK-NEXT: [[TMP15:%.*]] = load i16, i16* [[TMP10]], align 2 +; CHECK-NEXT: [[TMP16:%.*]] = load i16, i16* [[TMP11]], align 2 ; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x i16> poison, i16 [[TMP13]], i64 0 ; CHECK-NEXT: [[TMP18:%.*]] = insertelement <4 x i16> [[TMP17]], i16 [[TMP14]], i64 1 ; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i16> [[TMP18]], i16 [[TMP15]], i64 2 @@ -3736,7 +3695,7 @@ ; CHECK-NEXT: [[TMP24:%.*]] = mul nsw <4 x i32> [[TMP23]], [[TMP22]] ; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP26:%.*]] = bitcast i32* [[TMP25]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP24]], <4 x i32>* [[TMP26]], align 4, !alias.scope !40, !noalias !39 +; CHECK-NEXT: store <4 x i32> [[TMP24]], <4 x i32>* [[TMP26]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]] @@ -3772,26 +3731,17 @@ ; UNROLL-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 8 ; UNROLL-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; UNROLL: vector.memcheck: -; UNROLL-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[C:%.*]], i64 [[N]] -; UNROLL-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[N]] +; UNROLL-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[N]] +; UNROLL-NEXT: [[BOUND0:%.*]] = icmp ugt i32* [[SCEVGEP]], [[C:%.*]] +; UNROLL-NEXT: [[SCEVGEP3:%.*]] = getelementptr [2 x i16], [2 x i16]* [[A]], i64 [[N]], i64 0 +; UNROLL-NEXT: [[TMP0:%.*]] = bitcast i16* [[SCEVGEP3]] to i32* +; UNROLL-NEXT: [[BOUND05:%.*]] = icmp ugt i32* [[TMP0]], [[C]] +; UNROLL-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[BOUND0]], [[BOUND05]] ; UNROLL-NEXT: [[SCEVGEP6:%.*]] = getelementptr [2 x i16], [2 x i16]* [[A]], i64 0, i64 1 -; UNROLL-NEXT: [[SCEVGEP8:%.*]] = getelementptr [2 x i16], [2 x i16]* [[A]], i64 [[N]], i64 0 -; UNROLL-NEXT: [[BOUND0:%.*]] = icmp ugt i32* [[SCEVGEP4]], [[C]] -; UNROLL-NEXT: [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[B]] -; UNROLL-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; UNROLL-NEXT: [[TMP0:%.*]] = bitcast i16* [[SCEVGEP8]] to i32* -; UNROLL-NEXT: [[BOUND010:%.*]] = icmp ugt i32* [[TMP0]], [[C]] ; UNROLL-NEXT: [[TMP1:%.*]] = bitcast i32* [[SCEVGEP]] to i16* -; UNROLL-NEXT: [[BOUND111:%.*]] = icmp ult i16* [[SCEVGEP6]], [[TMP1]] -; UNROLL-NEXT: [[FOUND_CONFLICT12:%.*]] = and i1 [[BOUND010]], [[BOUND111]] -; UNROLL-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT12]] -; UNROLL-NEXT: [[TMP2:%.*]] = bitcast i16* [[SCEVGEP8]] to i32* -; UNROLL-NEXT: [[BOUND013:%.*]] = icmp ugt i32* [[TMP2]], [[B]] -; UNROLL-NEXT: [[TMP3:%.*]] = bitcast i32* [[SCEVGEP4]] to i16* -; UNROLL-NEXT: [[BOUND114:%.*]] = icmp ult i16* [[SCEVGEP6]], [[TMP3]] -; UNROLL-NEXT: [[FOUND_CONFLICT15:%.*]] = and i1 [[BOUND013]], [[BOUND114]] -; UNROLL-NEXT: [[CONFLICT_RDX16:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT15]] -; UNROLL-NEXT: br i1 [[CONFLICT_RDX16]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; UNROLL-NEXT: [[BOUND08:%.*]] = icmp ult i16* [[SCEVGEP6]], [[TMP1]] +; UNROLL-NEXT: [[CONFLICT_RDX9:%.*]] = or i1 [[CONFLICT_RDX]], [[BOUND08]] +; UNROLL-NEXT: br i1 [[CONFLICT_RDX9]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; UNROLL: vector.ph: ; UNROLL-NEXT: [[N_VEC:%.*]] = and i64 [[N]], -8 ; UNROLL-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <4 x i16> poison, i16 [[DOTPRE]], i64 3 @@ -3816,22 +3766,22 @@ ; UNROLL-NEXT: [[TMP18:%.*]] = getelementptr inbounds [2 x i16], [2 x i16]* [[A]], i64 [[TMP9]], i64 1 ; UNROLL-NEXT: [[TMP19:%.*]] = getelementptr inbounds [2 x i16], [2 x i16]* [[A]], i64 [[TMP10]], i64 1 ; UNROLL-NEXT: [[TMP20:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>* -; UNROLL-NEXT: store <4 x i32> , <4 x i32>* [[TMP20]], align 4, !alias.scope !33, !noalias !36 +; UNROLL-NEXT: store <4 x i32> , <4 x i32>* [[TMP20]], align 4 ; UNROLL-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i64 4 ; UNROLL-NEXT: [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>* -; UNROLL-NEXT: store <4 x i32> , <4 x i32>* [[TMP22]], align 4, !alias.scope !33, !noalias !36 -; UNROLL-NEXT: [[TMP23:%.*]] = load i16, i16* [[TMP12]], align 2, !alias.scope !39 -; UNROLL-NEXT: [[TMP24:%.*]] = load i16, i16* [[TMP13]], align 2, !alias.scope !39 -; UNROLL-NEXT: [[TMP25:%.*]] = load i16, i16* [[TMP14]], align 2, !alias.scope !39 -; UNROLL-NEXT: [[TMP26:%.*]] = load i16, i16* [[TMP15]], align 2, !alias.scope !39 +; UNROLL-NEXT: store <4 x i32> , <4 x i32>* [[TMP22]], align 4 +; UNROLL-NEXT: [[TMP23:%.*]] = load i16, i16* [[TMP12]], align 2 +; UNROLL-NEXT: [[TMP24:%.*]] = load i16, i16* [[TMP13]], align 2 +; UNROLL-NEXT: [[TMP25:%.*]] = load i16, i16* [[TMP14]], align 2 +; UNROLL-NEXT: [[TMP26:%.*]] = load i16, i16* [[TMP15]], align 2 ; UNROLL-NEXT: [[TMP27:%.*]] = insertelement <4 x i16> poison, i16 [[TMP23]], i64 0 ; UNROLL-NEXT: [[TMP28:%.*]] = insertelement <4 x i16> [[TMP27]], i16 [[TMP24]], i64 1 ; UNROLL-NEXT: [[TMP29:%.*]] = insertelement <4 x i16> [[TMP28]], i16 [[TMP25]], i64 2 ; UNROLL-NEXT: [[TMP30:%.*]] = insertelement <4 x i16> [[TMP29]], i16 [[TMP26]], i64 3 -; UNROLL-NEXT: [[TMP31:%.*]] = load i16, i16* [[TMP16]], align 2, !alias.scope !39 -; UNROLL-NEXT: [[TMP32:%.*]] = load i16, i16* [[TMP17]], align 2, !alias.scope !39 -; UNROLL-NEXT: [[TMP33:%.*]] = load i16, i16* [[TMP18]], align 2, !alias.scope !39 -; UNROLL-NEXT: [[TMP34:%.*]] = load i16, i16* [[TMP19]], align 2, !alias.scope !39 +; UNROLL-NEXT: [[TMP31:%.*]] = load i16, i16* [[TMP16]], align 2 +; UNROLL-NEXT: [[TMP32:%.*]] = load i16, i16* [[TMP17]], align 2 +; UNROLL-NEXT: [[TMP33:%.*]] = load i16, i16* [[TMP18]], align 2 +; UNROLL-NEXT: [[TMP34:%.*]] = load i16, i16* [[TMP19]], align 2 ; UNROLL-NEXT: [[TMP35:%.*]] = insertelement <4 x i16> poison, i16 [[TMP31]], i64 0 ; UNROLL-NEXT: [[TMP36:%.*]] = insertelement <4 x i16> [[TMP35]], i16 [[TMP32]], i64 1 ; UNROLL-NEXT: [[TMP37:%.*]] = insertelement <4 x i16> [[TMP36]], i16 [[TMP33]], i64 2 @@ -3846,10 +3796,10 @@ ; UNROLL-NEXT: [[TMP46:%.*]] = mul nsw <4 x i32> [[TMP44]], [[TMP42]] ; UNROLL-NEXT: [[TMP47:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]] ; UNROLL-NEXT: [[TMP48:%.*]] = bitcast i32* [[TMP47]] to <4 x i32>* -; UNROLL-NEXT: store <4 x i32> [[TMP45]], <4 x i32>* [[TMP48]], align 4, !alias.scope !40, !noalias !39 +; UNROLL-NEXT: store <4 x i32> [[TMP45]], <4 x i32>* [[TMP48]], align 4 ; UNROLL-NEXT: [[TMP49:%.*]] = getelementptr inbounds i32, i32* [[TMP47]], i64 4 ; UNROLL-NEXT: [[TMP50:%.*]] = bitcast i32* [[TMP49]] to <4 x i32>* -; UNROLL-NEXT: store <4 x i32> [[TMP46]], <4 x i32>* [[TMP50]], align 4, !alias.scope !40, !noalias !39 +; UNROLL-NEXT: store <4 x i32> [[TMP46]], <4 x i32>* [[TMP50]], align 4 ; UNROLL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; UNROLL-NEXT: [[TMP51:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; UNROLL-NEXT: br i1 [[TMP51]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]] @@ -3881,32 +3831,23 @@ ; UNROLL-NO-IC-LABEL: @PR34711( ; UNROLL-NO-IC-NEXT: entry: ; UNROLL-NO-IC-NEXT: [[C1:%.*]] = bitcast i32* [[C:%.*]] to i8* -; UNROLL-NO-IC-NEXT: [[B3:%.*]] = bitcast i32* [[B:%.*]] to i8* ; UNROLL-NO-IC-NEXT: [[PRE_INDEX:%.*]] = getelementptr inbounds [2 x i16], [2 x i16]* [[A:%.*]], i64 0, i64 0 ; UNROLL-NO-IC-NEXT: [[DOTPRE:%.*]] = load i16, i16* [[PRE_INDEX]], align 2 ; UNROLL-NO-IC-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 8 ; UNROLL-NO-IC-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; UNROLL-NO-IC: vector.memcheck: -; UNROLL-NO-IC-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[C]], i64 [[N]] +; UNROLL-NO-IC-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[N]] ; UNROLL-NO-IC-NEXT: [[SCEVGEP2:%.*]] = bitcast i32* [[SCEVGEP]] to i8* -; UNROLL-NO-IC-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[B]], i64 [[N]] -; UNROLL-NO-IC-NEXT: [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8* +; UNROLL-NO-IC-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[C1]], [[SCEVGEP2]] +; UNROLL-NO-IC-NEXT: [[SCEVGEP3:%.*]] = getelementptr [2 x i16], [2 x i16]* [[A]], i64 [[N]], i64 0 +; UNROLL-NO-IC-NEXT: [[SCEVGEP34:%.*]] = bitcast i16* [[SCEVGEP3]] to i8* +; UNROLL-NO-IC-NEXT: [[BOUND05:%.*]] = icmp ult i8* [[C1]], [[SCEVGEP34]] +; UNROLL-NO-IC-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[BOUND0]], [[BOUND05]] ; UNROLL-NO-IC-NEXT: [[SCEVGEP6:%.*]] = getelementptr [2 x i16], [2 x i16]* [[A]], i64 0, i64 1 ; UNROLL-NO-IC-NEXT: [[SCEVGEP67:%.*]] = bitcast i16* [[SCEVGEP6]] to i8* -; UNROLL-NO-IC-NEXT: [[SCEVGEP8:%.*]] = getelementptr [2 x i16], [2 x i16]* [[A]], i64 [[N]], i64 0 -; UNROLL-NO-IC-NEXT: [[SCEVGEP89:%.*]] = bitcast i16* [[SCEVGEP8]] to i8* -; UNROLL-NO-IC-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[C1]], [[SCEVGEP45]] -; UNROLL-NO-IC-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[B3]], [[SCEVGEP2]] -; UNROLL-NO-IC-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; UNROLL-NO-IC-NEXT: [[BOUND010:%.*]] = icmp ult i8* [[C1]], [[SCEVGEP89]] -; UNROLL-NO-IC-NEXT: [[BOUND111:%.*]] = icmp ult i8* [[SCEVGEP67]], [[SCEVGEP2]] -; UNROLL-NO-IC-NEXT: [[FOUND_CONFLICT12:%.*]] = and i1 [[BOUND010]], [[BOUND111]] -; UNROLL-NO-IC-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT12]] -; UNROLL-NO-IC-NEXT: [[BOUND013:%.*]] = icmp ult i8* [[B3]], [[SCEVGEP89]] -; UNROLL-NO-IC-NEXT: [[BOUND114:%.*]] = icmp ult i8* [[SCEVGEP67]], [[SCEVGEP45]] -; UNROLL-NO-IC-NEXT: [[FOUND_CONFLICT15:%.*]] = and i1 [[BOUND013]], [[BOUND114]] -; UNROLL-NO-IC-NEXT: [[CONFLICT_RDX16:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT15]] -; UNROLL-NO-IC-NEXT: br i1 [[CONFLICT_RDX16]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; UNROLL-NO-IC-NEXT: [[BOUND08:%.*]] = icmp ult i8* [[SCEVGEP67]], [[SCEVGEP2]] +; UNROLL-NO-IC-NEXT: [[CONFLICT_RDX9:%.*]] = or i1 [[CONFLICT_RDX]], [[BOUND08]] +; UNROLL-NO-IC-NEXT: br i1 [[CONFLICT_RDX9]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; UNROLL-NO-IC: vector.ph: ; UNROLL-NO-IC-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8 ; UNROLL-NO-IC-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] @@ -3935,22 +3876,22 @@ ; UNROLL-NO-IC-NEXT: [[TMP17:%.*]] = getelementptr inbounds [2 x i16], [2 x i16]* [[A]], i64 [[TMP7]], i64 1 ; UNROLL-NO-IC-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 0 ; UNROLL-NO-IC-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>* -; UNROLL-NO-IC-NEXT: store <4 x i32> , <4 x i32>* [[TMP19]], align 4, !alias.scope !33, !noalias !36 +; UNROLL-NO-IC-NEXT: store <4 x i32> , <4 x i32>* [[TMP19]], align 4 ; UNROLL-NO-IC-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 4 ; UNROLL-NO-IC-NEXT: [[TMP21:%.*]] = bitcast i32* [[TMP20]] to <4 x i32>* -; UNROLL-NO-IC-NEXT: store <4 x i32> , <4 x i32>* [[TMP21]], align 4, !alias.scope !33, !noalias !36 -; UNROLL-NO-IC-NEXT: [[TMP22:%.*]] = load i16, i16* [[TMP10]], align 2, !alias.scope !39 -; UNROLL-NO-IC-NEXT: [[TMP23:%.*]] = load i16, i16* [[TMP11]], align 2, !alias.scope !39 -; UNROLL-NO-IC-NEXT: [[TMP24:%.*]] = load i16, i16* [[TMP12]], align 2, !alias.scope !39 -; UNROLL-NO-IC-NEXT: [[TMP25:%.*]] = load i16, i16* [[TMP13]], align 2, !alias.scope !39 +; UNROLL-NO-IC-NEXT: store <4 x i32> , <4 x i32>* [[TMP21]], align 4 +; UNROLL-NO-IC-NEXT: [[TMP22:%.*]] = load i16, i16* [[TMP10]], align 2 +; UNROLL-NO-IC-NEXT: [[TMP23:%.*]] = load i16, i16* [[TMP11]], align 2 +; UNROLL-NO-IC-NEXT: [[TMP24:%.*]] = load i16, i16* [[TMP12]], align 2 +; UNROLL-NO-IC-NEXT: [[TMP25:%.*]] = load i16, i16* [[TMP13]], align 2 ; UNROLL-NO-IC-NEXT: [[TMP26:%.*]] = insertelement <4 x i16> poison, i16 [[TMP22]], i32 0 ; UNROLL-NO-IC-NEXT: [[TMP27:%.*]] = insertelement <4 x i16> [[TMP26]], i16 [[TMP23]], i32 1 ; UNROLL-NO-IC-NEXT: [[TMP28:%.*]] = insertelement <4 x i16> [[TMP27]], i16 [[TMP24]], i32 2 ; UNROLL-NO-IC-NEXT: [[TMP29:%.*]] = insertelement <4 x i16> [[TMP28]], i16 [[TMP25]], i32 3 -; UNROLL-NO-IC-NEXT: [[TMP30:%.*]] = load i16, i16* [[TMP14]], align 2, !alias.scope !39 -; UNROLL-NO-IC-NEXT: [[TMP31:%.*]] = load i16, i16* [[TMP15]], align 2, !alias.scope !39 -; UNROLL-NO-IC-NEXT: [[TMP32:%.*]] = load i16, i16* [[TMP16]], align 2, !alias.scope !39 -; UNROLL-NO-IC-NEXT: [[TMP33:%.*]] = load i16, i16* [[TMP17]], align 2, !alias.scope !39 +; UNROLL-NO-IC-NEXT: [[TMP30:%.*]] = load i16, i16* [[TMP14]], align 2 +; UNROLL-NO-IC-NEXT: [[TMP31:%.*]] = load i16, i16* [[TMP15]], align 2 +; UNROLL-NO-IC-NEXT: [[TMP32:%.*]] = load i16, i16* [[TMP16]], align 2 +; UNROLL-NO-IC-NEXT: [[TMP33:%.*]] = load i16, i16* [[TMP17]], align 2 ; UNROLL-NO-IC-NEXT: [[TMP34:%.*]] = insertelement <4 x i16> poison, i16 [[TMP30]], i32 0 ; UNROLL-NO-IC-NEXT: [[TMP35:%.*]] = insertelement <4 x i16> [[TMP34]], i16 [[TMP31]], i32 1 ; UNROLL-NO-IC-NEXT: [[TMP36:%.*]] = insertelement <4 x i16> [[TMP35]], i16 [[TMP32]], i32 2 @@ -3967,10 +3908,10 @@ ; UNROLL-NO-IC-NEXT: [[TMP47:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP4]] ; UNROLL-NO-IC-NEXT: [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[TMP46]], i32 0 ; UNROLL-NO-IC-NEXT: [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>* -; UNROLL-NO-IC-NEXT: store <4 x i32> [[TMP44]], <4 x i32>* [[TMP49]], align 4, !alias.scope !40, !noalias !39 +; UNROLL-NO-IC-NEXT: store <4 x i32> [[TMP44]], <4 x i32>* [[TMP49]], align 4 ; UNROLL-NO-IC-NEXT: [[TMP50:%.*]] = getelementptr inbounds i32, i32* [[TMP46]], i32 4 ; UNROLL-NO-IC-NEXT: [[TMP51:%.*]] = bitcast i32* [[TMP50]] to <4 x i32>* -; UNROLL-NO-IC-NEXT: store <4 x i32> [[TMP45]], <4 x i32>* [[TMP51]], align 4, !alias.scope !40, !noalias !39 +; UNROLL-NO-IC-NEXT: store <4 x i32> [[TMP45]], <4 x i32>* [[TMP51]], align 4 ; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; UNROLL-NO-IC-NEXT: [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; UNROLL-NO-IC-NEXT: br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]] @@ -4004,32 +3945,23 @@ ; UNROLL-NO-VF-LABEL: @PR34711( ; UNROLL-NO-VF-NEXT: entry: ; UNROLL-NO-VF-NEXT: [[C1:%.*]] = bitcast i32* [[C:%.*]] to i8* -; UNROLL-NO-VF-NEXT: [[B3:%.*]] = bitcast i32* [[B:%.*]] to i8* ; UNROLL-NO-VF-NEXT: [[PRE_INDEX:%.*]] = getelementptr inbounds [2 x i16], [2 x i16]* [[A:%.*]], i64 0, i64 0 ; UNROLL-NO-VF-NEXT: [[DOTPRE:%.*]] = load i16, i16* [[PRE_INDEX]], align 2 ; UNROLL-NO-VF-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 2 ; UNROLL-NO-VF-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; UNROLL-NO-VF: vector.memcheck: -; UNROLL-NO-VF-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[C]], i64 [[N]] +; UNROLL-NO-VF-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[N]] ; UNROLL-NO-VF-NEXT: [[SCEVGEP2:%.*]] = bitcast i32* [[SCEVGEP]] to i8* -; UNROLL-NO-VF-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[B]], i64 [[N]] -; UNROLL-NO-VF-NEXT: [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8* +; UNROLL-NO-VF-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[C1]], [[SCEVGEP2]] +; UNROLL-NO-VF-NEXT: [[SCEVGEP3:%.*]] = getelementptr [2 x i16], [2 x i16]* [[A]], i64 [[N]], i64 0 +; UNROLL-NO-VF-NEXT: [[SCEVGEP34:%.*]] = bitcast i16* [[SCEVGEP3]] to i8* +; UNROLL-NO-VF-NEXT: [[BOUND05:%.*]] = icmp ult i8* [[C1]], [[SCEVGEP34]] +; UNROLL-NO-VF-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[BOUND0]], [[BOUND05]] ; UNROLL-NO-VF-NEXT: [[SCEVGEP6:%.*]] = getelementptr [2 x i16], [2 x i16]* [[A]], i64 0, i64 1 ; UNROLL-NO-VF-NEXT: [[SCEVGEP67:%.*]] = bitcast i16* [[SCEVGEP6]] to i8* -; UNROLL-NO-VF-NEXT: [[SCEVGEP8:%.*]] = getelementptr [2 x i16], [2 x i16]* [[A]], i64 [[N]], i64 0 -; UNROLL-NO-VF-NEXT: [[SCEVGEP89:%.*]] = bitcast i16* [[SCEVGEP8]] to i8* -; UNROLL-NO-VF-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[C1]], [[SCEVGEP45]] -; UNROLL-NO-VF-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[B3]], [[SCEVGEP2]] -; UNROLL-NO-VF-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; UNROLL-NO-VF-NEXT: [[BOUND010:%.*]] = icmp ult i8* [[C1]], [[SCEVGEP89]] -; UNROLL-NO-VF-NEXT: [[BOUND111:%.*]] = icmp ult i8* [[SCEVGEP67]], [[SCEVGEP2]] -; UNROLL-NO-VF-NEXT: [[FOUND_CONFLICT12:%.*]] = and i1 [[BOUND010]], [[BOUND111]] -; UNROLL-NO-VF-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT12]] -; UNROLL-NO-VF-NEXT: [[BOUND013:%.*]] = icmp ult i8* [[B3]], [[SCEVGEP89]] -; UNROLL-NO-VF-NEXT: [[BOUND114:%.*]] = icmp ult i8* [[SCEVGEP67]], [[SCEVGEP45]] -; UNROLL-NO-VF-NEXT: [[FOUND_CONFLICT15:%.*]] = and i1 [[BOUND013]], [[BOUND114]] -; UNROLL-NO-VF-NEXT: [[CONFLICT_RDX16:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT15]] -; UNROLL-NO-VF-NEXT: br i1 [[CONFLICT_RDX16]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; UNROLL-NO-VF-NEXT: [[BOUND08:%.*]] = icmp ult i8* [[SCEVGEP67]], [[SCEVGEP2]] +; UNROLL-NO-VF-NEXT: [[CONFLICT_RDX9:%.*]] = or i1 [[CONFLICT_RDX]], [[BOUND08]] +; UNROLL-NO-VF-NEXT: br i1 [[CONFLICT_RDX9]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; UNROLL-NO-VF: vector.ph: ; UNROLL-NO-VF-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 2 ; UNROLL-NO-VF-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] @@ -4043,10 +3975,10 @@ ; UNROLL-NO-VF-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[INDUCTION17]] ; UNROLL-NO-VF-NEXT: [[TMP2:%.*]] = getelementptr inbounds [2 x i16], [2 x i16]* [[A]], i64 [[INDUCTION]], i64 1 ; UNROLL-NO-VF-NEXT: [[TMP3:%.*]] = getelementptr inbounds [2 x i16], [2 x i16]* [[A]], i64 [[INDUCTION17]], i64 1 -; UNROLL-NO-VF-NEXT: store i32 7, i32* [[TMP0]], align 4, !alias.scope !32, !noalias !35 -; UNROLL-NO-VF-NEXT: store i32 7, i32* [[TMP1]], align 4, !alias.scope !32, !noalias !35 -; UNROLL-NO-VF-NEXT: [[TMP4:%.*]] = load i16, i16* [[TMP2]], align 2, !alias.scope !38 -; UNROLL-NO-VF-NEXT: [[TMP5]] = load i16, i16* [[TMP3]], align 2, !alias.scope !38 +; UNROLL-NO-VF-NEXT: store i32 7, i32* [[TMP0]], align 4 +; UNROLL-NO-VF-NEXT: store i32 7, i32* [[TMP1]], align 4 +; UNROLL-NO-VF-NEXT: [[TMP4:%.*]] = load i16, i16* [[TMP2]], align 2 +; UNROLL-NO-VF-NEXT: [[TMP5]] = load i16, i16* [[TMP3]], align 2 ; UNROLL-NO-VF-NEXT: [[TMP6:%.*]] = sext i16 [[VECTOR_RECUR]] to i32 ; UNROLL-NO-VF-NEXT: [[TMP7:%.*]] = sext i16 [[TMP4]] to i32 ; UNROLL-NO-VF-NEXT: [[TMP8:%.*]] = sext i16 [[TMP4]] to i32 @@ -4055,8 +3987,8 @@ ; UNROLL-NO-VF-NEXT: [[TMP11:%.*]] = mul nsw i32 [[TMP9]], [[TMP7]] ; UNROLL-NO-VF-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDUCTION]] ; UNROLL-NO-VF-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDUCTION17]] -; UNROLL-NO-VF-NEXT: store i32 [[TMP10]], i32* [[TMP12]], align 4, !alias.scope !39, !noalias !38 -; UNROLL-NO-VF-NEXT: store i32 [[TMP11]], i32* [[TMP13]], align 4, !alias.scope !39, !noalias !38 +; UNROLL-NO-VF-NEXT: store i32 [[TMP10]], i32* [[TMP12]], align 4 +; UNROLL-NO-VF-NEXT: store i32 [[TMP11]], i32* [[TMP13]], align 4 ; UNROLL-NO-VF-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; UNROLL-NO-VF-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; UNROLL-NO-VF-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP40:![0-9]+]] @@ -4088,32 +4020,23 @@ ; SINK-AFTER-LABEL: @PR34711( ; SINK-AFTER-NEXT: entry: ; SINK-AFTER-NEXT: [[C1:%.*]] = bitcast i32* [[C:%.*]] to i8* -; SINK-AFTER-NEXT: [[B3:%.*]] = bitcast i32* [[B:%.*]] to i8* ; SINK-AFTER-NEXT: [[PRE_INDEX:%.*]] = getelementptr inbounds [2 x i16], [2 x i16]* [[A:%.*]], i64 0, i64 0 ; SINK-AFTER-NEXT: [[DOTPRE:%.*]] = load i16, i16* [[PRE_INDEX]], align 2 ; SINK-AFTER-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4 ; SINK-AFTER-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; SINK-AFTER: vector.memcheck: -; SINK-AFTER-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[C]], i64 [[N]] +; SINK-AFTER-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[N]] ; SINK-AFTER-NEXT: [[SCEVGEP2:%.*]] = bitcast i32* [[SCEVGEP]] to i8* -; SINK-AFTER-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[B]], i64 [[N]] -; SINK-AFTER-NEXT: [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8* +; SINK-AFTER-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[C1]], [[SCEVGEP2]] +; SINK-AFTER-NEXT: [[SCEVGEP3:%.*]] = getelementptr [2 x i16], [2 x i16]* [[A]], i64 [[N]], i64 0 +; SINK-AFTER-NEXT: [[SCEVGEP34:%.*]] = bitcast i16* [[SCEVGEP3]] to i8* +; SINK-AFTER-NEXT: [[BOUND05:%.*]] = icmp ult i8* [[C1]], [[SCEVGEP34]] +; SINK-AFTER-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[BOUND0]], [[BOUND05]] ; SINK-AFTER-NEXT: [[SCEVGEP6:%.*]] = getelementptr [2 x i16], [2 x i16]* [[A]], i64 0, i64 1 ; SINK-AFTER-NEXT: [[SCEVGEP67:%.*]] = bitcast i16* [[SCEVGEP6]] to i8* -; SINK-AFTER-NEXT: [[SCEVGEP8:%.*]] = getelementptr [2 x i16], [2 x i16]* [[A]], i64 [[N]], i64 0 -; SINK-AFTER-NEXT: [[SCEVGEP89:%.*]] = bitcast i16* [[SCEVGEP8]] to i8* -; SINK-AFTER-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[C1]], [[SCEVGEP45]] -; SINK-AFTER-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[B3]], [[SCEVGEP2]] -; SINK-AFTER-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; SINK-AFTER-NEXT: [[BOUND010:%.*]] = icmp ult i8* [[C1]], [[SCEVGEP89]] -; SINK-AFTER-NEXT: [[BOUND111:%.*]] = icmp ult i8* [[SCEVGEP67]], [[SCEVGEP2]] -; SINK-AFTER-NEXT: [[FOUND_CONFLICT12:%.*]] = and i1 [[BOUND010]], [[BOUND111]] -; SINK-AFTER-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT12]] -; SINK-AFTER-NEXT: [[BOUND013:%.*]] = icmp ult i8* [[B3]], [[SCEVGEP89]] -; SINK-AFTER-NEXT: [[BOUND114:%.*]] = icmp ult i8* [[SCEVGEP67]], [[SCEVGEP45]] -; SINK-AFTER-NEXT: [[FOUND_CONFLICT15:%.*]] = and i1 [[BOUND013]], [[BOUND114]] -; SINK-AFTER-NEXT: [[CONFLICT_RDX16:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT15]] -; SINK-AFTER-NEXT: br i1 [[CONFLICT_RDX16]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; SINK-AFTER-NEXT: [[BOUND08:%.*]] = icmp ult i8* [[SCEVGEP67]], [[SCEVGEP2]] +; SINK-AFTER-NEXT: [[CONFLICT_RDX9:%.*]] = or i1 [[CONFLICT_RDX]], [[BOUND08]] +; SINK-AFTER-NEXT: br i1 [[CONFLICT_RDX9]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; SINK-AFTER: vector.ph: ; SINK-AFTER-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 ; SINK-AFTER-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] @@ -4133,11 +4056,11 @@ ; SINK-AFTER-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i16], [2 x i16]* [[A]], i64 [[TMP3]], i64 1 ; SINK-AFTER-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0 ; SINK-AFTER-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>* -; SINK-AFTER-NEXT: store <4 x i32> , <4 x i32>* [[TMP10]], align 4, !alias.scope !33, !noalias !36 -; SINK-AFTER-NEXT: [[TMP11:%.*]] = load i16, i16* [[TMP5]], align 2, !alias.scope !39 -; SINK-AFTER-NEXT: [[TMP12:%.*]] = load i16, i16* [[TMP6]], align 2, !alias.scope !39 -; SINK-AFTER-NEXT: [[TMP13:%.*]] = load i16, i16* [[TMP7]], align 2, !alias.scope !39 -; SINK-AFTER-NEXT: [[TMP14:%.*]] = load i16, i16* [[TMP8]], align 2, !alias.scope !39 +; SINK-AFTER-NEXT: store <4 x i32> , <4 x i32>* [[TMP10]], align 4 +; SINK-AFTER-NEXT: [[TMP11:%.*]] = load i16, i16* [[TMP5]], align 2 +; SINK-AFTER-NEXT: [[TMP12:%.*]] = load i16, i16* [[TMP6]], align 2 +; SINK-AFTER-NEXT: [[TMP13:%.*]] = load i16, i16* [[TMP7]], align 2 +; SINK-AFTER-NEXT: [[TMP14:%.*]] = load i16, i16* [[TMP8]], align 2 ; SINK-AFTER-NEXT: [[TMP15:%.*]] = insertelement <4 x i16> poison, i16 [[TMP11]], i32 0 ; SINK-AFTER-NEXT: [[TMP16:%.*]] = insertelement <4 x i16> [[TMP15]], i16 [[TMP12]], i32 1 ; SINK-AFTER-NEXT: [[TMP17:%.*]] = insertelement <4 x i16> [[TMP16]], i16 [[TMP13]], i32 2 @@ -4149,7 +4072,7 @@ ; SINK-AFTER-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP0]] ; SINK-AFTER-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[TMP23]], i32 0 ; SINK-AFTER-NEXT: [[TMP25:%.*]] = bitcast i32* [[TMP24]] to <4 x i32>* -; SINK-AFTER-NEXT: store <4 x i32> [[TMP22]], <4 x i32>* [[TMP25]], align 4, !alias.scope !40, !noalias !39 +; SINK-AFTER-NEXT: store <4 x i32> [[TMP22]], <4 x i32>* [[TMP25]], align 4 ; SINK-AFTER-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; SINK-AFTER-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; SINK-AFTER-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]] @@ -4183,32 +4106,23 @@ ; NO-SINK-AFTER-LABEL: @PR34711( ; NO-SINK-AFTER-NEXT: entry: ; NO-SINK-AFTER-NEXT: [[C1:%.*]] = bitcast i32* [[C:%.*]] to i8* -; NO-SINK-AFTER-NEXT: [[B3:%.*]] = bitcast i32* [[B:%.*]] to i8* ; NO-SINK-AFTER-NEXT: [[PRE_INDEX:%.*]] = getelementptr inbounds [2 x i16], [2 x i16]* [[A:%.*]], i64 0, i64 0 ; NO-SINK-AFTER-NEXT: [[DOTPRE:%.*]] = load i16, i16* [[PRE_INDEX]], align 2 ; NO-SINK-AFTER-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4 ; NO-SINK-AFTER-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; NO-SINK-AFTER: vector.memcheck: -; NO-SINK-AFTER-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[C]], i64 [[N]] +; NO-SINK-AFTER-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[N]] ; NO-SINK-AFTER-NEXT: [[SCEVGEP2:%.*]] = bitcast i32* [[SCEVGEP]] to i8* -; NO-SINK-AFTER-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[B]], i64 [[N]] -; NO-SINK-AFTER-NEXT: [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8* +; NO-SINK-AFTER-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[C1]], [[SCEVGEP2]] +; NO-SINK-AFTER-NEXT: [[SCEVGEP3:%.*]] = getelementptr [2 x i16], [2 x i16]* [[A]], i64 [[N]], i64 0 +; NO-SINK-AFTER-NEXT: [[SCEVGEP34:%.*]] = bitcast i16* [[SCEVGEP3]] to i8* +; NO-SINK-AFTER-NEXT: [[BOUND05:%.*]] = icmp ult i8* [[C1]], [[SCEVGEP34]] +; NO-SINK-AFTER-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[BOUND0]], [[BOUND05]] ; NO-SINK-AFTER-NEXT: [[SCEVGEP6:%.*]] = getelementptr [2 x i16], [2 x i16]* [[A]], i64 0, i64 1 ; NO-SINK-AFTER-NEXT: [[SCEVGEP67:%.*]] = bitcast i16* [[SCEVGEP6]] to i8* -; NO-SINK-AFTER-NEXT: [[SCEVGEP8:%.*]] = getelementptr [2 x i16], [2 x i16]* [[A]], i64 [[N]], i64 0 -; NO-SINK-AFTER-NEXT: [[SCEVGEP89:%.*]] = bitcast i16* [[SCEVGEP8]] to i8* -; NO-SINK-AFTER-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[C1]], [[SCEVGEP45]] -; NO-SINK-AFTER-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[B3]], [[SCEVGEP2]] -; NO-SINK-AFTER-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; NO-SINK-AFTER-NEXT: [[BOUND010:%.*]] = icmp ult i8* [[C1]], [[SCEVGEP89]] -; NO-SINK-AFTER-NEXT: [[BOUND111:%.*]] = icmp ult i8* [[SCEVGEP67]], [[SCEVGEP2]] -; NO-SINK-AFTER-NEXT: [[FOUND_CONFLICT12:%.*]] = and i1 [[BOUND010]], [[BOUND111]] -; NO-SINK-AFTER-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT12]] -; NO-SINK-AFTER-NEXT: [[BOUND013:%.*]] = icmp ult i8* [[B3]], [[SCEVGEP89]] -; NO-SINK-AFTER-NEXT: [[BOUND114:%.*]] = icmp ult i8* [[SCEVGEP67]], [[SCEVGEP45]] -; NO-SINK-AFTER-NEXT: [[FOUND_CONFLICT15:%.*]] = and i1 [[BOUND013]], [[BOUND114]] -; NO-SINK-AFTER-NEXT: [[CONFLICT_RDX16:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT15]] -; NO-SINK-AFTER-NEXT: br i1 [[CONFLICT_RDX16]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; NO-SINK-AFTER-NEXT: [[BOUND08:%.*]] = icmp ult i8* [[SCEVGEP67]], [[SCEVGEP2]] +; NO-SINK-AFTER-NEXT: [[CONFLICT_RDX9:%.*]] = or i1 [[CONFLICT_RDX]], [[BOUND08]] +; NO-SINK-AFTER-NEXT: br i1 [[CONFLICT_RDX9]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; NO-SINK-AFTER: vector.ph: ; NO-SINK-AFTER-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 ; NO-SINK-AFTER-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] @@ -4228,11 +4142,11 @@ ; NO-SINK-AFTER-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i16], [2 x i16]* [[A]], i64 [[TMP3]], i64 1 ; NO-SINK-AFTER-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0 ; NO-SINK-AFTER-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>* -; NO-SINK-AFTER-NEXT: store <4 x i32> , <4 x i32>* [[TMP10]], align 4, !alias.scope !33, !noalias !36 -; NO-SINK-AFTER-NEXT: [[TMP11:%.*]] = load i16, i16* [[TMP5]], align 2, !alias.scope !39 -; NO-SINK-AFTER-NEXT: [[TMP12:%.*]] = load i16, i16* [[TMP6]], align 2, !alias.scope !39 -; NO-SINK-AFTER-NEXT: [[TMP13:%.*]] = load i16, i16* [[TMP7]], align 2, !alias.scope !39 -; NO-SINK-AFTER-NEXT: [[TMP14:%.*]] = load i16, i16* [[TMP8]], align 2, !alias.scope !39 +; NO-SINK-AFTER-NEXT: store <4 x i32> , <4 x i32>* [[TMP10]], align 4 +; NO-SINK-AFTER-NEXT: [[TMP11:%.*]] = load i16, i16* [[TMP5]], align 2 +; NO-SINK-AFTER-NEXT: [[TMP12:%.*]] = load i16, i16* [[TMP6]], align 2 +; NO-SINK-AFTER-NEXT: [[TMP13:%.*]] = load i16, i16* [[TMP7]], align 2 +; NO-SINK-AFTER-NEXT: [[TMP14:%.*]] = load i16, i16* [[TMP8]], align 2 ; NO-SINK-AFTER-NEXT: [[TMP15:%.*]] = insertelement <4 x i16> poison, i16 [[TMP11]], i32 0 ; NO-SINK-AFTER-NEXT: [[TMP16:%.*]] = insertelement <4 x i16> [[TMP15]], i16 [[TMP12]], i32 1 ; NO-SINK-AFTER-NEXT: [[TMP17:%.*]] = insertelement <4 x i16> [[TMP16]], i16 [[TMP13]], i32 2 @@ -4244,7 +4158,7 @@ ; NO-SINK-AFTER-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP0]] ; NO-SINK-AFTER-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[TMP23]], i32 0 ; NO-SINK-AFTER-NEXT: [[TMP25:%.*]] = bitcast i32* [[TMP24]] to <4 x i32>* -; NO-SINK-AFTER-NEXT: store <4 x i32> [[TMP22]], <4 x i32>* [[TMP25]], align 4, !alias.scope !40, !noalias !39 +; NO-SINK-AFTER-NEXT: store <4 x i32> [[TMP22]], <4 x i32>* [[TMP25]], align 4 ; NO-SINK-AFTER-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; NO-SINK-AFTER-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; NO-SINK-AFTER-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]] @@ -4334,7 +4248,7 @@ ; CHECK-NEXT: [[TMP3:%.*]] = or i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[TMP3]] ; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16* [[TMP4]] to <4 x i16>* -; CHECK-NEXT: [[WIDE_LOAD]] = load <4 x i16>, <4 x i16>* [[TMP5]], align 2, !alias.scope !43 +; CHECK-NEXT: [[WIDE_LOAD]] = load <4 x i16>, <4 x i16>* [[TMP5]], align 2 ; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[WIDE_LOAD]], <4 x i32> ; CHECK-NEXT: [[TMP7:%.*]] = sext <4 x i16> [[TMP6]] to <4 x i32> ; CHECK-NEXT: [[TMP8:%.*]] = add nsw <4 x i32> [[TMP7]], @@ -4342,7 +4256,7 @@ ; CHECK-NEXT: [[TMP10:%.*]] = mul nsw <4 x i32> [[TMP8]], [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4, !alias.scope !46, !noalias !43 +; CHECK-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP48:![0-9]+]] @@ -4397,10 +4311,10 @@ ; UNROLL-NEXT: [[TMP3:%.*]] = or i64 [[INDEX]], 1 ; UNROLL-NEXT: [[TMP4:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[TMP3]] ; UNROLL-NEXT: [[TMP5:%.*]] = bitcast i16* [[TMP4]] to <4 x i16>* -; UNROLL-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, <4 x i16>* [[TMP5]], align 2, !alias.scope !43 +; UNROLL-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, <4 x i16>* [[TMP5]], align 2 ; UNROLL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[TMP4]], i64 4 ; UNROLL-NEXT: [[TMP7:%.*]] = bitcast i16* [[TMP6]] to <4 x i16>* -; UNROLL-NEXT: [[WIDE_LOAD7]] = load <4 x i16>, <4 x i16>* [[TMP7]], align 2, !alias.scope !43 +; UNROLL-NEXT: [[WIDE_LOAD7]] = load <4 x i16>, <4 x i16>* [[TMP7]], align 2 ; UNROLL-NEXT: [[TMP8:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[WIDE_LOAD]], <4 x i32> ; UNROLL-NEXT: [[TMP9:%.*]] = shufflevector <4 x i16> [[WIDE_LOAD]], <4 x i16> [[WIDE_LOAD7]], <4 x i32> ; UNROLL-NEXT: [[TMP10:%.*]] = sext <4 x i16> [[TMP8]] to <4 x i32> @@ -4413,10 +4327,10 @@ ; UNROLL-NEXT: [[TMP17:%.*]] = mul nsw <4 x i32> [[TMP13]], [[TMP15]] ; UNROLL-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]] ; UNROLL-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>* -; UNROLL-NEXT: store <4 x i32> [[TMP16]], <4 x i32>* [[TMP19]], align 4, !alias.scope !46, !noalias !43 +; UNROLL-NEXT: store <4 x i32> [[TMP16]], <4 x i32>* [[TMP19]], align 4 ; UNROLL-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP18]], i64 4 ; UNROLL-NEXT: [[TMP21:%.*]] = bitcast i32* [[TMP20]] to <4 x i32>* -; UNROLL-NEXT: store <4 x i32> [[TMP17]], <4 x i32>* [[TMP21]], align 4, !alias.scope !46, !noalias !43 +; UNROLL-NEXT: store <4 x i32> [[TMP17]], <4 x i32>* [[TMP21]], align 4 ; UNROLL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; UNROLL-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; UNROLL-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP48:![0-9]+]] @@ -4479,10 +4393,10 @@ ; UNROLL-NO-IC-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[TMP4]] ; UNROLL-NO-IC-NEXT: [[TMP7:%.*]] = getelementptr inbounds i16, i16* [[TMP5]], i32 0 ; UNROLL-NO-IC-NEXT: [[TMP8:%.*]] = bitcast i16* [[TMP7]] to <4 x i16>* -; UNROLL-NO-IC-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, <4 x i16>* [[TMP8]], align 2, !alias.scope !43 +; UNROLL-NO-IC-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, <4 x i16>* [[TMP8]], align 2 ; UNROLL-NO-IC-NEXT: [[TMP9:%.*]] = getelementptr inbounds i16, i16* [[TMP5]], i32 4 ; UNROLL-NO-IC-NEXT: [[TMP10:%.*]] = bitcast i16* [[TMP9]] to <4 x i16>* -; UNROLL-NO-IC-NEXT: [[WIDE_LOAD7]] = load <4 x i16>, <4 x i16>* [[TMP10]], align 2, !alias.scope !43 +; UNROLL-NO-IC-NEXT: [[WIDE_LOAD7]] = load <4 x i16>, <4 x i16>* [[TMP10]], align 2 ; UNROLL-NO-IC-NEXT: [[TMP11:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[WIDE_LOAD]], <4 x i32> ; UNROLL-NO-IC-NEXT: [[TMP12:%.*]] = shufflevector <4 x i16> [[WIDE_LOAD]], <4 x i16> [[WIDE_LOAD7]], <4 x i32> ; UNROLL-NO-IC-NEXT: [[TMP13:%.*]] = sext <4 x i16> [[TMP11]] to <4 x i32> @@ -4497,10 +4411,10 @@ ; UNROLL-NO-IC-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP2]] ; UNROLL-NO-IC-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[TMP21]], i32 0 ; UNROLL-NO-IC-NEXT: [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>* -; UNROLL-NO-IC-NEXT: store <4 x i32> [[TMP19]], <4 x i32>* [[TMP24]], align 4, !alias.scope !46, !noalias !43 +; UNROLL-NO-IC-NEXT: store <4 x i32> [[TMP19]], <4 x i32>* [[TMP24]], align 4 ; UNROLL-NO-IC-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TMP21]], i32 4 ; UNROLL-NO-IC-NEXT: [[TMP26:%.*]] = bitcast i32* [[TMP25]] to <4 x i32>* -; UNROLL-NO-IC-NEXT: store <4 x i32> [[TMP20]], <4 x i32>* [[TMP26]], align 4, !alias.scope !46, !noalias !43 +; UNROLL-NO-IC-NEXT: store <4 x i32> [[TMP20]], <4 x i32>* [[TMP26]], align 4 ; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; UNROLL-NO-IC-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; UNROLL-NO-IC-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP48:![0-9]+]] @@ -4561,8 +4475,8 @@ ; UNROLL-NO-VF-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[INDUCTION7]], 1 ; UNROLL-NO-VF-NEXT: [[TMP3:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[TMP1]] ; UNROLL-NO-VF-NEXT: [[TMP4:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[TMP2]] -; UNROLL-NO-VF-NEXT: [[TMP5:%.*]] = load i16, i16* [[TMP3]], align 2, !alias.scope !42 -; UNROLL-NO-VF-NEXT: [[TMP6]] = load i16, i16* [[TMP4]], align 2, !alias.scope !42 +; UNROLL-NO-VF-NEXT: [[TMP5:%.*]] = load i16, i16* [[TMP3]], align 2 +; UNROLL-NO-VF-NEXT: [[TMP6]] = load i16, i16* [[TMP4]], align 2 ; UNROLL-NO-VF-NEXT: [[TMP7:%.*]] = sext i16 [[VECTOR_RECUR]] to i32 ; UNROLL-NO-VF-NEXT: [[TMP8:%.*]] = sext i16 [[TMP5]] to i32 ; UNROLL-NO-VF-NEXT: [[TMP9:%.*]] = add nsw i32 [[TMP7]], 2 @@ -4573,8 +4487,8 @@ ; UNROLL-NO-VF-NEXT: [[TMP14:%.*]] = mul nsw i32 [[TMP10]], [[TMP12]] ; UNROLL-NO-VF-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDUCTION]] ; UNROLL-NO-VF-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDUCTION7]] -; UNROLL-NO-VF-NEXT: store i32 [[TMP13]], i32* [[TMP15]], align 4, !alias.scope !45, !noalias !42 -; UNROLL-NO-VF-NEXT: store i32 [[TMP14]], i32* [[TMP16]], align 4, !alias.scope !45, !noalias !42 +; UNROLL-NO-VF-NEXT: store i32 [[TMP13]], i32* [[TMP15]], align 4 +; UNROLL-NO-VF-NEXT: store i32 [[TMP14]], i32* [[TMP16]], align 4 ; UNROLL-NO-VF-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; UNROLL-NO-VF-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; UNROLL-NO-VF-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP47:![0-9]+]] @@ -4633,7 +4547,7 @@ ; SINK-AFTER-NEXT: [[TMP3:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[TMP2]] ; SINK-AFTER-NEXT: [[TMP4:%.*]] = getelementptr inbounds i16, i16* [[TMP3]], i32 0 ; SINK-AFTER-NEXT: [[TMP5:%.*]] = bitcast i16* [[TMP4]] to <4 x i16>* -; SINK-AFTER-NEXT: [[WIDE_LOAD]] = load <4 x i16>, <4 x i16>* [[TMP5]], align 2, !alias.scope !43 +; SINK-AFTER-NEXT: [[WIDE_LOAD]] = load <4 x i16>, <4 x i16>* [[TMP5]], align 2 ; SINK-AFTER-NEXT: [[TMP6:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[WIDE_LOAD]], <4 x i32> ; SINK-AFTER-NEXT: [[TMP7:%.*]] = sext <4 x i16> [[TMP6]] to <4 x i32> ; SINK-AFTER-NEXT: [[TMP8:%.*]] = add nsw <4 x i32> [[TMP7]], @@ -4642,7 +4556,7 @@ ; SINK-AFTER-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP1]] ; SINK-AFTER-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i32 0 ; SINK-AFTER-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <4 x i32>* -; SINK-AFTER-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP13]], align 4, !alias.scope !46, !noalias !43 +; SINK-AFTER-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP13]], align 4 ; SINK-AFTER-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; SINK-AFTER-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; SINK-AFTER-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP48:![0-9]+]] @@ -4703,7 +4617,7 @@ ; NO-SINK-AFTER-NEXT: [[TMP3:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[TMP2]] ; NO-SINK-AFTER-NEXT: [[TMP4:%.*]] = getelementptr inbounds i16, i16* [[TMP3]], i32 0 ; NO-SINK-AFTER-NEXT: [[TMP5:%.*]] = bitcast i16* [[TMP4]] to <4 x i16>* -; NO-SINK-AFTER-NEXT: [[WIDE_LOAD]] = load <4 x i16>, <4 x i16>* [[TMP5]], align 2, !alias.scope !43 +; NO-SINK-AFTER-NEXT: [[WIDE_LOAD]] = load <4 x i16>, <4 x i16>* [[TMP5]], align 2 ; NO-SINK-AFTER-NEXT: [[TMP6:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[WIDE_LOAD]], <4 x i32> ; NO-SINK-AFTER-NEXT: [[TMP7:%.*]] = sext <4 x i16> [[TMP6]] to <4 x i32> ; NO-SINK-AFTER-NEXT: [[TMP8:%.*]] = add nsw <4 x i32> [[TMP7]], @@ -4712,7 +4626,7 @@ ; NO-SINK-AFTER-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP1]] ; NO-SINK-AFTER-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i32 0 ; NO-SINK-AFTER-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <4 x i32>* -; NO-SINK-AFTER-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP13]], align 4, !alias.scope !46, !noalias !43 +; NO-SINK-AFTER-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP13]], align 4 ; NO-SINK-AFTER-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; NO-SINK-AFTER-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; NO-SINK-AFTER-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP48:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/multiple-exits-versioning.ll b/llvm/test/Transforms/LoopVectorize/multiple-exits-versioning.ll --- a/llvm/test/Transforms/LoopVectorize/multiple-exits-versioning.ll +++ b/llvm/test/Transforms/LoopVectorize/multiple-exits-versioning.ll @@ -4,13 +4,13 @@ ; multiple exiting branches. ; Multiple branches exiting the loop to a unique exit block. The loop should -; be vectorized with versioning & noalias metadata should be added. +; be vectorized with versioning. define void @multiple_exits_unique_exit_block(i32* %A, i32* %B, i64 %N) { ; CHECK-LABEL: @multiple_exits_unique_exit_block ; CHECK: vector.memcheck: ; CHECK-LABEL: vector.body: -; CHECK: %wide.load = load <2 x i32>, <2 x i32>* {{.*}}, align 4, !alias.scope -; CHECK: store <2 x i32> %wide.load, <2 x i32>* {{.*}}, align 4, !alias.scope +; CHECK: %wide.load = load <2 x i32>, <2 x i32>* {{.*}}, align 4 +; CHECK: store <2 x i32> %wide.load, <2 x i32>* {{.*}}, align 4 ; CHECK: br ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/no_outside_user.ll b/llvm/test/Transforms/LoopVectorize/no_outside_user.ll --- a/llvm/test/Transforms/LoopVectorize/no_outside_user.ll +++ b/llvm/test/Transforms/LoopVectorize/no_outside_user.ll @@ -338,8 +338,8 @@ ; CHECK-LABEL: vector.body: ; CHECK: %wide.load = load <2 x i32>, <2 x i32>* -; CHECK: %wide.load16 = load <2 x i32>, <2 x i32>* -; CHECK: [[ADD:%[a-zA-Z0-9.]+]] = add nsw <2 x i32> %wide.load, %wide.load16 +; CHECK: %wide.load8 = load <2 x i32>, <2 x i32>* +; CHECK: [[ADD:%[a-zA-Z0-9.]+]] = add nsw <2 x i32> %wide.load, %wide.load8 ; CHECK: store <2 x i32> ; CHECK-LABEL: middle.block: diff --git a/llvm/test/Transforms/LoopVectorize/runtime-check-readonly.ll b/llvm/test/Transforms/LoopVectorize/runtime-check-readonly.ll --- a/llvm/test/Transforms/LoopVectorize/runtime-check-readonly.ll +++ b/llvm/test/Transforms/LoopVectorize/runtime-check-readonly.ll @@ -6,15 +6,9 @@ ; CHECK-LABEL: @add_ints( ; CHECK-LABEL: vector.memcheck: ; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 200 -; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 200 -; CHECK-NEXT: [[SCEVGEP7:%.*]] = getelementptr i32, i32* [[C:%.*]], i64 200 -; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt i32* [[SCEVGEP4]], [[A]] -; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[B]] -; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; CHECK-NEXT: [[BOUND09:%.*]] = icmp ugt i32* [[SCEVGEP7]], [[A]] -; CHECK-NEXT: [[BOUND110:%.*]] = icmp ugt i32* [[SCEVGEP]], [[C]] -; CHECK-NEXT: [[FOUND_CONFLICT11:%.*]] = and i1 [[BOUND09]], [[BOUND110]] -; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT11]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], %B +; CHECK-NEXT: [[BOUND110:%.*]] = icmp ugt i32* [[SCEVGEP]], %C +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[BOUND1]], [[BOUND110]] ; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label %scalar.ph, label %vector.ph ; CHECK: vector.ph: ; CHECK-NEXT: br label %vector.body diff --git a/llvm/test/Transforms/LoopVectorize/runtime-check-small-clamped-bounds.ll b/llvm/test/Transforms/LoopVectorize/runtime-check-small-clamped-bounds.ll --- a/llvm/test/Transforms/LoopVectorize/runtime-check-small-clamped-bounds.ll +++ b/llvm/test/Transforms/LoopVectorize/runtime-check-small-clamped-bounds.ll @@ -1,4 +1,3 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -loop-vectorize -force-vector-width=2 -S %s | FileCheck %s ; Tests where the indices of some accesses are clamped to a small range. @@ -13,8 +12,7 @@ define void @load_clamped_index(i32* %A, i32* %B, i32 %N) { ; CHECK-LABEL: @load_clamped_index( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[B1:%.*]] = bitcast i32* [[B:%.*]] to i8* -; CHECK-NEXT: [[A3:%.*]] = bitcast i32* [[A:%.*]] to i8* +; CHECK-NEXT: [[A1:%.*]] = bitcast i32* %A to i8* ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N:%.*]], 2 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] ; CHECK: vector.scevcheck: @@ -25,14 +23,10 @@ ; CHECK-NEXT: [[TMP10:%.*]] = add i32 [[N]], -1 ; CHECK-NEXT: [[TMP11:%.*]] = zext i32 [[TMP10]] to i64 ; CHECK-NEXT: [[TMP12:%.*]] = add nuw nsw i64 [[TMP11]], 1 -; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP12]] +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* %B, i64 [[TMP12]] ; CHECK-NEXT: [[SCEVGEP2:%.*]] = bitcast i32* [[SCEVGEP]] to i8* -; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP12]] -; CHECK-NEXT: [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8* -; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[B1]], [[SCEVGEP45]] -; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[A3]], [[SCEVGEP2]] -; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP2]] +; CHECK-NEXT: br i1 [[BOUND0]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 2 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] @@ -41,15 +35,15 @@ ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[INDEX]], 0 ; CHECK-NEXT: [[TMP14:%.*]] = urem i32 [[TMP13]], 4 -; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[TMP14]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* %A, i32 [[TMP14]] ; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP15]], i32 0 ; CHECK-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <2 x i32>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP17]], align 4, !alias.scope !0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP17]], align 4 ; CHECK-NEXT: [[TMP18:%.*]] = add <2 x i32> [[WIDE_LOAD]], -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[TMP13]] +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, i32* %B, i32 [[TMP13]] ; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP19]], i32 0 ; CHECK-NEXT: [[TMP21:%.*]] = bitcast i32* [[TMP20]] to <2 x i32>* -; CHECK-NEXT: store <2 x i32> [[TMP18]], <2 x i32>* [[TMP21]], align 4, !alias.scope !3, !noalias !0 +; CHECK-NEXT: store <2 x i32> [[TMP18]], <2 x i32>* [[TMP21]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] @@ -62,10 +56,10 @@ ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[CLAMPED_INDEX:%.*]] = urem i32 [[IV]], 4 -; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[CLAMPED_INDEX]] +; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i32, i32* %A, i32 [[CLAMPED_INDEX]] ; CHECK-NEXT: [[LV:%.*]] = load i32, i32* [[GEP_A]], align 4 ; CHECK-NEXT: [[ADD:%.*]] = add i32 [[LV]], 10 -; CHECK-NEXT: [[GEP_B:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[IV]] +; CHECK-NEXT: [[GEP_B:%.*]] = getelementptr inbounds i32, i32* %B, i32 [[IV]] ; CHECK-NEXT: store i32 [[ADD]], i32* [[GEP_B]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 ; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]] @@ -96,8 +90,7 @@ define void @store_clamped_index(i32* %A, i32* %B, i32 %N) { ; CHECK-LABEL: @store_clamped_index( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[B1:%.*]] = bitcast i32* [[B:%.*]] to i8* -; CHECK-NEXT: [[A3:%.*]] = bitcast i32* [[A:%.*]] to i8* +; CHECK-NEXT: [[B1:%.*]] = bitcast i32* %B to i8* ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N:%.*]], 2 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] ; CHECK: vector.scevcheck: @@ -108,14 +101,10 @@ ; CHECK-NEXT: [[TMP10:%.*]] = add i32 [[N]], -1 ; CHECK-NEXT: [[TMP11:%.*]] = zext i32 [[TMP10]] to i64 ; CHECK-NEXT: [[TMP12:%.*]] = add nuw nsw i64 [[TMP11]], 1 -; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP12]] -; CHECK-NEXT: [[SCEVGEP2:%.*]] = bitcast i32* [[SCEVGEP]] to i8* -; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP12]] +; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* %A, i64 [[TMP12]] ; CHECK-NEXT: [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8* ; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[B1]], [[SCEVGEP45]] -; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[A3]], [[SCEVGEP2]] -; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 [[BOUND0]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 2 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] @@ -124,15 +113,15 @@ ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[INDEX]], 0 ; CHECK-NEXT: [[TMP14:%.*]] = urem i32 [[TMP13]], 4 -; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[TMP13]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* %B, i32 [[TMP13]] ; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP15]], i32 0 ; CHECK-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <2 x i32>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP17]], align 4, !alias.scope !8, !noalias !11 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP17]], align 4 ; CHECK-NEXT: [[TMP18:%.*]] = add <2 x i32> [[WIDE_LOAD]], -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[TMP14]] +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, i32* %A, i32 [[TMP14]] ; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP19]], i32 0 ; CHECK-NEXT: [[TMP21:%.*]] = bitcast i32* [[TMP20]] to <2 x i32>* -; CHECK-NEXT: store <2 x i32> [[TMP18]], <2 x i32>* [[TMP21]], align 4, !alias.scope !11 +; CHECK-NEXT: store <2 x i32> [[TMP18]], <2 x i32>* [[TMP21]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] @@ -145,10 +134,10 @@ ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[CLAMPED_INDEX:%.*]] = urem i32 [[IV]], 4 -; CHECK-NEXT: [[GEP_B:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[IV]] +; CHECK-NEXT: [[GEP_B:%.*]] = getelementptr inbounds i32, i32* %B, i32 [[IV]] ; CHECK-NEXT: [[LV:%.*]] = load i32, i32* [[GEP_B]], align 4 ; CHECK-NEXT: [[ADD:%.*]] = add i32 [[LV]], 10 -; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[CLAMPED_INDEX]] +; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i32, i32* %A, i32 [[CLAMPED_INDEX]] ; CHECK-NEXT: store i32 [[ADD]], i32* [[GEP_A]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 ; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]] diff --git a/llvm/test/Transforms/LoopVectorize/runtime-check.ll b/llvm/test/Transforms/LoopVectorize/runtime-check.ll --- a/llvm/test/Transforms/LoopVectorize/runtime-check.ll +++ b/llvm/test/Transforms/LoopVectorize/runtime-check.ll @@ -26,23 +26,20 @@ ; CHECK-NEXT: [[TMP4:%.*]] = zext i32 [[TMP3]] to i64, [[DBG9]] ; CHECK-NEXT: [[TMP5:%.*]] = add nuw nsw i64 [[TMP4]], 1, [[DBG9]] ; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr float, float* [[A:%.*]], i64 [[TMP5]], [[DBG9]] -; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr float, float* [[B:%.*]], i64 [[TMP5]], [[DBG9]] -; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt float* [[SCEVGEP4]], [[A]], [[DBG9]] -; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt float* [[SCEVGEP]], [[B]], [[DBG9]] -; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]], [[DBG9]] -; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]], [[DBG9]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt float* [[SCEVGEP]], %b, [[DBG9]] +; CHECK-NEXT: br i1 [[BOUND1]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]], [[DBG9]] ; CHECK: vector.ph: ; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], 8589934588, [[DBG9]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]], [[DBG9]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [[DBG9]] -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDEX]], [[DBG9]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* %b, i64 [[INDEX]], [[DBG9]] ; CHECK-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <4 x float>*, [[DBG9]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP7]], align 4, [[DBG9]], !alias.scope !10 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP7]], align 4, [[DBG9]] ; CHECK-NEXT: [[TMP8:%.*]] = fmul <4 x float> [[WIDE_LOAD]], , [[DBG9]] ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDEX]], [[DBG9]] ; CHECK-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to <4 x float>*, [[DBG9]] -; CHECK-NEXT: store <4 x float> [[TMP8]], <4 x float>* [[TMP10]], align 4, [[DBG9]], !alias.scope !13, !noalias !10 +; CHECK-NEXT: store <4 x float> [[TMP8]], <4 x float>* [[TMP10]], align 4, [[DBG9]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4, [[DBG9]] ; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]], [[DBG9]] ; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[DBG9]], [[LOOP15:!llvm.loop !.*]] @@ -54,7 +51,7 @@ ; CHECK-NEXT: br label [[FOR_BODY:%.*]], [[DBG9]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [[DBG9]] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]], [[DBG9]] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* %b, i64 [[INDVARS_IV]], [[DBG9]] ; CHECK-NEXT: [[TMP12:%.*]] = load float, float* [[ARRAYIDX]], align 4, [[DBG9]] ; CHECK-NEXT: [[MUL:%.*]] = fmul float [[TMP12]], 3.000000e+00, [[DBG9]] ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]], [[DBG9]] diff --git a/llvm/test/Transforms/LoopVectorize/scalable-loop-unpredicated-body-scalar-tail.ll b/llvm/test/Transforms/LoopVectorize/scalable-loop-unpredicated-body-scalar-tail.ll --- a/llvm/test/Transforms/LoopVectorize/scalable-loop-unpredicated-body-scalar-tail.ll +++ b/llvm/test/Transforms/LoopVectorize/scalable-loop-unpredicated-body-scalar-tail.ll @@ -17,16 +17,16 @@ ; CHECKUF1: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] ; CHECKUF1: %[[IDXB:.*]] = getelementptr inbounds double, double* %b, i64 %index ; CHECKUF1: %[[IDXB_CAST:.*]] = bitcast double* %[[IDXB]] to * -; CHECKUF1: %wide.load = load , * %[[IDXB_CAST]], align 8, !alias.scope !0 +; CHECKUF1: %wide.load = load , * %[[IDXB_CAST]], align 8 ; CHECKUF1: %[[FADD:.*]] = fadd %wide.load, shufflevector ( insertelement ( poison, double 1.000000e+00, i32 0), poison, zeroinitializer) ; CHECKUF1: %[[IDXA:.*]] = getelementptr inbounds double, double* %a, i64 %index ; CHECKUF1: %[[IDXA_CAST:.*]] = bitcast double* %[[IDXA]] to * -; CHECKUF1: store %[[FADD]], * %[[IDXA_CAST]], align 8, !alias.scope !3, !noalias !0 +; CHECKUF1: store %[[FADD]], * %[[IDXA_CAST]], align 8 ; CHECKUF1: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64() ; CHECKUF1: %[[VSCALEX4:.*]] = shl i64 %[[VSCALE]], 2 ; CHECKUF1: %index.next = add nuw i64 %index, %[[VSCALEX4]] ; CHECKUF1: %[[CMP:.*]] = icmp eq i64 %index.next, %n.vec -; CHECKUF1: br i1 %[[CMP]], label %middle.block, label %vector.body, !llvm.loop !5 +; CHECKUF1: br i1 %[[CMP]], label %middle.block, label %vector.body, !llvm.loop !0 ; For an interleave factor of 2, vscale is scaled by 8 instead of 4 (and thus shifted left by 3 instead of 2). @@ -48,29 +48,29 @@ ; CHECKUF2: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] ; CHECKUF2: %[[IDXB:.*]] = getelementptr inbounds double, double* %b, i64 %index ; CHECKUF2: %[[IDXB_CAST:.*]] = bitcast double* %[[IDXB]] to * -; CHECKUF2: %wide.load = load , * %[[IDXB_CAST]], align 8, !alias.scope !0 +; CHECKUF2: %wide.load = load , * %[[IDXB_CAST]], align 8 ; CHECKUF2: %[[VSCALE:.*]] = call i32 @llvm.vscale.i32() ; CHECKUF2: %[[VSCALE2:.*]] = shl i32 %[[VSCALE]], 2 ; CHECKUF2: %[[VSCALE2_EXT:.*]] = sext i32 %[[VSCALE2]] to i64 ; CHECKUF2: %[[IDXB_NEXT:.*]] = getelementptr inbounds double, double* %[[IDXB]], i64 %[[VSCALE2_EXT]] ; CHECKUF2: %[[IDXB_NEXT_CAST:.*]] = bitcast double* %[[IDXB_NEXT]] to * -; CHECKUF2: %wide.load{{[0-9]+}} = load , * %[[IDXB_NEXT_CAST]], align 8, !alias.scope !0 +; CHECKUF2: %wide.load{{[0-9]+}} = load , * %[[IDXB_NEXT_CAST]], align 8 ; CHECKUF2: %[[FADD:.*]] = fadd %wide.load, shufflevector ( insertelement ( poison, double 1.000000e+00, i32 0), poison, zeroinitializer) ; CHECKUF2: %[[FADD_NEXT:.*]] = fadd %wide.load{{[0-9]+}}, shufflevector ( insertelement ( poison, double 1.000000e+00, i32 0), poison, zeroinitializer) ; CHECKUF2: %[[IDXA:.*]] = getelementptr inbounds double, double* %a, i64 %index ; CHECKUF2: %[[IDXA_CAST:.*]] = bitcast double* %[[IDXA]] to * -; CHECKUF2: store %[[FADD]], * %[[IDXA_CAST]], align 8, !alias.scope !3, !noalias !0 +; CHECKUF2: store %[[FADD]], * %[[IDXA_CAST]], align 8 ; CHECKUF2: %[[VSCALE:.*]] = call i32 @llvm.vscale.i32() ; CHECKUF2: %[[VSCALE2:.*]] = shl i32 %[[VSCALE]], 2 ; CHECKUF2: %[[VSCALE2_EXT:.*]] = sext i32 %[[VSCALE2]] to i64 ; CHECKUF2: %[[IDXA_NEXT:.*]] = getelementptr inbounds double, double* %[[IDXA]], i64 %[[VSCALE2_EXT]] ; CHECKUF2: %[[IDXA_NEXT_CAST:.*]] = bitcast double* %[[IDXA_NEXT]] to * -; CHECKUF2: store %[[FADD_NEXT]], * %[[IDXA_NEXT_CAST]], align 8, !alias.scope !3, !noalias !0 +; CHECKUF2: store %[[FADD_NEXT]], * %[[IDXA_NEXT_CAST]], align 8 ; CHECKUF2: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64() ; CHECKUF2: %[[VSCALEX8:.*]] = shl i64 %[[VSCALE]], 3 ; CHECKUF2: %index.next = add nuw i64 %index, %[[VSCALEX8]] ; CHECKUF2: %[[CMP:.*]] = icmp eq i64 %index.next, %n.vec -; CHECKUF2: br i1 %[[CMP]], label %middle.block, label %vector.body, !llvm.loop !5 +; CHECKUF2: br i1 %[[CMP]], label %middle.block, label %vector.body, !llvm.loop !0 define void @loop(i32 %N, double* nocapture %a, double* nocapture readonly %b) { entry: diff --git a/llvm/test/Transforms/LoopVectorize/tbaa-nodep.ll b/llvm/test/Transforms/LoopVectorize/tbaa-nodep.ll --- a/llvm/test/Transforms/LoopVectorize/tbaa-nodep.ll +++ b/llvm/test/Transforms/LoopVectorize/tbaa-nodep.ll @@ -17,7 +17,6 @@ ; CHECK-NOTBAA-LABEL: @test1 ; CHECK-NOTBAA: entry: -; CHECK-NOTBAA: icmp ugt i32* ; CHECK-NOTBAA: icmp ugt float* ; CHECK-NOTBAA-NOT: icmp ; CHECK-NOTBAA: br i1 {{.+}}, label %for.body, label %vector.body @@ -51,7 +50,6 @@ ; CHECK-LABEL: @test2 ; CHECK: entry: ; CHECK: icmp ugt float* -; CHECK: icmp ugt float* ; CHECK-NOT: icmp uge i32* ; CHECK: br i1 {{.+}}, label %for.body, label %vector.body @@ -63,8 +61,6 @@ ; CHECK-NOTBAA-LABEL: @test2 ; CHECK-NOTBAA: entry: ; CHECK-NOTBAA: icmp ugt float* -; CHECK-NOTBAA: icmp ugt float* -; CHECK-NOTBAA-DAG: icmp ugt float* ; CHECK-NOTBAA-DAG: icmp ugt i32* ; CHECK-NOTBAA-NOT: icmp ; CHECK-NOTBAA: br i1 {{.+}}, label %for.body, label %vector.body