diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h --- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h +++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h @@ -244,6 +244,15 @@ SmallVector getInstructionsForAccess(Value *Ptr, bool isWrite) const; + /// Return the program order indices for the access location (Ptr, IsWrite). + /// Returns an empty ArrayRef if there are no accesses for the location. + ArrayRef getOrderForAccess(Value *Ptr, bool IsWrite) const { + auto I = Accesses.find({Ptr, IsWrite}); + if (I != Accesses.end()) + return I->second; + return {}; + } + private: /// A wrapper around ScalarEvolution, used to add runtime SCEV checks, and /// applies dynamic knowledge to simplify SCEV expressions and convert them @@ -359,6 +368,16 @@ const RuntimeCheckingPtrGroup *> RuntimePointerCheck; +struct PointerDiffInfo { + const SCEV *SrcStart; + const SCEV *SinkStart; + unsigned AccessSize; + + PointerDiffInfo(const SCEV *SrcStart, const SCEV *SinkStart, + unsigned AccessSize) + : SrcStart(SrcStart), SinkStart(SinkStart), AccessSize(AccessSize) {} +}; + /// Holds information about the memory runtime legality checks to verify /// that a group of pointers do not overlap. class RuntimePointerChecking { @@ -392,7 +411,8 @@ AliasSetId(AliasSetId), Expr(Expr) {} }; - RuntimePointerChecking(ScalarEvolution *SE) : SE(SE) {} + RuntimePointerChecking(MemoryDepChecker &DC, ScalarEvolution *SE) + : DC(DC), SE(SE) {} /// Reset the state of the pointer runtime information. void reset() { @@ -418,11 +438,23 @@ void generateChecks(MemoryDepChecker::DepCandidates &DepCands, bool UseDependencies); - /// Returns the checks that generateChecks created. + /// Returns the checks that generateChecks created. They can be used to ensure + /// no read/write accesses overlap across all loop iterations. const SmallVectorImpl &getChecks() const { return Checks; } + // Returns an optional list of (pointer-difference expressions, access size) + // pairs that can be used to prove that there are no vectorization-preventing + // dependencies at runtime. There are is a vectorization-preventing dependency + // if any pointer-difference is > getDiffChecks() const { + if (!CanUseDiffCheck) + return None; + return {DiffChecks}; + } + /// Decide if we need to add a check between two groups of pointers, /// according to needsChecking. bool needsChecking(const RuntimeCheckingPtrGroup &M, @@ -477,7 +509,15 @@ bool UseDependencies); /// Generate the checks and return them. - SmallVector generateChecks() const; + SmallVector generateChecks(); + + /// Try to create add a new (pointer-difference, access size) pair to + /// DiffCheck for checking groups \p CGI and \p CGJ. If pointer-difference + /// checks cannot be used for the groups, set CanUseDiffCheck to false. + void tryToCreateDiffCheck(const RuntimeCheckingPtrGroup &CGI, + const RuntimeCheckingPtrGroup &CGJ); + + MemoryDepChecker &DC; /// Holds a pointer to the ScalarEvolution analysis. ScalarEvolution *SE; @@ -485,6 +525,13 @@ /// Set of run-time checks required to establish independence of /// otherwise may-aliasing pointers in the loop. SmallVector Checks; + + /// Flag indicating if pointer-difference checks can be used + bool CanUseDiffCheck = true; + + /// A list of (pointer-difference, access size) pairs that can be used to + /// prove that there are no vectorization-preventing dependencies. + SmallVector DiffChecks; }; /// Drive the analysis of memory accesses in the loop diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h --- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h +++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h @@ -14,6 +14,7 @@ #define LLVM_TRANSFORMS_UTILS_LOOPUTILS_H #include "llvm/Analysis/IVDescriptors.h" +#include "llvm/Analysis/LoopAccessAnalysis.h" #include "llvm/Transforms/Utils/ValueMapper.h" namespace llvm { @@ -500,6 +501,12 @@ const SmallVectorImpl &PointerChecks, SCEVExpander &Expander); +Value * +addDiffRuntimeChecks(Instruction *Loc, Loop *TheLoop, + ArrayRef Checks, SCEVExpander &Expander, + function_ref GetVF, + unsigned IC); + /// Struct to hold information about a partially invariant condition. struct IVConditionInfo { /// Instructions that need to be duplicated and checked for the unswitching diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -233,8 +233,98 @@ Pointers.emplace_back(Ptr, ScStart, ScEnd, WritePtr, DepSetId, ASId, Sc); } -SmallVector -RuntimePointerChecking::generateChecks() const { +void RuntimePointerChecking::tryToCreateDiffCheck( + const RuntimeCheckingPtrGroup &CGI, const RuntimeCheckingPtrGroup &CGJ) { + if (!CanUseDiffCheck) + return; + + // If either group contains multiple different pointers, bail out. + // TODO: Support multiple pointers by using the minimum or maximum pointer, + // depending on src & sink. + if (CGI.Members.size() != 1 || CGJ.Members.size() != 1) { + CanUseDiffCheck = false; + return; + } + + PointerInfo *Src = &Pointers[CGI.Members[0]]; + PointerInfo *Sink = &Pointers[CGJ.Members[0]]; + + // If either pointer is read and written, multiple checks may be needed. Bail + // out. + if (!DC.getOrderForAccess(Src->PointerValue, !Src->IsWritePtr).empty() || + !DC.getOrderForAccess(Sink->PointerValue, !Sink->IsWritePtr).empty()) { + CanUseDiffCheck = false; + return; + } + + ArrayRef AccSrc = + DC.getOrderForAccess(Src->PointerValue, Src->IsWritePtr); + ArrayRef AccSink = + DC.getOrderForAccess(Sink->PointerValue, Sink->IsWritePtr); + // If either pointer is accessed multiple times, there may not be a clear + // src/sink relation. Bail out for now. + if (AccSrc.size() != 1 || AccSink.size() != 1) { + CanUseDiffCheck = false; + return; + } + // If the sink is accessed before src, swap src/sink. + if (AccSink[0] < AccSrc[0]) + std::swap(Src, Sink); + + auto *SrcAR = dyn_cast(Src->Expr); + auto *SinkAR = dyn_cast(Sink->Expr); + if (!SrcAR || !SinkAR) { + CanUseDiffCheck = false; + return; + } + + const DataLayout &DL = + SinkAR->getLoop()->getHeader()->getModule()->getDataLayout(); + auto GetAccessedTy = [](Instruction *I) -> Type * { + if (auto *SI = dyn_cast(I)) + return SI->getValueOperand()->getType(); + + if (auto *LI = dyn_cast(I)) + return LI->getType(); + + return nullptr; + }; + SmallVector SrcInsts = + DC.getInstructionsForAccess(Src->PointerValue, Src->IsWritePtr); + SmallVector SinkInsts = + DC.getInstructionsForAccess(Sink->PointerValue, Sink->IsWritePtr); + unsigned AllocSize = + std::max(DL.getTypeAllocSize(GetAccessedTy(SrcInsts[0])), + DL.getTypeAllocSize(GetAccessedTy(SinkInsts[0]))); + IntegerType *IntTy = + IntegerType::get(Src->PointerValue->getContext(), + DL.getPointerSizeInBits(CGI.AddressSpace)); + + // Only matching constant steps matching the AllocSize are supported at the + // moment. This simplifies the difference computation. Can be extended in the + // future. + auto *Step = dyn_cast(SinkAR->getStepRecurrence(*SE)); + if (!Step || Step != SrcAR->getStepRecurrence(*SE) || + Step->getAPInt().abs() != AllocSize) { + CanUseDiffCheck = false; + return; + } + + // When counting down, the dependence distance needs to be swapped. + if (Step->getValue()->isNegative()) + std::swap(SinkAR, SrcAR); + + const SCEV *SinkStartInt = SE->getPtrToIntExpr(SinkAR->getStart(), IntTy); + const SCEV *SrcStartInt = SE->getPtrToIntExpr(SrcAR->getStart(), IntTy); + if (isa(SinkStartInt) || + isa(SrcStartInt)) { + CanUseDiffCheck = false; + return; + } + DiffChecks.emplace_back(SrcStartInt, SinkStartInt, AllocSize); +} + +SmallVector RuntimePointerChecking::generateChecks() { SmallVector Checks; for (unsigned I = 0; I < CheckingGroups.size(); ++I) { @@ -242,8 +332,10 @@ const RuntimeCheckingPtrGroup &CGI = CheckingGroups[I]; const RuntimeCheckingPtrGroup &CGJ = CheckingGroups[J]; - if (needsChecking(CGI, CGJ)) + if (needsChecking(CGI, CGJ)) { + tryToCreateDiffCheck(CGI, CGJ); Checks.push_back(std::make_pair(&CGI, &CGJ)); + } } } return Checks; @@ -2307,10 +2399,12 @@ const TargetLibraryInfo *TLI, AAResults *AA, DominatorTree *DT, LoopInfo *LI) : PSE(std::make_unique(*SE, *L)), - PtrRtChecking(std::make_unique(SE)), + PtrRtChecking(nullptr), DepChecker(std::make_unique(*PSE, L)), TheLoop(L) { - if (canAnalyzeLoop()) + if (canAnalyzeLoop()) { + PtrRtChecking = std::make_unique(*DepChecker, SE); analyzeLoop(AA, LI, TLI, DT); + } } void LoopAccessInfo::print(raw_ostream &OS, unsigned Depth) const { diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp --- a/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -1608,6 +1608,40 @@ return MemoryRuntimeCheck; } +Value *llvm::addDiffRuntimeChecks( + Instruction *Loc, Loop *TheLoop, ArrayRef Checks, + SCEVExpander &Expander, + function_ref GetVF, unsigned IC) { + + LLVMContext &Ctx = Loc->getContext(); + IRBuilder ChkBuilder(Ctx, + Loc->getModule()->getDataLayout()); + ChkBuilder.SetInsertPoint(Loc); + // Our instructions might fold to a constant. + Value *MemoryRuntimeCheck = nullptr; + + for (auto &C : Checks) { + Type *Ty = C.SinkStart->getType(); + // Compute VF * IC * AccessSize. + auto *VFTimesUFTimesSize = + ChkBuilder.CreateMul(GetVF(ChkBuilder, Ty->getScalarSizeInBits()), + ConstantInt::get(Ty, IC * C.AccessSize)); + Value *Sink = Expander.expandCodeFor(C.SinkStart, Ty, Loc); + Value *Src = Expander.expandCodeFor(C.SrcStart, Ty, Loc); + Value *Diff = ChkBuilder.CreateSub(Sink, Src); + Value *IsConflict = + ChkBuilder.CreateICmpULT(Diff, VFTimesUFTimesSize, "diff.check"); + + if (MemoryRuntimeCheck) { + IsConflict = + ChkBuilder.CreateOr(MemoryRuntimeCheck, IsConflict, "conflict.rdx"); + } + MemoryRuntimeCheck = IsConflict; + } + + return MemoryRuntimeCheck; +} + Optional llvm::hasPartialIVCondition(Loop &L, unsigned MSSAThreshold, MemorySSA &MSSA, diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1971,7 +1971,7 @@ /// there is no vector code generation, the check blocks are removed /// completely. void Create(Loop *L, const LoopAccessInfo &LAI, - const SCEVPredicate &Pred) { + const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) { BasicBlock *LoopHeader = L->getHeader(); BasicBlock *Preheader = L->getLoopPreheader(); @@ -1980,12 +1980,12 @@ // ensure the blocks are properly added to LoopInfo & DominatorTree. Those // may be used by SCEVExpander. The blocks will be un-linked from their // predecessors and removed from LI & DT at the end of the function. - if (!Pred.isAlwaysTrue()) { + if (!UnionPred.isAlwaysTrue()) { SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI, nullptr, "vector.scevcheck"); SCEVCheckCond = SCEVExp.expandCodeForPredicate( - &Pred, SCEVCheckBlock->getTerminator()); + &UnionPred, SCEVCheckBlock->getTerminator()); } const auto &RtPtrChecking = *LAI.getRuntimePointerChecking(); @@ -1994,9 +1994,19 @@ MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr, "vector.memcheck"); - MemRuntimeCheckCond = - addRuntimeChecks(MemCheckBlock->getTerminator(), L, - RtPtrChecking.getChecks(), MemCheckExp); + auto DiffChecks = RtPtrChecking.getDiffChecks(); + if (DiffChecks) { + MemRuntimeCheckCond = addDiffRuntimeChecks( + MemCheckBlock->getTerminator(), L, *DiffChecks, MemCheckExp, + [VF](IRBuilderBase &B, unsigned Bits) { + return getRuntimeVF(B, B.getIntNTy(Bits), VF); + }, + IC); + } else { + MemRuntimeCheckCond = + addRuntimeChecks(MemCheckBlock->getTerminator(), L, + RtPtrChecking.getChecks(), MemCheckExp); + } assert(MemRuntimeCheckCond && "no RT checks generated although RtPtrChecking " "claimed checks are required"); @@ -3061,13 +3071,17 @@ AddedSafetyChecks = true; - // We currently don't use LoopVersioning for the actual loop cloning but we - // still use it to add the noalias metadata. - LVer = std::make_unique( - *Legal->getLAI(), - Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, - DT, PSE.getSE()); - LVer->prepareNoAliasMetadata(); + // Only use noalias metadata when using memory checks guaranteeing no overlap + // across all iterations. + if (!Legal->getLAI()->getRuntimePointerChecking()->getDiffChecks()) { + // We currently don't use LoopVersioning for the actual loop cloning but we + // still use it to add the noalias metadata. + LVer = std::make_unique( + *Legal->getLAI(), + Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, + DT, PSE.getSE()); + LVer->prepareNoAliasMetadata(); + } return MemCheckBlock; } @@ -10518,7 +10532,7 @@ GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, F->getParent()->getDataLayout()); if (!VF.Width.isScalar() || IC > 1) - Checks.Create(L, *LVL.getLAI(), PSE.getPredicate()); + Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, IC); using namespace ore; if (!VectorizeLoop) { diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll @@ -10,6 +10,8 @@ define void @vector_reverse_f64(i64 %N, double* %a, double* %b) #0{ ; CHECK-LABEL: @vector_reverse_f64( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[A2:%.*]] = ptrtoint double* [[A:%.*]] to i64 +; CHECK-NEXT: [[B1:%.*]] = ptrtoint double* [[B:%.*]] to i64 ; CHECK-NEXT: [[CMP7:%.*]] = icmp sgt i64 [[N:%.*]], 0 ; CHECK-NEXT: br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] ; CHECK: for.body.preheader: @@ -18,16 +20,18 @@ ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; CHECK: vector.memcheck: -; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr double, double* [[A:%.*]], i64 [[N]] -; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr double, double* [[B:%.*]], i64 [[N]] -; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt double* [[SCEVGEP4]], [[A]] -; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt double* [[SCEVGEP]], [[B]] -; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] -; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 3 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 6 +; CHECK-NEXT: [[TMP4:%.*]] = shl i64 [[N]], 3 +; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[TMP4]], [[B1]] +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[TMP4]], [[A2]] +; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP7]], [[TMP3]] +; CHECK-NEXT: br i1 [[DIFF_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP9:%.*]] = shl i64 [[TMP8]], 3 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP9]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: @@ -41,7 +45,7 @@ ; CHECK-NEXT: [[TMP9:%.*]] = sext i32 [[TMP8]] to i64 ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds double, double* [[TMP6]], i64 [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = bitcast double* [[TMP10]] to * -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , * [[TMP11]], align 8, !alias.scope !0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , * [[TMP11]], align 8 ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP5]] ; CHECK-NEXT: [[TMP13:%.*]] = fadd [[WIDE_LOAD]], shufflevector ( insertelement ( poison, double 1.000000e+00, i32 0), poison, zeroinitializer) ; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.vscale.i32() @@ -50,7 +54,7 @@ ; CHECK-NEXT: [[TMP16:%.*]] = sext i32 [[TMP15]] to i64 ; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds double, double* [[TMP12]], i64 [[TMP16]] ; CHECK-NEXT: [[TMP18:%.*]] = bitcast double* [[TMP17]] to * -; CHECK-NEXT: store [[TMP13]], * [[TMP18]], align 8, !alias.scope !3, !noalias !0 +; CHECK-NEXT: store [[TMP13]], * [[TMP18]], align 8 ; CHECK-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP20:%.*]] = shl i64 [[TMP19]], 3 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP20]] @@ -100,6 +104,8 @@ define void @vector_reverse_i64(i64 %N, i64* %a, i64* %b) #0 { ; CHECK-LABEL: @vector_reverse_i64( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[A2:%.*]] = ptrtoint i64* [[A:%.*]] to i64 +; CHECK-NEXT: [[B1:%.*]] = ptrtoint i64* [[B:%.*]] to i64 ; CHECK-NEXT: [[CMP8:%.*]] = icmp sgt i64 [[N:%.*]], 0 ; CHECK-NEXT: br i1 [[CMP8]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] ; CHECK: for.body.preheader: @@ -108,16 +114,18 @@ ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; CHECK: vector.memcheck: -; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i64, i64* [[A:%.*]], i64 [[N]] -; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr i64, i64* [[B:%.*]], i64 [[N]] -; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt i64* [[SCEVGEP4]], [[A]] -; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt i64* [[SCEVGEP]], [[B]] -; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] -; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 3 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 6 +; CHECK-NEXT: [[TMP4:%.*]] = shl i64 [[N]], 3 +; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[TMP4]], [[B1]] +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[TMP4]], [[A2]] +; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP7]], [[TMP3]] +; CHECK-NEXT: br i1 [[DIFF_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP9:%.*]] = shl i64 [[TMP8]], 3 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP9]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: @@ -131,7 +139,7 @@ ; CHECK-NEXT: [[TMP9:%.*]] = sext i32 [[TMP8]] to i64 ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, i64* [[TMP6]], i64 [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = bitcast i64* [[TMP10]] to * -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , * [[TMP11]], align 8, !alias.scope !9 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , * [[TMP11]], align 8 ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[TMP5]] ; CHECK-NEXT: [[TMP13:%.*]] = add [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer) ; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.vscale.i32() @@ -140,7 +148,7 @@ ; CHECK-NEXT: [[TMP16:%.*]] = sext i32 [[TMP15]] to i64 ; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i64, i64* [[TMP12]], i64 [[TMP16]] ; CHECK-NEXT: [[TMP18:%.*]] = bitcast i64* [[TMP17]] to * -; CHECK-NEXT: store [[TMP13]], * [[TMP18]], align 8, !alias.scope !12, !noalias !9 +; CHECK-NEXT: store [[TMP13]], * [[TMP18]], align 8 ; CHECK-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP20:%.*]] = shl i64 [[TMP19]], 3 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP20]] diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-qabs.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-qabs.ll --- a/llvm/test/Transforms/LoopVectorize/ARM/mve-qabs.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-qabs.ll @@ -7,18 +7,16 @@ define void @arm_abs_q7(i8* nocapture readonly %pSrc, i8* nocapture %pDst, i32 %blockSize) #0 { ; CHECK-LABEL: @arm_abs_q7( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[PSRC2:%.*]] = ptrtoint i8* [[PSRC:%.*]] to i32 +; CHECK-NEXT: [[PDST1:%.*]] = ptrtoint i8* [[PDST:%.*]] to i32 ; CHECK-NEXT: [[CMP_NOT19:%.*]] = icmp eq i32 [[BLOCKSIZE:%.*]], 0 ; CHECK-NEXT: br i1 [[CMP_NOT19]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]] ; CHECK: while.body.preheader: ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[BLOCKSIZE]], 16 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] -; CHECK: vector.memcheck: -; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, i8* [[PDST:%.*]], i32 [[BLOCKSIZE]] -; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, i8* [[PSRC:%.*]], i32 [[BLOCKSIZE]] -; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt i8* [[SCEVGEP1]], [[PDST]] -; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt i8* [[SCEVGEP]], [[PSRC]] -; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: [[TMP0:%.*]] = sub i32 [[PDST1]], [[PSRC2]] +; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i32 [[TMP0]], 16 +; CHECK-NEXT: [[OR_COND:%.*]] = select i1 [[MIN_ITERS_CHECK]], i1 true, i1 [[DIFF_CHECK]] +; CHECK-NEXT: br i1 [[OR_COND]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[BLOCKSIZE]], -16 ; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, i8* [[PSRC]], i32 [[N_VEC]] @@ -30,14 +28,14 @@ ; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, i8* [[PSRC]], i32 [[INDEX]] ; CHECK-NEXT: [[NEXT_GEP6:%.*]] = getelementptr i8, i8* [[PDST]], i32 [[INDEX]] ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[NEXT_GEP]] to <16 x i8>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, <16 x i8>* [[TMP0]], align 1, !alias.scope !0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, <16 x i8>* [[TMP0]], align 1 ; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt <16 x i8> [[WIDE_LOAD]], zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = icmp eq <16 x i8> [[WIDE_LOAD]], ; CHECK-NEXT: [[TMP3:%.*]] = sub <16 x i8> zeroinitializer, [[WIDE_LOAD]] ; CHECK-NEXT: [[TMP4:%.*]] = select <16 x i1> [[TMP2]], <16 x i8> , <16 x i8> [[TMP3]] ; CHECK-NEXT: [[TMP5:%.*]] = select <16 x i1> [[TMP1]], <16 x i8> [[WIDE_LOAD]], <16 x i8> [[TMP4]] ; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8* [[NEXT_GEP6]] to <16 x i8>* -; CHECK-NEXT: store <16 x i8> [[TMP5]], <16 x i8>* [[TMP6]], align 1, !alias.scope !3, !noalias !0 +; CHECK-NEXT: store <16 x i8> [[TMP5]], <16 x i8>* [[TMP6]], align 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16 ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP5:!llvm.loop !.*]] @@ -45,9 +43,9 @@ ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[BLOCKSIZE]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[WHILE_END]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[PSRC]], [[WHILE_BODY_PREHEADER]] ], [ [[PSRC]], [[VECTOR_MEMCHECK]] ] -; CHECK-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i32 [ [[IND_END3]], [[MIDDLE_BLOCK]] ], [ [[BLOCKSIZE]], [[WHILE_BODY_PREHEADER]] ], [ [[BLOCKSIZE]], [[VECTOR_MEMCHECK]] ] -; CHECK-NEXT: [[BC_RESUME_VAL4:%.*]] = phi i8* [ [[IND_END5]], [[MIDDLE_BLOCK]] ], [ [[PDST]], [[WHILE_BODY_PREHEADER]] ], [ [[PDST]], [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[PSRC]], [[WHILE_BODY_PREHEADER]] ] +; CHECK-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i32 [ [[IND_END3]], [[MIDDLE_BLOCK]] ], [ [[BLOCKSIZE]], [[WHILE_BODY_PREHEADER]] ] +; CHECK-NEXT: [[BC_RESUME_VAL4:%.*]] = phi i8* [ [[IND_END5]], [[MIDDLE_BLOCK]] ], [ [[PDST]], [[WHILE_BODY_PREHEADER]] ] ; CHECK-NEXT: br label [[WHILE_BODY:%.*]] ; CHECK: while.body: ; CHECK-NEXT: [[PSRC_ADDR_022:%.*]] = phi i8* [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -102,18 +100,16 @@ define void @arm_abs_q15(i16* nocapture readonly %pSrc, i16* nocapture %pDst, i32 %blockSize) #0 { ; CHECK-LABEL: @arm_abs_q15( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[PSRC2:%.*]] = ptrtoint i16* [[PSRC:%.*]] to i32 +; CHECK-NEXT: [[PDST1:%.*]] = ptrtoint i16* [[PDST:%.*]] to i32 ; CHECK-NEXT: [[CMP_NOT20:%.*]] = icmp eq i32 [[BLOCKSIZE:%.*]], 0 ; CHECK-NEXT: br i1 [[CMP_NOT20]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]] ; CHECK: while.body.preheader: ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[BLOCKSIZE]], 8 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] -; CHECK: vector.memcheck: -; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i16, i16* [[PDST:%.*]], i32 [[BLOCKSIZE]] -; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr i16, i16* [[PSRC:%.*]], i32 [[BLOCKSIZE]] -; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt i16* [[SCEVGEP4]], [[PDST]] -; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt i16* [[SCEVGEP]], [[PSRC]] -; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: [[TMP0:%.*]] = sub i32 [[PDST1]], [[PSRC2]] +; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i32 [[TMP0]], 16 +; CHECK-NEXT: [[OR_COND:%.*]] = select i1 [[MIN_ITERS_CHECK]], i1 true, i1 [[DIFF_CHECK]] +; CHECK-NEXT: br i1 [[OR_COND]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[BLOCKSIZE]], -8 ; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i16, i16* [[PSRC]], i32 [[N_VEC]] @@ -125,14 +121,14 @@ ; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i16, i16* [[PSRC]], i32 [[INDEX]] ; CHECK-NEXT: [[NEXT_GEP10:%.*]] = getelementptr i16, i16* [[PDST]], i32 [[INDEX]] ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i16* [[NEXT_GEP]] to <8 x i16>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 2, !alias.scope !8 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 2 ; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt <8 x i16> [[WIDE_LOAD]], zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = icmp eq <8 x i16> [[WIDE_LOAD]], ; CHECK-NEXT: [[TMP3:%.*]] = sub <8 x i16> zeroinitializer, [[WIDE_LOAD]] ; CHECK-NEXT: [[TMP4:%.*]] = select <8 x i1> [[TMP2]], <8 x i16> , <8 x i16> [[TMP3]] ; CHECK-NEXT: [[TMP5:%.*]] = select <8 x i1> [[TMP1]], <8 x i16> [[WIDE_LOAD]], <8 x i16> [[TMP4]] ; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16* [[NEXT_GEP10]] to <8 x i16>* -; CHECK-NEXT: store <8 x i16> [[TMP5]], <8 x i16>* [[TMP6]], align 2, !alias.scope !11, !noalias !8 +; CHECK-NEXT: store <8 x i16> [[TMP5]], <8 x i16>* [[TMP6]], align 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP13:!llvm.loop !.*]] @@ -140,9 +136,9 @@ ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[BLOCKSIZE]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[WHILE_END]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i16* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[PSRC]], [[WHILE_BODY_PREHEADER]] ], [ [[PSRC]], [[VECTOR_MEMCHECK]] ] -; CHECK-NEXT: [[BC_RESUME_VAL6:%.*]] = phi i32 [ [[IND_END7]], [[MIDDLE_BLOCK]] ], [ [[BLOCKSIZE]], [[WHILE_BODY_PREHEADER]] ], [ [[BLOCKSIZE]], [[VECTOR_MEMCHECK]] ] -; CHECK-NEXT: [[BC_RESUME_VAL8:%.*]] = phi i16* [ [[IND_END9]], [[MIDDLE_BLOCK]] ], [ [[PDST]], [[WHILE_BODY_PREHEADER]] ], [ [[PDST]], [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i16* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[PSRC]], [[WHILE_BODY_PREHEADER]] ] +; CHECK-NEXT: [[BC_RESUME_VAL6:%.*]] = phi i32 [ [[IND_END7]], [[MIDDLE_BLOCK]] ], [ [[BLOCKSIZE]], [[WHILE_BODY_PREHEADER]] ] +; CHECK-NEXT: [[BC_RESUME_VAL8:%.*]] = phi i16* [ [[IND_END9]], [[MIDDLE_BLOCK]] ], [ [[PDST]], [[WHILE_BODY_PREHEADER]] ] ; CHECK-NEXT: br label [[WHILE_BODY:%.*]] ; CHECK: while.body: ; CHECK-NEXT: [[PSRC_ADDR_023:%.*]] = phi i16* [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -197,18 +193,16 @@ define void @arm_abs_q31(i32* nocapture readonly %pSrc, i32* nocapture %pDst, i32 %blockSize) #0 { ; CHECK-LABEL: @arm_abs_q31( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[PSRC2:%.*]] = ptrtoint i32* [[PSRC:%.*]] to i32 +; CHECK-NEXT: [[PDST1:%.*]] = ptrtoint i32* [[PDST:%.*]] to i32 ; CHECK-NEXT: [[CMP_NOT14:%.*]] = icmp eq i32 [[BLOCKSIZE:%.*]], 0 ; CHECK-NEXT: br i1 [[CMP_NOT14]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]] ; CHECK: while.body.preheader: ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[BLOCKSIZE]], 4 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] -; CHECK: vector.memcheck: -; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[PDST:%.*]], i32 [[BLOCKSIZE]] -; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[PSRC:%.*]], i32 [[BLOCKSIZE]] -; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt i32* [[SCEVGEP4]], [[PDST]] -; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[PSRC]] -; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: [[TMP0:%.*]] = sub i32 [[PDST1]], [[PSRC2]] +; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i32 [[TMP0]], 16 +; CHECK-NEXT: [[OR_COND:%.*]] = select i1 [[MIN_ITERS_CHECK]], i1 true, i1 [[DIFF_CHECK]] +; CHECK-NEXT: br i1 [[OR_COND]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[BLOCKSIZE]], -4 ; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i32, i32* [[PSRC]], i32 [[N_VEC]] @@ -220,14 +214,14 @@ ; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i32, i32* [[PSRC]], i32 [[INDEX]] ; CHECK-NEXT: [[NEXT_GEP10:%.*]] = getelementptr i32, i32* [[PDST]], i32 [[INDEX]] ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[NEXT_GEP]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4, !alias.scope !15 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], ; CHECK-NEXT: [[TMP3:%.*]] = sub nsw <4 x i32> zeroinitializer, [[WIDE_LOAD]] ; CHECK-NEXT: [[TMP4:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> , <4 x i32> [[TMP3]] ; CHECK-NEXT: [[TMP5:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[WIDE_LOAD]], <4 x i32> [[TMP4]] ; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[NEXT_GEP10]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4, !alias.scope !18, !noalias !15 +; CHECK-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP20:!llvm.loop !.*]] @@ -235,9 +229,9 @@ ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[BLOCKSIZE]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[WHILE_END]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[PSRC]], [[WHILE_BODY_PREHEADER]] ], [ [[PSRC]], [[VECTOR_MEMCHECK]] ] -; CHECK-NEXT: [[BC_RESUME_VAL6:%.*]] = phi i32 [ [[IND_END7]], [[MIDDLE_BLOCK]] ], [ [[BLOCKSIZE]], [[WHILE_BODY_PREHEADER]] ], [ [[BLOCKSIZE]], [[VECTOR_MEMCHECK]] ] -; CHECK-NEXT: [[BC_RESUME_VAL8:%.*]] = phi i32* [ [[IND_END9]], [[MIDDLE_BLOCK]] ], [ [[PDST]], [[WHILE_BODY_PREHEADER]] ], [ [[PDST]], [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[PSRC]], [[WHILE_BODY_PREHEADER]] ] +; CHECK-NEXT: [[BC_RESUME_VAL6:%.*]] = phi i32 [ [[IND_END7]], [[MIDDLE_BLOCK]] ], [ [[BLOCKSIZE]], [[WHILE_BODY_PREHEADER]] ] +; CHECK-NEXT: [[BC_RESUME_VAL8:%.*]] = phi i32* [ [[IND_END9]], [[MIDDLE_BLOCK]] ], [ [[PDST]], [[WHILE_BODY_PREHEADER]] ] ; CHECK-NEXT: br label [[WHILE_BODY:%.*]] ; CHECK: while.body: ; CHECK-NEXT: [[PSRC_ADDR_017:%.*]] = phi i32* [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] diff --git a/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll b/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll --- a/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll @@ -20,24 +20,16 @@ define void @foo1(i32* nocapture %A, i32* nocapture readonly %B, i32* nocapture readonly %trigger) local_unnamed_addr #0 { ; AVX1-LABEL: @foo1( ; AVX1-NEXT: entry: -; AVX1-NEXT: [[A1:%.*]] = bitcast i32* [[A:%.*]] to i8* -; AVX1-NEXT: [[TRIGGER3:%.*]] = bitcast i32* [[TRIGGER:%.*]] to i8* -; AVX1-NEXT: [[B6:%.*]] = bitcast i32* [[B:%.*]] to i8* +; AVX1-NEXT: [[B3:%.*]] = ptrtoint i32* [[B:%.*]] to i64 +; AVX1-NEXT: [[TRIGGER2:%.*]] = ptrtoint i32* [[TRIGGER:%.*]] to i64 +; AVX1-NEXT: [[A1:%.*]] = ptrtoint i32* [[A:%.*]] to i64 ; AVX1-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; AVX1: vector.memcheck: -; AVX1-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[A]], i64 10000 -; AVX1-NEXT: [[SCEVGEP2:%.*]] = bitcast i32* [[SCEVGEP]] to i8* -; AVX1-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[TRIGGER]], i64 10000 -; AVX1-NEXT: [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8* -; AVX1-NEXT: [[SCEVGEP7:%.*]] = getelementptr i32, i32* [[B]], i64 10000 -; AVX1-NEXT: [[SCEVGEP78:%.*]] = bitcast i32* [[SCEVGEP7]] to i8* -; AVX1-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP45]] -; AVX1-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[TRIGGER3]], [[SCEVGEP2]] -; AVX1-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; AVX1-NEXT: [[BOUND09:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP78]] -; AVX1-NEXT: [[BOUND110:%.*]] = icmp ult i8* [[B6]], [[SCEVGEP2]] -; AVX1-NEXT: [[FOUND_CONFLICT11:%.*]] = and i1 [[BOUND09]], [[BOUND110]] -; AVX1-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT11]] +; AVX1-NEXT: [[TMP0:%.*]] = sub i64 [[A1]], [[TRIGGER2]] +; AVX1-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 32 +; AVX1-NEXT: [[TMP1:%.*]] = sub i64 [[A1]], [[B3]] +; AVX1-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP1]], 32 +; AVX1-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]] ; AVX1-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; AVX1: vector.ph: ; AVX1-NEXT: br label [[VECTOR_BODY:%.*]] @@ -47,17 +39,17 @@ ; AVX1-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP0]] ; AVX1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0 ; AVX1-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <8 x i32>* -; AVX1-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP3]], align 4, !alias.scope !0 +; AVX1-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP3]], align 4 ; AVX1-NEXT: [[TMP4:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], ; AVX1-NEXT: [[TMP5:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP0]] ; AVX1-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[TMP5]], i32 0 ; AVX1-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>* -; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP7]], i32 4, <8 x i1> [[TMP4]], <8 x i32> poison), !alias.scope !3 +; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP7]], i32 4, <8 x i1> [[TMP4]], <8 x i32> poison) ; AVX1-NEXT: [[TMP8:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]] ; AVX1-NEXT: [[TMP9:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP0]] ; AVX1-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[TMP9]], i32 0 ; AVX1-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <8 x i32>* -; AVX1-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP8]], <8 x i32>* [[TMP11]], i32 4, <8 x i1> [[TMP4]]), !alias.scope !5, !noalias !7 +; AVX1-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP8]], <8 x i32>* [[TMP11]], i32 4, <8 x i1> [[TMP4]]) ; AVX1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; AVX1-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000 ; AVX1-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] @@ -89,24 +81,16 @@ ; ; AVX2-LABEL: @foo1( ; AVX2-NEXT: entry: -; AVX2-NEXT: [[A1:%.*]] = bitcast i32* [[A:%.*]] to i8* -; AVX2-NEXT: [[TRIGGER3:%.*]] = bitcast i32* [[TRIGGER:%.*]] to i8* -; AVX2-NEXT: [[B6:%.*]] = bitcast i32* [[B:%.*]] to i8* +; AVX2-NEXT: [[B3:%.*]] = ptrtoint i32* [[B:%.*]] to i64 +; AVX2-NEXT: [[TRIGGER2:%.*]] = ptrtoint i32* [[TRIGGER:%.*]] to i64 +; AVX2-NEXT: [[A1:%.*]] = ptrtoint i32* [[A:%.*]] to i64 ; AVX2-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; AVX2: vector.memcheck: -; AVX2-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[A]], i64 10000 -; AVX2-NEXT: [[SCEVGEP2:%.*]] = bitcast i32* [[SCEVGEP]] to i8* -; AVX2-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[TRIGGER]], i64 10000 -; AVX2-NEXT: [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8* -; AVX2-NEXT: [[SCEVGEP7:%.*]] = getelementptr i32, i32* [[B]], i64 10000 -; AVX2-NEXT: [[SCEVGEP78:%.*]] = bitcast i32* [[SCEVGEP7]] to i8* -; AVX2-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP45]] -; AVX2-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[TRIGGER3]], [[SCEVGEP2]] -; AVX2-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; AVX2-NEXT: [[BOUND09:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP78]] -; AVX2-NEXT: [[BOUND110:%.*]] = icmp ult i8* [[B6]], [[SCEVGEP2]] -; AVX2-NEXT: [[FOUND_CONFLICT11:%.*]] = and i1 [[BOUND09]], [[BOUND110]] -; AVX2-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT11]] +; AVX2-NEXT: [[TMP0:%.*]] = sub i64 [[A1]], [[TRIGGER2]] +; AVX2-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 128 +; AVX2-NEXT: [[TMP1:%.*]] = sub i64 [[A1]], [[B3]] +; AVX2-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP1]], 128 +; AVX2-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]] ; AVX2-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; AVX2: vector.ph: ; AVX2-NEXT: br label [[VECTOR_BODY:%.*]] @@ -122,16 +106,16 @@ ; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP3]] ; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0 ; AVX2-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <8 x i32>* -; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP9]], align 4, !alias.scope !0 +; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP9]], align 4 ; AVX2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 8 ; AVX2-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <8 x i32>* -; AVX2-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i32>, <8 x i32>* [[TMP11]], align 4, !alias.scope !0 +; AVX2-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i32>, <8 x i32>* [[TMP11]], align 4 ; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 16 ; AVX2-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <8 x i32>* -; AVX2-NEXT: [[WIDE_LOAD13:%.*]] = load <8 x i32>, <8 x i32>* [[TMP13]], align 4, !alias.scope !0 +; AVX2-NEXT: [[WIDE_LOAD13:%.*]] = load <8 x i32>, <8 x i32>* [[TMP13]], align 4 ; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 24 ; AVX2-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <8 x i32>* -; AVX2-NEXT: [[WIDE_LOAD14:%.*]] = load <8 x i32>, <8 x i32>* [[TMP15]], align 4, !alias.scope !0 +; AVX2-NEXT: [[WIDE_LOAD14:%.*]] = load <8 x i32>, <8 x i32>* [[TMP15]], align 4 ; AVX2-NEXT: [[TMP16:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], ; AVX2-NEXT: [[TMP17:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD12]], ; AVX2-NEXT: [[TMP18:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD13]], @@ -142,16 +126,16 @@ ; AVX2-NEXT: [[TMP23:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP3]] ; AVX2-NEXT: [[TMP24:%.*]] = getelementptr i32, i32* [[TMP20]], i32 0 ; AVX2-NEXT: [[TMP25:%.*]] = bitcast i32* [[TMP24]] to <8 x i32>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP25]], i32 4, <8 x i1> [[TMP16]], <8 x i32> poison), !alias.scope !3 +; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP25]], i32 4, <8 x i1> [[TMP16]], <8 x i32> poison) ; AVX2-NEXT: [[TMP26:%.*]] = getelementptr i32, i32* [[TMP20]], i32 8 ; AVX2-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <8 x i32>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP27]], i32 4, <8 x i1> [[TMP17]], <8 x i32> poison), !alias.scope !3 +; AVX2-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP27]], i32 4, <8 x i1> [[TMP17]], <8 x i32> poison) ; AVX2-NEXT: [[TMP28:%.*]] = getelementptr i32, i32* [[TMP20]], i32 16 ; AVX2-NEXT: [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <8 x i32>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP29]], i32 4, <8 x i1> [[TMP18]], <8 x i32> poison), !alias.scope !3 +; AVX2-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP29]], i32 4, <8 x i1> [[TMP18]], <8 x i32> poison) ; AVX2-NEXT: [[TMP30:%.*]] = getelementptr i32, i32* [[TMP20]], i32 24 ; AVX2-NEXT: [[TMP31:%.*]] = bitcast i32* [[TMP30]] to <8 x i32>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD17:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP31]], i32 4, <8 x i1> [[TMP19]], <8 x i32> poison), !alias.scope !3 +; AVX2-NEXT: [[WIDE_MASKED_LOAD17:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP31]], i32 4, <8 x i1> [[TMP19]], <8 x i32> poison) ; AVX2-NEXT: [[TMP32:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]] ; AVX2-NEXT: [[TMP33:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD15]], [[WIDE_LOAD12]] ; AVX2-NEXT: [[TMP34:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD16]], [[WIDE_LOAD13]] @@ -162,16 +146,16 @@ ; AVX2-NEXT: [[TMP39:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP3]] ; AVX2-NEXT: [[TMP40:%.*]] = getelementptr i32, i32* [[TMP36]], i32 0 ; AVX2-NEXT: [[TMP41:%.*]] = bitcast i32* [[TMP40]] to <8 x i32>* -; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP32]], <8 x i32>* [[TMP41]], i32 4, <8 x i1> [[TMP16]]), !alias.scope !5, !noalias !7 +; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP32]], <8 x i32>* [[TMP41]], i32 4, <8 x i1> [[TMP16]]) ; AVX2-NEXT: [[TMP42:%.*]] = getelementptr i32, i32* [[TMP36]], i32 8 ; AVX2-NEXT: [[TMP43:%.*]] = bitcast i32* [[TMP42]] to <8 x i32>* -; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP33]], <8 x i32>* [[TMP43]], i32 4, <8 x i1> [[TMP17]]), !alias.scope !5, !noalias !7 +; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP33]], <8 x i32>* [[TMP43]], i32 4, <8 x i1> [[TMP17]]) ; AVX2-NEXT: [[TMP44:%.*]] = getelementptr i32, i32* [[TMP36]], i32 16 ; AVX2-NEXT: [[TMP45:%.*]] = bitcast i32* [[TMP44]] to <8 x i32>* -; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP34]], <8 x i32>* [[TMP45]], i32 4, <8 x i1> [[TMP18]]), !alias.scope !5, !noalias !7 +; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP34]], <8 x i32>* [[TMP45]], i32 4, <8 x i1> [[TMP18]]) ; AVX2-NEXT: [[TMP46:%.*]] = getelementptr i32, i32* [[TMP36]], i32 24 ; AVX2-NEXT: [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <8 x i32>* -; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP35]], <8 x i32>* [[TMP47]], i32 4, <8 x i1> [[TMP19]]), !alias.scope !5, !noalias !7 +; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP35]], <8 x i32>* [[TMP47]], i32 4, <8 x i1> [[TMP19]]) ; AVX2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; AVX2-NEXT: [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984 ; AVX2-NEXT: br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] @@ -203,24 +187,16 @@ ; ; AVX512-LABEL: @foo1( ; AVX512-NEXT: iter.check: -; AVX512-NEXT: [[A1:%.*]] = bitcast i32* [[A:%.*]] to i8* -; AVX512-NEXT: [[TRIGGER3:%.*]] = bitcast i32* [[TRIGGER:%.*]] to i8* -; AVX512-NEXT: [[B6:%.*]] = bitcast i32* [[B:%.*]] to i8* +; AVX512-NEXT: [[B3:%.*]] = ptrtoint i32* [[B:%.*]] to i64 +; AVX512-NEXT: [[TRIGGER2:%.*]] = ptrtoint i32* [[TRIGGER:%.*]] to i64 +; AVX512-NEXT: [[A1:%.*]] = ptrtoint i32* [[A:%.*]] to i64 ; AVX512-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; AVX512: vector.memcheck: -; AVX512-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[A]], i64 10000 -; AVX512-NEXT: [[SCEVGEP2:%.*]] = bitcast i32* [[SCEVGEP]] to i8* -; AVX512-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[TRIGGER]], i64 10000 -; AVX512-NEXT: [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8* -; AVX512-NEXT: [[SCEVGEP7:%.*]] = getelementptr i32, i32* [[B]], i64 10000 -; AVX512-NEXT: [[SCEVGEP78:%.*]] = bitcast i32* [[SCEVGEP7]] to i8* -; AVX512-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP45]] -; AVX512-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[TRIGGER3]], [[SCEVGEP2]] -; AVX512-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; AVX512-NEXT: [[BOUND09:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP78]] -; AVX512-NEXT: [[BOUND110:%.*]] = icmp ult i8* [[B6]], [[SCEVGEP2]] -; AVX512-NEXT: [[FOUND_CONFLICT11:%.*]] = and i1 [[BOUND09]], [[BOUND110]] -; AVX512-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT11]] +; AVX512-NEXT: [[TMP0:%.*]] = sub i64 [[A1]], [[TRIGGER2]] +; AVX512-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 256 +; AVX512-NEXT: [[TMP1:%.*]] = sub i64 [[A1]], [[B3]] +; AVX512-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP1]], 256 +; AVX512-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]] ; AVX512-NEXT: br i1 [[CONFLICT_RDX]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] ; AVX512: vector.main.loop.iter.check: ; AVX512-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] @@ -238,16 +214,16 @@ ; AVX512-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP3]] ; AVX512-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0 ; AVX512-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32>* [[TMP9]], align 4, !alias.scope !0 +; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32>* [[TMP9]], align 4 ; AVX512-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 16 ; AVX512-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i32>, <16 x i32>* [[TMP11]], align 4, !alias.scope !0 +; AVX512-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i32>, <16 x i32>* [[TMP11]], align 4 ; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 32 ; AVX512-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_LOAD13:%.*]] = load <16 x i32>, <16 x i32>* [[TMP13]], align 4, !alias.scope !0 +; AVX512-NEXT: [[WIDE_LOAD13:%.*]] = load <16 x i32>, <16 x i32>* [[TMP13]], align 4 ; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 48 ; AVX512-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_LOAD14:%.*]] = load <16 x i32>, <16 x i32>* [[TMP15]], align 4, !alias.scope !0 +; AVX512-NEXT: [[WIDE_LOAD14:%.*]] = load <16 x i32>, <16 x i32>* [[TMP15]], align 4 ; AVX512-NEXT: [[TMP16:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD]], ; AVX512-NEXT: [[TMP17:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD12]], ; AVX512-NEXT: [[TMP18:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD13]], @@ -258,16 +234,16 @@ ; AVX512-NEXT: [[TMP23:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP3]] ; AVX512-NEXT: [[TMP24:%.*]] = getelementptr i32, i32* [[TMP20]], i32 0 ; AVX512-NEXT: [[TMP25:%.*]] = bitcast i32* [[TMP24]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP25]], i32 4, <16 x i1> [[TMP16]], <16 x i32> poison), !alias.scope !3 +; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP25]], i32 4, <16 x i1> [[TMP16]], <16 x i32> poison) ; AVX512-NEXT: [[TMP26:%.*]] = getelementptr i32, i32* [[TMP20]], i32 16 ; AVX512-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP27]], i32 4, <16 x i1> [[TMP17]], <16 x i32> poison), !alias.scope !3 +; AVX512-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP27]], i32 4, <16 x i1> [[TMP17]], <16 x i32> poison) ; AVX512-NEXT: [[TMP28:%.*]] = getelementptr i32, i32* [[TMP20]], i32 32 ; AVX512-NEXT: [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP29]], i32 4, <16 x i1> [[TMP18]], <16 x i32> poison), !alias.scope !3 +; AVX512-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP29]], i32 4, <16 x i1> [[TMP18]], <16 x i32> poison) ; AVX512-NEXT: [[TMP30:%.*]] = getelementptr i32, i32* [[TMP20]], i32 48 ; AVX512-NEXT: [[TMP31:%.*]] = bitcast i32* [[TMP30]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD17:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP31]], i32 4, <16 x i1> [[TMP19]], <16 x i32> poison), !alias.scope !3 +; AVX512-NEXT: [[WIDE_MASKED_LOAD17:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP31]], i32 4, <16 x i1> [[TMP19]], <16 x i32> poison) ; AVX512-NEXT: [[TMP32:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]] ; AVX512-NEXT: [[TMP33:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD15]], [[WIDE_LOAD12]] ; AVX512-NEXT: [[TMP34:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD16]], [[WIDE_LOAD13]] @@ -278,16 +254,16 @@ ; AVX512-NEXT: [[TMP39:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP3]] ; AVX512-NEXT: [[TMP40:%.*]] = getelementptr i32, i32* [[TMP36]], i32 0 ; AVX512-NEXT: [[TMP41:%.*]] = bitcast i32* [[TMP40]] to <16 x i32>* -; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP32]], <16 x i32>* [[TMP41]], i32 4, <16 x i1> [[TMP16]]), !alias.scope !5, !noalias !7 +; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP32]], <16 x i32>* [[TMP41]], i32 4, <16 x i1> [[TMP16]]) ; AVX512-NEXT: [[TMP42:%.*]] = getelementptr i32, i32* [[TMP36]], i32 16 ; AVX512-NEXT: [[TMP43:%.*]] = bitcast i32* [[TMP42]] to <16 x i32>* -; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP33]], <16 x i32>* [[TMP43]], i32 4, <16 x i1> [[TMP17]]), !alias.scope !5, !noalias !7 +; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP33]], <16 x i32>* [[TMP43]], i32 4, <16 x i1> [[TMP17]]) ; AVX512-NEXT: [[TMP44:%.*]] = getelementptr i32, i32* [[TMP36]], i32 32 ; AVX512-NEXT: [[TMP45:%.*]] = bitcast i32* [[TMP44]] to <16 x i32>* -; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP34]], <16 x i32>* [[TMP45]], i32 4, <16 x i1> [[TMP18]]), !alias.scope !5, !noalias !7 +; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP34]], <16 x i32>* [[TMP45]], i32 4, <16 x i1> [[TMP18]]) ; AVX512-NEXT: [[TMP46:%.*]] = getelementptr i32, i32* [[TMP36]], i32 48 ; AVX512-NEXT: [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <16 x i32>* -; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP35]], <16 x i32>* [[TMP47]], i32 4, <16 x i1> [[TMP19]]), !alias.scope !5, !noalias !7 +; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP35]], <16 x i32>* [[TMP47]], i32 4, <16 x i1> [[TMP19]]) ; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64 ; AVX512-NEXT: [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984 ; AVX512-NEXT: br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] @@ -379,24 +355,16 @@ define void @foo1_addrspace1(i32 addrspace(1)* nocapture %A, i32 addrspace(1)* nocapture readonly %B, i32 addrspace(1)* nocapture readonly %trigger) local_unnamed_addr #0 { ; AVX1-LABEL: @foo1_addrspace1( ; AVX1-NEXT: entry: -; AVX1-NEXT: [[A1:%.*]] = bitcast i32 addrspace(1)* [[A:%.*]] to i8 addrspace(1)* -; AVX1-NEXT: [[TRIGGER3:%.*]] = bitcast i32 addrspace(1)* [[TRIGGER:%.*]] to i8 addrspace(1)* -; AVX1-NEXT: [[B6:%.*]] = bitcast i32 addrspace(1)* [[B:%.*]] to i8 addrspace(1)* +; AVX1-NEXT: [[B3:%.*]] = ptrtoint i32 addrspace(1)* [[B:%.*]] to i64 +; AVX1-NEXT: [[TRIGGER2:%.*]] = ptrtoint i32 addrspace(1)* [[TRIGGER:%.*]] to i64 +; AVX1-NEXT: [[A1:%.*]] = ptrtoint i32 addrspace(1)* [[A:%.*]] to i64 ; AVX1-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; AVX1: vector.memcheck: -; AVX1-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 10000 -; AVX1-NEXT: [[SCEVGEP2:%.*]] = bitcast i32 addrspace(1)* [[SCEVGEP]] to i8 addrspace(1)* -; AVX1-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32 addrspace(1)* [[TRIGGER]], i64 10000 -; AVX1-NEXT: [[SCEVGEP45:%.*]] = bitcast i32 addrspace(1)* [[SCEVGEP4]] to i8 addrspace(1)* -; AVX1-NEXT: [[SCEVGEP7:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 10000 -; AVX1-NEXT: [[SCEVGEP78:%.*]] = bitcast i32 addrspace(1)* [[SCEVGEP7]] to i8 addrspace(1)* -; AVX1-NEXT: [[BOUND0:%.*]] = icmp ult i8 addrspace(1)* [[A1]], [[SCEVGEP45]] -; AVX1-NEXT: [[BOUND1:%.*]] = icmp ult i8 addrspace(1)* [[TRIGGER3]], [[SCEVGEP2]] -; AVX1-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; AVX1-NEXT: [[BOUND09:%.*]] = icmp ult i8 addrspace(1)* [[A1]], [[SCEVGEP78]] -; AVX1-NEXT: [[BOUND110:%.*]] = icmp ult i8 addrspace(1)* [[B6]], [[SCEVGEP2]] -; AVX1-NEXT: [[FOUND_CONFLICT11:%.*]] = and i1 [[BOUND09]], [[BOUND110]] -; AVX1-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT11]] +; AVX1-NEXT: [[TMP0:%.*]] = sub i64 [[A1]], [[TRIGGER2]] +; AVX1-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 32 +; AVX1-NEXT: [[TMP1:%.*]] = sub i64 [[A1]], [[B3]] +; AVX1-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP1]], 32 +; AVX1-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]] ; AVX1-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; AVX1: vector.ph: ; AVX1-NEXT: br label [[VECTOR_BODY:%.*]] @@ -406,17 +374,17 @@ ; AVX1-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[TMP0]] ; AVX1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP1]], i32 0 ; AVX1-NEXT: [[TMP3:%.*]] = bitcast i32 addrspace(1)* [[TMP2]] to <8 x i32> addrspace(1)* -; AVX1-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP3]], align 4, !alias.scope !11 +; AVX1-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP3]], align 4 ; AVX1-NEXT: [[TMP4:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], ; AVX1-NEXT: [[TMP5:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 [[TMP0]] ; AVX1-NEXT: [[TMP6:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP5]], i32 0 ; AVX1-NEXT: [[TMP7:%.*]] = bitcast i32 addrspace(1)* [[TMP6]] to <8 x i32> addrspace(1)* -; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP7]], i32 4, <8 x i1> [[TMP4]], <8 x i32> poison), !alias.scope !14 +; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP7]], i32 4, <8 x i1> [[TMP4]], <8 x i32> poison) ; AVX1-NEXT: [[TMP8:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]] ; AVX1-NEXT: [[TMP9:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 [[TMP0]] ; AVX1-NEXT: [[TMP10:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP9]], i32 0 ; AVX1-NEXT: [[TMP11:%.*]] = bitcast i32 addrspace(1)* [[TMP10]] to <8 x i32> addrspace(1)* -; AVX1-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP8]], <8 x i32> addrspace(1)* [[TMP11]], i32 4, <8 x i1> [[TMP4]]), !alias.scope !16, !noalias !18 +; AVX1-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP8]], <8 x i32> addrspace(1)* [[TMP11]], i32 4, <8 x i1> [[TMP4]]) ; AVX1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; AVX1-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000 ; AVX1-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] @@ -448,24 +416,16 @@ ; ; AVX2-LABEL: @foo1_addrspace1( ; AVX2-NEXT: entry: -; AVX2-NEXT: [[A1:%.*]] = bitcast i32 addrspace(1)* [[A:%.*]] to i8 addrspace(1)* -; AVX2-NEXT: [[TRIGGER3:%.*]] = bitcast i32 addrspace(1)* [[TRIGGER:%.*]] to i8 addrspace(1)* -; AVX2-NEXT: [[B6:%.*]] = bitcast i32 addrspace(1)* [[B:%.*]] to i8 addrspace(1)* +; AVX2-NEXT: [[B3:%.*]] = ptrtoint i32 addrspace(1)* [[B:%.*]] to i64 +; AVX2-NEXT: [[TRIGGER2:%.*]] = ptrtoint i32 addrspace(1)* [[TRIGGER:%.*]] to i64 +; AVX2-NEXT: [[A1:%.*]] = ptrtoint i32 addrspace(1)* [[A:%.*]] to i64 ; AVX2-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; AVX2: vector.memcheck: -; AVX2-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 10000 -; AVX2-NEXT: [[SCEVGEP2:%.*]] = bitcast i32 addrspace(1)* [[SCEVGEP]] to i8 addrspace(1)* -; AVX2-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32 addrspace(1)* [[TRIGGER]], i64 10000 -; AVX2-NEXT: [[SCEVGEP45:%.*]] = bitcast i32 addrspace(1)* [[SCEVGEP4]] to i8 addrspace(1)* -; AVX2-NEXT: [[SCEVGEP7:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 10000 -; AVX2-NEXT: [[SCEVGEP78:%.*]] = bitcast i32 addrspace(1)* [[SCEVGEP7]] to i8 addrspace(1)* -; AVX2-NEXT: [[BOUND0:%.*]] = icmp ult i8 addrspace(1)* [[A1]], [[SCEVGEP45]] -; AVX2-NEXT: [[BOUND1:%.*]] = icmp ult i8 addrspace(1)* [[TRIGGER3]], [[SCEVGEP2]] -; AVX2-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; AVX2-NEXT: [[BOUND09:%.*]] = icmp ult i8 addrspace(1)* [[A1]], [[SCEVGEP78]] -; AVX2-NEXT: [[BOUND110:%.*]] = icmp ult i8 addrspace(1)* [[B6]], [[SCEVGEP2]] -; AVX2-NEXT: [[FOUND_CONFLICT11:%.*]] = and i1 [[BOUND09]], [[BOUND110]] -; AVX2-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT11]] +; AVX2-NEXT: [[TMP0:%.*]] = sub i64 [[A1]], [[TRIGGER2]] +; AVX2-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 128 +; AVX2-NEXT: [[TMP1:%.*]] = sub i64 [[A1]], [[B3]] +; AVX2-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP1]], 128 +; AVX2-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]] ; AVX2-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; AVX2: vector.ph: ; AVX2-NEXT: br label [[VECTOR_BODY:%.*]] @@ -481,16 +441,16 @@ ; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[TMP3]] ; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP4]], i32 0 ; AVX2-NEXT: [[TMP9:%.*]] = bitcast i32 addrspace(1)* [[TMP8]] to <8 x i32> addrspace(1)* -; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP9]], align 4, !alias.scope !11 +; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP9]], align 4 ; AVX2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP4]], i32 8 ; AVX2-NEXT: [[TMP11:%.*]] = bitcast i32 addrspace(1)* [[TMP10]] to <8 x i32> addrspace(1)* -; AVX2-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP11]], align 4, !alias.scope !11 +; AVX2-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP11]], align 4 ; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP4]], i32 16 ; AVX2-NEXT: [[TMP13:%.*]] = bitcast i32 addrspace(1)* [[TMP12]] to <8 x i32> addrspace(1)* -; AVX2-NEXT: [[WIDE_LOAD13:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP13]], align 4, !alias.scope !11 +; AVX2-NEXT: [[WIDE_LOAD13:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP13]], align 4 ; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP4]], i32 24 ; AVX2-NEXT: [[TMP15:%.*]] = bitcast i32 addrspace(1)* [[TMP14]] to <8 x i32> addrspace(1)* -; AVX2-NEXT: [[WIDE_LOAD14:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP15]], align 4, !alias.scope !11 +; AVX2-NEXT: [[WIDE_LOAD14:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP15]], align 4 ; AVX2-NEXT: [[TMP16:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], ; AVX2-NEXT: [[TMP17:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD12]], ; AVX2-NEXT: [[TMP18:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD13]], @@ -501,16 +461,16 @@ ; AVX2-NEXT: [[TMP23:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 [[TMP3]] ; AVX2-NEXT: [[TMP24:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP20]], i32 0 ; AVX2-NEXT: [[TMP25:%.*]] = bitcast i32 addrspace(1)* [[TMP24]] to <8 x i32> addrspace(1)* -; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP25]], i32 4, <8 x i1> [[TMP16]], <8 x i32> poison), !alias.scope !14 +; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP25]], i32 4, <8 x i1> [[TMP16]], <8 x i32> poison) ; AVX2-NEXT: [[TMP26:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP20]], i32 8 ; AVX2-NEXT: [[TMP27:%.*]] = bitcast i32 addrspace(1)* [[TMP26]] to <8 x i32> addrspace(1)* -; AVX2-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP27]], i32 4, <8 x i1> [[TMP17]], <8 x i32> poison), !alias.scope !14 +; AVX2-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP27]], i32 4, <8 x i1> [[TMP17]], <8 x i32> poison) ; AVX2-NEXT: [[TMP28:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP20]], i32 16 ; AVX2-NEXT: [[TMP29:%.*]] = bitcast i32 addrspace(1)* [[TMP28]] to <8 x i32> addrspace(1)* -; AVX2-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP29]], i32 4, <8 x i1> [[TMP18]], <8 x i32> poison), !alias.scope !14 +; AVX2-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP29]], i32 4, <8 x i1> [[TMP18]], <8 x i32> poison) ; AVX2-NEXT: [[TMP30:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP20]], i32 24 ; AVX2-NEXT: [[TMP31:%.*]] = bitcast i32 addrspace(1)* [[TMP30]] to <8 x i32> addrspace(1)* -; AVX2-NEXT: [[WIDE_MASKED_LOAD17:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP31]], i32 4, <8 x i1> [[TMP19]], <8 x i32> poison), !alias.scope !14 +; AVX2-NEXT: [[WIDE_MASKED_LOAD17:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP31]], i32 4, <8 x i1> [[TMP19]], <8 x i32> poison) ; AVX2-NEXT: [[TMP32:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]] ; AVX2-NEXT: [[TMP33:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD15]], [[WIDE_LOAD12]] ; AVX2-NEXT: [[TMP34:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD16]], [[WIDE_LOAD13]] @@ -521,16 +481,16 @@ ; AVX2-NEXT: [[TMP39:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 [[TMP3]] ; AVX2-NEXT: [[TMP40:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP36]], i32 0 ; AVX2-NEXT: [[TMP41:%.*]] = bitcast i32 addrspace(1)* [[TMP40]] to <8 x i32> addrspace(1)* -; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP32]], <8 x i32> addrspace(1)* [[TMP41]], i32 4, <8 x i1> [[TMP16]]), !alias.scope !16, !noalias !18 +; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP32]], <8 x i32> addrspace(1)* [[TMP41]], i32 4, <8 x i1> [[TMP16]]) ; AVX2-NEXT: [[TMP42:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP36]], i32 8 ; AVX2-NEXT: [[TMP43:%.*]] = bitcast i32 addrspace(1)* [[TMP42]] to <8 x i32> addrspace(1)* -; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP33]], <8 x i32> addrspace(1)* [[TMP43]], i32 4, <8 x i1> [[TMP17]]), !alias.scope !16, !noalias !18 +; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP33]], <8 x i32> addrspace(1)* [[TMP43]], i32 4, <8 x i1> [[TMP17]]) ; AVX2-NEXT: [[TMP44:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP36]], i32 16 ; AVX2-NEXT: [[TMP45:%.*]] = bitcast i32 addrspace(1)* [[TMP44]] to <8 x i32> addrspace(1)* -; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP34]], <8 x i32> addrspace(1)* [[TMP45]], i32 4, <8 x i1> [[TMP18]]), !alias.scope !16, !noalias !18 +; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP34]], <8 x i32> addrspace(1)* [[TMP45]], i32 4, <8 x i1> [[TMP18]]) ; AVX2-NEXT: [[TMP46:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP36]], i32 24 ; AVX2-NEXT: [[TMP47:%.*]] = bitcast i32 addrspace(1)* [[TMP46]] to <8 x i32> addrspace(1)* -; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP35]], <8 x i32> addrspace(1)* [[TMP47]], i32 4, <8 x i1> [[TMP19]]), !alias.scope !16, !noalias !18 +; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP35]], <8 x i32> addrspace(1)* [[TMP47]], i32 4, <8 x i1> [[TMP19]]) ; AVX2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; AVX2-NEXT: [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984 ; AVX2-NEXT: br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] @@ -562,24 +522,16 @@ ; ; AVX512-LABEL: @foo1_addrspace1( ; AVX512-NEXT: iter.check: -; AVX512-NEXT: [[A1:%.*]] = bitcast i32 addrspace(1)* [[A:%.*]] to i8 addrspace(1)* -; AVX512-NEXT: [[TRIGGER3:%.*]] = bitcast i32 addrspace(1)* [[TRIGGER:%.*]] to i8 addrspace(1)* -; AVX512-NEXT: [[B6:%.*]] = bitcast i32 addrspace(1)* [[B:%.*]] to i8 addrspace(1)* +; AVX512-NEXT: [[B3:%.*]] = ptrtoint i32 addrspace(1)* [[B:%.*]] to i64 +; AVX512-NEXT: [[TRIGGER2:%.*]] = ptrtoint i32 addrspace(1)* [[TRIGGER:%.*]] to i64 +; AVX512-NEXT: [[A1:%.*]] = ptrtoint i32 addrspace(1)* [[A:%.*]] to i64 ; AVX512-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; AVX512: vector.memcheck: -; AVX512-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 10000 -; AVX512-NEXT: [[SCEVGEP2:%.*]] = bitcast i32 addrspace(1)* [[SCEVGEP]] to i8 addrspace(1)* -; AVX512-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32 addrspace(1)* [[TRIGGER]], i64 10000 -; AVX512-NEXT: [[SCEVGEP45:%.*]] = bitcast i32 addrspace(1)* [[SCEVGEP4]] to i8 addrspace(1)* -; AVX512-NEXT: [[SCEVGEP7:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 10000 -; AVX512-NEXT: [[SCEVGEP78:%.*]] = bitcast i32 addrspace(1)* [[SCEVGEP7]] to i8 addrspace(1)* -; AVX512-NEXT: [[BOUND0:%.*]] = icmp ult i8 addrspace(1)* [[A1]], [[SCEVGEP45]] -; AVX512-NEXT: [[BOUND1:%.*]] = icmp ult i8 addrspace(1)* [[TRIGGER3]], [[SCEVGEP2]] -; AVX512-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; AVX512-NEXT: [[BOUND09:%.*]] = icmp ult i8 addrspace(1)* [[A1]], [[SCEVGEP78]] -; AVX512-NEXT: [[BOUND110:%.*]] = icmp ult i8 addrspace(1)* [[B6]], [[SCEVGEP2]] -; AVX512-NEXT: [[FOUND_CONFLICT11:%.*]] = and i1 [[BOUND09]], [[BOUND110]] -; AVX512-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT11]] +; AVX512-NEXT: [[TMP0:%.*]] = sub i64 [[A1]], [[TRIGGER2]] +; AVX512-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 256 +; AVX512-NEXT: [[TMP1:%.*]] = sub i64 [[A1]], [[B3]] +; AVX512-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP1]], 256 +; AVX512-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]] ; AVX512-NEXT: br i1 [[CONFLICT_RDX]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] ; AVX512: vector.main.loop.iter.check: ; AVX512-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] @@ -597,16 +549,16 @@ ; AVX512-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[TMP3]] ; AVX512-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP4]], i32 0 ; AVX512-NEXT: [[TMP9:%.*]] = bitcast i32 addrspace(1)* [[TMP8]] to <16 x i32> addrspace(1)* -; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP9]], align 4, !alias.scope !13 +; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP9]], align 4 ; AVX512-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP4]], i32 16 ; AVX512-NEXT: [[TMP11:%.*]] = bitcast i32 addrspace(1)* [[TMP10]] to <16 x i32> addrspace(1)* -; AVX512-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP11]], align 4, !alias.scope !13 +; AVX512-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP11]], align 4 ; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP4]], i32 32 ; AVX512-NEXT: [[TMP13:%.*]] = bitcast i32 addrspace(1)* [[TMP12]] to <16 x i32> addrspace(1)* -; AVX512-NEXT: [[WIDE_LOAD13:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP13]], align 4, !alias.scope !13 +; AVX512-NEXT: [[WIDE_LOAD13:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP13]], align 4 ; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP4]], i32 48 ; AVX512-NEXT: [[TMP15:%.*]] = bitcast i32 addrspace(1)* [[TMP14]] to <16 x i32> addrspace(1)* -; AVX512-NEXT: [[WIDE_LOAD14:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP15]], align 4, !alias.scope !13 +; AVX512-NEXT: [[WIDE_LOAD14:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP15]], align 4 ; AVX512-NEXT: [[TMP16:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD]], ; AVX512-NEXT: [[TMP17:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD12]], ; AVX512-NEXT: [[TMP18:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD13]], @@ -617,16 +569,16 @@ ; AVX512-NEXT: [[TMP23:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 [[TMP3]] ; AVX512-NEXT: [[TMP24:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP20]], i32 0 ; AVX512-NEXT: [[TMP25:%.*]] = bitcast i32 addrspace(1)* [[TMP24]] to <16 x i32> addrspace(1)* -; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP25]], i32 4, <16 x i1> [[TMP16]], <16 x i32> poison), !alias.scope !16 +; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP25]], i32 4, <16 x i1> [[TMP16]], <16 x i32> poison) ; AVX512-NEXT: [[TMP26:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP20]], i32 16 ; AVX512-NEXT: [[TMP27:%.*]] = bitcast i32 addrspace(1)* [[TMP26]] to <16 x i32> addrspace(1)* -; AVX512-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP27]], i32 4, <16 x i1> [[TMP17]], <16 x i32> poison), !alias.scope !16 +; AVX512-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP27]], i32 4, <16 x i1> [[TMP17]], <16 x i32> poison) ; AVX512-NEXT: [[TMP28:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP20]], i32 32 ; AVX512-NEXT: [[TMP29:%.*]] = bitcast i32 addrspace(1)* [[TMP28]] to <16 x i32> addrspace(1)* -; AVX512-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP29]], i32 4, <16 x i1> [[TMP18]], <16 x i32> poison), !alias.scope !16 +; AVX512-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP29]], i32 4, <16 x i1> [[TMP18]], <16 x i32> poison) ; AVX512-NEXT: [[TMP30:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP20]], i32 48 ; AVX512-NEXT: [[TMP31:%.*]] = bitcast i32 addrspace(1)* [[TMP30]] to <16 x i32> addrspace(1)* -; AVX512-NEXT: [[WIDE_MASKED_LOAD17:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP31]], i32 4, <16 x i1> [[TMP19]], <16 x i32> poison), !alias.scope !16 +; AVX512-NEXT: [[WIDE_MASKED_LOAD17:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP31]], i32 4, <16 x i1> [[TMP19]], <16 x i32> poison) ; AVX512-NEXT: [[TMP32:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]] ; AVX512-NEXT: [[TMP33:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD15]], [[WIDE_LOAD12]] ; AVX512-NEXT: [[TMP34:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD16]], [[WIDE_LOAD13]] @@ -637,16 +589,16 @@ ; AVX512-NEXT: [[TMP39:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 [[TMP3]] ; AVX512-NEXT: [[TMP40:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP36]], i32 0 ; AVX512-NEXT: [[TMP41:%.*]] = bitcast i32 addrspace(1)* [[TMP40]] to <16 x i32> addrspace(1)* -; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP32]], <16 x i32> addrspace(1)* [[TMP41]], i32 4, <16 x i1> [[TMP16]]), !alias.scope !18, !noalias !20 +; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP32]], <16 x i32> addrspace(1)* [[TMP41]], i32 4, <16 x i1> [[TMP16]]) ; AVX512-NEXT: [[TMP42:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP36]], i32 16 ; AVX512-NEXT: [[TMP43:%.*]] = bitcast i32 addrspace(1)* [[TMP42]] to <16 x i32> addrspace(1)* -; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP33]], <16 x i32> addrspace(1)* [[TMP43]], i32 4, <16 x i1> [[TMP17]]), !alias.scope !18, !noalias !20 +; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP33]], <16 x i32> addrspace(1)* [[TMP43]], i32 4, <16 x i1> [[TMP17]]) ; AVX512-NEXT: [[TMP44:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP36]], i32 32 ; AVX512-NEXT: [[TMP45:%.*]] = bitcast i32 addrspace(1)* [[TMP44]] to <16 x i32> addrspace(1)* -; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP34]], <16 x i32> addrspace(1)* [[TMP45]], i32 4, <16 x i1> [[TMP18]]), !alias.scope !18, !noalias !20 +; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP34]], <16 x i32> addrspace(1)* [[TMP45]], i32 4, <16 x i1> [[TMP18]]) ; AVX512-NEXT: [[TMP46:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP36]], i32 48 ; AVX512-NEXT: [[TMP47:%.*]] = bitcast i32 addrspace(1)* [[TMP46]] to <16 x i32> addrspace(1)* -; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP35]], <16 x i32> addrspace(1)* [[TMP47]], i32 4, <16 x i1> [[TMP19]]), !alias.scope !18, !noalias !20 +; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP35]], <16 x i32> addrspace(1)* [[TMP47]], i32 4, <16 x i1> [[TMP19]]) ; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64 ; AVX512-NEXT: [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984 ; AVX512-NEXT: br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] @@ -747,24 +699,16 @@ define void @foo2(float* nocapture %A, float* nocapture readonly %B, i32* nocapture readonly %trigger) local_unnamed_addr #0 { ; AVX1-LABEL: @foo2( ; AVX1-NEXT: entry: -; AVX1-NEXT: [[A1:%.*]] = bitcast float* [[A:%.*]] to i8* -; AVX1-NEXT: [[TRIGGER3:%.*]] = bitcast i32* [[TRIGGER:%.*]] to i8* -; AVX1-NEXT: [[B6:%.*]] = bitcast float* [[B:%.*]] to i8* +; AVX1-NEXT: [[B3:%.*]] = ptrtoint float* [[B:%.*]] to i64 +; AVX1-NEXT: [[TRIGGER2:%.*]] = ptrtoint i32* [[TRIGGER:%.*]] to i64 +; AVX1-NEXT: [[A1:%.*]] = ptrtoint float* [[A:%.*]] to i64 ; AVX1-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; AVX1: vector.memcheck: -; AVX1-NEXT: [[SCEVGEP:%.*]] = getelementptr float, float* [[A]], i64 10000 -; AVX1-NEXT: [[SCEVGEP2:%.*]] = bitcast float* [[SCEVGEP]] to i8* -; AVX1-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[TRIGGER]], i64 10000 -; AVX1-NEXT: [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8* -; AVX1-NEXT: [[SCEVGEP7:%.*]] = getelementptr float, float* [[B]], i64 10000 -; AVX1-NEXT: [[SCEVGEP78:%.*]] = bitcast float* [[SCEVGEP7]] to i8* -; AVX1-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP45]] -; AVX1-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[TRIGGER3]], [[SCEVGEP2]] -; AVX1-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; AVX1-NEXT: [[BOUND09:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP78]] -; AVX1-NEXT: [[BOUND110:%.*]] = icmp ult i8* [[B6]], [[SCEVGEP2]] -; AVX1-NEXT: [[FOUND_CONFLICT11:%.*]] = and i1 [[BOUND09]], [[BOUND110]] -; AVX1-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT11]] +; AVX1-NEXT: [[TMP0:%.*]] = sub i64 [[A1]], [[TRIGGER2]] +; AVX1-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 32 +; AVX1-NEXT: [[TMP1:%.*]] = sub i64 [[A1]], [[B3]] +; AVX1-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP1]], 32 +; AVX1-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]] ; AVX1-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; AVX1: vector.ph: ; AVX1-NEXT: br label [[VECTOR_BODY:%.*]] @@ -774,18 +718,18 @@ ; AVX1-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP0]] ; AVX1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0 ; AVX1-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <8 x i32>* -; AVX1-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP3]], align 4, !alias.scope !21 +; AVX1-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP3]], align 4 ; AVX1-NEXT: [[TMP4:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], ; AVX1-NEXT: [[TMP5:%.*]] = getelementptr float, float* [[B]], i64 [[TMP0]] ; AVX1-NEXT: [[TMP6:%.*]] = getelementptr float, float* [[TMP5]], i32 0 ; AVX1-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <8 x float>* -; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP7]], i32 4, <8 x i1> [[TMP4]], <8 x float> poison), !alias.scope !24 +; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP7]], i32 4, <8 x i1> [[TMP4]], <8 x float> poison) ; AVX1-NEXT: [[TMP8:%.*]] = sitofp <8 x i32> [[WIDE_LOAD]] to <8 x float> ; AVX1-NEXT: [[TMP9:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD]], [[TMP8]] ; AVX1-NEXT: [[TMP10:%.*]] = getelementptr float, float* [[A]], i64 [[TMP0]] ; AVX1-NEXT: [[TMP11:%.*]] = getelementptr float, float* [[TMP10]], i32 0 ; AVX1-NEXT: [[TMP12:%.*]] = bitcast float* [[TMP11]] to <8 x float>* -; AVX1-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP9]], <8 x float>* [[TMP12]], i32 4, <8 x i1> [[TMP4]]), !alias.scope !26, !noalias !28 +; AVX1-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP9]], <8 x float>* [[TMP12]], i32 4, <8 x i1> [[TMP4]]) ; AVX1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; AVX1-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000 ; AVX1-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]] @@ -818,24 +762,16 @@ ; ; AVX2-LABEL: @foo2( ; AVX2-NEXT: entry: -; AVX2-NEXT: [[A1:%.*]] = bitcast float* [[A:%.*]] to i8* -; AVX2-NEXT: [[TRIGGER3:%.*]] = bitcast i32* [[TRIGGER:%.*]] to i8* -; AVX2-NEXT: [[B6:%.*]] = bitcast float* [[B:%.*]] to i8* +; AVX2-NEXT: [[B3:%.*]] = ptrtoint float* [[B:%.*]] to i64 +; AVX2-NEXT: [[TRIGGER2:%.*]] = ptrtoint i32* [[TRIGGER:%.*]] to i64 +; AVX2-NEXT: [[A1:%.*]] = ptrtoint float* [[A:%.*]] to i64 ; AVX2-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; AVX2: vector.memcheck: -; AVX2-NEXT: [[SCEVGEP:%.*]] = getelementptr float, float* [[A]], i64 10000 -; AVX2-NEXT: [[SCEVGEP2:%.*]] = bitcast float* [[SCEVGEP]] to i8* -; AVX2-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[TRIGGER]], i64 10000 -; AVX2-NEXT: [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8* -; AVX2-NEXT: [[SCEVGEP7:%.*]] = getelementptr float, float* [[B]], i64 10000 -; AVX2-NEXT: [[SCEVGEP78:%.*]] = bitcast float* [[SCEVGEP7]] to i8* -; AVX2-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP45]] -; AVX2-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[TRIGGER3]], [[SCEVGEP2]] -; AVX2-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; AVX2-NEXT: [[BOUND09:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP78]] -; AVX2-NEXT: [[BOUND110:%.*]] = icmp ult i8* [[B6]], [[SCEVGEP2]] -; AVX2-NEXT: [[FOUND_CONFLICT11:%.*]] = and i1 [[BOUND09]], [[BOUND110]] -; AVX2-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT11]] +; AVX2-NEXT: [[TMP0:%.*]] = sub i64 [[A1]], [[TRIGGER2]] +; AVX2-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 128 +; AVX2-NEXT: [[TMP1:%.*]] = sub i64 [[A1]], [[B3]] +; AVX2-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP1]], 128 +; AVX2-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]] ; AVX2-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; AVX2: vector.ph: ; AVX2-NEXT: br label [[VECTOR_BODY:%.*]] @@ -851,16 +787,16 @@ ; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP3]] ; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0 ; AVX2-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <8 x i32>* -; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP9]], align 4, !alias.scope !21 +; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP9]], align 4 ; AVX2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 8 ; AVX2-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <8 x i32>* -; AVX2-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i32>, <8 x i32>* [[TMP11]], align 4, !alias.scope !21 +; AVX2-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i32>, <8 x i32>* [[TMP11]], align 4 ; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 16 ; AVX2-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <8 x i32>* -; AVX2-NEXT: [[WIDE_LOAD13:%.*]] = load <8 x i32>, <8 x i32>* [[TMP13]], align 4, !alias.scope !21 +; AVX2-NEXT: [[WIDE_LOAD13:%.*]] = load <8 x i32>, <8 x i32>* [[TMP13]], align 4 ; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 24 ; AVX2-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <8 x i32>* -; AVX2-NEXT: [[WIDE_LOAD14:%.*]] = load <8 x i32>, <8 x i32>* [[TMP15]], align 4, !alias.scope !21 +; AVX2-NEXT: [[WIDE_LOAD14:%.*]] = load <8 x i32>, <8 x i32>* [[TMP15]], align 4 ; AVX2-NEXT: [[TMP16:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], ; AVX2-NEXT: [[TMP17:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD12]], ; AVX2-NEXT: [[TMP18:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD13]], @@ -871,16 +807,16 @@ ; AVX2-NEXT: [[TMP23:%.*]] = getelementptr float, float* [[B]], i64 [[TMP3]] ; AVX2-NEXT: [[TMP24:%.*]] = getelementptr float, float* [[TMP20]], i32 0 ; AVX2-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP24]] to <8 x float>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP25]], i32 4, <8 x i1> [[TMP16]], <8 x float> poison), !alias.scope !24 +; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP25]], i32 4, <8 x i1> [[TMP16]], <8 x float> poison) ; AVX2-NEXT: [[TMP26:%.*]] = getelementptr float, float* [[TMP20]], i32 8 ; AVX2-NEXT: [[TMP27:%.*]] = bitcast float* [[TMP26]] to <8 x float>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP27]], i32 4, <8 x i1> [[TMP17]], <8 x float> poison), !alias.scope !24 +; AVX2-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP27]], i32 4, <8 x i1> [[TMP17]], <8 x float> poison) ; AVX2-NEXT: [[TMP28:%.*]] = getelementptr float, float* [[TMP20]], i32 16 ; AVX2-NEXT: [[TMP29:%.*]] = bitcast float* [[TMP28]] to <8 x float>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP29]], i32 4, <8 x i1> [[TMP18]], <8 x float> poison), !alias.scope !24 +; AVX2-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP29]], i32 4, <8 x i1> [[TMP18]], <8 x float> poison) ; AVX2-NEXT: [[TMP30:%.*]] = getelementptr float, float* [[TMP20]], i32 24 ; AVX2-NEXT: [[TMP31:%.*]] = bitcast float* [[TMP30]] to <8 x float>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD17:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP31]], i32 4, <8 x i1> [[TMP19]], <8 x float> poison), !alias.scope !24 +; AVX2-NEXT: [[WIDE_MASKED_LOAD17:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP31]], i32 4, <8 x i1> [[TMP19]], <8 x float> poison) ; AVX2-NEXT: [[TMP32:%.*]] = sitofp <8 x i32> [[WIDE_LOAD]] to <8 x float> ; AVX2-NEXT: [[TMP33:%.*]] = sitofp <8 x i32> [[WIDE_LOAD12]] to <8 x float> ; AVX2-NEXT: [[TMP34:%.*]] = sitofp <8 x i32> [[WIDE_LOAD13]] to <8 x float> @@ -895,16 +831,16 @@ ; AVX2-NEXT: [[TMP43:%.*]] = getelementptr float, float* [[A]], i64 [[TMP3]] ; AVX2-NEXT: [[TMP44:%.*]] = getelementptr float, float* [[TMP40]], i32 0 ; AVX2-NEXT: [[TMP45:%.*]] = bitcast float* [[TMP44]] to <8 x float>* -; AVX2-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP36]], <8 x float>* [[TMP45]], i32 4, <8 x i1> [[TMP16]]), !alias.scope !26, !noalias !28 +; AVX2-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP36]], <8 x float>* [[TMP45]], i32 4, <8 x i1> [[TMP16]]) ; AVX2-NEXT: [[TMP46:%.*]] = getelementptr float, float* [[TMP40]], i32 8 ; AVX2-NEXT: [[TMP47:%.*]] = bitcast float* [[TMP46]] to <8 x float>* -; AVX2-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP37]], <8 x float>* [[TMP47]], i32 4, <8 x i1> [[TMP17]]), !alias.scope !26, !noalias !28 +; AVX2-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP37]], <8 x float>* [[TMP47]], i32 4, <8 x i1> [[TMP17]]) ; AVX2-NEXT: [[TMP48:%.*]] = getelementptr float, float* [[TMP40]], i32 16 ; AVX2-NEXT: [[TMP49:%.*]] = bitcast float* [[TMP48]] to <8 x float>* -; AVX2-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP38]], <8 x float>* [[TMP49]], i32 4, <8 x i1> [[TMP18]]), !alias.scope !26, !noalias !28 +; AVX2-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP38]], <8 x float>* [[TMP49]], i32 4, <8 x i1> [[TMP18]]) ; AVX2-NEXT: [[TMP50:%.*]] = getelementptr float, float* [[TMP40]], i32 24 ; AVX2-NEXT: [[TMP51:%.*]] = bitcast float* [[TMP50]] to <8 x float>* -; AVX2-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP39]], <8 x float>* [[TMP51]], i32 4, <8 x i1> [[TMP19]]), !alias.scope !26, !noalias !28 +; AVX2-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP39]], <8 x float>* [[TMP51]], i32 4, <8 x i1> [[TMP19]]) ; AVX2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; AVX2-NEXT: [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984 ; AVX2-NEXT: br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]] @@ -937,24 +873,16 @@ ; ; AVX512-LABEL: @foo2( ; AVX512-NEXT: iter.check: -; AVX512-NEXT: [[A1:%.*]] = bitcast float* [[A:%.*]] to i8* -; AVX512-NEXT: [[TRIGGER3:%.*]] = bitcast i32* [[TRIGGER:%.*]] to i8* -; AVX512-NEXT: [[B6:%.*]] = bitcast float* [[B:%.*]] to i8* +; AVX512-NEXT: [[B3:%.*]] = ptrtoint float* [[B:%.*]] to i64 +; AVX512-NEXT: [[TRIGGER2:%.*]] = ptrtoint i32* [[TRIGGER:%.*]] to i64 +; AVX512-NEXT: [[A1:%.*]] = ptrtoint float* [[A:%.*]] to i64 ; AVX512-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; AVX512: vector.memcheck: -; AVX512-NEXT: [[SCEVGEP:%.*]] = getelementptr float, float* [[A]], i64 10000 -; AVX512-NEXT: [[SCEVGEP2:%.*]] = bitcast float* [[SCEVGEP]] to i8* -; AVX512-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[TRIGGER]], i64 10000 -; AVX512-NEXT: [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8* -; AVX512-NEXT: [[SCEVGEP7:%.*]] = getelementptr float, float* [[B]], i64 10000 -; AVX512-NEXT: [[SCEVGEP78:%.*]] = bitcast float* [[SCEVGEP7]] to i8* -; AVX512-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP45]] -; AVX512-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[TRIGGER3]], [[SCEVGEP2]] -; AVX512-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; AVX512-NEXT: [[BOUND09:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP78]] -; AVX512-NEXT: [[BOUND110:%.*]] = icmp ult i8* [[B6]], [[SCEVGEP2]] -; AVX512-NEXT: [[FOUND_CONFLICT11:%.*]] = and i1 [[BOUND09]], [[BOUND110]] -; AVX512-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT11]] +; AVX512-NEXT: [[TMP0:%.*]] = sub i64 [[A1]], [[TRIGGER2]] +; AVX512-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 256 +; AVX512-NEXT: [[TMP1:%.*]] = sub i64 [[A1]], [[B3]] +; AVX512-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP1]], 256 +; AVX512-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]] ; AVX512-NEXT: br i1 [[CONFLICT_RDX]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] ; AVX512: vector.main.loop.iter.check: ; AVX512-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] @@ -972,16 +900,16 @@ ; AVX512-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP3]] ; AVX512-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0 ; AVX512-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32>* [[TMP9]], align 4, !alias.scope !24 +; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32>* [[TMP9]], align 4 ; AVX512-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 16 ; AVX512-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i32>, <16 x i32>* [[TMP11]], align 4, !alias.scope !24 +; AVX512-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i32>, <16 x i32>* [[TMP11]], align 4 ; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 32 ; AVX512-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_LOAD13:%.*]] = load <16 x i32>, <16 x i32>* [[TMP13]], align 4, !alias.scope !24 +; AVX512-NEXT: [[WIDE_LOAD13:%.*]] = load <16 x i32>, <16 x i32>* [[TMP13]], align 4 ; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 48 ; AVX512-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_LOAD14:%.*]] = load <16 x i32>, <16 x i32>* [[TMP15]], align 4, !alias.scope !24 +; AVX512-NEXT: [[WIDE_LOAD14:%.*]] = load <16 x i32>, <16 x i32>* [[TMP15]], align 4 ; AVX512-NEXT: [[TMP16:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD]], ; AVX512-NEXT: [[TMP17:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD12]], ; AVX512-NEXT: [[TMP18:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD13]], @@ -992,16 +920,16 @@ ; AVX512-NEXT: [[TMP23:%.*]] = getelementptr float, float* [[B]], i64 [[TMP3]] ; AVX512-NEXT: [[TMP24:%.*]] = getelementptr float, float* [[TMP20]], i32 0 ; AVX512-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP24]] to <16 x float>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP25]], i32 4, <16 x i1> [[TMP16]], <16 x float> poison), !alias.scope !27 +; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP25]], i32 4, <16 x i1> [[TMP16]], <16 x float> poison) ; AVX512-NEXT: [[TMP26:%.*]] = getelementptr float, float* [[TMP20]], i32 16 ; AVX512-NEXT: [[TMP27:%.*]] = bitcast float* [[TMP26]] to <16 x float>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP27]], i32 4, <16 x i1> [[TMP17]], <16 x float> poison), !alias.scope !27 +; AVX512-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP27]], i32 4, <16 x i1> [[TMP17]], <16 x float> poison) ; AVX512-NEXT: [[TMP28:%.*]] = getelementptr float, float* [[TMP20]], i32 32 ; AVX512-NEXT: [[TMP29:%.*]] = bitcast float* [[TMP28]] to <16 x float>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP29]], i32 4, <16 x i1> [[TMP18]], <16 x float> poison), !alias.scope !27 +; AVX512-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP29]], i32 4, <16 x i1> [[TMP18]], <16 x float> poison) ; AVX512-NEXT: [[TMP30:%.*]] = getelementptr float, float* [[TMP20]], i32 48 ; AVX512-NEXT: [[TMP31:%.*]] = bitcast float* [[TMP30]] to <16 x float>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD17:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP31]], i32 4, <16 x i1> [[TMP19]], <16 x float> poison), !alias.scope !27 +; AVX512-NEXT: [[WIDE_MASKED_LOAD17:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP31]], i32 4, <16 x i1> [[TMP19]], <16 x float> poison) ; AVX512-NEXT: [[TMP32:%.*]] = sitofp <16 x i32> [[WIDE_LOAD]] to <16 x float> ; AVX512-NEXT: [[TMP33:%.*]] = sitofp <16 x i32> [[WIDE_LOAD12]] to <16 x float> ; AVX512-NEXT: [[TMP34:%.*]] = sitofp <16 x i32> [[WIDE_LOAD13]] to <16 x float> @@ -1016,16 +944,16 @@ ; AVX512-NEXT: [[TMP43:%.*]] = getelementptr float, float* [[A]], i64 [[TMP3]] ; AVX512-NEXT: [[TMP44:%.*]] = getelementptr float, float* [[TMP40]], i32 0 ; AVX512-NEXT: [[TMP45:%.*]] = bitcast float* [[TMP44]] to <16 x float>* -; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP36]], <16 x float>* [[TMP45]], i32 4, <16 x i1> [[TMP16]]), !alias.scope !29, !noalias !31 +; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP36]], <16 x float>* [[TMP45]], i32 4, <16 x i1> [[TMP16]]) ; AVX512-NEXT: [[TMP46:%.*]] = getelementptr float, float* [[TMP40]], i32 16 ; AVX512-NEXT: [[TMP47:%.*]] = bitcast float* [[TMP46]] to <16 x float>* -; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP37]], <16 x float>* [[TMP47]], i32 4, <16 x i1> [[TMP17]]), !alias.scope !29, !noalias !31 +; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP37]], <16 x float>* [[TMP47]], i32 4, <16 x i1> [[TMP17]]) ; AVX512-NEXT: [[TMP48:%.*]] = getelementptr float, float* [[TMP40]], i32 32 ; AVX512-NEXT: [[TMP49:%.*]] = bitcast float* [[TMP48]] to <16 x float>* -; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP38]], <16 x float>* [[TMP49]], i32 4, <16 x i1> [[TMP18]]), !alias.scope !29, !noalias !31 +; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP38]], <16 x float>* [[TMP49]], i32 4, <16 x i1> [[TMP18]]) ; AVX512-NEXT: [[TMP50:%.*]] = getelementptr float, float* [[TMP40]], i32 48 ; AVX512-NEXT: [[TMP51:%.*]] = bitcast float* [[TMP50]] to <16 x float>* -; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP39]], <16 x float>* [[TMP51]], i32 4, <16 x i1> [[TMP19]]), !alias.scope !29, !noalias !31 +; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP39]], <16 x float>* [[TMP51]], i32 4, <16 x i1> [[TMP19]]) ; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64 ; AVX512-NEXT: [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984 ; AVX512-NEXT: br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]] @@ -1162,16 +1090,16 @@ ; AVX-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP3]] ; AVX-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0 ; AVX-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>* -; AVX-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4, !alias.scope !31 +; AVX-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4, !alias.scope !7 ; AVX-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 4 ; AVX-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <4 x i32>* -; AVX-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP11]], align 4, !alias.scope !31 +; AVX-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP11]], align 4, !alias.scope !7 ; AVX-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 8 ; AVX-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <4 x i32>* -; AVX-NEXT: [[WIDE_LOAD13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP13]], align 4, !alias.scope !31 +; AVX-NEXT: [[WIDE_LOAD13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP13]], align 4, !alias.scope !7 ; AVX-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 12 ; AVX-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <4 x i32>* -; AVX-NEXT: [[WIDE_LOAD14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP15]], align 4, !alias.scope !31 +; AVX-NEXT: [[WIDE_LOAD14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP15]], align 4, !alias.scope !7 ; AVX-NEXT: [[TMP16:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD]], ; AVX-NEXT: [[TMP17:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD12]], ; AVX-NEXT: [[TMP18:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD13]], @@ -1182,16 +1110,16 @@ ; AVX-NEXT: [[TMP23:%.*]] = getelementptr double, double* [[B]], i64 [[TMP3]] ; AVX-NEXT: [[TMP24:%.*]] = getelementptr double, double* [[TMP20]], i32 0 ; AVX-NEXT: [[TMP25:%.*]] = bitcast double* [[TMP24]] to <4 x double>* -; AVX-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP25]], i32 8, <4 x i1> [[TMP16]], <4 x double> poison), !alias.scope !34 +; AVX-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP25]], i32 8, <4 x i1> [[TMP16]], <4 x double> poison), !alias.scope !10 ; AVX-NEXT: [[TMP26:%.*]] = getelementptr double, double* [[TMP20]], i32 4 ; AVX-NEXT: [[TMP27:%.*]] = bitcast double* [[TMP26]] to <4 x double>* -; AVX-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP27]], i32 8, <4 x i1> [[TMP17]], <4 x double> poison), !alias.scope !34 +; AVX-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP27]], i32 8, <4 x i1> [[TMP17]], <4 x double> poison), !alias.scope !10 ; AVX-NEXT: [[TMP28:%.*]] = getelementptr double, double* [[TMP20]], i32 8 ; AVX-NEXT: [[TMP29:%.*]] = bitcast double* [[TMP28]] to <4 x double>* -; AVX-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP29]], i32 8, <4 x i1> [[TMP18]], <4 x double> poison), !alias.scope !34 +; AVX-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP29]], i32 8, <4 x i1> [[TMP18]], <4 x double> poison), !alias.scope !10 ; AVX-NEXT: [[TMP30:%.*]] = getelementptr double, double* [[TMP20]], i32 12 ; AVX-NEXT: [[TMP31:%.*]] = bitcast double* [[TMP30]] to <4 x double>* -; AVX-NEXT: [[WIDE_MASKED_LOAD17:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP31]], i32 8, <4 x i1> [[TMP19]], <4 x double> poison), !alias.scope !34 +; AVX-NEXT: [[WIDE_MASKED_LOAD17:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP31]], i32 8, <4 x i1> [[TMP19]], <4 x double> poison), !alias.scope !10 ; AVX-NEXT: [[TMP32:%.*]] = sitofp <4 x i32> [[WIDE_LOAD]] to <4 x double> ; AVX-NEXT: [[TMP33:%.*]] = sitofp <4 x i32> [[WIDE_LOAD12]] to <4 x double> ; AVX-NEXT: [[TMP34:%.*]] = sitofp <4 x i32> [[WIDE_LOAD13]] to <4 x double> @@ -1206,16 +1134,16 @@ ; AVX-NEXT: [[TMP43:%.*]] = getelementptr double, double* [[A]], i64 [[TMP3]] ; AVX-NEXT: [[TMP44:%.*]] = getelementptr double, double* [[TMP40]], i32 0 ; AVX-NEXT: [[TMP45:%.*]] = bitcast double* [[TMP44]] to <4 x double>* -; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP36]], <4 x double>* [[TMP45]], i32 8, <4 x i1> [[TMP16]]), !alias.scope !36, !noalias !38 +; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP36]], <4 x double>* [[TMP45]], i32 8, <4 x i1> [[TMP16]]), !alias.scope !12, !noalias !14 ; AVX-NEXT: [[TMP46:%.*]] = getelementptr double, double* [[TMP40]], i32 4 ; AVX-NEXT: [[TMP47:%.*]] = bitcast double* [[TMP46]] to <4 x double>* -; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP37]], <4 x double>* [[TMP47]], i32 8, <4 x i1> [[TMP17]]), !alias.scope !36, !noalias !38 +; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP37]], <4 x double>* [[TMP47]], i32 8, <4 x i1> [[TMP17]]), !alias.scope !12, !noalias !14 ; AVX-NEXT: [[TMP48:%.*]] = getelementptr double, double* [[TMP40]], i32 8 ; AVX-NEXT: [[TMP49:%.*]] = bitcast double* [[TMP48]] to <4 x double>* -; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP38]], <4 x double>* [[TMP49]], i32 8, <4 x i1> [[TMP18]]), !alias.scope !36, !noalias !38 +; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP38]], <4 x double>* [[TMP49]], i32 8, <4 x i1> [[TMP18]]), !alias.scope !12, !noalias !14 ; AVX-NEXT: [[TMP50:%.*]] = getelementptr double, double* [[TMP40]], i32 12 ; AVX-NEXT: [[TMP51:%.*]] = bitcast double* [[TMP50]] to <4 x double>* -; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP39]], <4 x double>* [[TMP51]], i32 8, <4 x i1> [[TMP19]]), !alias.scope !36, !noalias !38 +; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP39]], <4 x double>* [[TMP51]], i32 8, <4 x i1> [[TMP19]]), !alias.scope !12, !noalias !14 ; AVX-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; AVX-NEXT: [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000 ; AVX-NEXT: br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP39:![0-9]+]] @@ -1281,16 +1209,16 @@ ; AVX512-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP3]] ; AVX512-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0 ; AVX512-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <8 x i32>* -; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP9]], align 4, !alias.scope !35 +; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP9]], align 4, !alias.scope !11 ; AVX512-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 8 ; AVX512-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <8 x i32>* -; AVX512-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i32>, <8 x i32>* [[TMP11]], align 4, !alias.scope !35 +; AVX512-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i32>, <8 x i32>* [[TMP11]], align 4, !alias.scope !11 ; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 16 ; AVX512-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <8 x i32>* -; AVX512-NEXT: [[WIDE_LOAD13:%.*]] = load <8 x i32>, <8 x i32>* [[TMP13]], align 4, !alias.scope !35 +; AVX512-NEXT: [[WIDE_LOAD13:%.*]] = load <8 x i32>, <8 x i32>* [[TMP13]], align 4, !alias.scope !11 ; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 24 ; AVX512-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <8 x i32>* -; AVX512-NEXT: [[WIDE_LOAD14:%.*]] = load <8 x i32>, <8 x i32>* [[TMP15]], align 4, !alias.scope !35 +; AVX512-NEXT: [[WIDE_LOAD14:%.*]] = load <8 x i32>, <8 x i32>* [[TMP15]], align 4, !alias.scope !11 ; AVX512-NEXT: [[TMP16:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], ; AVX512-NEXT: [[TMP17:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD12]], ; AVX512-NEXT: [[TMP18:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD13]], @@ -1301,16 +1229,16 @@ ; AVX512-NEXT: [[TMP23:%.*]] = getelementptr double, double* [[B]], i64 [[TMP3]] ; AVX512-NEXT: [[TMP24:%.*]] = getelementptr double, double* [[TMP20]], i32 0 ; AVX512-NEXT: [[TMP25:%.*]] = bitcast double* [[TMP24]] to <8 x double>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP25]], i32 8, <8 x i1> [[TMP16]], <8 x double> poison), !alias.scope !38 +; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP25]], i32 8, <8 x i1> [[TMP16]], <8 x double> poison), !alias.scope !14 ; AVX512-NEXT: [[TMP26:%.*]] = getelementptr double, double* [[TMP20]], i32 8 ; AVX512-NEXT: [[TMP27:%.*]] = bitcast double* [[TMP26]] to <8 x double>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP27]], i32 8, <8 x i1> [[TMP17]], <8 x double> poison), !alias.scope !38 +; AVX512-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP27]], i32 8, <8 x i1> [[TMP17]], <8 x double> poison), !alias.scope !14 ; AVX512-NEXT: [[TMP28:%.*]] = getelementptr double, double* [[TMP20]], i32 16 ; AVX512-NEXT: [[TMP29:%.*]] = bitcast double* [[TMP28]] to <8 x double>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP29]], i32 8, <8 x i1> [[TMP18]], <8 x double> poison), !alias.scope !38 +; AVX512-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP29]], i32 8, <8 x i1> [[TMP18]], <8 x double> poison), !alias.scope !14 ; AVX512-NEXT: [[TMP30:%.*]] = getelementptr double, double* [[TMP20]], i32 24 ; AVX512-NEXT: [[TMP31:%.*]] = bitcast double* [[TMP30]] to <8 x double>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD17:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP31]], i32 8, <8 x i1> [[TMP19]], <8 x double> poison), !alias.scope !38 +; AVX512-NEXT: [[WIDE_MASKED_LOAD17:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP31]], i32 8, <8 x i1> [[TMP19]], <8 x double> poison), !alias.scope !14 ; AVX512-NEXT: [[TMP32:%.*]] = sitofp <8 x i32> [[WIDE_LOAD]] to <8 x double> ; AVX512-NEXT: [[TMP33:%.*]] = sitofp <8 x i32> [[WIDE_LOAD12]] to <8 x double> ; AVX512-NEXT: [[TMP34:%.*]] = sitofp <8 x i32> [[WIDE_LOAD13]] to <8 x double> @@ -1325,16 +1253,16 @@ ; AVX512-NEXT: [[TMP43:%.*]] = getelementptr double, double* [[A]], i64 [[TMP3]] ; AVX512-NEXT: [[TMP44:%.*]] = getelementptr double, double* [[TMP40]], i32 0 ; AVX512-NEXT: [[TMP45:%.*]] = bitcast double* [[TMP44]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP36]], <8 x double>* [[TMP45]], i32 8, <8 x i1> [[TMP16]]), !alias.scope !40, !noalias !42 +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP36]], <8 x double>* [[TMP45]], i32 8, <8 x i1> [[TMP16]]), !alias.scope !16, !noalias !18 ; AVX512-NEXT: [[TMP46:%.*]] = getelementptr double, double* [[TMP40]], i32 8 ; AVX512-NEXT: [[TMP47:%.*]] = bitcast double* [[TMP46]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP37]], <8 x double>* [[TMP47]], i32 8, <8 x i1> [[TMP17]]), !alias.scope !40, !noalias !42 +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP37]], <8 x double>* [[TMP47]], i32 8, <8 x i1> [[TMP17]]), !alias.scope !16, !noalias !18 ; AVX512-NEXT: [[TMP48:%.*]] = getelementptr double, double* [[TMP40]], i32 16 ; AVX512-NEXT: [[TMP49:%.*]] = bitcast double* [[TMP48]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP38]], <8 x double>* [[TMP49]], i32 8, <8 x i1> [[TMP18]]), !alias.scope !40, !noalias !42 +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP38]], <8 x double>* [[TMP49]], i32 8, <8 x i1> [[TMP18]]), !alias.scope !16, !noalias !18 ; AVX512-NEXT: [[TMP50:%.*]] = getelementptr double, double* [[TMP40]], i32 24 ; AVX512-NEXT: [[TMP51:%.*]] = bitcast double* [[TMP50]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP39]], <8 x double>* [[TMP51]], i32 8, <8 x i1> [[TMP19]]), !alias.scope !40, !noalias !42 +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP39]], <8 x double>* [[TMP51]], i32 8, <8 x i1> [[TMP19]]), !alias.scope !16, !noalias !18 ; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; AVX512-NEXT: [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984 ; AVX512-NEXT: br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP43:![0-9]+]] @@ -1457,15 +1385,15 @@ ; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; AVX512-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <8 x i64> [[VEC_IND]] -; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP0]], i32 4, <8 x i1> , <8 x i32> undef), !alias.scope !45 +; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP0]], i32 4, <8 x i1> , <8 x i32> undef), !alias.scope !21 ; AVX512-NEXT: [[TMP1:%.*]] = icmp slt <8 x i32> [[WIDE_MASKED_GATHER]], ; AVX512-NEXT: [[TMP2:%.*]] = shl nuw nsw <8 x i64> [[VEC_IND]], ; AVX512-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, double* [[B]], <8 x i64> [[TMP2]] -; AVX512-NEXT: [[WIDE_MASKED_GATHER12:%.*]] = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> [[TMP3]], i32 8, <8 x i1> [[TMP1]], <8 x double> undef), !alias.scope !48 +; AVX512-NEXT: [[WIDE_MASKED_GATHER12:%.*]] = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> [[TMP3]], i32 8, <8 x i1> [[TMP1]], <8 x double> undef), !alias.scope !24 ; AVX512-NEXT: [[TMP4:%.*]] = sitofp <8 x i32> [[WIDE_MASKED_GATHER]] to <8 x double> ; AVX512-NEXT: [[TMP5:%.*]] = fadd <8 x double> [[WIDE_MASKED_GATHER12]], [[TMP4]] ; AVX512-NEXT: [[TMP6:%.*]] = getelementptr inbounds double, double* [[A]], <8 x i64> [[VEC_IND]] -; AVX512-NEXT: call void @llvm.masked.scatter.v8f64.v8p0f64(<8 x double> [[TMP5]], <8 x double*> [[TMP6]], i32 8, <8 x i1> [[TMP1]]), !alias.scope !50, !noalias !52 +; AVX512-NEXT: call void @llvm.masked.scatter.v8f64.v8p0f64(<8 x double> [[TMP5]], <8 x double*> [[TMP6]], i32 8, <8 x i1> [[TMP1]]), !alias.scope !26, !noalias !28 ; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; AVX512-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], ; AVX512-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 624 @@ -1669,22 +1597,22 @@ ; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0 ; AVX2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 -3 ; AVX2-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>* -; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP10]], align 4, !alias.scope !41 +; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP10]], align 4, !alias.scope !17 ; AVX2-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD]], <4 x i32> poison, <4 x i32> ; AVX2-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 -4 ; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i32 -3 ; AVX2-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <4 x i32>* -; AVX2-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP13]], align 4, !alias.scope !41 +; AVX2-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP13]], align 4, !alias.scope !17 ; AVX2-NEXT: [[REVERSE13:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD12]], <4 x i32> poison, <4 x i32> ; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 -8 ; AVX2-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP14]], i32 -3 ; AVX2-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP15]] to <4 x i32>* -; AVX2-NEXT: [[WIDE_LOAD14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP16]], align 4, !alias.scope !41 +; AVX2-NEXT: [[WIDE_LOAD14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP16]], align 4, !alias.scope !17 ; AVX2-NEXT: [[REVERSE15:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD14]], <4 x i32> poison, <4 x i32> ; AVX2-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 -12 ; AVX2-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP17]], i32 -3 ; AVX2-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>* -; AVX2-NEXT: [[WIDE_LOAD16:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4, !alias.scope !41 +; AVX2-NEXT: [[WIDE_LOAD16:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4, !alias.scope !17 ; AVX2-NEXT: [[REVERSE17:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD16]], <4 x i32> poison, <4 x i32> ; AVX2-NEXT: [[TMP20:%.*]] = icmp sgt <4 x i32> [[REVERSE]], zeroinitializer ; AVX2-NEXT: [[TMP21:%.*]] = icmp sgt <4 x i32> [[REVERSE13]], zeroinitializer @@ -1698,25 +1626,25 @@ ; AVX2-NEXT: [[TMP29:%.*]] = getelementptr double, double* [[TMP28]], i32 -3 ; AVX2-NEXT: [[REVERSE18:%.*]] = shufflevector <4 x i1> [[TMP20]], <4 x i1> poison, <4 x i32> ; AVX2-NEXT: [[TMP30:%.*]] = bitcast double* [[TMP29]] to <4 x double>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP30]], i32 8, <4 x i1> [[REVERSE18]], <4 x double> poison), !alias.scope !44 +; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP30]], i32 8, <4 x i1> [[REVERSE18]], <4 x double> poison), !alias.scope !20 ; AVX2-NEXT: [[REVERSE19:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD]], <4 x double> poison, <4 x i32> ; AVX2-NEXT: [[TMP31:%.*]] = getelementptr double, double* [[TMP24]], i32 -4 ; AVX2-NEXT: [[TMP32:%.*]] = getelementptr double, double* [[TMP31]], i32 -3 ; AVX2-NEXT: [[REVERSE20:%.*]] = shufflevector <4 x i1> [[TMP21]], <4 x i1> poison, <4 x i32> ; AVX2-NEXT: [[TMP33:%.*]] = bitcast double* [[TMP32]] to <4 x double>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD21:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP33]], i32 8, <4 x i1> [[REVERSE20]], <4 x double> poison), !alias.scope !44 +; AVX2-NEXT: [[WIDE_MASKED_LOAD21:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP33]], i32 8, <4 x i1> [[REVERSE20]], <4 x double> poison), !alias.scope !20 ; AVX2-NEXT: [[REVERSE22:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD21]], <4 x double> poison, <4 x i32> ; AVX2-NEXT: [[TMP34:%.*]] = getelementptr double, double* [[TMP24]], i32 -8 ; AVX2-NEXT: [[TMP35:%.*]] = getelementptr double, double* [[TMP34]], i32 -3 ; AVX2-NEXT: [[REVERSE23:%.*]] = shufflevector <4 x i1> [[TMP22]], <4 x i1> poison, <4 x i32> ; AVX2-NEXT: [[TMP36:%.*]] = bitcast double* [[TMP35]] to <4 x double>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD24:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP36]], i32 8, <4 x i1> [[REVERSE23]], <4 x double> poison), !alias.scope !44 +; AVX2-NEXT: [[WIDE_MASKED_LOAD24:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP36]], i32 8, <4 x i1> [[REVERSE23]], <4 x double> poison), !alias.scope !20 ; AVX2-NEXT: [[REVERSE25:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD24]], <4 x double> poison, <4 x i32> ; AVX2-NEXT: [[TMP37:%.*]] = getelementptr double, double* [[TMP24]], i32 -12 ; AVX2-NEXT: [[TMP38:%.*]] = getelementptr double, double* [[TMP37]], i32 -3 ; AVX2-NEXT: [[REVERSE26:%.*]] = shufflevector <4 x i1> [[TMP23]], <4 x i1> poison, <4 x i32> ; AVX2-NEXT: [[TMP39:%.*]] = bitcast double* [[TMP38]] to <4 x double>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP39]], i32 8, <4 x i1> [[REVERSE26]], <4 x double> poison), !alias.scope !44 +; AVX2-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP39]], i32 8, <4 x i1> [[REVERSE26]], <4 x double> poison), !alias.scope !20 ; AVX2-NEXT: [[REVERSE28:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD27]], <4 x double> poison, <4 x i32> ; AVX2-NEXT: [[TMP40:%.*]] = fadd <4 x double> [[REVERSE19]], ; AVX2-NEXT: [[TMP41:%.*]] = fadd <4 x double> [[REVERSE22]], @@ -1730,22 +1658,22 @@ ; AVX2-NEXT: [[TMP48:%.*]] = getelementptr double, double* [[TMP44]], i32 0 ; AVX2-NEXT: [[TMP49:%.*]] = getelementptr double, double* [[TMP48]], i32 -3 ; AVX2-NEXT: [[TMP50:%.*]] = bitcast double* [[TMP49]] to <4 x double>* -; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[REVERSE29]], <4 x double>* [[TMP50]], i32 8, <4 x i1> [[REVERSE18]]), !alias.scope !46, !noalias !48 +; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[REVERSE29]], <4 x double>* [[TMP50]], i32 8, <4 x i1> [[REVERSE18]]), !alias.scope !22, !noalias !24 ; AVX2-NEXT: [[REVERSE31:%.*]] = shufflevector <4 x double> [[TMP41]], <4 x double> poison, <4 x i32> ; AVX2-NEXT: [[TMP51:%.*]] = getelementptr double, double* [[TMP44]], i32 -4 ; AVX2-NEXT: [[TMP52:%.*]] = getelementptr double, double* [[TMP51]], i32 -3 ; AVX2-NEXT: [[TMP53:%.*]] = bitcast double* [[TMP52]] to <4 x double>* -; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[REVERSE31]], <4 x double>* [[TMP53]], i32 8, <4 x i1> [[REVERSE20]]), !alias.scope !46, !noalias !48 +; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[REVERSE31]], <4 x double>* [[TMP53]], i32 8, <4 x i1> [[REVERSE20]]), !alias.scope !22, !noalias !24 ; AVX2-NEXT: [[REVERSE33:%.*]] = shufflevector <4 x double> [[TMP42]], <4 x double> poison, <4 x i32> ; AVX2-NEXT: [[TMP54:%.*]] = getelementptr double, double* [[TMP44]], i32 -8 ; AVX2-NEXT: [[TMP55:%.*]] = getelementptr double, double* [[TMP54]], i32 -3 ; AVX2-NEXT: [[TMP56:%.*]] = bitcast double* [[TMP55]] to <4 x double>* -; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[REVERSE33]], <4 x double>* [[TMP56]], i32 8, <4 x i1> [[REVERSE23]]), !alias.scope !46, !noalias !48 +; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[REVERSE33]], <4 x double>* [[TMP56]], i32 8, <4 x i1> [[REVERSE23]]), !alias.scope !22, !noalias !24 ; AVX2-NEXT: [[REVERSE35:%.*]] = shufflevector <4 x double> [[TMP43]], <4 x double> poison, <4 x i32> ; AVX2-NEXT: [[TMP57:%.*]] = getelementptr double, double* [[TMP44]], i32 -12 ; AVX2-NEXT: [[TMP58:%.*]] = getelementptr double, double* [[TMP57]], i32 -3 ; AVX2-NEXT: [[TMP59:%.*]] = bitcast double* [[TMP58]] to <4 x double>* -; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[REVERSE35]], <4 x double>* [[TMP59]], i32 8, <4 x i1> [[REVERSE26]]), !alias.scope !46, !noalias !48 +; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[REVERSE35]], <4 x double>* [[TMP59]], i32 8, <4 x i1> [[REVERSE26]]), !alias.scope !22, !noalias !24 ; AVX2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; AVX2-NEXT: [[TMP60:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 ; AVX2-NEXT: br i1 [[TMP60]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP49:![0-9]+]] @@ -1812,22 +1740,22 @@ ; AVX512-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0 ; AVX512-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 -7 ; AVX512-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <8 x i32>* -; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP10]], align 4, !alias.scope !55 +; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP10]], align 4, !alias.scope !31 ; AVX512-NEXT: [[REVERSE:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD]], <8 x i32> poison, <8 x i32> ; AVX512-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 -8 ; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i32 -7 ; AVX512-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <8 x i32>* -; AVX512-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i32>, <8 x i32>* [[TMP13]], align 4, !alias.scope !55 +; AVX512-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i32>, <8 x i32>* [[TMP13]], align 4, !alias.scope !31 ; AVX512-NEXT: [[REVERSE13:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD12]], <8 x i32> poison, <8 x i32> ; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 -16 ; AVX512-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP14]], i32 -7 ; AVX512-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP15]] to <8 x i32>* -; AVX512-NEXT: [[WIDE_LOAD14:%.*]] = load <8 x i32>, <8 x i32>* [[TMP16]], align 4, !alias.scope !55 +; AVX512-NEXT: [[WIDE_LOAD14:%.*]] = load <8 x i32>, <8 x i32>* [[TMP16]], align 4, !alias.scope !31 ; AVX512-NEXT: [[REVERSE15:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD14]], <8 x i32> poison, <8 x i32> ; AVX512-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 -24 ; AVX512-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP17]], i32 -7 ; AVX512-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <8 x i32>* -; AVX512-NEXT: [[WIDE_LOAD16:%.*]] = load <8 x i32>, <8 x i32>* [[TMP19]], align 4, !alias.scope !55 +; AVX512-NEXT: [[WIDE_LOAD16:%.*]] = load <8 x i32>, <8 x i32>* [[TMP19]], align 4, !alias.scope !31 ; AVX512-NEXT: [[REVERSE17:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD16]], <8 x i32> poison, <8 x i32> ; AVX512-NEXT: [[TMP20:%.*]] = icmp sgt <8 x i32> [[REVERSE]], zeroinitializer ; AVX512-NEXT: [[TMP21:%.*]] = icmp sgt <8 x i32> [[REVERSE13]], zeroinitializer @@ -1841,25 +1769,25 @@ ; AVX512-NEXT: [[TMP29:%.*]] = getelementptr double, double* [[TMP28]], i32 -7 ; AVX512-NEXT: [[REVERSE18:%.*]] = shufflevector <8 x i1> [[TMP20]], <8 x i1> poison, <8 x i32> ; AVX512-NEXT: [[TMP30:%.*]] = bitcast double* [[TMP29]] to <8 x double>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP30]], i32 8, <8 x i1> [[REVERSE18]], <8 x double> poison), !alias.scope !58 +; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP30]], i32 8, <8 x i1> [[REVERSE18]], <8 x double> poison), !alias.scope !34 ; AVX512-NEXT: [[REVERSE19:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD]], <8 x double> poison, <8 x i32> ; AVX512-NEXT: [[TMP31:%.*]] = getelementptr double, double* [[TMP24]], i32 -8 ; AVX512-NEXT: [[TMP32:%.*]] = getelementptr double, double* [[TMP31]], i32 -7 ; AVX512-NEXT: [[REVERSE20:%.*]] = shufflevector <8 x i1> [[TMP21]], <8 x i1> poison, <8 x i32> ; AVX512-NEXT: [[TMP33:%.*]] = bitcast double* [[TMP32]] to <8 x double>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD21:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP33]], i32 8, <8 x i1> [[REVERSE20]], <8 x double> poison), !alias.scope !58 +; AVX512-NEXT: [[WIDE_MASKED_LOAD21:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP33]], i32 8, <8 x i1> [[REVERSE20]], <8 x double> poison), !alias.scope !34 ; AVX512-NEXT: [[REVERSE22:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD21]], <8 x double> poison, <8 x i32> ; AVX512-NEXT: [[TMP34:%.*]] = getelementptr double, double* [[TMP24]], i32 -16 ; AVX512-NEXT: [[TMP35:%.*]] = getelementptr double, double* [[TMP34]], i32 -7 ; AVX512-NEXT: [[REVERSE23:%.*]] = shufflevector <8 x i1> [[TMP22]], <8 x i1> poison, <8 x i32> ; AVX512-NEXT: [[TMP36:%.*]] = bitcast double* [[TMP35]] to <8 x double>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD24:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP36]], i32 8, <8 x i1> [[REVERSE23]], <8 x double> poison), !alias.scope !58 +; AVX512-NEXT: [[WIDE_MASKED_LOAD24:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP36]], i32 8, <8 x i1> [[REVERSE23]], <8 x double> poison), !alias.scope !34 ; AVX512-NEXT: [[REVERSE25:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD24]], <8 x double> poison, <8 x i32> ; AVX512-NEXT: [[TMP37:%.*]] = getelementptr double, double* [[TMP24]], i32 -24 ; AVX512-NEXT: [[TMP38:%.*]] = getelementptr double, double* [[TMP37]], i32 -7 ; AVX512-NEXT: [[REVERSE26:%.*]] = shufflevector <8 x i1> [[TMP23]], <8 x i1> poison, <8 x i32> ; AVX512-NEXT: [[TMP39:%.*]] = bitcast double* [[TMP38]] to <8 x double>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP39]], i32 8, <8 x i1> [[REVERSE26]], <8 x double> poison), !alias.scope !58 +; AVX512-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP39]], i32 8, <8 x i1> [[REVERSE26]], <8 x double> poison), !alias.scope !34 ; AVX512-NEXT: [[REVERSE28:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD27]], <8 x double> poison, <8 x i32> ; AVX512-NEXT: [[TMP40:%.*]] = fadd <8 x double> [[REVERSE19]], ; AVX512-NEXT: [[TMP41:%.*]] = fadd <8 x double> [[REVERSE22]], @@ -1873,22 +1801,22 @@ ; AVX512-NEXT: [[TMP48:%.*]] = getelementptr double, double* [[TMP44]], i32 0 ; AVX512-NEXT: [[TMP49:%.*]] = getelementptr double, double* [[TMP48]], i32 -7 ; AVX512-NEXT: [[TMP50:%.*]] = bitcast double* [[TMP49]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[REVERSE29]], <8 x double>* [[TMP50]], i32 8, <8 x i1> [[REVERSE18]]), !alias.scope !60, !noalias !62 +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[REVERSE29]], <8 x double>* [[TMP50]], i32 8, <8 x i1> [[REVERSE18]]), !alias.scope !36, !noalias !38 ; AVX512-NEXT: [[REVERSE31:%.*]] = shufflevector <8 x double> [[TMP41]], <8 x double> poison, <8 x i32> ; AVX512-NEXT: [[TMP51:%.*]] = getelementptr double, double* [[TMP44]], i32 -8 ; AVX512-NEXT: [[TMP52:%.*]] = getelementptr double, double* [[TMP51]], i32 -7 ; AVX512-NEXT: [[TMP53:%.*]] = bitcast double* [[TMP52]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[REVERSE31]], <8 x double>* [[TMP53]], i32 8, <8 x i1> [[REVERSE20]]), !alias.scope !60, !noalias !62 +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[REVERSE31]], <8 x double>* [[TMP53]], i32 8, <8 x i1> [[REVERSE20]]), !alias.scope !36, !noalias !38 ; AVX512-NEXT: [[REVERSE33:%.*]] = shufflevector <8 x double> [[TMP42]], <8 x double> poison, <8 x i32> ; AVX512-NEXT: [[TMP54:%.*]] = getelementptr double, double* [[TMP44]], i32 -16 ; AVX512-NEXT: [[TMP55:%.*]] = getelementptr double, double* [[TMP54]], i32 -7 ; AVX512-NEXT: [[TMP56:%.*]] = bitcast double* [[TMP55]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[REVERSE33]], <8 x double>* [[TMP56]], i32 8, <8 x i1> [[REVERSE23]]), !alias.scope !60, !noalias !62 +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[REVERSE33]], <8 x double>* [[TMP56]], i32 8, <8 x i1> [[REVERSE23]]), !alias.scope !36, !noalias !38 ; AVX512-NEXT: [[REVERSE35:%.*]] = shufflevector <8 x double> [[TMP43]], <8 x double> poison, <8 x i32> ; AVX512-NEXT: [[TMP57:%.*]] = getelementptr double, double* [[TMP44]], i32 -24 ; AVX512-NEXT: [[TMP58:%.*]] = getelementptr double, double* [[TMP57]], i32 -7 ; AVX512-NEXT: [[TMP59:%.*]] = bitcast double* [[TMP58]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[REVERSE35]], <8 x double>* [[TMP59]], i32 8, <8 x i1> [[REVERSE26]]), !alias.scope !60, !noalias !62 +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[REVERSE35]], <8 x double>* [[TMP59]], i32 8, <8 x i1> [[REVERSE26]]), !alias.scope !36, !noalias !38 ; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; AVX512-NEXT: [[TMP60:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 ; AVX512-NEXT: br i1 [[TMP60]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP63:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll @@ -17,26 +17,21 @@ define void @recurrence_1(i32* nocapture readonly %a, i32* nocapture %b, i32 %n) { ; CHECK-LABEL: @recurrence_1( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[A2:%.*]] = ptrtoint i32* [[A:%.*]] to i64 +; CHECK-NEXT: [[B1:%.*]] = ptrtoint i32* [[B:%.*]] to i64 ; CHECK-NEXT: br label [[FOR_PREHEADER:%.*]] ; CHECK: for.preheader: -; CHECK-NEXT: [[PRE_LOAD:%.*]] = load i32, i32* [[A:%.*]], align 4 +; CHECK-NEXT: [[PRE_LOAD:%.*]] = load i32, i32* [[A]], align 4 ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N:%.*]], -1 ; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 ; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP0]], 3 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; CHECK: vector.memcheck: -; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[N]], -1 -; CHECK-NEXT: [[TMP4:%.*]] = zext i32 [[TMP3]] to i64 -; CHECK-NEXT: [[TMP5:%.*]] = add nuw nsw i64 [[TMP4]], 1 -; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[TMP5]] -; CHECK-NEXT: [[SCEVGEP3:%.*]] = getelementptr i32, i32* [[A]], i64 1 -; CHECK-NEXT: [[TMP6:%.*]] = add nuw nsw i64 [[TMP4]], 2 -; CHECK-NEXT: [[SCEVGEP5:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP6]] -; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt i32* [[SCEVGEP5]], [[B]] -; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i32* [[SCEVGEP3]], [[SCEVGEP]] -; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[A2]], 4 +; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[B1]], [[TMP3]] +; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP4]], 16 +; CHECK-NEXT: br i1 [[DIFF_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], 8589934588 ; CHECK-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <4 x i32> poison, i32 [[PRE_LOAD]], i64 3 @@ -81,26 +76,21 @@ ; ; UNROLL-LABEL: @recurrence_1( ; UNROLL-NEXT: entry: +; UNROLL-NEXT: [[A2:%.*]] = ptrtoint i32* [[A:%.*]] to i64 +; UNROLL-NEXT: [[B1:%.*]] = ptrtoint i32* [[B:%.*]] to i64 ; UNROLL-NEXT: br label [[FOR_PREHEADER:%.*]] ; UNROLL: for.preheader: -; UNROLL-NEXT: [[PRE_LOAD:%.*]] = load i32, i32* [[A:%.*]], align 4 +; UNROLL-NEXT: [[PRE_LOAD:%.*]] = load i32, i32* [[A]], align 4 ; UNROLL-NEXT: [[TMP0:%.*]] = add i32 [[N:%.*]], -1 ; UNROLL-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 ; UNROLL-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 ; UNROLL-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP0]], 7 ; UNROLL-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; UNROLL: vector.memcheck: -; UNROLL-NEXT: [[TMP3:%.*]] = add i32 [[N]], -1 -; UNROLL-NEXT: [[TMP4:%.*]] = zext i32 [[TMP3]] to i64 -; UNROLL-NEXT: [[TMP5:%.*]] = add nuw nsw i64 [[TMP4]], 1 -; UNROLL-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[TMP5]] -; UNROLL-NEXT: [[SCEVGEP3:%.*]] = getelementptr i32, i32* [[A]], i64 1 -; UNROLL-NEXT: [[TMP6:%.*]] = add nuw nsw i64 [[TMP4]], 2 -; UNROLL-NEXT: [[SCEVGEP5:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP6]] -; UNROLL-NEXT: [[BOUND0:%.*]] = icmp ugt i32* [[SCEVGEP5]], [[B]] -; UNROLL-NEXT: [[BOUND1:%.*]] = icmp ult i32* [[SCEVGEP3]], [[SCEVGEP]] -; UNROLL-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; UNROLL-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; UNROLL-NEXT: [[TMP3:%.*]] = add i64 [[A2]], 4 +; UNROLL-NEXT: [[TMP4:%.*]] = sub i64 [[B1]], [[TMP3]] +; UNROLL-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP4]], 32 +; UNROLL-NEXT: br i1 [[DIFF_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; UNROLL: vector.ph: ; UNROLL-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], 8589934584 ; UNROLL-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <4 x i32> poison, i32 [[PRE_LOAD]], i64 3 @@ -153,7 +143,8 @@ ; ; UNROLL-NO-IC-LABEL: @recurrence_1( ; UNROLL-NO-IC-NEXT: entry: -; UNROLL-NO-IC-NEXT: [[B1:%.*]] = bitcast i32* [[B:%.*]] to i8* +; UNROLL-NO-IC-NEXT: [[A2:%.*]] = ptrtoint i32* [[A:%.*]] to i64 +; UNROLL-NO-IC-NEXT: [[B1:%.*]] = ptrtoint i32* [[B:%.*]] to i64 ; UNROLL-NO-IC-NEXT: br label [[FOR_PREHEADER:%.*]] ; UNROLL-NO-IC: for.preheader: ; UNROLL-NO-IC-NEXT: [[ARRAYIDX_PHI_TRANS_INSERT:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 0 @@ -164,20 +155,10 @@ ; UNROLL-NO-IC-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 8 ; UNROLL-NO-IC-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; UNROLL-NO-IC: vector.memcheck: -; UNROLL-NO-IC-NEXT: [[TMP3:%.*]] = add i32 [[N]], -1 -; UNROLL-NO-IC-NEXT: [[TMP4:%.*]] = zext i32 [[TMP3]] to i64 -; UNROLL-NO-IC-NEXT: [[TMP5:%.*]] = add nuw nsw i64 [[TMP4]], 1 -; UNROLL-NO-IC-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP5]] -; UNROLL-NO-IC-NEXT: [[SCEVGEP2:%.*]] = bitcast i32* [[SCEVGEP]] to i8* -; UNROLL-NO-IC-NEXT: [[SCEVGEP3:%.*]] = getelementptr i32, i32* [[A]], i64 1 -; UNROLL-NO-IC-NEXT: [[SCEVGEP34:%.*]] = bitcast i32* [[SCEVGEP3]] to i8* -; UNROLL-NO-IC-NEXT: [[TMP6:%.*]] = add nuw nsw i64 [[TMP4]], 2 -; UNROLL-NO-IC-NEXT: [[SCEVGEP5:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP6]] -; UNROLL-NO-IC-NEXT: [[SCEVGEP56:%.*]] = bitcast i32* [[SCEVGEP5]] to i8* -; UNROLL-NO-IC-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[B1]], [[SCEVGEP56]] -; UNROLL-NO-IC-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[SCEVGEP34]], [[SCEVGEP2]] -; UNROLL-NO-IC-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; UNROLL-NO-IC-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; UNROLL-NO-IC-NEXT: [[TMP3:%.*]] = add i64 [[A2]], 4 +; UNROLL-NO-IC-NEXT: [[TMP4:%.*]] = sub i64 [[B1]], [[TMP3]] +; UNROLL-NO-IC-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP4]], 32 +; UNROLL-NO-IC-NEXT: br i1 [[DIFF_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; UNROLL-NO-IC: vector.ph: ; UNROLL-NO-IC-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 8 ; UNROLL-NO-IC-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]] @@ -239,7 +220,8 @@ ; ; UNROLL-NO-VF-LABEL: @recurrence_1( ; UNROLL-NO-VF-NEXT: entry: -; UNROLL-NO-VF-NEXT: [[B1:%.*]] = bitcast i32* [[B:%.*]] to i8* +; UNROLL-NO-VF-NEXT: [[A2:%.*]] = ptrtoint i32* [[A:%.*]] to i64 +; UNROLL-NO-VF-NEXT: [[B1:%.*]] = ptrtoint i32* [[B:%.*]] to i64 ; UNROLL-NO-VF-NEXT: br label [[FOR_PREHEADER:%.*]] ; UNROLL-NO-VF: for.preheader: ; UNROLL-NO-VF-NEXT: [[ARRAYIDX_PHI_TRANS_INSERT:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 0 @@ -250,20 +232,10 @@ ; UNROLL-NO-VF-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 2 ; UNROLL-NO-VF-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; UNROLL-NO-VF: vector.memcheck: -; UNROLL-NO-VF-NEXT: [[TMP3:%.*]] = add i32 [[N]], -1 -; UNROLL-NO-VF-NEXT: [[TMP4:%.*]] = zext i32 [[TMP3]] to i64 -; UNROLL-NO-VF-NEXT: [[TMP5:%.*]] = add nuw nsw i64 [[TMP4]], 1 -; UNROLL-NO-VF-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP5]] -; UNROLL-NO-VF-NEXT: [[SCEVGEP2:%.*]] = bitcast i32* [[SCEVGEP]] to i8* -; UNROLL-NO-VF-NEXT: [[SCEVGEP3:%.*]] = getelementptr i32, i32* [[A]], i64 1 -; UNROLL-NO-VF-NEXT: [[SCEVGEP34:%.*]] = bitcast i32* [[SCEVGEP3]] to i8* -; UNROLL-NO-VF-NEXT: [[TMP6:%.*]] = add nuw nsw i64 [[TMP4]], 2 -; UNROLL-NO-VF-NEXT: [[SCEVGEP5:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP6]] -; UNROLL-NO-VF-NEXT: [[SCEVGEP56:%.*]] = bitcast i32* [[SCEVGEP5]] to i8* -; UNROLL-NO-VF-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[B1]], [[SCEVGEP56]] -; UNROLL-NO-VF-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[SCEVGEP34]], [[SCEVGEP2]] -; UNROLL-NO-VF-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; UNROLL-NO-VF-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; UNROLL-NO-VF-NEXT: [[TMP3:%.*]] = add i64 [[A2]], 4 +; UNROLL-NO-VF-NEXT: [[TMP4:%.*]] = sub i64 [[B1]], [[TMP3]] +; UNROLL-NO-VF-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP4]], 8 +; UNROLL-NO-VF-NEXT: br i1 [[DIFF_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; UNROLL-NO-VF: vector.ph: ; UNROLL-NO-VF-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 2 ; UNROLL-NO-VF-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]] @@ -312,10 +284,11 @@ ; ; SINK-AFTER-LABEL: @recurrence_1( ; SINK-AFTER-NEXT: entry: -; SINK-AFTER-NEXT: [[B1:%.*]] = bitcast i32* [[B:%.*]] to i8* +; SINK-AFTER-NEXT: [[A2:%.*]] = ptrtoint i32* [[A:%.*]] to i64 +; SINK-AFTER-NEXT: [[B1:%.*]] = ptrtoint i32* [[B:%.*]] to i64 ; SINK-AFTER-NEXT: br label [[FOR_PREHEADER:%.*]] ; SINK-AFTER: for.preheader: -; SINK-AFTER-NEXT: [[ARRAYIDX_PHI_TRANS_INSERT:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 0 +; SINK-AFTER-NEXT: [[ARRAYIDX_PHI_TRANS_INSERT:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 0 ; SINK-AFTER-NEXT: [[PRE_LOAD:%.*]] = load i32, i32* [[ARRAYIDX_PHI_TRANS_INSERT]], align 4 ; SINK-AFTER-NEXT: [[TMP0:%.*]] = add i32 [[N:%.*]], -1 ; SINK-AFTER-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 @@ -323,20 +296,10 @@ ; SINK-AFTER-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 4 ; SINK-AFTER-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; SINK-AFTER: vector.memcheck: -; SINK-AFTER-NEXT: [[TMP3:%.*]] = add i32 [[N]], -1 -; SINK-AFTER-NEXT: [[TMP4:%.*]] = zext i32 [[TMP3]] to i64 -; SINK-AFTER-NEXT: [[TMP5:%.*]] = add nuw nsw i64 [[TMP4]], 1 -; SINK-AFTER-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP5]] -; SINK-AFTER-NEXT: [[SCEVGEP2:%.*]] = bitcast i32* [[SCEVGEP]] to i8* -; SINK-AFTER-NEXT: [[SCEVGEP3:%.*]] = getelementptr i32, i32* [[A]], i64 1 -; SINK-AFTER-NEXT: [[SCEVGEP34:%.*]] = bitcast i32* [[SCEVGEP3]] to i8* -; SINK-AFTER-NEXT: [[TMP6:%.*]] = add nuw nsw i64 [[TMP4]], 2 -; SINK-AFTER-NEXT: [[SCEVGEP5:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP6]] -; SINK-AFTER-NEXT: [[SCEVGEP56:%.*]] = bitcast i32* [[SCEVGEP5]] to i8* -; SINK-AFTER-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[B1]], [[SCEVGEP56]] -; SINK-AFTER-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[SCEVGEP34]], [[SCEVGEP2]] -; SINK-AFTER-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; SINK-AFTER-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; SINK-AFTER-NEXT: [[TMP3:%.*]] = add i64 [[A2]], 4 +; SINK-AFTER-NEXT: [[TMP4:%.*]] = sub i64 [[B1]], [[TMP3]] +; SINK-AFTER-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP4]], 16 +; SINK-AFTER-NEXT: br i1 [[DIFF_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; SINK-AFTER: vector.ph: ; SINK-AFTER-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 4 ; SINK-AFTER-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]] @@ -3078,26 +3041,19 @@ ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; CHECK: vector.memcheck: -; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[C:%.*]], i64 [[N]] -; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[N]] -; CHECK-NEXT: [[SCEVGEP6:%.*]] = getelementptr [2 x i16], [2 x i16]* [[A]], i64 0, i64 1 -; CHECK-NEXT: [[SCEVGEP8:%.*]] = getelementptr [2 x i16], [2 x i16]* [[A]], i64 [[N]], i64 0 -; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt i32* [[SCEVGEP4]], [[C]] -; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[B]] -; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; CHECK-NEXT: [[TMP0:%.*]] = bitcast i16* [[SCEVGEP8]] to i32* -; CHECK-NEXT: [[BOUND010:%.*]] = icmp ugt i32* [[TMP0]], [[C]] -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[SCEVGEP]] to i16* -; CHECK-NEXT: [[BOUND111:%.*]] = icmp ult i16* [[SCEVGEP6]], [[TMP1]] -; CHECK-NEXT: [[FOUND_CONFLICT12:%.*]] = and i1 [[BOUND010]], [[BOUND111]] -; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT12]] -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[SCEVGEP8]] to i32* -; CHECK-NEXT: [[BOUND013:%.*]] = icmp ugt i32* [[TMP2]], [[B]] -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[SCEVGEP4]] to i16* -; CHECK-NEXT: [[BOUND114:%.*]] = icmp ult i16* [[SCEVGEP6]], [[TMP3]] -; CHECK-NEXT: [[FOUND_CONFLICT15:%.*]] = and i1 [[BOUND013]], [[BOUND114]] -; CHECK-NEXT: [[CONFLICT_RDX16:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT15]] -; CHECK-NEXT: br i1 [[CONFLICT_RDX16]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: [[B1:%.*]] = ptrtoint i32* [[B:%.*]] to i64 +; CHECK-NEXT: [[C2:%.*]] = ptrtoint i32* [[C:%.*]] to i64 +; CHECK-NEXT: [[A3:%.*]] = ptrtoint [2 x i16]* [[A]] to i64 +; CHECK-NEXT: [[TMP0:%.*]] = sub i64 [[B1]], [[C2]] +; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 16 +; CHECK-NEXT: [[TMP1:%.*]] = add nuw i64 [[A3]], 2 +; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], [[C2]] +; CHECK-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP2]], 16 +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]] +; CHECK-NEXT: [[TMP3:%.*]] = sub i64 [[B1]], [[TMP1]] +; CHECK-NEXT: [[DIFF_CHECK5:%.*]] = icmp ult i64 [[TMP3]], 16 +; CHECK-NEXT: [[CONFLICT_RDX6:%.*]] = or i1 [[CONFLICT_RDX]], [[DIFF_CHECK5]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX6]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[N]], -4 ; CHECK-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <4 x i16> poison, i16 [[DOTPRE]], i64 3 @@ -3165,26 +3121,19 @@ ; UNROLL-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 8 ; UNROLL-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; UNROLL: vector.memcheck: -; UNROLL-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[C:%.*]], i64 [[N]] -; UNROLL-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[N]] -; UNROLL-NEXT: [[SCEVGEP6:%.*]] = getelementptr [2 x i16], [2 x i16]* [[A]], i64 0, i64 1 -; UNROLL-NEXT: [[SCEVGEP8:%.*]] = getelementptr [2 x i16], [2 x i16]* [[A]], i64 [[N]], i64 0 -; UNROLL-NEXT: [[BOUND0:%.*]] = icmp ugt i32* [[SCEVGEP4]], [[C]] -; UNROLL-NEXT: [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[B]] -; UNROLL-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; UNROLL-NEXT: [[TMP0:%.*]] = bitcast i16* [[SCEVGEP8]] to i32* -; UNROLL-NEXT: [[BOUND010:%.*]] = icmp ugt i32* [[TMP0]], [[C]] -; UNROLL-NEXT: [[TMP1:%.*]] = bitcast i32* [[SCEVGEP]] to i16* -; UNROLL-NEXT: [[BOUND111:%.*]] = icmp ult i16* [[SCEVGEP6]], [[TMP1]] -; UNROLL-NEXT: [[FOUND_CONFLICT12:%.*]] = and i1 [[BOUND010]], [[BOUND111]] -; UNROLL-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT12]] -; UNROLL-NEXT: [[TMP2:%.*]] = bitcast i16* [[SCEVGEP8]] to i32* -; UNROLL-NEXT: [[BOUND013:%.*]] = icmp ugt i32* [[TMP2]], [[B]] -; UNROLL-NEXT: [[TMP3:%.*]] = bitcast i32* [[SCEVGEP4]] to i16* -; UNROLL-NEXT: [[BOUND114:%.*]] = icmp ult i16* [[SCEVGEP6]], [[TMP3]] -; UNROLL-NEXT: [[FOUND_CONFLICT15:%.*]] = and i1 [[BOUND013]], [[BOUND114]] -; UNROLL-NEXT: [[CONFLICT_RDX16:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT15]] -; UNROLL-NEXT: br i1 [[CONFLICT_RDX16]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; UNROLL-NEXT: [[B1:%.*]] = ptrtoint i32* [[B:%.*]] to i64 +; UNROLL-NEXT: [[C2:%.*]] = ptrtoint i32* [[C:%.*]] to i64 +; UNROLL-NEXT: [[A3:%.*]] = ptrtoint [2 x i16]* [[A]] to i64 +; UNROLL-NEXT: [[TMP0:%.*]] = sub i64 [[B1]], [[C2]] +; UNROLL-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 32 +; UNROLL-NEXT: [[TMP1:%.*]] = add nuw i64 [[A3]], 2 +; UNROLL-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], [[C2]] +; UNROLL-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP2]], 32 +; UNROLL-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]] +; UNROLL-NEXT: [[TMP3:%.*]] = sub i64 [[B1]], [[TMP1]] +; UNROLL-NEXT: [[DIFF_CHECK5:%.*]] = icmp ult i64 [[TMP3]], 32 +; UNROLL-NEXT: [[CONFLICT_RDX6:%.*]] = or i1 [[CONFLICT_RDX]], [[DIFF_CHECK5]] + ; UNROLL: vector.ph: ; UNROLL-NEXT: [[N_VEC:%.*]] = and i64 [[N]], -8 ; UNROLL-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <4 x i16> poison, i16 [[DOTPRE]], i64 3 @@ -3273,33 +3222,24 @@ ; ; UNROLL-NO-IC-LABEL: @PR34711( ; UNROLL-NO-IC-NEXT: entry: -; UNROLL-NO-IC-NEXT: [[C1:%.*]] = bitcast i32* [[C:%.*]] to i8* -; UNROLL-NO-IC-NEXT: [[B3:%.*]] = bitcast i32* [[B:%.*]] to i8* +; UNROLL-NO-IC-NEXT: [[A3:%.*]] = ptrtoint [2 x i16]* [[A:%.*]] to i64 +; UNROLL-NO-IC-NEXT: [[C2:%.*]] = ptrtoint i32* [[C:%.*]] to i64 +; UNROLL-NO-IC-NEXT: [[B1:%.*]] = ptrtoint i32* [[B:%.*]] to i64 ; UNROLL-NO-IC-NEXT: [[PRE_INDEX:%.*]] = getelementptr inbounds [2 x i16], [2 x i16]* [[A:%.*]], i64 0, i64 0 ; UNROLL-NO-IC-NEXT: [[DOTPRE:%.*]] = load i16, i16* [[PRE_INDEX]], align 2 ; UNROLL-NO-IC-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 8 ; UNROLL-NO-IC-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; UNROLL-NO-IC: vector.memcheck: -; UNROLL-NO-IC-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[C]], i64 [[N]] -; UNROLL-NO-IC-NEXT: [[SCEVGEP2:%.*]] = bitcast i32* [[SCEVGEP]] to i8* -; UNROLL-NO-IC-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[B]], i64 [[N]] -; UNROLL-NO-IC-NEXT: [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8* -; UNROLL-NO-IC-NEXT: [[SCEVGEP6:%.*]] = getelementptr [2 x i16], [2 x i16]* [[A]], i64 0, i64 1 -; UNROLL-NO-IC-NEXT: [[SCEVGEP67:%.*]] = bitcast i16* [[SCEVGEP6]] to i8* -; UNROLL-NO-IC-NEXT: [[SCEVGEP8:%.*]] = getelementptr [2 x i16], [2 x i16]* [[A]], i64 [[N]], i64 0 -; UNROLL-NO-IC-NEXT: [[SCEVGEP89:%.*]] = bitcast i16* [[SCEVGEP8]] to i8* -; UNROLL-NO-IC-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[C1]], [[SCEVGEP45]] -; UNROLL-NO-IC-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[B3]], [[SCEVGEP2]] -; UNROLL-NO-IC-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; UNROLL-NO-IC-NEXT: [[BOUND010:%.*]] = icmp ult i8* [[C1]], [[SCEVGEP89]] -; UNROLL-NO-IC-NEXT: [[BOUND111:%.*]] = icmp ult i8* [[SCEVGEP67]], [[SCEVGEP2]] -; UNROLL-NO-IC-NEXT: [[FOUND_CONFLICT12:%.*]] = and i1 [[BOUND010]], [[BOUND111]] -; UNROLL-NO-IC-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT12]] -; UNROLL-NO-IC-NEXT: [[BOUND013:%.*]] = icmp ult i8* [[B3]], [[SCEVGEP89]] -; UNROLL-NO-IC-NEXT: [[BOUND114:%.*]] = icmp ult i8* [[SCEVGEP67]], [[SCEVGEP45]] -; UNROLL-NO-IC-NEXT: [[FOUND_CONFLICT15:%.*]] = and i1 [[BOUND013]], [[BOUND114]] -; UNROLL-NO-IC-NEXT: [[CONFLICT_RDX16:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT15]] -; UNROLL-NO-IC-NEXT: br i1 [[CONFLICT_RDX16]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; UNROLL-NO-IC-NEXT: [[TMP0:%.*]] = sub i64 [[B1]], [[C2]] +; UNROLL-NO-IC-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 32 +; UNROLL-NO-IC-NEXT: [[TMP1:%.*]] = add nuw i64 [[A3]], 2 +; UNROLL-NO-IC-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], [[C2]] +; UNROLL-NO-IC-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP2]], 32 +; UNROLL-NO-IC-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]] +; UNROLL-NO-IC-NEXT: [[TMP3:%.*]] = sub i64 [[B1]], [[TMP1]] +; UNROLL-NO-IC-NEXT: [[DIFF_CHECK5:%.*]] = icmp ult i64 [[TMP3]], 32 +; UNROLL-NO-IC-NEXT: [[CONFLICT_RDX6:%.*]] = or i1 [[CONFLICT_RDX]], [[DIFF_CHECK5]] +; UNROLL-NO-IC-NEXT: br i1 [[CONFLICT_RDX6]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; UNROLL-NO-IC: vector.ph: ; UNROLL-NO-IC-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8 ; UNROLL-NO-IC-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] @@ -3396,33 +3336,24 @@ ; ; UNROLL-NO-VF-LABEL: @PR34711( ; UNROLL-NO-VF-NEXT: entry: -; UNROLL-NO-VF-NEXT: [[C1:%.*]] = bitcast i32* [[C:%.*]] to i8* -; UNROLL-NO-VF-NEXT: [[B3:%.*]] = bitcast i32* [[B:%.*]] to i8* +; UNROLL-NO-VF-NEXT: [[A3:%.*]] = ptrtoint [2 x i16]* [[A:%.*]] to i64 +; UNROLL-NO-VF-NEXT: [[C2:%.*]] = ptrtoint i32* [[C:%.*]] to i64 +; UNROLL-NO-VF-NEXT: [[B1:%.*]] = ptrtoint i32* [[B:%.*]] to i64 ; UNROLL-NO-VF-NEXT: [[PRE_INDEX:%.*]] = getelementptr inbounds [2 x i16], [2 x i16]* [[A:%.*]], i64 0, i64 0 ; UNROLL-NO-VF-NEXT: [[DOTPRE:%.*]] = load i16, i16* [[PRE_INDEX]], align 2 ; UNROLL-NO-VF-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 2 ; UNROLL-NO-VF-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; UNROLL-NO-VF: vector.memcheck: -; UNROLL-NO-VF-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[C]], i64 [[N]] -; UNROLL-NO-VF-NEXT: [[SCEVGEP2:%.*]] = bitcast i32* [[SCEVGEP]] to i8* -; UNROLL-NO-VF-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[B]], i64 [[N]] -; UNROLL-NO-VF-NEXT: [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8* -; UNROLL-NO-VF-NEXT: [[SCEVGEP6:%.*]] = getelementptr [2 x i16], [2 x i16]* [[A]], i64 0, i64 1 -; UNROLL-NO-VF-NEXT: [[SCEVGEP67:%.*]] = bitcast i16* [[SCEVGEP6]] to i8* -; UNROLL-NO-VF-NEXT: [[SCEVGEP8:%.*]] = getelementptr [2 x i16], [2 x i16]* [[A]], i64 [[N]], i64 0 -; UNROLL-NO-VF-NEXT: [[SCEVGEP89:%.*]] = bitcast i16* [[SCEVGEP8]] to i8* -; UNROLL-NO-VF-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[C1]], [[SCEVGEP45]] -; UNROLL-NO-VF-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[B3]], [[SCEVGEP2]] -; UNROLL-NO-VF-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; UNROLL-NO-VF-NEXT: [[BOUND010:%.*]] = icmp ult i8* [[C1]], [[SCEVGEP89]] -; UNROLL-NO-VF-NEXT: [[BOUND111:%.*]] = icmp ult i8* [[SCEVGEP67]], [[SCEVGEP2]] -; UNROLL-NO-VF-NEXT: [[FOUND_CONFLICT12:%.*]] = and i1 [[BOUND010]], [[BOUND111]] -; UNROLL-NO-VF-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT12]] -; UNROLL-NO-VF-NEXT: [[BOUND013:%.*]] = icmp ult i8* [[B3]], [[SCEVGEP89]] -; UNROLL-NO-VF-NEXT: [[BOUND114:%.*]] = icmp ult i8* [[SCEVGEP67]], [[SCEVGEP45]] -; UNROLL-NO-VF-NEXT: [[FOUND_CONFLICT15:%.*]] = and i1 [[BOUND013]], [[BOUND114]] -; UNROLL-NO-VF-NEXT: [[CONFLICT_RDX16:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT15]] -; UNROLL-NO-VF-NEXT: br i1 [[CONFLICT_RDX16]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; UNROLL-NO-VF-NEXT: [[TMP0:%.*]] = sub i64 [[B1]], [[C2]] +; UNROLL-NO-VF-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 8 +; UNROLL-NO-VF-NEXT: [[TMP1:%.*]] = add nuw i64 [[A3]], 2 +; UNROLL-NO-VF-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], [[C2]] +; UNROLL-NO-VF-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP2]], 8 +; UNROLL-NO-VF-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]] +; UNROLL-NO-VF-NEXT: [[TMP3:%.*]] = sub i64 [[B1]], [[TMP1]] +; UNROLL-NO-VF-NEXT: [[DIFF_CHECK5:%.*]] = icmp ult i64 [[TMP3]], 8 +; UNROLL-NO-VF-NEXT: [[CONFLICT_RDX6:%.*]] = or i1 [[CONFLICT_RDX]], [[DIFF_CHECK5]] +; UNROLL-NO-VF-NEXT: br i1 [[CONFLICT_RDX6]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; UNROLL-NO-VF: vector.ph: ; UNROLL-NO-VF-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 2 ; UNROLL-NO-VF-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] @@ -3480,33 +3411,24 @@ ; ; SINK-AFTER-LABEL: @PR34711( ; SINK-AFTER-NEXT: entry: -; SINK-AFTER-NEXT: [[C1:%.*]] = bitcast i32* [[C:%.*]] to i8* -; SINK-AFTER-NEXT: [[B3:%.*]] = bitcast i32* [[B:%.*]] to i8* +; SINK-AFTER-NEXT: [[A3:%.*]] = ptrtoint [2 x i16]* [[A:%.*]] to i64 +; SINK-AFTER-NEXT: [[C2:%.*]] = ptrtoint i32* [[C:%.*]] to i64 +; SINK-AFTER-NEXT: [[B1:%.*]] = ptrtoint i32* [[B:%.*]] to i64 ; SINK-AFTER-NEXT: [[PRE_INDEX:%.*]] = getelementptr inbounds [2 x i16], [2 x i16]* [[A:%.*]], i64 0, i64 0 ; SINK-AFTER-NEXT: [[DOTPRE:%.*]] = load i16, i16* [[PRE_INDEX]], align 2 ; SINK-AFTER-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4 ; SINK-AFTER-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; SINK-AFTER: vector.memcheck: -; SINK-AFTER-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[C]], i64 [[N]] -; SINK-AFTER-NEXT: [[SCEVGEP2:%.*]] = bitcast i32* [[SCEVGEP]] to i8* -; SINK-AFTER-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[B]], i64 [[N]] -; SINK-AFTER-NEXT: [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8* -; SINK-AFTER-NEXT: [[SCEVGEP6:%.*]] = getelementptr [2 x i16], [2 x i16]* [[A]], i64 0, i64 1 -; SINK-AFTER-NEXT: [[SCEVGEP67:%.*]] = bitcast i16* [[SCEVGEP6]] to i8* -; SINK-AFTER-NEXT: [[SCEVGEP8:%.*]] = getelementptr [2 x i16], [2 x i16]* [[A]], i64 [[N]], i64 0 -; SINK-AFTER-NEXT: [[SCEVGEP89:%.*]] = bitcast i16* [[SCEVGEP8]] to i8* -; SINK-AFTER-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[C1]], [[SCEVGEP45]] -; SINK-AFTER-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[B3]], [[SCEVGEP2]] -; SINK-AFTER-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; SINK-AFTER-NEXT: [[BOUND010:%.*]] = icmp ult i8* [[C1]], [[SCEVGEP89]] -; SINK-AFTER-NEXT: [[BOUND111:%.*]] = icmp ult i8* [[SCEVGEP67]], [[SCEVGEP2]] -; SINK-AFTER-NEXT: [[FOUND_CONFLICT12:%.*]] = and i1 [[BOUND010]], [[BOUND111]] -; SINK-AFTER-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT12]] -; SINK-AFTER-NEXT: [[BOUND013:%.*]] = icmp ult i8* [[B3]], [[SCEVGEP89]] -; SINK-AFTER-NEXT: [[BOUND114:%.*]] = icmp ult i8* [[SCEVGEP67]], [[SCEVGEP45]] -; SINK-AFTER-NEXT: [[FOUND_CONFLICT15:%.*]] = and i1 [[BOUND013]], [[BOUND114]] -; SINK-AFTER-NEXT: [[CONFLICT_RDX16:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT15]] -; SINK-AFTER-NEXT: br i1 [[CONFLICT_RDX16]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; SINK-AFTER-NEXT: [[TMP0:%.*]] = sub i64 [[B1]], [[C2]] +; SINK-AFTER-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 16 +; SINK-AFTER-NEXT: [[TMP1:%.*]] = add nuw i64 [[A3]], 2 +; SINK-AFTER-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], [[C2]] +; SINK-AFTER-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP2]], 16 +; SINK-AFTER-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]] +; SINK-AFTER-NEXT: [[TMP3:%.*]] = sub i64 [[B1]], [[TMP1]] +; SINK-AFTER-NEXT: [[DIFF_CHECK5:%.*]] = icmp ult i64 [[TMP3]], 16 +; SINK-AFTER-NEXT: [[CONFLICT_RDX6:%.*]] = or i1 [[CONFLICT_RDX]], [[DIFF_CHECK5]] +; SINK-AFTER-NEXT: br i1 [[CONFLICT_RDX6]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; SINK-AFTER: vector.ph: ; SINK-AFTER-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 ; SINK-AFTER-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] diff --git a/llvm/test/Transforms/LoopVectorize/multiple-exits-versioning.ll b/llvm/test/Transforms/LoopVectorize/multiple-exits-versioning.ll --- a/llvm/test/Transforms/LoopVectorize/multiple-exits-versioning.ll +++ b/llvm/test/Transforms/LoopVectorize/multiple-exits-versioning.ll @@ -4,13 +4,13 @@ ; multiple exiting branches. ; Multiple branches exiting the loop to a unique exit block. The loop should -; be vectorized with versioning & noalias metadata should be added. +; be vectorized with versioning. define void @multiple_exits_unique_exit_block(i32* %A, i32* %B, i64 %N) { ; CHECK-LABEL: @multiple_exits_unique_exit_block ; CHECK: vector.memcheck: ; CHECK-LABEL: vector.body: -; CHECK: %wide.load = load <2 x i32>, <2 x i32>* {{.*}}, align 4, !alias.scope -; CHECK: store <2 x i32> %wide.load, <2 x i32>* {{.*}}, align 4, !alias.scope +; CHECK: %wide.load = load <2 x i32>, <2 x i32>* {{.*}}, align 4 +; CHECK: store <2 x i32> %wide.load, <2 x i32>* {{.*}}, align 4 ; CHECK: br ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/no_outside_user.ll b/llvm/test/Transforms/LoopVectorize/no_outside_user.ll --- a/llvm/test/Transforms/LoopVectorize/no_outside_user.ll +++ b/llvm/test/Transforms/LoopVectorize/no_outside_user.ll @@ -338,8 +338,8 @@ ; CHECK-LABEL: vector.body: ; CHECK: %wide.load = load <2 x i32>, <2 x i32>* -; CHECK: %wide.load16 = load <2 x i32>, <2 x i32>* -; CHECK: [[ADD:%[a-zA-Z0-9.]+]] = add nsw <2 x i32> %wide.load, %wide.load16 +; CHECK: %wide.load5 = load <2 x i32>, <2 x i32>* +; CHECK: [[ADD:%[a-zA-Z0-9.]+]] = add nsw <2 x i32> %wide.load, %wide.load5 ; CHECK: store <2 x i32> ; CHECK-LABEL: middle.block: diff --git a/llvm/test/Transforms/LoopVectorize/runtime-check-readonly.ll b/llvm/test/Transforms/LoopVectorize/runtime-check-readonly.ll --- a/llvm/test/Transforms/LoopVectorize/runtime-check-readonly.ll +++ b/llvm/test/Transforms/LoopVectorize/runtime-check-readonly.ll @@ -4,18 +4,18 @@ define void @add_ints(i32* nocapture %A, i32* nocapture %B, i32* nocapture %C) { ; CHECK-LABEL: @add_ints( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; CHECK-LABEL: vector.memcheck: -; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 200 -; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 200 -; CHECK-NEXT: [[SCEVGEP7:%.*]] = getelementptr i32, i32* [[C:%.*]], i64 200 -; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt i32* [[SCEVGEP4]], [[A]] -; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[B]] -; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; CHECK-NEXT: [[BOUND09:%.*]] = icmp ugt i32* [[SCEVGEP7]], [[A]] -; CHECK-NEXT: [[BOUND110:%.*]] = icmp ugt i32* [[SCEVGEP]], [[C]] -; CHECK-NEXT: [[FOUND_CONFLICT11:%.*]] = and i1 [[BOUND09]], [[BOUND110]] -; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT11]] -; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label %scalar.ph, label %vector.ph +; CHECK-NEXT: [[A1:%.*]] = ptrtoint i32* [[A:%.*]] to i64 +; CHECK-NEXT: [[B2:%.*]] = ptrtoint i32* [[B:%.*]] to i64 +; CHECK-NEXT: [[C3:%.*]] = ptrtoint i32* [[C:%.*]] to i64 +; CHECK-NEXT: [[TMP0:%.*]] = sub i64 [[A1]], [[B2]] +; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 16 +; CHECK-NEXT: [[TMP1:%.*]] = sub i64 [[A1]], [[C3]] +; CHECK-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP1]], 16 +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: br label %vector.body ; CHECK: vector.body: diff --git a/llvm/test/Transforms/LoopVectorize/runtime-check-small-clamped-bounds.ll b/llvm/test/Transforms/LoopVectorize/runtime-check-small-clamped-bounds.ll --- a/llvm/test/Transforms/LoopVectorize/runtime-check-small-clamped-bounds.ll +++ b/llvm/test/Transforms/LoopVectorize/runtime-check-small-clamped-bounds.ll @@ -13,8 +13,8 @@ define void @load_clamped_index(i32* %A, i32* %B, i32 %N) { ; CHECK-LABEL: @load_clamped_index( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[B1:%.*]] = bitcast i32* [[B:%.*]] to i8* -; CHECK-NEXT: [[A3:%.*]] = bitcast i32* [[A:%.*]] to i8* +; CHECK-NEXT: [[A2:%.*]] = ptrtoint i32* [[A:%.*]] to i64 +; CHECK-NEXT: [[B1:%.*]] = ptrtoint i32* [[B:%.*]] to i64 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N:%.*]], 2 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] ; CHECK: vector.scevcheck: @@ -22,17 +22,9 @@ ; CHECK-NEXT: [[TMP7:%.*]] = icmp ugt i32 [[TMP0]], 3 ; CHECK-NEXT: br i1 [[TMP7]], label [[SCALAR_PH]], label [[VECTOR_MEMCHECK:%.*]] ; CHECK: vector.memcheck: -; CHECK-NEXT: [[TMP10:%.*]] = add i32 [[N]], -1 -; CHECK-NEXT: [[TMP11:%.*]] = zext i32 [[TMP10]] to i64 -; CHECK-NEXT: [[TMP12:%.*]] = add nuw nsw i64 [[TMP11]], 1 -; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP12]] -; CHECK-NEXT: [[SCEVGEP2:%.*]] = bitcast i32* [[SCEVGEP]] to i8* -; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP12]] -; CHECK-NEXT: [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8* -; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[B1]], [[SCEVGEP45]] -; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[A3]], [[SCEVGEP2]] -; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[B1]], [[A2]] +; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], 8 +; CHECK-NEXT: br i1 [[DIFF_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 2 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] @@ -44,12 +36,12 @@ ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[TMP14]] ; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP15]], i32 0 ; CHECK-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <2 x i32>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP17]], align 4, !alias.scope !0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP17]], align 4 ; CHECK-NEXT: [[TMP18:%.*]] = add <2 x i32> [[WIDE_LOAD]], ; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[TMP13]] ; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP19]], i32 0 ; CHECK-NEXT: [[TMP21:%.*]] = bitcast i32* [[TMP20]] to <2 x i32>* -; CHECK-NEXT: store <2 x i32> [[TMP18]], <2 x i32>* [[TMP21]], align 4, !alias.scope !3, !noalias !0 +; CHECK-NEXT: store <2 x i32> [[TMP18]], <2 x i32>* [[TMP21]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] @@ -96,8 +88,8 @@ define void @store_clamped_index(i32* %A, i32* %B, i32 %N) { ; CHECK-LABEL: @store_clamped_index( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[B1:%.*]] = bitcast i32* [[B:%.*]] to i8* -; CHECK-NEXT: [[A3:%.*]] = bitcast i32* [[A:%.*]] to i8* +; CHECK-NEXT: [[B2:%.*]] = ptrtoint i32* [[B:%.*]] to i64 +; CHECK-NEXT: [[A1:%.*]] = ptrtoint i32* [[A:%.*]] to i64 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N:%.*]], 2 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] ; CHECK: vector.scevcheck: @@ -105,17 +97,9 @@ ; CHECK-NEXT: [[TMP7:%.*]] = icmp ugt i32 [[TMP0]], 3 ; CHECK-NEXT: br i1 [[TMP7]], label [[SCALAR_PH]], label [[VECTOR_MEMCHECK:%.*]] ; CHECK: vector.memcheck: -; CHECK-NEXT: [[TMP10:%.*]] = add i32 [[N]], -1 -; CHECK-NEXT: [[TMP11:%.*]] = zext i32 [[TMP10]] to i64 -; CHECK-NEXT: [[TMP12:%.*]] = add nuw nsw i64 [[TMP11]], 1 -; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP12]] -; CHECK-NEXT: [[SCEVGEP2:%.*]] = bitcast i32* [[SCEVGEP]] to i8* -; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP12]] -; CHECK-NEXT: [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8* -; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[B1]], [[SCEVGEP45]] -; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[A3]], [[SCEVGEP2]] -; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[A1]], [[B2]] +; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], 8 +; CHECK-NEXT: br i1 [[DIFF_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 2 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] @@ -127,12 +111,12 @@ ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[TMP13]] ; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP15]], i32 0 ; CHECK-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <2 x i32>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP17]], align 4, !alias.scope !8, !noalias !11 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP17]], align 4 ; CHECK-NEXT: [[TMP18:%.*]] = add <2 x i32> [[WIDE_LOAD]], ; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[TMP14]] ; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP19]], i32 0 ; CHECK-NEXT: [[TMP21:%.*]] = bitcast i32* [[TMP20]] to <2 x i32>* -; CHECK-NEXT: store <2 x i32> [[TMP18]], <2 x i32>* [[TMP21]], align 4, !alias.scope !11 +; CHECK-NEXT: store <2 x i32> [[TMP18]], <2 x i32>* [[TMP21]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/runtime-check.ll b/llvm/test/Transforms/LoopVectorize/runtime-check.ll --- a/llvm/test/Transforms/LoopVectorize/runtime-check.ll +++ b/llvm/test/Transforms/LoopVectorize/runtime-check.ll @@ -13,7 +13,9 @@ define i32 @foo(float* nocapture %a, float* nocapture %b, i32 %n) nounwind uwtable ssp { ; CHECK-LABEL: @foo( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0, [[DBG4:!dbg !.*]] +; CHECK-NEXT: [[B2:%.*]] = ptrtoint float* [[B:%.*]] to i64, [[DBG4:!dbg !.*]] +; CHECK-NEXT: [[A1:%.*]] = ptrtoint float* [[A:%.*]] to i64, [[DBG4]] +; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0, [[DBG4]] ; CHECK-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]], [[DBG4]] ; CHECK: for.body.preheader: ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1, [[DBG9:!dbg !.*]] @@ -22,15 +24,9 @@ ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP0]], 3, [[DBG9]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]], [[DBG9]] ; CHECK: vector.memcheck: -; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[N]], -1, [[DBG9]] -; CHECK-NEXT: [[TMP4:%.*]] = zext i32 [[TMP3]] to i64, [[DBG9]] -; CHECK-NEXT: [[TMP5:%.*]] = add nuw nsw i64 [[TMP4]], 1, [[DBG9]] -; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr float, float* [[A:%.*]], i64 [[TMP5]], [[DBG9]] -; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr float, float* [[B:%.*]], i64 [[TMP5]], [[DBG9]] -; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt float* [[SCEVGEP4]], [[A]], [[DBG9]] -; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt float* [[SCEVGEP]], [[B]], [[DBG9]] -; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]], [[DBG9]] -; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]], [[DBG9]] +; CHECK-NEXT: [[TMP3:%.*]] = sub i64 [[A1]], [[B2]], [[DBG9]] +; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP3]], 16, [[DBG9]] +; CHECK-NEXT: br i1 [[DIFF_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]], [[DBG9]] ; CHECK: vector.ph: ; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], 8589934588, [[DBG9]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]], [[DBG9]] @@ -38,11 +34,11 @@ ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [[DBG9]] ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDEX]], [[DBG9]] ; CHECK-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <4 x float>*, [[DBG9]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP7]], align 4, [[DBG9]], !alias.scope !10 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP7]], align 4, [[DBG9]] ; CHECK-NEXT: [[TMP8:%.*]] = fmul <4 x float> [[WIDE_LOAD]], , [[DBG9]] ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDEX]], [[DBG9]] ; CHECK-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to <4 x float>*, [[DBG9]] -; CHECK-NEXT: store <4 x float> [[TMP8]], <4 x float>* [[TMP10]], align 4, [[DBG9]], !alias.scope !13, !noalias !10 +; CHECK-NEXT: store <4 x float> [[TMP8]], <4 x float>* [[TMP10]], align 4, [[DBG9]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4, [[DBG9]] ; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]], [[DBG9]] ; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[DBG9]], [[LOOP15:!llvm.loop !.*]] diff --git a/llvm/test/Transforms/LoopVectorize/runtime-checks-difference.ll b/llvm/test/Transforms/LoopVectorize/runtime-checks-difference.ll --- a/llvm/test/Transforms/LoopVectorize/runtime-checks-difference.ll +++ b/llvm/test/Transforms/LoopVectorize/runtime-checks-difference.ll @@ -6,19 +6,14 @@ define void @same_step_and_size(i32* %a, i32* %b, i64 %n) { ; CHECK-LABEL: @same_step_and_size( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[B1:%.*]] = bitcast i32* [[B:%.*]] to i8* -; CHECK-NEXT: [[A3:%.*]] = bitcast i32* [[A:%.*]] to i8* +; CHECK-NEXT: [[A2:%.*]] = ptrtoint i32* [[A:%.*]] to i64 +; CHECK-NEXT: [[B1:%.*]] = ptrtoint i32* [[B:%.*]] to i64 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %scalar.ph, label %vector.memcheck ; CHECK: vector.memcheck: -; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B]], i64 [[N]] -; CHECK-NEXT: [[SCEVGEP2:%.*]] = bitcast i32* [[SCEVGEP]] to i8* -; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[A]], i64 [[N]] -; CHECK-NEXT: [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8* -; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[B1]], [[SCEVGEP45]] -; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[A3]], [[SCEVGEP2]] -; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label %scalar.ph, label %vector.ph +; CHECK-NEXT: [[TMP0:%.*]] = sub i64 [[B1]], [[A2]] +; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 16 +; CHECK-NEXT: br i1 [[DIFF_CHECK]], label %scalar.ph, label %vector.ph ; entry: br label %loop @@ -41,19 +36,14 @@ define void @same_step_and_size_no_dominance_between_accesses(i32* %a, i32* %b, i64 %n, i64 %x) { ; CHECK-LABEL: @same_step_and_size_no_dominance_between_accesses( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[B1:%.*]] = bitcast i32* [[B:%.*]] to i8* -; CHECK-NEXT: [[A3:%.*]] = bitcast i32* [[A:%.*]] to i8* +; CHECK-NEXT: [[B2:%.*]] = ptrtoint i32* [[B:%.*]] to i64 +; CHECK-NEXT: [[A1:%.*]] = ptrtoint i32* [[A:%.*]] to i64 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %scalar.ph, label %vector.memcheck ; CHECK: vector.memcheck: -; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B]], i64 [[N]] -; CHECK-NEXT: [[SCEVGEP2:%.*]] = bitcast i32* [[SCEVGEP]] to i8* -; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[A]], i64 [[N]] -; CHECK-NEXT: [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8* -; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[B1]], [[SCEVGEP45]] -; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[A3]], [[SCEVGEP2]] -; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label %scalar.ph, label %vector.ph +; CHECK-NEXT: [[TMP0:%.*]] = sub i64 [[A1]], [[B2]] +; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 16 +; CHECK-NEXT: br i1 [[DIFF_CHECK]], label %scalar.ph, label %vector.ph ; entry: br label %loop @@ -121,20 +111,15 @@ define void @steps_match_but_different_access_sizes_1([2 x i16]* %a, i32* %b, i64 %n) { ; CHECK-LABEL: @steps_match_but_different_access_sizes_1( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[B1:%.*]] = bitcast i32* [[B:%.*]] to i8* +; CHECK-NEXT: [[A2:%.*]] = ptrtoint [2 x i16]* [[A:%.*]] to i64 +; CHECK-NEXT: [[B1:%.*]] = ptrtoint i32* [[B:%.*]] to i64 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %scalar.ph, label %vector.memcheck ; CHECK: vector.memcheck: -; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B]], i64 [[N]] -; CHECK-NEXT: [[SCEVGEP2:%.*]] = bitcast i32* [[SCEVGEP]] to i8* -; CHECK-NEXT: [[SCEVGEP3:%.*]] = getelementptr [2 x i16], [2 x i16]* [[A:%.*]], i64 0, i64 1 -; CHECK-NEXT: [[SCEVGEP34:%.*]] = bitcast i16* [[SCEVGEP3]] to i8* -; CHECK-NEXT: [[SCEVGEP5:%.*]] = getelementptr [2 x i16], [2 x i16]* [[A]], i64 [[N]], i64 0 -; CHECK-NEXT: [[SCEVGEP56:%.*]] = bitcast i16* [[SCEVGEP5]] to i8* -; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[B1]], [[SCEVGEP56]] -; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[SCEVGEP34]], [[SCEVGEP2]] -; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label %scalar.ph, label %vector.ph +; CHECK-NEXT: [[TMP0:%.*]] = add nuw i64 [[A2]], 2 +; CHECK-NEXT: [[TMP1:%.*]] = sub i64 [[B1]], [[TMP0]] +; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP1]], 16 +; CHECK-NEXT: br i1 [[DIFF_CHECK]], label %scalar.ph, label %vector.ph ; entry: br label %loop @@ -160,20 +145,15 @@ define void @steps_match_but_different_access_sizes_2([2 x i16]* %a, i32* %b, i64 %n) { ; CHECK-LABEL: @steps_match_but_different_access_sizes_2( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[B4:%.*]] = bitcast i32* [[B:%.*]] to i8* +; CHECK-NEXT: [[B2:%.*]] = ptrtoint i32* [[B:%.*]] to i64 +; CHECK-NEXT: [[A1:%.*]] = ptrtoint [2 x i16]* [[A:%.*]] to i64 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %scalar.ph, label %vector.memcheck ; CHECK: vector.memcheck: -; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr [2 x i16], [2 x i16]* [[A:%.*]], i64 0, i64 1 -; CHECK-NEXT: [[SCEVGEP1:%.*]] = bitcast i16* [[SCEVGEP]] to i8* -; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr [2 x i16], [2 x i16]* [[A]], i64 [[N]], i64 0 -; CHECK-NEXT: [[SCEVGEP23:%.*]] = bitcast i16* [[SCEVGEP2]] to i8* -; CHECK-NEXT: [[SCEVGEP5:%.*]] = getelementptr i32, i32* [[B]], i64 [[N]] -; CHECK-NEXT: [[SCEVGEP56:%.*]] = bitcast i32* [[SCEVGEP5]] to i8* -; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[SCEVGEP1]], [[SCEVGEP56]] -; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[B4]], [[SCEVGEP23]] -; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label %scalar.ph, label %vector.ph +; CHECK-NEXT: [[TMP0:%.*]] = add nuw i64 [[A1]], 2 +; CHECK-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[B2]] +; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP1]], 16 +; CHECK-NEXT: br i1 [[DIFF_CHECK]], label %scalar.ph, label %vector.ph ; entry: br label %loop diff --git a/llvm/test/Transforms/LoopVectorize/scalable-loop-unpredicated-body-scalar-tail.ll b/llvm/test/Transforms/LoopVectorize/scalable-loop-unpredicated-body-scalar-tail.ll --- a/llvm/test/Transforms/LoopVectorize/scalable-loop-unpredicated-body-scalar-tail.ll +++ b/llvm/test/Transforms/LoopVectorize/scalable-loop-unpredicated-body-scalar-tail.ll @@ -17,16 +17,16 @@ ; CHECKUF1: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] ; CHECKUF1: %[[IDXB:.*]] = getelementptr inbounds double, double* %b, i64 %index ; CHECKUF1: %[[IDXB_CAST:.*]] = bitcast double* %[[IDXB]] to * -; CHECKUF1: %wide.load = load , * %[[IDXB_CAST]], align 8, !alias.scope !0 +; CHECKUF1: %wide.load = load , * %[[IDXB_CAST]], align 8 ; CHECKUF1: %[[FADD:.*]] = fadd %wide.load, shufflevector ( insertelement ( poison, double 1.000000e+00, i32 0), poison, zeroinitializer) ; CHECKUF1: %[[IDXA:.*]] = getelementptr inbounds double, double* %a, i64 %index ; CHECKUF1: %[[IDXA_CAST:.*]] = bitcast double* %[[IDXA]] to * -; CHECKUF1: store %[[FADD]], * %[[IDXA_CAST]], align 8, !alias.scope !3, !noalias !0 +; CHECKUF1: store %[[FADD]], * %[[IDXA_CAST]], align 8 ; CHECKUF1: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64() ; CHECKUF1: %[[VSCALEX4:.*]] = shl i64 %[[VSCALE]], 2 ; CHECKUF1: %index.next = add nuw i64 %index, %[[VSCALEX4]] ; CHECKUF1: %[[CMP:.*]] = icmp eq i64 %index.next, %n.vec -; CHECKUF1: br i1 %[[CMP]], label %middle.block, label %vector.body, !llvm.loop !5 +; CHECKUF1: br i1 %[[CMP]], label %middle.block, label %vector.body, !llvm.loop !0 ; For an interleave factor of 2, vscale is scaled by 8 instead of 4 (and thus shifted left by 3 instead of 2). @@ -48,29 +48,29 @@ ; CHECKUF2: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] ; CHECKUF2: %[[IDXB:.*]] = getelementptr inbounds double, double* %b, i64 %index ; CHECKUF2: %[[IDXB_CAST:.*]] = bitcast double* %[[IDXB]] to * -; CHECKUF2: %wide.load = load , * %[[IDXB_CAST]], align 8, !alias.scope !0 +; CHECKUF2: %wide.load = load , * %[[IDXB_CAST]], align 8 ; CHECKUF2: %[[VSCALE:.*]] = call i32 @llvm.vscale.i32() ; CHECKUF2: %[[VSCALE2:.*]] = shl i32 %[[VSCALE]], 2 ; CHECKUF2: %[[VSCALE2_EXT:.*]] = sext i32 %[[VSCALE2]] to i64 ; CHECKUF2: %[[IDXB_NEXT:.*]] = getelementptr inbounds double, double* %[[IDXB]], i64 %[[VSCALE2_EXT]] ; CHECKUF2: %[[IDXB_NEXT_CAST:.*]] = bitcast double* %[[IDXB_NEXT]] to * -; CHECKUF2: %wide.load{{[0-9]+}} = load , * %[[IDXB_NEXT_CAST]], align 8, !alias.scope !0 +; CHECKUF2: %wide.load{{[0-9]+}} = load , * %[[IDXB_NEXT_CAST]], align 8 ; CHECKUF2: %[[FADD:.*]] = fadd %wide.load, shufflevector ( insertelement ( poison, double 1.000000e+00, i32 0), poison, zeroinitializer) ; CHECKUF2: %[[FADD_NEXT:.*]] = fadd %wide.load{{[0-9]+}}, shufflevector ( insertelement ( poison, double 1.000000e+00, i32 0), poison, zeroinitializer) ; CHECKUF2: %[[IDXA:.*]] = getelementptr inbounds double, double* %a, i64 %index ; CHECKUF2: %[[IDXA_CAST:.*]] = bitcast double* %[[IDXA]] to * -; CHECKUF2: store %[[FADD]], * %[[IDXA_CAST]], align 8, !alias.scope !3, !noalias !0 +; CHECKUF2: store %[[FADD]], * %[[IDXA_CAST]], align 8 ; CHECKUF2: %[[VSCALE:.*]] = call i32 @llvm.vscale.i32() ; CHECKUF2: %[[VSCALE2:.*]] = shl i32 %[[VSCALE]], 2 ; CHECKUF2: %[[VSCALE2_EXT:.*]] = sext i32 %[[VSCALE2]] to i64 ; CHECKUF2: %[[IDXA_NEXT:.*]] = getelementptr inbounds double, double* %[[IDXA]], i64 %[[VSCALE2_EXT]] ; CHECKUF2: %[[IDXA_NEXT_CAST:.*]] = bitcast double* %[[IDXA_NEXT]] to * -; CHECKUF2: store %[[FADD_NEXT]], * %[[IDXA_NEXT_CAST]], align 8, !alias.scope !3, !noalias !0 +; CHECKUF2: store %[[FADD_NEXT]], * %[[IDXA_NEXT_CAST]], align 8 ; CHECKUF2: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64() ; CHECKUF2: %[[VSCALEX8:.*]] = shl i64 %[[VSCALE]], 3 ; CHECKUF2: %index.next = add nuw i64 %index, %[[VSCALEX8]] ; CHECKUF2: %[[CMP:.*]] = icmp eq i64 %index.next, %n.vec -; CHECKUF2: br i1 %[[CMP]], label %middle.block, label %vector.body, !llvm.loop !5 +; CHECKUF2: br i1 %[[CMP]], label %middle.block, label %vector.body, !llvm.loop !0 define void @loop(i32 %N, double* nocapture %a, double* nocapture readonly %b) { entry: diff --git a/llvm/test/Transforms/LoopVectorize/tbaa-nodep.ll b/llvm/test/Transforms/LoopVectorize/tbaa-nodep.ll --- a/llvm/test/Transforms/LoopVectorize/tbaa-nodep.ll +++ b/llvm/test/Transforms/LoopVectorize/tbaa-nodep.ll @@ -17,8 +17,7 @@ ; CHECK-NOTBAA-LABEL: @test1 ; CHECK-NOTBAA: entry: -; CHECK-NOTBAA: icmp ugt i32* -; CHECK-NOTBAA: icmp ugt float* +; CHECK-NOTBAA: icmp ult i64 ; CHECK-NOTBAA-NOT: icmp ; CHECK-NOTBAA: br i1 {{.+}}, label %for.body, label %vector.body @@ -50,9 +49,8 @@ define i32 @test2(i32* nocapture readonly %a, float* nocapture readonly %b, float* nocapture %c) { ; CHECK-LABEL: @test2 ; CHECK: entry: -; CHECK: icmp ugt float* -; CHECK: icmp ugt float* -; CHECK-NOT: icmp uge i32* +; CHECK: icmp ult i64 +; CHECK-NOT: icmp ; CHECK: br i1 {{.+}}, label %for.body, label %vector.body ; CHECK: load <4 x float>, <4 x float>* %{{.*}}, align 4, !tbaa @@ -62,10 +60,8 @@ ; CHECK-NOTBAA-LABEL: @test2 ; CHECK-NOTBAA: entry: -; CHECK-NOTBAA: icmp ugt float* -; CHECK-NOTBAA: icmp ugt float* -; CHECK-NOTBAA-DAG: icmp ugt float* -; CHECK-NOTBAA-DAG: icmp ugt i32* +; CHECK-NOTBAA: icmp ult i64 +; CHECK-NOTBAA: icmp ult i64 ; CHECK-NOTBAA-NOT: icmp ; CHECK-NOTBAA: br i1 {{.+}}, label %for.body, label %vector.body diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll --- a/llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll +++ b/llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll @@ -41,21 +41,20 @@ define void @loop(double* %X, double* %Y) { ; CHECK-LABEL: @loop( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr double, double* [[X:%.*]], i64 20000 -; CHECK-NEXT: [[SCEVGEP9:%.*]] = getelementptr double, double* [[Y:%.*]], i64 20000 -; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt double* [[SCEVGEP9]], [[X]] -; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt double* [[SCEVGEP]], [[Y]] -; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY:%.*]] +; CHECK-NEXT: [[X6:%.*]] = ptrtoint double* [[X:%.*]] to i64 +; CHECK-NEXT: [[Y7:%.*]] = ptrtoint double* [[Y:%.*]] to i64 +; CHECK-NEXT: [[TMP0:%.*]] = sub i64 [[X6]], [[Y7]] +; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 32 +; CHECK-NEXT: br i1 [[DIFF_CHECK]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[INDEX]] to i64 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = bitcast double* [[TMP1]] to <2 x double>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8, !alias.scope !0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8 ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, double* [[TMP1]], i64 2 ; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* [[TMP3]] to <2 x double>* -; CHECK-NEXT: [[WIDE_LOAD11:%.*]] = load <2 x double>, <2 x double>* [[TMP4]], align 8, !alias.scope !0 +; CHECK-NEXT: [[WIDE_LOAD11:%.*]] = load <2 x double>, <2 x double>* [[TMP4]] ; CHECK-NEXT: [[TMP5:%.*]] = fcmp olt <2 x double> [[WIDE_LOAD]], zeroinitializer ; CHECK-NEXT: [[TMP6:%.*]] = fcmp olt <2 x double> [[WIDE_LOAD11]], zeroinitializer ; CHECK-NEXT: [[TMP7:%.*]] = fcmp ogt <2 x double> [[WIDE_LOAD]], @@ -66,10 +65,10 @@ ; CHECK-NEXT: [[TMP12:%.*]] = select <2 x i1> [[TMP6]], <2 x double> zeroinitializer, <2 x double> [[TMP10]] ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP14:%.*]] = bitcast double* [[TMP13]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP11]], <2 x double>* [[TMP14]], align 8, !alias.scope !3, !noalias !0 +; CHECK-NEXT: store <2 x double> [[TMP11]], <2 x double>* [[TMP14]], align 8 ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds double, double* [[TMP13]], i64 2 ; CHECK-NEXT: [[TMP16:%.*]] = bitcast double* [[TMP15]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP12]], <2 x double>* [[TMP16]], align 8, !alias.scope !3, !noalias !0 +; CHECK-NEXT: store <2 x double> [[TMP12]], <2 x double>* [[TMP16]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], 20000 ; CHECK-NEXT: br i1 [[TMP17]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] @@ -161,19 +160,19 @@ ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4, !alias.scope !8 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4, !alias.scope !3 ; CHECK-NEXT: [[TMP4:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP6:%.*]] = bitcast float* [[TMP5]] to <4 x float>* -; CHECK-NEXT: [[WIDE_LOAD14:%.*]] = load <4 x float>, <4 x float>* [[TMP6]], align 4, !alias.scope !11 +; CHECK-NEXT: [[WIDE_LOAD14:%.*]] = load <4 x float>, <4 x float>* [[TMP6]], align 4, !alias.scope !6 ; CHECK-NEXT: [[TMP7:%.*]] = fmul <4 x float> [[WIDE_LOAD14]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr float, float* [[B]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP8]] to <4 x float>* -; CHECK-NEXT: [[WIDE_LOAD15:%.*]] = load <4 x float>, <4 x float>* [[TMP9]], align 4, !alias.scope !13, !noalias !15 +; CHECK-NEXT: [[WIDE_LOAD15:%.*]] = load <4 x float>, <4 x float>* [[TMP9]], align 4, !alias.scope !8, !noalias !10 ; CHECK-NEXT: [[TMP10:%.*]] = select <4 x i1> [[TMP4]], <4 x float> , <4 x float> [[WIDE_LOAD15]] ; CHECK-NEXT: [[PREDPHI:%.*]] = fadd <4 x float> [[TMP7]], [[TMP10]] ; CHECK-NEXT: [[TMP11:%.*]] = bitcast float* [[TMP8]] to <4 x float>* -; CHECK-NEXT: store <4 x float> [[PREDPHI]], <4 x float>* [[TMP11]], align 4, !alias.scope !13, !noalias !15 +; CHECK-NEXT: store <4 x float> [[PREDPHI]], <4 x float>* [[TMP11]], align 4, !alias.scope !8, !noalias !10 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000 ; CHECK-NEXT: br i1 [[TMP12]], label [[EXIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll b/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll --- a/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll @@ -17,16 +17,14 @@ ; CHECK-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[N:%.*]], 0 ; CHECK-NEXT: br i1 [[CMP1]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]] ; CHECK: for.body.preheader: +; CHECK-NEXT: [[X4:%.*]] = ptrtoint double* [[X:%.*]] to i64 +; CHECK-NEXT: [[Y5:%.*]] = ptrtoint double* [[Y:%.*]] to i64 ; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 16 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY_PREHEADER18:%.*]], label [[VECTOR_MEMCHECK:%.*]] -; CHECK: vector.memcheck: -; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr double, double* [[X:%.*]], i64 [[WIDE_TRIP_COUNT]] -; CHECK-NEXT: [[SCEVGEP7:%.*]] = getelementptr double, double* [[Y:%.*]], i64 [[WIDE_TRIP_COUNT]] -; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt double* [[SCEVGEP7]], [[X]] -; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt double* [[SCEVGEP]], [[Y]] -; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[FOR_BODY_PREHEADER18]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: [[TMP0:%.*]] = sub i64 [[X4]], [[Y5]] +; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 128 +; CHECK-NEXT: [[OR_COND:%.*]] = select i1 [[MIN_ITERS_CHECK]], i1 true, i1 [[DIFF_CHECK]] +; CHECK-NEXT: br i1 [[OR_COND]], label [[FOR_BODY_PREHEADER15:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 4294967280 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x double> poison, double [[A:%.*]], i64 0 @@ -46,40 +44,40 @@ ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[TMP4]] to <4 x double>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x double>, <4 x double>* [[TMP5]], align 8, !tbaa [[TBAA3:![0-9]+]], !alias.scope !7 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x double>, <4 x double>* [[TMP5]], align 8, !tbaa [[TBAA3:![0-9]+]] ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds double, double* [[TMP4]], i64 4 ; CHECK-NEXT: [[TMP7:%.*]] = bitcast double* [[TMP6]] to <4 x double>* -; CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <4 x double>, <4 x double>* [[TMP7]], align 8, !tbaa [[TBAA3]], !alias.scope !7 +; CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <4 x double>, <4 x double>* [[TMP7]], align 8, !tbaa [[TBAA3]] ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds double, double* [[TMP4]], i64 8 ; CHECK-NEXT: [[TMP9:%.*]] = bitcast double* [[TMP8]] to <4 x double>* -; CHECK-NEXT: [[WIDE_LOAD10:%.*]] = load <4 x double>, <4 x double>* [[TMP9]], align 8, !tbaa [[TBAA3]], !alias.scope !7 +; CHECK-NEXT: [[WIDE_LOAD10:%.*]] = load <4 x double>, <4 x double>* [[TMP9]], align 8, !tbaa [[TBAA3]] ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds double, double* [[TMP4]], i64 12 ; CHECK-NEXT: [[TMP11:%.*]] = bitcast double* [[TMP10]] to <4 x double>* -; CHECK-NEXT: [[WIDE_LOAD11:%.*]] = load <4 x double>, <4 x double>* [[TMP11]], align 8, !tbaa [[TBAA3]], !alias.scope !7 +; CHECK-NEXT: [[WIDE_LOAD11:%.*]] = load <4 x double>, <4 x double>* [[TMP11]], align 8, !tbaa [[TBAA3]] ; CHECK-NEXT: [[TMP12:%.*]] = fmul fast <4 x double> [[WIDE_LOAD]], [[TMP0]] ; CHECK-NEXT: [[TMP13:%.*]] = fmul fast <4 x double> [[WIDE_LOAD9]], [[TMP1]] ; CHECK-NEXT: [[TMP14:%.*]] = fmul fast <4 x double> [[WIDE_LOAD10]], [[TMP2]] ; CHECK-NEXT: [[TMP15:%.*]] = fmul fast <4 x double> [[WIDE_LOAD11]], [[TMP3]] ; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP17:%.*]] = bitcast double* [[TMP16]] to <4 x double>* -; CHECK-NEXT: store <4 x double> [[TMP12]], <4 x double>* [[TMP17]], align 8, !tbaa [[TBAA3]], !alias.scope !10, !noalias !7 +; CHECK-NEXT: store <4 x double> [[TMP12]], <4 x double>* [[TMP17]], align 8, !tbaa [[TBAA3]] ; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds double, double* [[TMP16]], i64 4 ; CHECK-NEXT: [[TMP19:%.*]] = bitcast double* [[TMP18]] to <4 x double>* -; CHECK-NEXT: store <4 x double> [[TMP13]], <4 x double>* [[TMP19]], align 8, !tbaa [[TBAA3]], !alias.scope !10, !noalias !7 +; CHECK-NEXT: store <4 x double> [[TMP13]], <4 x double>* [[TMP19]], align 8, !tbaa [[TBAA3]] ; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds double, double* [[TMP16]], i64 8 ; CHECK-NEXT: [[TMP21:%.*]] = bitcast double* [[TMP20]] to <4 x double>* -; CHECK-NEXT: store <4 x double> [[TMP14]], <4 x double>* [[TMP21]], align 8, !tbaa [[TBAA3]], !alias.scope !10, !noalias !7 +; CHECK-NEXT: store <4 x double> [[TMP14]], <4 x double>* [[TMP21]], align 8, !tbaa [[TBAA3]] ; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds double, double* [[TMP16]], i64 12 ; CHECK-NEXT: [[TMP23:%.*]] = bitcast double* [[TMP22]] to <4 x double>* -; CHECK-NEXT: store <4 x double> [[TMP15]], <4 x double>* [[TMP23]], align 8, !tbaa [[TBAA3]], !alias.scope !10, !noalias !7 +; CHECK-NEXT: store <4 x double> [[TMP15]], <4 x double>* [[TMP23]], align 8, !tbaa [[TBAA3]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[WIDE_TRIP_COUNT]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY_PREHEADER18]] -; CHECK: for.body.preheader18: -; CHECK-NEXT: [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY_PREHEADER15]] +; CHECK: for.body.preheader15: +; CHECK-NEXT: [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: [[TMP25:%.*]] = xor i64 [[INDVARS_IV_PH]], -1 ; CHECK-NEXT: [[TMP26:%.*]] = add nsw i64 [[TMP25]], [[WIDE_TRIP_COUNT]] ; CHECK-NEXT: [[XTRAITER:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 3 @@ -101,10 +99,10 @@ ; CHECK-NEXT: [[PROL_ITER_CMP_NOT:%.*]] = icmp eq i64 [[PROL_ITER_NEXT]], [[XTRAITER]] ; CHECK-NEXT: br i1 [[PROL_ITER_CMP_NOT]], label [[FOR_BODY_PROL_LOOPEXIT]], label [[FOR_BODY_PROL]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: for.body.prol.loopexit: -; CHECK-NEXT: [[INDVARS_IV_UNR:%.*]] = phi i64 [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER18]] ], [ [[INDVARS_IV_NEXT_PROL]], [[FOR_BODY_PROL]] ] +; CHECK-NEXT: [[INDVARS_IV_UNR:%.*]] = phi i64 [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER15]] ], [ [[INDVARS_IV_NEXT_PROL]], [[FOR_BODY_PROL]] ] ; CHECK-NEXT: [[TMP29:%.*]] = icmp ult i64 [[TMP26]], 3 ; CHECK-NEXT: br i1 [[TMP29]], label [[FOR_END]], label [[FOR_BODY_PREHEADER18_NEW:%.*]] -; CHECK: for.body.preheader18.new: +; CHECK: for.body.preheader15.new: ; CHECK-NEXT: [[TMP30:%.*]] = fdiv fast double 1.000000e+00, [[A]] ; CHECK-NEXT: [[TMP31:%.*]] = fdiv fast double 1.000000e+00, [[A]] ; CHECK-NEXT: [[TMP32:%.*]] = fdiv fast double 1.000000e+00, [[A]]